Don't warn when alignment of global common data exceeds maximum alignment.
[official-gcc.git] / gcc / tree-vect-loop.c
blob995d143dbbd73317d394cc2f897afae953576c85
1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
158 bool *, bool *);
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162 may already be set for general statements (not just data refs). */
164 static opt_result
165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
166 bool vectype_maybe_set_p,
167 poly_uint64 *vf)
169 gimple *stmt = stmt_info->stmt;
171 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
172 && !STMT_VINFO_LIVE_P (stmt_info))
173 || gimple_clobber_p (stmt))
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
177 return opt_result::success ();
180 tree stmt_vectype, nunits_vectype;
181 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
182 &stmt_vectype,
183 &nunits_vectype);
184 if (!res)
185 return res;
187 if (stmt_vectype)
189 if (STMT_VINFO_VECTYPE (stmt_info))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 || vectype_maybe_set_p)
195 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. Return true on success
209 or false if something prevented vectorization. */
211 static opt_result
212 vect_determine_vf_for_stmt (vec_info *vinfo,
213 stmt_vec_info stmt_info, poly_uint64 *vf)
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
217 stmt_info->stmt);
218 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
219 if (!res)
220 return res;
222 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
223 && STMT_VINFO_RELATED_STMT (stmt_info))
225 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
226 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
228 /* If a pattern statement has def stmts, analyze them too. */
229 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
230 !gsi_end_p (si); gsi_next (&si))
232 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
233 if (dump_enabled_p ())
234 dump_printf_loc (MSG_NOTE, vect_location,
235 "==> examining pattern def stmt: %G",
236 def_stmt_info->stmt);
237 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
238 if (!res)
239 return res;
242 if (dump_enabled_p ())
243 dump_printf_loc (MSG_NOTE, vect_location,
244 "==> examining pattern statement: %G",
245 stmt_info->stmt);
246 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
247 if (!res)
248 return res;
251 return opt_result::success ();
254 /* Function vect_determine_vectorization_factor
256 Determine the vectorization factor (VF). VF is the number of data elements
257 that are operated upon in parallel in a single iteration of the vectorized
258 loop. For example, when vectorizing a loop that operates on 4byte elements,
259 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260 elements can fit in a single vector register.
262 We currently support vectorization of loops in which all types operated upon
263 are of the same size. Therefore this function currently sets VF according to
264 the size of the types operated upon, and fails if there are multiple sizes
265 in the loop.
267 VF is also the factor by which the loop iterations are strip-mined, e.g.:
268 original loop:
269 for (i=0; i<N; i++){
270 a[i] = b[i] + c[i];
273 vectorized loop:
274 for (i=0; i<N; i+=VF){
275 a[i:VF] = b[i:VF] + c[i:VF];
279 static opt_result
280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
282 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284 unsigned nbbs = loop->num_nodes;
285 poly_uint64 vectorization_factor = 1;
286 tree scalar_type = NULL_TREE;
287 gphi *phi;
288 tree vectype;
289 stmt_vec_info stmt_info;
290 unsigned i;
292 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
294 for (i = 0; i < nbbs; i++)
296 basic_block bb = bbs[i];
298 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
299 gsi_next (&si))
301 phi = si.phi ();
302 stmt_info = loop_vinfo->lookup_stmt (phi);
303 if (dump_enabled_p ())
304 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
305 phi);
307 gcc_assert (stmt_info);
309 if (STMT_VINFO_RELEVANT_P (stmt_info)
310 || STMT_VINFO_LIVE_P (stmt_info))
312 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
313 scalar_type = TREE_TYPE (PHI_RESULT (phi));
315 if (dump_enabled_p ())
316 dump_printf_loc (MSG_NOTE, vect_location,
317 "get vectype for scalar type: %T\n",
318 scalar_type);
320 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
321 if (!vectype)
322 return opt_result::failure_at (phi,
323 "not vectorized: unsupported "
324 "data-type %T\n",
325 scalar_type);
326 STMT_VINFO_VECTYPE (stmt_info) = vectype;
328 if (dump_enabled_p ())
329 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
330 vectype);
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
335 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
336 dump_printf (MSG_NOTE, "\n");
339 vect_update_max_nunits (&vectorization_factor, vectype);
343 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
344 gsi_next (&si))
346 if (is_gimple_debug (gsi_stmt (si)))
347 continue;
348 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
349 opt_result res
350 = vect_determine_vf_for_stmt (loop_vinfo,
351 stmt_info, &vectorization_factor);
352 if (!res)
353 return res;
357 /* TODO: Analyze cost. Decide if worth while to vectorize. */
358 if (dump_enabled_p ())
360 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
361 dump_dec (MSG_NOTE, vectorization_factor);
362 dump_printf (MSG_NOTE, "\n");
365 if (known_le (vectorization_factor, 1U))
366 return opt_result::failure_at (vect_location,
367 "not vectorized: unsupported data-type\n");
368 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
369 return opt_result::success ();
373 /* Function vect_is_simple_iv_evolution.
375 FORNOW: A simple evolution of an induction variables in the loop is
376 considered a polynomial evolution. */
378 static bool
379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
380 tree * step)
382 tree init_expr;
383 tree step_expr;
384 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
385 basic_block bb;
387 /* When there is no evolution in this loop, the evolution function
388 is not "simple". */
389 if (evolution_part == NULL_TREE)
390 return false;
392 /* When the evolution is a polynomial of degree >= 2
393 the evolution function is not "simple". */
394 if (tree_is_chrec (evolution_part))
395 return false;
397 step_expr = evolution_part;
398 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
402 step_expr, init_expr);
404 *init = init_expr;
405 *step = step_expr;
407 if (TREE_CODE (step_expr) != INTEGER_CST
408 && (TREE_CODE (step_expr) != SSA_NAME
409 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
410 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
411 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
412 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
413 || !flag_associative_math)))
414 && (TREE_CODE (step_expr) != REAL_CST
415 || !flag_associative_math))
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "step unknown.\n");
420 return false;
423 return true;
426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
427 what we are assuming is a double reduction. For example, given
428 a structure like this:
430 outer1:
431 x_1 = PHI <x_4(outer2), ...>;
434 inner:
435 x_2 = PHI <x_1(outer1), ...>;
437 x_3 = ...;
440 outer2:
441 x_4 = PHI <x_3(inner)>;
444 outer loop analysis would treat x_1 as a double reduction phi and
445 this function would then return true for x_2. */
447 static bool
448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
450 use_operand_p use_p;
451 ssa_op_iter op_iter;
452 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
453 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
454 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
455 return true;
456 return false;
459 /* Function vect_analyze_scalar_cycles_1.
461 Examine the cross iteration def-use cycles of scalar variables
462 in LOOP. LOOP_VINFO represents the loop that is now being
463 considered for vectorization (can be LOOP, or an outer-loop
464 enclosing LOOP). */
466 static void
467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
469 basic_block bb = loop->header;
470 tree init, step;
471 auto_vec<stmt_vec_info, 64> worklist;
472 gphi_iterator gsi;
473 bool double_reduc, reduc_chain;
475 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
477 /* First - identify all inductions. Reduction detection assumes that all the
478 inductions have been identified, therefore, this order must not be
479 changed. */
480 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
482 gphi *phi = gsi.phi ();
483 tree access_fn = NULL;
484 tree def = PHI_RESULT (phi);
485 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
490 /* Skip virtual phi's. The data dependences that are associated with
491 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
492 if (virtual_operand_p (def))
493 continue;
495 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
497 /* Analyze the evolution function. */
498 access_fn = analyze_scalar_evolution (loop, def);
499 if (access_fn)
501 STRIP_NOPS (access_fn);
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "Access function of PHI: %T\n", access_fn);
505 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
506 = initial_condition_in_loop_num (access_fn, loop->num);
507 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
508 = evolution_part_in_loop_num (access_fn, loop->num);
511 if (!access_fn
512 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
513 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
514 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
515 && TREE_CODE (step) != INTEGER_CST))
517 worklist.safe_push (stmt_vinfo);
518 continue;
521 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 != NULL_TREE);
523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
525 if (dump_enabled_p ())
526 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
527 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
531 /* Second - identify all reductions and nested cycles. */
532 while (worklist.length () > 0)
534 stmt_vec_info stmt_vinfo = worklist.pop ();
535 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
536 tree def = PHI_RESULT (phi);
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
541 gcc_assert (!virtual_operand_p (def)
542 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
544 stmt_vec_info reduc_stmt_info
545 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
546 &reduc_chain);
547 if (reduc_stmt_info)
549 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
550 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
551 if (double_reduc)
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_NOTE, vect_location,
555 "Detected double reduction.\n");
557 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
558 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
560 else
562 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "Detected vectorizable nested cycle.\n");
568 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
570 else
572 if (dump_enabled_p ())
573 dump_printf_loc (MSG_NOTE, vect_location,
574 "Detected reduction.\n");
576 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
577 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
578 /* Store the reduction cycles for possible vectorization in
579 loop-aware SLP if it was not detected as reduction
580 chain. */
581 if (! reduc_chain)
582 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
583 (reduc_stmt_info);
587 else
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Unknown def-use cycle pattern.\n");
595 /* Function vect_analyze_scalar_cycles.
597 Examine the cross iteration def-use cycles of scalar variables, by
598 analyzing the loop-header PHIs of scalar variables. Classify each
599 cycle as one of the following: invariant, induction, reduction, unknown.
600 We do that for the loop represented by LOOP_VINFO, and also to its
601 inner-loop, if exists.
602 Examples for scalar cycles:
604 Example1: reduction:
606 loop1:
607 for (i=0; i<N; i++)
608 sum += a[i];
610 Example2: induction:
612 loop2:
613 for (i=0; i<N; i++)
614 a[i] = i; */
616 static void
617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
619 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
621 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
623 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
624 Reductions in such inner-loop therefore have different properties than
625 the reductions in the nest that gets vectorized:
626 1. When vectorized, they are executed in the same order as in the original
627 scalar loop, so we can't change the order of computation when
628 vectorizing them.
629 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
630 current checks are too strict. */
632 if (loop->inner)
633 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
636 /* Transfer group and reduction information from STMT_INFO to its
637 pattern stmt. */
639 static void
640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
642 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
643 stmt_vec_info stmtp;
644 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
645 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
646 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
649 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
650 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
651 == STMT_VINFO_DEF_TYPE (stmt_info));
652 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
653 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
654 if (stmt_info)
655 REDUC_GROUP_NEXT_ELEMENT (stmtp)
656 = STMT_VINFO_RELATED_STMT (stmt_info);
658 while (stmt_info);
661 /* Fixup scalar cycles that now have their stmts detected as patterns. */
663 static void
664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
666 stmt_vec_info first;
667 unsigned i;
669 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
674 if ((STMT_VINFO_IN_PATTERN_P (next)
675 != STMT_VINFO_IN_PATTERN_P (first))
676 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
677 break;
678 next = REDUC_GROUP_NEXT_ELEMENT (next);
680 /* If all reduction chain members are well-formed patterns adjust
681 the group to group the pattern stmts instead. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
685 if (STMT_VINFO_IN_PATTERN_P (first))
687 vect_fixup_reduc_chain (first);
688 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
689 = STMT_VINFO_RELATED_STMT (first);
692 /* If not all stmt in the chain are patterns or if we failed
693 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
694 it as regular reduction instead. */
695 else
697 stmt_vec_info vinfo = first;
698 stmt_vec_info last = NULL;
699 while (vinfo)
701 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
702 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
703 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
704 last = vinfo;
705 vinfo = next;
707 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
708 = vect_internal_def;
709 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
710 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
711 --i;
716 /* Function vect_get_loop_niters.
718 Determine how many iterations the loop is executed and place it
719 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
720 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
721 niter information holds in ASSUMPTIONS.
723 Return the loop exit condition. */
726 static gcond *
727 vect_get_loop_niters (class loop *loop, tree *assumptions,
728 tree *number_of_iterations, tree *number_of_iterationsm1)
730 edge exit = single_exit (loop);
731 class tree_niter_desc niter_desc;
732 tree niter_assumptions, niter, may_be_zero;
733 gcond *cond = get_loop_exit_condition (loop);
735 *assumptions = boolean_true_node;
736 *number_of_iterationsm1 = chrec_dont_know;
737 *number_of_iterations = chrec_dont_know;
738 DUMP_VECT_SCOPE ("get_loop_niters");
740 if (!exit)
741 return cond;
743 may_be_zero = NULL_TREE;
744 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
745 || chrec_contains_undetermined (niter_desc.niter))
746 return cond;
748 niter_assumptions = niter_desc.assumptions;
749 may_be_zero = niter_desc.may_be_zero;
750 niter = niter_desc.niter;
752 if (may_be_zero && integer_zerop (may_be_zero))
753 may_be_zero = NULL_TREE;
755 if (may_be_zero)
757 if (COMPARISON_CLASS_P (may_be_zero))
759 /* Try to combine may_be_zero with assumptions, this can simplify
760 computation of niter expression. */
761 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
762 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
763 niter_assumptions,
764 fold_build1 (TRUTH_NOT_EXPR,
765 boolean_type_node,
766 may_be_zero));
767 else
768 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
769 build_int_cst (TREE_TYPE (niter), 0),
770 rewrite_to_non_trapping_overflow (niter));
772 may_be_zero = NULL_TREE;
774 else if (integer_nonzerop (may_be_zero))
776 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
777 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
778 return cond;
780 else
781 return cond;
784 *assumptions = niter_assumptions;
785 *number_of_iterationsm1 = niter;
787 /* We want the number of loop header executions which is the number
788 of latch executions plus one.
789 ??? For UINT_MAX latch executions this number overflows to zero
790 for loops like do { n++; } while (n != 0); */
791 if (niter && !chrec_contains_undetermined (niter))
792 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
793 build_int_cst (TREE_TYPE (niter), 1));
794 *number_of_iterations = niter;
796 return cond;
799 /* Function bb_in_loop_p
801 Used as predicate for dfs order traversal of the loop bbs. */
803 static bool
804 bb_in_loop_p (const_basic_block bb, const void *data)
806 const class loop *const loop = (const class loop *)data;
807 if (flow_bb_inside_loop_p (loop, bb))
808 return true;
809 return false;
813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
814 stmt_vec_info structs for all the stmts in LOOP_IN. */
816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
817 : vec_info (vec_info::loop, init_cost (loop_in, false), shared),
818 loop (loop_in),
819 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
820 num_itersm1 (NULL_TREE),
821 num_iters (NULL_TREE),
822 num_iters_unchanged (NULL_TREE),
823 num_iters_assumptions (NULL_TREE),
824 th (0),
825 versioning_threshold (0),
826 vectorization_factor (0),
827 main_loop_edge (nullptr),
828 skip_main_loop_edge (nullptr),
829 skip_this_loop_edge (nullptr),
830 reusable_accumulators (),
831 max_vectorization_factor (0),
832 mask_skip_niters (NULL_TREE),
833 rgroup_compare_type (NULL_TREE),
834 simd_if_cond (NULL_TREE),
835 unaligned_dr (NULL),
836 peeling_for_alignment (0),
837 ptr_mask (0),
838 ivexpr_map (NULL),
839 scan_map (NULL),
840 slp_unrolling_factor (1),
841 single_scalar_iteration_cost (0),
842 vec_outside_cost (0),
843 vec_inside_cost (0),
844 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
845 vectorizable (false),
846 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
847 using_partial_vectors_p (false),
848 epil_using_partial_vectors_p (false),
849 peeling_for_gaps (false),
850 peeling_for_niter (false),
851 no_data_dependencies (false),
852 has_mask_store (false),
853 scalar_loop_scaling (profile_probability::uninitialized ()),
854 scalar_loop (NULL),
855 orig_loop_info (NULL)
857 /* CHECKME: We want to visit all BBs before their successors (except for
858 latch blocks, for which this assertion wouldn't hold). In the simple
859 case of the loop forms we allow, a dfs order of the BBs would the same
860 as reversed postorder traversal, so we are safe. */
862 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
863 bbs, loop->num_nodes, loop);
864 gcc_assert (nbbs == loop->num_nodes);
866 for (unsigned int i = 0; i < nbbs; i++)
868 basic_block bb = bbs[i];
869 gimple_stmt_iterator si;
871 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
873 gimple *phi = gsi_stmt (si);
874 gimple_set_uid (phi, 0);
875 add_stmt (phi);
878 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
880 gimple *stmt = gsi_stmt (si);
881 gimple_set_uid (stmt, 0);
882 if (is_gimple_debug (stmt))
883 continue;
884 add_stmt (stmt);
885 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
886 third argument is the #pragma omp simd if (x) condition, when 0,
887 loop shouldn't be vectorized, when non-zero constant, it should
888 be vectorized normally, otherwise versioned with vectorized loop
889 done if the condition is non-zero at runtime. */
890 if (loop_in->simduid
891 && is_gimple_call (stmt)
892 && gimple_call_internal_p (stmt)
893 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
894 && gimple_call_num_args (stmt) >= 3
895 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
896 && (loop_in->simduid
897 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
899 tree arg = gimple_call_arg (stmt, 2);
900 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
901 simd_if_cond = arg;
902 else
903 gcc_assert (integer_nonzerop (arg));
908 epilogue_vinfos.create (6);
911 /* Free all levels of rgroup CONTROLS. */
913 void
914 release_vec_loop_controls (vec<rgroup_controls> *controls)
916 rgroup_controls *rgc;
917 unsigned int i;
918 FOR_EACH_VEC_ELT (*controls, i, rgc)
919 rgc->controls.release ();
920 controls->release ();
923 /* Free all memory used by the _loop_vec_info, as well as all the
924 stmt_vec_info structs of all the stmts in the loop. */
926 _loop_vec_info::~_loop_vec_info ()
928 free (bbs);
930 release_vec_loop_controls (&masks);
931 release_vec_loop_controls (&lens);
932 delete ivexpr_map;
933 delete scan_map;
934 epilogue_vinfos.release ();
936 /* When we release an epiloge vinfo that we do not intend to use
937 avoid clearing AUX of the main loop which should continue to
938 point to the main loop vinfo since otherwise we'll leak that. */
939 if (loop->aux == this)
940 loop->aux = NULL;
943 /* Return an invariant or register for EXPR and emit necessary
944 computations in the LOOP_VINFO loop preheader. */
946 tree
947 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
949 if (is_gimple_reg (expr)
950 || is_gimple_min_invariant (expr))
951 return expr;
953 if (! loop_vinfo->ivexpr_map)
954 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
955 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
956 if (! cached)
958 gimple_seq stmts = NULL;
959 cached = force_gimple_operand (unshare_expr (expr),
960 &stmts, true, NULL_TREE);
961 if (stmts)
963 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
964 gsi_insert_seq_on_edge_immediate (e, stmts);
967 return cached;
970 /* Return true if we can use CMP_TYPE as the comparison type to produce
971 all masks required to mask LOOP_VINFO. */
973 static bool
974 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
976 rgroup_controls *rgm;
977 unsigned int i;
978 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
979 if (rgm->type != NULL_TREE
980 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
981 cmp_type, rgm->type,
982 OPTIMIZE_FOR_SPEED))
983 return false;
984 return true;
987 /* Calculate the maximum number of scalars per iteration for every
988 rgroup in LOOP_VINFO. */
990 static unsigned int
991 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
993 unsigned int res = 1;
994 unsigned int i;
995 rgroup_controls *rgm;
996 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
997 res = MAX (res, rgm->max_nscalars_per_iter);
998 return res;
1001 /* Calculate the minimum precision necessary to represent:
1003 MAX_NITERS * FACTOR
1005 as an unsigned integer, where MAX_NITERS is the maximum number of
1006 loop header iterations for the original scalar form of LOOP_VINFO. */
1008 static unsigned
1009 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1011 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1013 /* Get the maximum number of iterations that is representable
1014 in the counter type. */
1015 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1016 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1018 /* Get a more refined estimate for the number of iterations. */
1019 widest_int max_back_edges;
1020 if (max_loop_iterations (loop, &max_back_edges))
1021 max_ni = wi::smin (max_ni, max_back_edges + 1);
1023 /* Work out how many bits we need to represent the limit. */
1024 return wi::min_precision (max_ni * factor, UNSIGNED);
1027 /* True if the loop needs peeling or partial vectors when vectorized. */
1029 static bool
1030 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1032 unsigned HOST_WIDE_INT const_vf;
1033 HOST_WIDE_INT max_niter
1034 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1036 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1037 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1038 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1039 (loop_vinfo));
1041 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1042 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1044 /* Work out the (constant) number of iterations that need to be
1045 peeled for reasons other than niters. */
1046 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1047 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1048 peel_niter += 1;
1049 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1050 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1051 return true;
1053 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1054 /* ??? When peeling for gaps but not alignment, we could
1055 try to check whether the (variable) niters is known to be
1056 VF * N + 1. That's something of a niche case though. */
1057 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1058 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1059 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1060 < (unsigned) exact_log2 (const_vf))
1061 /* In case of versioning, check if the maximum number of
1062 iterations is greater than th. If they are identical,
1063 the epilogue is unnecessary. */
1064 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1065 || ((unsigned HOST_WIDE_INT) max_niter
1066 > (th / const_vf) * const_vf))))
1067 return true;
1069 return false;
1072 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1073 whether we can actually generate the masks required. Return true if so,
1074 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1076 static bool
1077 vect_verify_full_masking (loop_vec_info loop_vinfo)
1079 unsigned int min_ni_width;
1080 unsigned int max_nscalars_per_iter
1081 = vect_get_max_nscalars_per_iter (loop_vinfo);
1083 /* Use a normal loop if there are no statements that need masking.
1084 This only happens in rare degenerate cases: it means that the loop
1085 has no loads, no stores, and no live-out values. */
1086 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1087 return false;
1089 /* Work out how many bits we need to represent the limit. */
1090 min_ni_width
1091 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1093 /* Find a scalar mode for which WHILE_ULT is supported. */
1094 opt_scalar_int_mode cmp_mode_iter;
1095 tree cmp_type = NULL_TREE;
1096 tree iv_type = NULL_TREE;
1097 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1098 unsigned int iv_precision = UINT_MAX;
1100 if (iv_limit != -1)
1101 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1102 UNSIGNED);
1104 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1106 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1107 if (cmp_bits >= min_ni_width
1108 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1110 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1111 if (this_type
1112 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1114 /* Although we could stop as soon as we find a valid mode,
1115 there are at least two reasons why that's not always the
1116 best choice:
1118 - An IV that's Pmode or wider is more likely to be reusable
1119 in address calculations than an IV that's narrower than
1120 Pmode.
1122 - Doing the comparison in IV_PRECISION or wider allows
1123 a natural 0-based IV, whereas using a narrower comparison
1124 type requires mitigations against wrap-around.
1126 Conversely, if the IV limit is variable, doing the comparison
1127 in a wider type than the original type can introduce
1128 unnecessary extensions, so picking the widest valid mode
1129 is not always a good choice either.
1131 Here we prefer the first IV type that's Pmode or wider,
1132 and the first comparison type that's IV_PRECISION or wider.
1133 (The comparison type must be no wider than the IV type,
1134 to avoid extensions in the vector loop.)
1136 ??? We might want to try continuing beyond Pmode for ILP32
1137 targets if CMP_BITS < IV_PRECISION. */
1138 iv_type = this_type;
1139 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1140 cmp_type = this_type;
1141 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1142 break;
1147 if (!cmp_type)
1148 return false;
1150 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1151 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1152 return true;
1155 /* Check whether we can use vector access with length based on precison
1156 comparison. So far, to keep it simple, we only allow the case that the
1157 precision of the target supported length is larger than the precision
1158 required by loop niters. */
1160 static bool
1161 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1163 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1164 return false;
1166 unsigned int max_nitems_per_iter = 1;
1167 unsigned int i;
1168 rgroup_controls *rgl;
1169 /* Find the maximum number of items per iteration for every rgroup. */
1170 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1172 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1173 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1176 /* Work out how many bits we need to represent the length limit. */
1177 unsigned int min_ni_prec
1178 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1180 /* Now use the maximum of below precisions for one suitable IV type:
1181 - the IV's natural precision
1182 - the precision needed to hold: the maximum number of scalar
1183 iterations multiplied by the scale factor (min_ni_prec above)
1184 - the Pmode precision
1186 If min_ni_prec is less than the precision of the current niters,
1187 we perfer to still use the niters type. Prefer to use Pmode and
1188 wider IV to avoid narrow conversions. */
1190 unsigned int ni_prec
1191 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1192 min_ni_prec = MAX (min_ni_prec, ni_prec);
1193 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1195 tree iv_type = NULL_TREE;
1196 opt_scalar_int_mode tmode_iter;
1197 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1199 scalar_mode tmode = tmode_iter.require ();
1200 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1202 /* ??? Do we really want to construct one IV whose precision exceeds
1203 BITS_PER_WORD? */
1204 if (tbits > BITS_PER_WORD)
1205 break;
1207 /* Find the first available standard integral type. */
1208 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1210 iv_type = build_nonstandard_integer_type (tbits, true);
1211 break;
1215 if (!iv_type)
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "can't vectorize with length-based partial vectors"
1220 " because there is no suitable iv type.\n");
1221 return false;
1224 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1225 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1227 return true;
1230 /* Calculate the cost of one scalar iteration of the loop. */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1234 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236 int nbbs = loop->num_nodes, factor;
1237 int innerloop_iters, i;
1239 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1241 /* Gather costs for statements in the scalar loop. */
1243 /* FORNOW. */
1244 innerloop_iters = 1;
1245 if (loop->inner)
1246 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1248 for (i = 0; i < nbbs; i++)
1250 gimple_stmt_iterator si;
1251 basic_block bb = bbs[i];
1253 if (bb->loop_father == loop->inner)
1254 factor = innerloop_iters;
1255 else
1256 factor = 1;
1258 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1260 gimple *stmt = gsi_stmt (si);
1261 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1263 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1264 continue;
1266 /* Skip stmts that are not vectorized inside the loop. */
1267 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1268 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1269 && (!STMT_VINFO_LIVE_P (vstmt_info)
1270 || !VECTORIZABLE_CYCLE_DEF
1271 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1272 continue;
1274 vect_cost_for_stmt kind;
1275 if (STMT_VINFO_DATA_REF (stmt_info))
1277 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1278 kind = scalar_load;
1279 else
1280 kind = scalar_store;
1282 else if (vect_nop_conversion_p (stmt_info))
1283 continue;
1284 else
1285 kind = scalar_stmt;
1287 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1288 factor, kind, stmt_info, 0, vect_prologue);
1292 /* Now accumulate cost. */
1293 void *target_cost_data = init_cost (loop, true);
1294 stmt_info_for_cost *si;
1295 int j;
1296 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1297 j, si)
1298 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1299 si->kind, si->stmt_info, si->vectype,
1300 si->misalign, vect_body);
1301 unsigned dummy, body_cost = 0;
1302 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1303 destroy_cost_data (target_cost_data);
1304 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1308 /* Function vect_analyze_loop_form_1.
1310 Verify that certain CFG restrictions hold, including:
1311 - the loop has a pre-header
1312 - the loop has a single entry and exit
1313 - the loop exit condition is simple enough
1314 - the number of iterations can be analyzed, i.e, a countable loop. The
1315 niter could be analyzed under some assumptions. */
1317 opt_result
1318 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1319 tree *assumptions, tree *number_of_iterationsm1,
1320 tree *number_of_iterations, gcond **inner_loop_cond)
1322 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1324 /* Different restrictions apply when we are considering an inner-most loop,
1325 vs. an outer (nested) loop.
1326 (FORNOW. May want to relax some of these restrictions in the future). */
1328 if (!loop->inner)
1330 /* Inner-most loop. We currently require that the number of BBs is
1331 exactly 2 (the header and latch). Vectorizable inner-most loops
1332 look like this:
1334 (pre-header)
1336 header <--------+
1337 | | |
1338 | +--> latch --+
1340 (exit-bb) */
1342 if (loop->num_nodes != 2)
1343 return opt_result::failure_at (vect_location,
1344 "not vectorized:"
1345 " control flow in loop.\n");
1347 if (empty_block_p (loop->header))
1348 return opt_result::failure_at (vect_location,
1349 "not vectorized: empty loop.\n");
1351 else
1353 class loop *innerloop = loop->inner;
1354 edge entryedge;
1356 /* Nested loop. We currently require that the loop is doubly-nested,
1357 contains a single inner loop, and the number of BBs is exactly 5.
1358 Vectorizable outer-loops look like this:
1360 (pre-header)
1362 header <---+
1364 inner-loop |
1366 tail ------+
1368 (exit-bb)
1370 The inner-loop has the properties expected of inner-most loops
1371 as described above. */
1373 if ((loop->inner)->inner || (loop->inner)->next)
1374 return opt_result::failure_at (vect_location,
1375 "not vectorized:"
1376 " multiple nested loops.\n");
1378 if (loop->num_nodes != 5)
1379 return opt_result::failure_at (vect_location,
1380 "not vectorized:"
1381 " control flow in loop.\n");
1383 entryedge = loop_preheader_edge (innerloop);
1384 if (entryedge->src != loop->header
1385 || !single_exit (innerloop)
1386 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1387 return opt_result::failure_at (vect_location,
1388 "not vectorized:"
1389 " unsupported outerloop form.\n");
1391 /* Analyze the inner-loop. */
1392 tree inner_niterm1, inner_niter, inner_assumptions;
1393 opt_result res
1394 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1395 &inner_assumptions, &inner_niterm1,
1396 &inner_niter, NULL);
1397 if (!res)
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401 "not vectorized: Bad inner loop.\n");
1402 return res;
1405 /* Don't support analyzing niter under assumptions for inner
1406 loop. */
1407 if (!integer_onep (inner_assumptions))
1408 return opt_result::failure_at (vect_location,
1409 "not vectorized: Bad inner loop.\n");
1411 if (!expr_invariant_in_loop_p (loop, inner_niter))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: inner-loop count not"
1414 " invariant.\n");
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE, vect_location,
1418 "Considering outer-loop vectorization.\n");
1421 if (!single_exit (loop))
1422 return opt_result::failure_at (vect_location,
1423 "not vectorized: multiple exits.\n");
1424 if (EDGE_COUNT (loop->header->preds) != 2)
1425 return opt_result::failure_at (vect_location,
1426 "not vectorized:"
1427 " too many incoming edges.\n");
1429 /* We assume that the loop exit condition is at the end of the loop. i.e,
1430 that the loop is represented as a do-while (with a proper if-guard
1431 before the loop if needed), where the loop header contains all the
1432 executable statements, and the latch is empty. */
1433 if (!empty_block_p (loop->latch)
1434 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1435 return opt_result::failure_at (vect_location,
1436 "not vectorized: latch block not empty.\n");
1438 /* Make sure the exit is not abnormal. */
1439 edge e = single_exit (loop);
1440 if (e->flags & EDGE_ABNORMAL)
1441 return opt_result::failure_at (vect_location,
1442 "not vectorized:"
1443 " abnormal loop exit edge.\n");
1445 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1446 number_of_iterationsm1);
1447 if (!*loop_cond)
1448 return opt_result::failure_at
1449 (vect_location,
1450 "not vectorized: complicated exit condition.\n");
1452 if (integer_zerop (*assumptions)
1453 || !*number_of_iterations
1454 || chrec_contains_undetermined (*number_of_iterations))
1455 return opt_result::failure_at
1456 (*loop_cond,
1457 "not vectorized: number of iterations cannot be computed.\n");
1459 if (integer_zerop (*number_of_iterations))
1460 return opt_result::failure_at
1461 (*loop_cond,
1462 "not vectorized: number of iterations = 0.\n");
1464 return opt_result::success ();
1467 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1469 opt_loop_vec_info
1470 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1472 tree assumptions, number_of_iterations, number_of_iterationsm1;
1473 gcond *loop_cond, *inner_loop_cond = NULL;
1475 opt_result res
1476 = vect_analyze_loop_form_1 (loop, &loop_cond,
1477 &assumptions, &number_of_iterationsm1,
1478 &number_of_iterations, &inner_loop_cond);
1479 if (!res)
1480 return opt_loop_vec_info::propagate_failure (res);
1482 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1483 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1484 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1485 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1486 if (!integer_onep (assumptions))
1488 /* We consider to vectorize this loop by versioning it under
1489 some assumptions. In order to do this, we need to clear
1490 existing information computed by scev and niter analyzer. */
1491 scev_reset_htab ();
1492 free_numbers_of_iterations_estimates (loop);
1493 /* Also set flag for this loop so that following scev and niter
1494 analysis are done under the assumptions. */
1495 loop_constraint_set (loop, LOOP_C_FINITE);
1496 /* Also record the assumptions for versioning. */
1497 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1500 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1502 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_NOTE, vect_location,
1505 "Symbolic number of iterations is ");
1506 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1507 dump_printf (MSG_NOTE, "\n");
1511 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1512 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1513 if (inner_loop_cond)
1515 stmt_vec_info inner_loop_cond_info
1516 = loop_vinfo->lookup_stmt (inner_loop_cond);
1517 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1520 gcc_assert (!loop->aux);
1521 loop->aux = loop_vinfo;
1522 return opt_loop_vec_info::success (loop_vinfo);
1527 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1528 statements update the vectorization factor. */
1530 static void
1531 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1533 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1534 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1535 int nbbs = loop->num_nodes;
1536 poly_uint64 vectorization_factor;
1537 int i;
1539 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1541 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1542 gcc_assert (known_ne (vectorization_factor, 0U));
1544 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1545 vectorization factor of the loop is the unrolling factor required by
1546 the SLP instances. If that unrolling factor is 1, we say, that we
1547 perform pure SLP on loop - cross iteration parallelism is not
1548 exploited. */
1549 bool only_slp_in_loop = true;
1550 for (i = 0; i < nbbs; i++)
1552 basic_block bb = bbs[i];
1553 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1554 gsi_next (&si))
1556 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1557 if (!stmt_info)
1558 continue;
1559 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1560 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1561 && !PURE_SLP_STMT (stmt_info))
1562 /* STMT needs both SLP and loop-based vectorization. */
1563 only_slp_in_loop = false;
1565 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1566 gsi_next (&si))
1568 if (is_gimple_debug (gsi_stmt (si)))
1569 continue;
1570 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1571 stmt_info = vect_stmt_to_vectorize (stmt_info);
1572 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574 && !PURE_SLP_STMT (stmt_info))
1575 /* STMT needs both SLP and loop-based vectorization. */
1576 only_slp_in_loop = false;
1580 if (only_slp_in_loop)
1582 if (dump_enabled_p ())
1583 dump_printf_loc (MSG_NOTE, vect_location,
1584 "Loop contains only SLP stmts\n");
1585 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1587 else
1589 if (dump_enabled_p ())
1590 dump_printf_loc (MSG_NOTE, vect_location,
1591 "Loop contains SLP and non-SLP stmts\n");
1592 /* Both the vectorization factor and unroll factor have the form
1593 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1594 so they must have a common multiple. */
1595 vectorization_factor
1596 = force_common_multiple (vectorization_factor,
1597 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1600 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1601 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "Updating vectorization factor to ");
1605 dump_dec (MSG_NOTE, vectorization_factor);
1606 dump_printf (MSG_NOTE, ".\n");
1610 /* Return true if STMT_INFO describes a double reduction phi and if
1611 the other phi in the reduction is also relevant for vectorization.
1612 This rejects cases such as:
1614 outer1:
1615 x_1 = PHI <x_3(outer2), ...>;
1618 inner:
1619 x_2 = ...;
1622 outer2:
1623 x_3 = PHI <x_2(inner)>;
1625 if nothing in x_2 or elsewhere makes x_1 relevant. */
1627 static bool
1628 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1630 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1631 return false;
1633 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1636 /* Function vect_analyze_loop_operations.
1638 Scan the loop stmts and make sure they are all vectorizable. */
1640 static opt_result
1641 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1643 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1644 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1645 int nbbs = loop->num_nodes;
1646 int i;
1647 stmt_vec_info stmt_info;
1648 bool need_to_vectorize = false;
1649 bool ok;
1651 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1653 auto_vec<stmt_info_for_cost> cost_vec;
1655 for (i = 0; i < nbbs; i++)
1657 basic_block bb = bbs[i];
1659 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1660 gsi_next (&si))
1662 gphi *phi = si.phi ();
1663 ok = true;
1665 stmt_info = loop_vinfo->lookup_stmt (phi);
1666 if (dump_enabled_p ())
1667 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1668 if (virtual_operand_p (gimple_phi_result (phi)))
1669 continue;
1671 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1672 (i.e., a phi in the tail of the outer-loop). */
1673 if (! is_loop_header_bb_p (bb))
1675 /* FORNOW: we currently don't support the case that these phis
1676 are not used in the outerloop (unless it is double reduction,
1677 i.e., this phi is vect_reduction_def), cause this case
1678 requires to actually do something here. */
1679 if (STMT_VINFO_LIVE_P (stmt_info)
1680 && !vect_active_double_reduction_p (stmt_info))
1681 return opt_result::failure_at (phi,
1682 "Unsupported loop-closed phi"
1683 " in outer-loop.\n");
1685 /* If PHI is used in the outer loop, we check that its operand
1686 is defined in the inner loop. */
1687 if (STMT_VINFO_RELEVANT_P (stmt_info))
1689 tree phi_op;
1691 if (gimple_phi_num_args (phi) != 1)
1692 return opt_result::failure_at (phi, "unsupported phi");
1694 phi_op = PHI_ARG_DEF (phi, 0);
1695 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1696 if (!op_def_info)
1697 return opt_result::failure_at (phi, "unsupported phi\n");
1699 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1700 && (STMT_VINFO_RELEVANT (op_def_info)
1701 != vect_used_in_outer_by_reduction))
1702 return opt_result::failure_at (phi, "unsupported phi\n");
1704 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1705 || (STMT_VINFO_DEF_TYPE (stmt_info)
1706 == vect_double_reduction_def))
1707 && !vectorizable_lc_phi (loop_vinfo,
1708 stmt_info, NULL, NULL))
1709 return opt_result::failure_at (phi, "unsupported phi\n");
1712 continue;
1715 gcc_assert (stmt_info);
1717 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1718 || STMT_VINFO_LIVE_P (stmt_info))
1719 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1720 /* A scalar-dependence cycle that we don't support. */
1721 return opt_result::failure_at (phi,
1722 "not vectorized:"
1723 " scalar dependence cycle.\n");
1725 if (STMT_VINFO_RELEVANT_P (stmt_info))
1727 need_to_vectorize = true;
1728 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1729 && ! PURE_SLP_STMT (stmt_info))
1730 ok = vectorizable_induction (loop_vinfo,
1731 stmt_info, NULL, NULL,
1732 &cost_vec);
1733 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1734 || (STMT_VINFO_DEF_TYPE (stmt_info)
1735 == vect_double_reduction_def)
1736 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1737 && ! PURE_SLP_STMT (stmt_info))
1738 ok = vectorizable_reduction (loop_vinfo,
1739 stmt_info, NULL, NULL, &cost_vec);
1742 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1743 if (ok
1744 && STMT_VINFO_LIVE_P (stmt_info)
1745 && !PURE_SLP_STMT (stmt_info))
1746 ok = vectorizable_live_operation (loop_vinfo,
1747 stmt_info, NULL, NULL, NULL,
1748 -1, false, &cost_vec);
1750 if (!ok)
1751 return opt_result::failure_at (phi,
1752 "not vectorized: relevant phi not "
1753 "supported: %G",
1754 static_cast <gimple *> (phi));
1757 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1758 gsi_next (&si))
1760 gimple *stmt = gsi_stmt (si);
1761 if (!gimple_clobber_p (stmt)
1762 && !is_gimple_debug (stmt))
1764 opt_result res
1765 = vect_analyze_stmt (loop_vinfo,
1766 loop_vinfo->lookup_stmt (stmt),
1767 &need_to_vectorize,
1768 NULL, NULL, &cost_vec);
1769 if (!res)
1770 return res;
1773 } /* bbs */
1775 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1777 /* All operations in the loop are either irrelevant (deal with loop
1778 control, or dead), or only used outside the loop and can be moved
1779 out of the loop (e.g. invariants, inductions). The loop can be
1780 optimized away by scalar optimizations. We're better off not
1781 touching this loop. */
1782 if (!need_to_vectorize)
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_NOTE, vect_location,
1786 "All the computation can be taken out of the loop.\n");
1787 return opt_result::failure_at
1788 (vect_location,
1789 "not vectorized: redundant loop. no profit to vectorize.\n");
1792 return opt_result::success ();
1795 /* Return true if we know that the iteration count is smaller than the
1796 vectorization factor. Return false if it isn't, or if we can't be sure
1797 either way. */
1799 static bool
1800 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1802 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1804 HOST_WIDE_INT max_niter;
1805 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1806 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1807 else
1808 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1810 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1811 return true;
1813 return false;
1816 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1817 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1818 definitely no, or -1 if it's worth retrying. */
1820 static int
1821 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1823 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1824 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1826 /* Only loops that can handle partially-populated vectors can have iteration
1827 counts less than the vectorization factor. */
1828 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1830 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1832 if (dump_enabled_p ())
1833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1834 "not vectorized: iteration count smaller than "
1835 "vectorization factor.\n");
1836 return 0;
1840 /* If using the "very cheap" model. reject cases in which we'd keep
1841 a copy of the scalar code (even if we might be able to vectorize it). */
1842 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1843 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1844 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1845 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1847 if (dump_enabled_p ())
1848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1849 "some scalar iterations would need to be peeled\n");
1850 return 0;
1853 int min_profitable_iters, min_profitable_estimate;
1854 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1855 &min_profitable_estimate);
1857 if (min_profitable_iters < 0)
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "not vectorized: vectorization not profitable.\n");
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864 "not vectorized: vector version will never be "
1865 "profitable.\n");
1866 return -1;
1869 int min_scalar_loop_bound = (param_min_vect_loop_bound
1870 * assumed_vf);
1872 /* Use the cost model only if it is more conservative than user specified
1873 threshold. */
1874 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1875 min_profitable_iters);
1877 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1879 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1880 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1882 if (dump_enabled_p ())
1883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884 "not vectorized: vectorization not profitable.\n");
1885 if (dump_enabled_p ())
1886 dump_printf_loc (MSG_NOTE, vect_location,
1887 "not vectorized: iteration count smaller than user "
1888 "specified loop bound parameter or minimum profitable "
1889 "iterations (whichever is more conservative).\n");
1890 return 0;
1893 /* The static profitablity threshold min_profitable_estimate includes
1894 the cost of having to check at runtime whether the scalar loop
1895 should be used instead. If it turns out that we don't need or want
1896 such a check, the threshold we should use for the static estimate
1897 is simply the point at which the vector loop becomes more profitable
1898 than the scalar loop. */
1899 if (min_profitable_estimate > min_profitable_iters
1900 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1901 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1902 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1903 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1907 " choice between the scalar and vector loops\n");
1908 min_profitable_estimate = min_profitable_iters;
1911 /* If the vector loop needs multiple iterations to be beneficial then
1912 things are probably too close to call, and the conservative thing
1913 would be to stick with the scalar code. */
1914 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1915 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1919 "one iteration of the vector loop would be"
1920 " more expensive than the equivalent number of"
1921 " iterations of the scalar loop\n");
1922 return 0;
1925 HOST_WIDE_INT estimated_niter;
1927 /* If we are vectorizing an epilogue then we know the maximum number of
1928 scalar iterations it will cover is at least one lower than the
1929 vectorization factor of the main loop. */
1930 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1931 estimated_niter
1932 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1933 else
1935 estimated_niter = estimated_stmt_executions_int (loop);
1936 if (estimated_niter == -1)
1937 estimated_niter = likely_max_stmt_executions_int (loop);
1939 if (estimated_niter != -1
1940 && ((unsigned HOST_WIDE_INT) estimated_niter
1941 < MAX (th, (unsigned) min_profitable_estimate)))
1943 if (dump_enabled_p ())
1944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1945 "not vectorized: estimated iteration count too "
1946 "small.\n");
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_NOTE, vect_location,
1949 "not vectorized: estimated iteration count smaller "
1950 "than specified loop bound parameter or minimum "
1951 "profitable iterations (whichever is more "
1952 "conservative).\n");
1953 return -1;
1956 return 1;
1959 static opt_result
1960 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1961 vec<data_reference_p> *datarefs,
1962 unsigned int *n_stmts)
1964 *n_stmts = 0;
1965 for (unsigned i = 0; i < loop->num_nodes; i++)
1966 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1967 !gsi_end_p (gsi); gsi_next (&gsi))
1969 gimple *stmt = gsi_stmt (gsi);
1970 if (is_gimple_debug (stmt))
1971 continue;
1972 ++(*n_stmts);
1973 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1974 NULL, 0);
1975 if (!res)
1977 if (is_gimple_call (stmt) && loop->safelen)
1979 tree fndecl = gimple_call_fndecl (stmt), op;
1980 if (fndecl != NULL_TREE)
1982 cgraph_node *node = cgraph_node::get (fndecl);
1983 if (node != NULL && node->simd_clones != NULL)
1985 unsigned int j, n = gimple_call_num_args (stmt);
1986 for (j = 0; j < n; j++)
1988 op = gimple_call_arg (stmt, j);
1989 if (DECL_P (op)
1990 || (REFERENCE_CLASS_P (op)
1991 && get_base_address (op)))
1992 break;
1994 op = gimple_call_lhs (stmt);
1995 /* Ignore #pragma omp declare simd functions
1996 if they don't have data references in the
1997 call stmt itself. */
1998 if (j == n
1999 && !(op
2000 && (DECL_P (op)
2001 || (REFERENCE_CLASS_P (op)
2002 && get_base_address (op)))))
2003 continue;
2007 return res;
2009 /* If dependence analysis will give up due to the limit on the
2010 number of datarefs stop here and fail fatally. */
2011 if (datarefs->length ()
2012 > (unsigned)param_loop_max_datarefs_for_datadeps)
2013 return opt_result::failure_at (stmt, "exceeded param "
2014 "loop-max-datarefs-for-datadeps\n");
2016 return opt_result::success ();
2019 /* Look for SLP-only access groups and turn each individual access into its own
2020 group. */
2021 static void
2022 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2024 unsigned int i;
2025 struct data_reference *dr;
2027 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2029 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2030 FOR_EACH_VEC_ELT (datarefs, i, dr)
2032 gcc_assert (DR_REF (dr));
2033 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2035 /* Check if the load is a part of an interleaving chain. */
2036 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2038 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2039 unsigned int group_size = DR_GROUP_SIZE (first_element);
2041 /* Check if SLP-only groups. */
2042 if (!STMT_SLP_TYPE (stmt_info)
2043 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2045 /* Dissolve the group. */
2046 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2048 stmt_vec_info vinfo = first_element;
2049 while (vinfo)
2051 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2052 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2053 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2054 DR_GROUP_SIZE (vinfo) = 1;
2055 if (STMT_VINFO_STRIDED_P (first_element))
2056 DR_GROUP_GAP (vinfo) = 0;
2057 else
2058 DR_GROUP_GAP (vinfo) = group_size - 1;
2059 vinfo = next;
2066 /* Determine if operating on full vectors for LOOP_VINFO might leave
2067 some scalar iterations still to do. If so, decide how we should
2068 handle those scalar iterations. The possibilities are:
2070 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2071 In this case:
2073 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2074 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2075 LOOP_VINFO_PEELING_FOR_NITER == false
2077 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2078 to handle the remaining scalar iterations. In this case:
2080 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2081 LOOP_VINFO_PEELING_FOR_NITER == true
2083 There are two choices:
2085 (2a) Consider vectorizing the epilogue loop at the same VF as the
2086 main loop, but using partial vectors instead of full vectors.
2087 In this case:
2089 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2091 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2092 In this case:
2094 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2096 When FOR_EPILOGUE_P is true, make this determination based on the
2097 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2098 based on the assumption that LOOP_VINFO is the main loop. The caller
2099 has made sure that the number of iterations is set appropriately for
2100 this value of FOR_EPILOGUE_P. */
2102 opt_result
2103 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2104 bool for_epilogue_p)
2106 /* Determine whether there would be any scalar iterations left over. */
2107 bool need_peeling_or_partial_vectors_p
2108 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2110 /* Decide whether to vectorize the loop with partial vectors. */
2111 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2112 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2113 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2114 && need_peeling_or_partial_vectors_p)
2116 /* For partial-vector-usage=1, try to push the handling of partial
2117 vectors to the epilogue, with the main loop continuing to operate
2118 on full vectors.
2120 ??? We could then end up failing to use partial vectors if we
2121 decide to peel iterations into a prologue, and if the main loop
2122 then ends up processing fewer than VF iterations. */
2123 if (param_vect_partial_vector_usage == 1
2124 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2125 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2126 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2127 else
2128 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2131 if (dump_enabled_p ())
2133 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2134 dump_printf_loc (MSG_NOTE, vect_location,
2135 "operating on partial vectors%s.\n",
2136 for_epilogue_p ? " for epilogue loop" : "");
2137 else
2138 dump_printf_loc (MSG_NOTE, vect_location,
2139 "operating only on full vectors%s.\n",
2140 for_epilogue_p ? " for epilogue loop" : "");
2143 if (for_epilogue_p)
2145 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2146 gcc_assert (orig_loop_vinfo);
2147 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2148 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2149 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2152 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2153 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2155 /* Check that the loop processes at least one full vector. */
2156 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2157 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2158 if (known_lt (wi::to_widest (scalar_niters), vf))
2159 return opt_result::failure_at (vect_location,
2160 "loop does not have enough iterations"
2161 " to support vectorization.\n");
2163 /* If we need to peel an extra epilogue iteration to handle data
2164 accesses with gaps, check that there are enough scalar iterations
2165 available.
2167 The check above is redundant with this one when peeling for gaps,
2168 but the distinction is useful for diagnostics. */
2169 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2170 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2171 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2172 return opt_result::failure_at (vect_location,
2173 "loop does not have enough iterations"
2174 " to support peeling for gaps.\n");
2177 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2178 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2179 && need_peeling_or_partial_vectors_p);
2181 return opt_result::success ();
2184 /* Function vect_analyze_loop_2.
2186 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2187 for it. The different analyses will record information in the
2188 loop_vec_info struct. */
2189 static opt_result
2190 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2192 opt_result ok = opt_result::success ();
2193 int res;
2194 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2195 poly_uint64 min_vf = 2;
2196 loop_vec_info orig_loop_vinfo = NULL;
2198 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2199 loop_vec_info of the first vectorized loop. */
2200 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2201 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2202 else
2203 orig_loop_vinfo = loop_vinfo;
2204 gcc_assert (orig_loop_vinfo);
2206 /* The first group of checks is independent of the vector size. */
2207 fatal = true;
2209 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2210 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2211 return opt_result::failure_at (vect_location,
2212 "not vectorized: simd if(0)\n");
2214 /* Find all data references in the loop (which correspond to vdefs/vuses)
2215 and analyze their evolution in the loop. */
2217 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2219 /* Gather the data references and count stmts in the loop. */
2220 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2222 opt_result res
2223 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2224 &LOOP_VINFO_DATAREFS (loop_vinfo),
2225 n_stmts);
2226 if (!res)
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 "not vectorized: loop contains function "
2231 "calls or data references that cannot "
2232 "be analyzed\n");
2233 return res;
2235 loop_vinfo->shared->save_datarefs ();
2237 else
2238 loop_vinfo->shared->check_datarefs ();
2240 /* Analyze the data references and also adjust the minimal
2241 vectorization factor according to the loads and stores. */
2243 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2244 if (!ok)
2246 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248 "bad data references.\n");
2249 return ok;
2252 /* Classify all cross-iteration scalar data-flow cycles.
2253 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2254 vect_analyze_scalar_cycles (loop_vinfo);
2256 vect_pattern_recog (loop_vinfo);
2258 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2260 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2261 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2263 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2264 if (!ok)
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2268 "bad data access.\n");
2269 return ok;
2272 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2274 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2275 if (!ok)
2277 if (dump_enabled_p ())
2278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279 "unexpected pattern.\n");
2280 return ok;
2283 /* While the rest of the analysis below depends on it in some way. */
2284 fatal = false;
2286 /* Analyze data dependences between the data-refs in the loop
2287 and adjust the maximum vectorization factor according to
2288 the dependences.
2289 FORNOW: fail at the first data dependence that we encounter. */
2291 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2292 if (!ok)
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296 "bad data dependence.\n");
2297 return ok;
2299 if (max_vf != MAX_VECTORIZATION_FACTOR
2300 && maybe_lt (max_vf, min_vf))
2301 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2302 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2304 ok = vect_determine_vectorization_factor (loop_vinfo);
2305 if (!ok)
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 "can't determine vectorization factor.\n");
2310 return ok;
2312 if (max_vf != MAX_VECTORIZATION_FACTOR
2313 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2314 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2316 /* Compute the scalar iteration cost. */
2317 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2319 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2321 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2322 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2323 if (!ok)
2324 return ok;
2326 /* If there are any SLP instances mark them as pure_slp. */
2327 bool slp = vect_make_slp_decision (loop_vinfo);
2328 if (slp)
2330 /* Find stmts that need to be both vectorized and SLPed. */
2331 vect_detect_hybrid_slp (loop_vinfo);
2333 /* Update the vectorization factor based on the SLP decision. */
2334 vect_update_vf_for_slp (loop_vinfo);
2336 /* Optimize the SLP graph with the vectorization factor fixed. */
2337 vect_optimize_slp (loop_vinfo);
2339 /* Gather the loads reachable from the SLP graph entries. */
2340 vect_gather_slp_loads (loop_vinfo);
2343 bool saved_can_use_partial_vectors_p
2344 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2346 /* We don't expect to have to roll back to anything other than an empty
2347 set of rgroups. */
2348 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2350 /* This is the point where we can re-start analysis with SLP forced off. */
2351 start_over:
2353 /* Now the vectorization factor is final. */
2354 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2355 gcc_assert (known_ne (vectorization_factor, 0U));
2357 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2359 dump_printf_loc (MSG_NOTE, vect_location,
2360 "vectorization_factor = ");
2361 dump_dec (MSG_NOTE, vectorization_factor);
2362 dump_printf (MSG_NOTE, ", niters = %wd\n",
2363 LOOP_VINFO_INT_NITERS (loop_vinfo));
2366 /* Analyze the alignment of the data-refs in the loop.
2367 Fail if a data reference is found that cannot be vectorized. */
2369 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2370 if (!ok)
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374 "bad data alignment.\n");
2375 return ok;
2378 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2379 It is important to call pruning after vect_analyze_data_ref_accesses,
2380 since we use grouping information gathered by interleaving analysis. */
2381 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2382 if (!ok)
2383 return ok;
2385 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2386 vectorization, since we do not want to add extra peeling or
2387 add versioning for alignment. */
2388 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2389 /* This pass will decide on using loop versioning and/or loop peeling in
2390 order to enhance the alignment of data references in the loop. */
2391 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2392 if (!ok)
2393 return ok;
2395 if (slp)
2397 /* Analyze operations in the SLP instances. Note this may
2398 remove unsupported SLP instances which makes the above
2399 SLP kind detection invalid. */
2400 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2401 vect_slp_analyze_operations (loop_vinfo);
2402 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2404 ok = opt_result::failure_at (vect_location,
2405 "unsupported SLP instances\n");
2406 goto again;
2409 /* Check whether any load in ALL SLP instances is possibly permuted. */
2410 slp_tree load_node, slp_root;
2411 unsigned i, x;
2412 slp_instance instance;
2413 bool can_use_lanes = true;
2414 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2416 slp_root = SLP_INSTANCE_TREE (instance);
2417 int group_size = SLP_TREE_LANES (slp_root);
2418 tree vectype = SLP_TREE_VECTYPE (slp_root);
2419 bool loads_permuted = false;
2420 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2422 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2423 continue;
2424 unsigned j;
2425 stmt_vec_info load_info;
2426 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2427 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2429 loads_permuted = true;
2430 break;
2434 /* If the loads and stores can be handled with load/store-lane
2435 instructions record it and move on to the next instance. */
2436 if (loads_permuted
2437 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2438 && vect_store_lanes_supported (vectype, group_size, false))
2440 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2442 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2443 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2444 /* Use SLP for strided accesses (or if we can't
2445 load-lanes). */
2446 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2447 || ! vect_load_lanes_supported
2448 (STMT_VINFO_VECTYPE (stmt_vinfo),
2449 DR_GROUP_SIZE (stmt_vinfo), false))
2450 break;
2453 can_use_lanes
2454 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2456 if (can_use_lanes && dump_enabled_p ())
2457 dump_printf_loc (MSG_NOTE, vect_location,
2458 "SLP instance %p can use load/store-lanes\n",
2459 instance);
2461 else
2463 can_use_lanes = false;
2464 break;
2468 /* If all SLP instances can use load/store-lanes abort SLP and try again
2469 with SLP disabled. */
2470 if (can_use_lanes)
2472 ok = opt_result::failure_at (vect_location,
2473 "Built SLP cancelled: can use "
2474 "load/store-lanes\n");
2475 if (dump_enabled_p ())
2476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2477 "Built SLP cancelled: all SLP instances support "
2478 "load/store-lanes\n");
2479 goto again;
2483 /* Dissolve SLP-only groups. */
2484 vect_dissolve_slp_only_groups (loop_vinfo);
2486 /* Scan all the remaining operations in the loop that are not subject
2487 to SLP and make sure they are vectorizable. */
2488 ok = vect_analyze_loop_operations (loop_vinfo);
2489 if (!ok)
2491 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493 "bad operation or unsupported loop bound.\n");
2494 return ok;
2497 /* For now, we don't expect to mix both masking and length approaches for one
2498 loop, disable it if both are recorded. */
2499 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2500 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2501 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2505 "can't vectorize a loop with partial vectors"
2506 " because we don't expect to mix different"
2507 " approaches with partial vectors for the"
2508 " same loop.\n");
2509 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2512 /* If we still have the option of using partial vectors,
2513 check whether we can generate the necessary loop controls. */
2514 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2515 && !vect_verify_full_masking (loop_vinfo)
2516 && !vect_verify_loop_lens (loop_vinfo))
2517 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2519 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2520 to be able to handle fewer than VF scalars, or needs to have a lower VF
2521 than the main loop. */
2522 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2523 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2524 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2525 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2526 return opt_result::failure_at (vect_location,
2527 "Vectorization factor too high for"
2528 " epilogue loop.\n");
2530 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2531 assuming that the loop will be used as a main loop. We will redo
2532 this analysis later if we instead decide to use the loop as an
2533 epilogue loop. */
2534 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2535 if (!ok)
2536 return ok;
2538 /* Check the costings of the loop make vectorizing worthwhile. */
2539 res = vect_analyze_loop_costing (loop_vinfo);
2540 if (res < 0)
2542 ok = opt_result::failure_at (vect_location,
2543 "Loop costings may not be worthwhile.\n");
2544 goto again;
2546 if (!res)
2547 return opt_result::failure_at (vect_location,
2548 "Loop costings not worthwhile.\n");
2550 /* If an epilogue loop is required make sure we can create one. */
2551 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2552 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2554 if (dump_enabled_p ())
2555 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2556 if (!vect_can_advance_ivs_p (loop_vinfo)
2557 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2558 single_exit (LOOP_VINFO_LOOP
2559 (loop_vinfo))))
2561 ok = opt_result::failure_at (vect_location,
2562 "not vectorized: can't create required "
2563 "epilog loop\n");
2564 goto again;
2568 /* During peeling, we need to check if number of loop iterations is
2569 enough for both peeled prolog loop and vector loop. This check
2570 can be merged along with threshold check of loop versioning, so
2571 increase threshold for this case if necessary.
2573 If we are analyzing an epilogue we still want to check what its
2574 versioning threshold would be. If we decide to vectorize the epilogues we
2575 will want to use the lowest versioning threshold of all epilogues and main
2576 loop. This will enable us to enter a vectorized epilogue even when
2577 versioning the loop. We can't simply check whether the epilogue requires
2578 versioning though since we may have skipped some versioning checks when
2579 analyzing the epilogue. For instance, checks for alias versioning will be
2580 skipped when dealing with epilogues as we assume we already checked them
2581 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2582 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2584 poly_uint64 niters_th = 0;
2585 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2587 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2589 /* Niters for peeled prolog loop. */
2590 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2592 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2593 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2594 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2596 else
2597 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2600 /* Niters for at least one iteration of vectorized loop. */
2601 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2602 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2603 /* One additional iteration because of peeling for gap. */
2604 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2605 niters_th += 1;
2607 /* Use the same condition as vect_transform_loop to decide when to use
2608 the cost to determine a versioning threshold. */
2609 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2610 && ordered_p (th, niters_th))
2611 niters_th = ordered_max (poly_uint64 (th), niters_th);
2613 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2616 gcc_assert (known_eq (vectorization_factor,
2617 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2619 /* Ok to vectorize! */
2620 return opt_result::success ();
2622 again:
2623 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2624 gcc_assert (!ok);
2626 /* Try again with SLP forced off but if we didn't do any SLP there is
2627 no point in re-trying. */
2628 if (!slp)
2629 return ok;
2631 /* If there are reduction chains re-trying will fail anyway. */
2632 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2633 return ok;
2635 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2636 via interleaving or lane instructions. */
2637 slp_instance instance;
2638 slp_tree node;
2639 unsigned i, j;
2640 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2642 stmt_vec_info vinfo;
2643 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2644 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2645 continue;
2646 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2647 unsigned int size = DR_GROUP_SIZE (vinfo);
2648 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2649 if (! vect_store_lanes_supported (vectype, size, false)
2650 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2651 && ! vect_grouped_store_supported (vectype, size))
2652 return opt_result::failure_at (vinfo->stmt,
2653 "unsupported grouped store\n");
2654 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2656 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2657 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2658 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2659 size = DR_GROUP_SIZE (vinfo);
2660 vectype = STMT_VINFO_VECTYPE (vinfo);
2661 if (! vect_load_lanes_supported (vectype, size, false)
2662 && ! vect_grouped_load_supported (vectype, single_element_p,
2663 size))
2664 return opt_result::failure_at (vinfo->stmt,
2665 "unsupported grouped load\n");
2669 if (dump_enabled_p ())
2670 dump_printf_loc (MSG_NOTE, vect_location,
2671 "re-trying with SLP disabled\n");
2673 /* Roll back state appropriately. No SLP this time. */
2674 slp = false;
2675 /* Restore vectorization factor as it were without SLP. */
2676 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2677 /* Free the SLP instances. */
2678 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2679 vect_free_slp_instance (instance);
2680 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2681 /* Reset SLP type to loop_vect on all stmts. */
2682 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2684 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2685 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2686 !gsi_end_p (si); gsi_next (&si))
2688 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2689 STMT_SLP_TYPE (stmt_info) = loop_vect;
2690 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2691 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2693 /* vectorizable_reduction adjusts reduction stmt def-types,
2694 restore them to that of the PHI. */
2695 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2696 = STMT_VINFO_DEF_TYPE (stmt_info);
2697 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2698 (STMT_VINFO_REDUC_DEF (stmt_info)))
2699 = STMT_VINFO_DEF_TYPE (stmt_info);
2702 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2703 !gsi_end_p (si); gsi_next (&si))
2705 if (is_gimple_debug (gsi_stmt (si)))
2706 continue;
2707 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2708 STMT_SLP_TYPE (stmt_info) = loop_vect;
2709 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2711 stmt_vec_info pattern_stmt_info
2712 = STMT_VINFO_RELATED_STMT (stmt_info);
2713 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2714 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2716 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2717 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2718 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2719 !gsi_end_p (pi); gsi_next (&pi))
2720 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2721 = loop_vect;
2725 /* Free optimized alias test DDRS. */
2726 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2727 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2728 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2729 /* Reset target cost data. */
2730 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2731 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2732 = init_cost (LOOP_VINFO_LOOP (loop_vinfo), false);
2733 /* Reset accumulated rgroup information. */
2734 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2735 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2736 /* Reset assorted flags. */
2737 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2738 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2739 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2740 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2741 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2742 = saved_can_use_partial_vectors_p;
2744 goto start_over;
2747 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2748 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2749 OLD_LOOP_VINFO is better unless something specifically indicates
2750 otherwise.
2752 Note that this deliberately isn't a partial order. */
2754 static bool
2755 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2756 loop_vec_info old_loop_vinfo)
2758 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2759 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2761 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2762 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2764 /* Always prefer a VF of loop->simdlen over any other VF. */
2765 if (loop->simdlen)
2767 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2768 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2769 if (new_simdlen_p != old_simdlen_p)
2770 return new_simdlen_p;
2773 /* Limit the VFs to what is likely to be the maximum number of iterations,
2774 to handle cases in which at least one loop_vinfo is fully-masked. */
2775 HOST_WIDE_INT estimated_max_niter;
2776 loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
2777 unsigned HOST_WIDE_INT main_vf;
2778 if (main_loop
2779 && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
2780 && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
2781 estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
2782 else
2783 estimated_max_niter = likely_max_stmt_executions_int (loop);
2784 if (estimated_max_niter != -1)
2786 if (known_le (estimated_max_niter, new_vf))
2787 new_vf = estimated_max_niter;
2788 if (known_le (estimated_max_niter, old_vf))
2789 old_vf = estimated_max_niter;
2792 /* Check whether the (fractional) cost per scalar iteration is lower
2793 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2794 poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2795 poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2797 HOST_WIDE_INT est_rel_new_min
2798 = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2799 HOST_WIDE_INT est_rel_new_max
2800 = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2802 HOST_WIDE_INT est_rel_old_min
2803 = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2804 HOST_WIDE_INT est_rel_old_max
2805 = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2807 /* Check first if we can make out an unambigous total order from the minimum
2808 and maximum estimates. */
2809 if (est_rel_new_min < est_rel_old_min
2810 && est_rel_new_max < est_rel_old_max)
2811 return true;
2812 else if (est_rel_old_min < est_rel_new_min
2813 && est_rel_old_max < est_rel_new_max)
2814 return false;
2815 /* When old_loop_vinfo uses a variable vectorization factor,
2816 we know that it has a lower cost for at least one runtime VF.
2817 However, we don't know how likely that VF is.
2819 One option would be to compare the costs for the estimated VFs.
2820 The problem is that that can put too much pressure on the cost
2821 model. E.g. if the estimated VF is also the lowest possible VF,
2822 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2823 for the estimated VF, we'd then choose new_loop_vinfo even
2824 though (a) new_loop_vinfo might not actually be better than
2825 old_loop_vinfo for that VF and (b) it would be significantly
2826 worse at larger VFs.
2828 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2829 no more expensive than old_loop_vinfo even after doubling the
2830 estimated old_loop_vinfo VF. For all but trivial loops, this
2831 ensures that we only pick new_loop_vinfo if it is significantly
2832 better than old_loop_vinfo at the estimated VF. */
2834 if (est_rel_old_min != est_rel_new_min
2835 || est_rel_old_max != est_rel_new_max)
2837 HOST_WIDE_INT est_rel_new_likely
2838 = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2839 HOST_WIDE_INT est_rel_old_likely
2840 = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2842 return est_rel_new_likely * 2 <= est_rel_old_likely;
2845 /* If there's nothing to choose between the loop bodies, see whether
2846 there's a difference in the prologue and epilogue costs. */
2847 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2848 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2850 return false;
2853 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2854 true if we should. */
2856 static bool
2857 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2858 loop_vec_info old_loop_vinfo)
2860 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2861 return false;
2863 if (dump_enabled_p ())
2864 dump_printf_loc (MSG_NOTE, vect_location,
2865 "***** Preferring vector mode %s to vector mode %s\n",
2866 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2867 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2868 return true;
2871 /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise
2872 try to reanalyze it as a main loop. Return the loop_vinfo on success
2873 and null on failure. */
2875 static loop_vec_info
2876 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2878 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2879 return loop_vinfo;
2881 if (dump_enabled_p ())
2882 dump_printf_loc (MSG_NOTE, vect_location,
2883 "***** Reanalyzing as a main loop with vector mode %s\n",
2884 GET_MODE_NAME (loop_vinfo->vector_mode));
2886 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2887 vec_info_shared *shared = loop_vinfo->shared;
2888 opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2889 gcc_assert (main_loop_vinfo);
2891 main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2893 bool fatal = false;
2894 bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2895 loop->aux = NULL;
2896 if (!res)
2898 if (dump_enabled_p ())
2899 dump_printf_loc (MSG_NOTE, vect_location,
2900 "***** Failed to analyze main loop with vector"
2901 " mode %s\n",
2902 GET_MODE_NAME (loop_vinfo->vector_mode));
2903 delete main_loop_vinfo;
2904 return NULL;
2906 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2907 return main_loop_vinfo;
2910 /* Function vect_analyze_loop.
2912 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2913 for it. The different analyses will record information in the
2914 loop_vec_info struct. */
2915 opt_loop_vec_info
2916 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2918 auto_vector_modes vector_modes;
2920 /* Autodetect first vector size we try. */
2921 unsigned int autovec_flags
2922 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2923 loop->simdlen != 0);
2924 unsigned int mode_i = 0;
2926 DUMP_VECT_SCOPE ("analyze_loop_nest");
2928 if (loop_outer (loop)
2929 && loop_vec_info_for_loop (loop_outer (loop))
2930 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2931 return opt_loop_vec_info::failure_at (vect_location,
2932 "outer-loop already vectorized.\n");
2934 if (!find_loop_nest (loop, &shared->loop_nest))
2935 return opt_loop_vec_info::failure_at
2936 (vect_location,
2937 "not vectorized: loop nest containing two or more consecutive inner"
2938 " loops cannot be vectorized\n");
2940 unsigned n_stmts = 0;
2941 machine_mode autodetected_vector_mode = VOIDmode;
2942 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2943 machine_mode next_vector_mode = VOIDmode;
2944 poly_uint64 lowest_th = 0;
2945 unsigned vectorized_loops = 0;
2946 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2947 && !unlimited_cost_model (loop));
2949 bool vect_epilogues = false;
2950 opt_result res = opt_result::success ();
2951 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2952 while (1)
2954 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2955 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2956 if (!loop_vinfo)
2958 if (dump_enabled_p ())
2959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2960 "bad loop form.\n");
2961 gcc_checking_assert (first_loop_vinfo == NULL);
2962 return loop_vinfo;
2964 loop_vinfo->vector_mode = next_vector_mode;
2966 bool fatal = false;
2968 /* When pick_lowest_cost_p is true, we should in principle iterate
2969 over all the loop_vec_infos that LOOP_VINFO could replace and
2970 try to vectorize LOOP_VINFO under the same conditions.
2971 E.g. when trying to replace an epilogue loop, we should vectorize
2972 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2973 to replace the main loop, we should vectorize LOOP_VINFO as a main
2974 loop too.
2976 However, autovectorize_vector_modes is usually sorted as follows:
2978 - Modes that naturally produce lower VFs usually follow modes that
2979 naturally produce higher VFs.
2981 - When modes naturally produce the same VF, maskable modes
2982 usually follow unmaskable ones, so that the maskable mode
2983 can be used to vectorize the epilogue of the unmaskable mode.
2985 This order is preferred because it leads to the maximum
2986 epilogue vectorization opportunities. Targets should only use
2987 a different order if they want to make wide modes available while
2988 disparaging them relative to earlier, smaller modes. The assumption
2989 in that case is that the wider modes are more expensive in some
2990 way that isn't reflected directly in the costs.
2992 There should therefore be few interesting cases in which
2993 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2994 treated as a standalone loop, and ends up being genuinely cheaper
2995 than FIRST_LOOP_VINFO. */
2996 if (vect_epilogues)
2997 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2999 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
3000 if (mode_i == 0)
3001 autodetected_vector_mode = loop_vinfo->vector_mode;
3002 if (dump_enabled_p ())
3004 if (res)
3005 dump_printf_loc (MSG_NOTE, vect_location,
3006 "***** Analysis succeeded with vector mode %s\n",
3007 GET_MODE_NAME (loop_vinfo->vector_mode));
3008 else
3009 dump_printf_loc (MSG_NOTE, vect_location,
3010 "***** Analysis failed with vector mode %s\n",
3011 GET_MODE_NAME (loop_vinfo->vector_mode));
3014 loop->aux = NULL;
3016 if (!fatal)
3017 while (mode_i < vector_modes.length ()
3018 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3020 if (dump_enabled_p ())
3021 dump_printf_loc (MSG_NOTE, vect_location,
3022 "***** The result for vector mode %s would"
3023 " be the same\n",
3024 GET_MODE_NAME (vector_modes[mode_i]));
3025 mode_i += 1;
3028 if (res)
3030 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3031 vectorized_loops++;
3033 /* Once we hit the desired simdlen for the first time,
3034 discard any previous attempts. */
3035 if (simdlen
3036 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3038 delete first_loop_vinfo;
3039 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3040 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3041 simdlen = 0;
3043 else if (pick_lowest_cost_p && first_loop_vinfo)
3045 /* Keep trying to roll back vectorization attempts while the
3046 loop_vec_infos they produced were worse than this one. */
3047 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3048 while (!vinfos.is_empty ()
3049 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3051 gcc_assert (vect_epilogues);
3052 delete vinfos.pop ();
3054 if (vinfos.is_empty ()
3055 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3057 loop_vec_info main_loop_vinfo
3058 = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3059 if (main_loop_vinfo == loop_vinfo)
3061 delete first_loop_vinfo;
3062 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3064 else if (main_loop_vinfo
3065 && vect_joust_loop_vinfos (main_loop_vinfo,
3066 first_loop_vinfo))
3068 delete first_loop_vinfo;
3069 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3070 delete loop_vinfo;
3071 loop_vinfo
3072 = opt_loop_vec_info::success (main_loop_vinfo);
3074 else
3076 if (dump_enabled_p ())
3077 dump_printf_loc (MSG_NOTE, vect_location,
3078 "***** No longer preferring vector"
3079 " mode %s after reanalyzing the loop"
3080 " as a main loop\n",
3081 GET_MODE_NAME
3082 (main_loop_vinfo->vector_mode));
3083 delete main_loop_vinfo;
3088 if (first_loop_vinfo == NULL)
3090 first_loop_vinfo = loop_vinfo;
3091 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3093 else if (vect_epilogues
3094 /* For now only allow one epilogue loop. */
3095 && first_loop_vinfo->epilogue_vinfos.is_empty ())
3097 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3098 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3099 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3100 || maybe_ne (lowest_th, 0U));
3101 /* Keep track of the known smallest versioning
3102 threshold. */
3103 if (ordered_p (lowest_th, th))
3104 lowest_th = ordered_min (lowest_th, th);
3106 else
3108 delete loop_vinfo;
3109 loop_vinfo = opt_loop_vec_info::success (NULL);
3112 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3113 enabled, SIMDUID is not set, it is the innermost loop and we have
3114 either already found the loop's SIMDLEN or there was no SIMDLEN to
3115 begin with.
3116 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3117 vect_epilogues = (!simdlen
3118 && loop->inner == NULL
3119 && param_vect_epilogues_nomask
3120 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3121 && !loop->simduid
3122 /* For now only allow one epilogue loop, but allow
3123 pick_lowest_cost_p to replace it. */
3124 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3125 || pick_lowest_cost_p));
3127 /* Commit to first_loop_vinfo if we have no reason to try
3128 alternatives. */
3129 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3130 break;
3132 else
3134 delete loop_vinfo;
3135 loop_vinfo = opt_loop_vec_info::success (NULL);
3136 if (fatal)
3138 gcc_checking_assert (first_loop_vinfo == NULL);
3139 break;
3143 /* Handle the case that the original loop can use partial
3144 vectorization, but want to only adopt it for the epilogue.
3145 The retry should be in the same mode as original. */
3146 if (vect_epilogues
3147 && loop_vinfo
3148 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3150 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3151 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3152 if (dump_enabled_p ())
3153 dump_printf_loc (MSG_NOTE, vect_location,
3154 "***** Re-trying analysis with same vector mode"
3155 " %s for epilogue with partial vectors.\n",
3156 GET_MODE_NAME (loop_vinfo->vector_mode));
3157 continue;
3160 if (mode_i < vector_modes.length ()
3161 && VECTOR_MODE_P (autodetected_vector_mode)
3162 && (related_vector_mode (vector_modes[mode_i],
3163 GET_MODE_INNER (autodetected_vector_mode))
3164 == autodetected_vector_mode)
3165 && (related_vector_mode (autodetected_vector_mode,
3166 GET_MODE_INNER (vector_modes[mode_i]))
3167 == vector_modes[mode_i]))
3169 if (dump_enabled_p ())
3170 dump_printf_loc (MSG_NOTE, vect_location,
3171 "***** Skipping vector mode %s, which would"
3172 " repeat the analysis for %s\n",
3173 GET_MODE_NAME (vector_modes[mode_i]),
3174 GET_MODE_NAME (autodetected_vector_mode));
3175 mode_i += 1;
3178 if (mode_i == vector_modes.length ()
3179 || autodetected_vector_mode == VOIDmode)
3180 break;
3182 /* Try the next biggest vector size. */
3183 next_vector_mode = vector_modes[mode_i++];
3184 if (dump_enabled_p ())
3185 dump_printf_loc (MSG_NOTE, vect_location,
3186 "***** Re-trying analysis with vector mode %s\n",
3187 GET_MODE_NAME (next_vector_mode));
3190 if (first_loop_vinfo)
3192 loop->aux = (loop_vec_info) first_loop_vinfo;
3193 if (dump_enabled_p ())
3194 dump_printf_loc (MSG_NOTE, vect_location,
3195 "***** Choosing vector mode %s\n",
3196 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3197 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3198 return first_loop_vinfo;
3201 return opt_loop_vec_info::propagate_failure (res);
3204 /* Return true if there is an in-order reduction function for CODE, storing
3205 it in *REDUC_FN if so. */
3207 static bool
3208 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3210 switch (code)
3212 case PLUS_EXPR:
3213 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3214 return true;
3216 default:
3217 return false;
3221 /* Function reduction_fn_for_scalar_code
3223 Input:
3224 CODE - tree_code of a reduction operations.
3226 Output:
3227 REDUC_FN - the corresponding internal function to be used to reduce the
3228 vector of partial results into a single scalar result, or IFN_LAST
3229 if the operation is a supported reduction operation, but does not have
3230 such an internal function.
3232 Return FALSE if CODE currently cannot be vectorized as reduction. */
3234 bool
3235 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3237 switch (code)
3239 case MAX_EXPR:
3240 *reduc_fn = IFN_REDUC_MAX;
3241 return true;
3243 case MIN_EXPR:
3244 *reduc_fn = IFN_REDUC_MIN;
3245 return true;
3247 case PLUS_EXPR:
3248 *reduc_fn = IFN_REDUC_PLUS;
3249 return true;
3251 case BIT_AND_EXPR:
3252 *reduc_fn = IFN_REDUC_AND;
3253 return true;
3255 case BIT_IOR_EXPR:
3256 *reduc_fn = IFN_REDUC_IOR;
3257 return true;
3259 case BIT_XOR_EXPR:
3260 *reduc_fn = IFN_REDUC_XOR;
3261 return true;
3263 case MULT_EXPR:
3264 case MINUS_EXPR:
3265 *reduc_fn = IFN_LAST;
3266 return true;
3268 default:
3269 return false;
3273 /* If there is a neutral value X such that a reduction would not be affected
3274 by the introduction of additional X elements, return that X, otherwise
3275 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3276 of the scalar elements. If the reduction has just a single initial value
3277 then INITIAL_VALUE is that value, otherwise it is null. */
3279 static tree
3280 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3282 switch (code)
3284 case WIDEN_SUM_EXPR:
3285 case DOT_PROD_EXPR:
3286 case SAD_EXPR:
3287 case PLUS_EXPR:
3288 case MINUS_EXPR:
3289 case BIT_IOR_EXPR:
3290 case BIT_XOR_EXPR:
3291 return build_zero_cst (scalar_type);
3293 case MULT_EXPR:
3294 return build_one_cst (scalar_type);
3296 case BIT_AND_EXPR:
3297 return build_all_ones_cst (scalar_type);
3299 case MAX_EXPR:
3300 case MIN_EXPR:
3301 return initial_value;
3303 default:
3304 return NULL_TREE;
3308 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3309 STMT is printed with a message MSG. */
3311 static void
3312 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3314 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3317 /* Return true if we need an in-order reduction for operation CODE
3318 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3319 overflow must wrap. */
3321 bool
3322 needs_fold_left_reduction_p (tree type, tree_code code)
3324 /* CHECKME: check for !flag_finite_math_only too? */
3325 if (SCALAR_FLOAT_TYPE_P (type))
3326 switch (code)
3328 case MIN_EXPR:
3329 case MAX_EXPR:
3330 return false;
3332 default:
3333 return !flag_associative_math;
3336 if (INTEGRAL_TYPE_P (type))
3338 if (!operation_no_trapping_overflow (type, code))
3339 return true;
3340 return false;
3343 if (SAT_FIXED_POINT_TYPE_P (type))
3344 return true;
3346 return false;
3349 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3350 has a handled computation expression. Store the main reduction
3351 operation in *CODE. */
3353 static bool
3354 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3355 tree loop_arg, enum tree_code *code,
3356 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3358 auto_bitmap visited;
3359 tree lookfor = PHI_RESULT (phi);
3360 ssa_op_iter curri;
3361 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3362 while (USE_FROM_PTR (curr) != loop_arg)
3363 curr = op_iter_next_use (&curri);
3364 curri.i = curri.numops;
3367 path.safe_push (std::make_pair (curri, curr));
3368 tree use = USE_FROM_PTR (curr);
3369 if (use == lookfor)
3370 break;
3371 gimple *def = SSA_NAME_DEF_STMT (use);
3372 if (gimple_nop_p (def)
3373 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3375 pop:
3378 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3379 curri = x.first;
3380 curr = x.second;
3382 curr = op_iter_next_use (&curri);
3383 /* Skip already visited or non-SSA operands (from iterating
3384 over PHI args). */
3385 while (curr != NULL_USE_OPERAND_P
3386 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3387 || ! bitmap_set_bit (visited,
3388 SSA_NAME_VERSION
3389 (USE_FROM_PTR (curr)))));
3391 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3392 if (curr == NULL_USE_OPERAND_P)
3393 break;
3395 else
3397 if (gimple_code (def) == GIMPLE_PHI)
3398 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3399 else
3400 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3401 while (curr != NULL_USE_OPERAND_P
3402 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3403 || ! bitmap_set_bit (visited,
3404 SSA_NAME_VERSION
3405 (USE_FROM_PTR (curr)))))
3406 curr = op_iter_next_use (&curri);
3407 if (curr == NULL_USE_OPERAND_P)
3408 goto pop;
3411 while (1);
3412 if (dump_file && (dump_flags & TDF_DETAILS))
3414 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3415 unsigned i;
3416 std::pair<ssa_op_iter, use_operand_p> *x;
3417 FOR_EACH_VEC_ELT (path, i, x)
3418 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3419 dump_printf (MSG_NOTE, "\n");
3422 /* Check whether the reduction path detected is valid. */
3423 bool fail = path.length () == 0;
3424 bool neg = false;
3425 int sign = -1;
3426 *code = ERROR_MARK;
3427 for (unsigned i = 1; i < path.length (); ++i)
3429 gimple *use_stmt = USE_STMT (path[i].second);
3430 tree op = USE_FROM_PTR (path[i].second);
3431 if (! is_gimple_assign (use_stmt)
3432 /* The following make sure we can compute the operand index
3433 easily plus it mostly disallows chaining via COND_EXPR condition
3434 operands. */
3435 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3436 && (gimple_num_ops (use_stmt) <= 2
3437 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3438 && (gimple_num_ops (use_stmt) <= 3
3439 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3441 fail = true;
3442 break;
3444 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3445 if (use_code == MINUS_EXPR)
3447 use_code = PLUS_EXPR;
3448 /* Track whether we negate the reduction value each iteration. */
3449 if (gimple_assign_rhs2 (use_stmt) == op)
3450 neg = ! neg;
3452 if (CONVERT_EXPR_CODE_P (use_code)
3453 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3454 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3456 else if (*code == ERROR_MARK)
3458 *code = use_code;
3459 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3461 else if (use_code != *code)
3463 fail = true;
3464 break;
3466 else if ((use_code == MIN_EXPR
3467 || use_code == MAX_EXPR)
3468 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3470 fail = true;
3471 break;
3473 /* Check there's only a single stmt the op is used on. For the
3474 not value-changing tail and the last stmt allow out-of-loop uses.
3475 ??? We could relax this and handle arbitrary live stmts by
3476 forcing a scalar epilogue for example. */
3477 imm_use_iterator imm_iter;
3478 gimple *op_use_stmt;
3479 unsigned cnt = 0;
3480 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3481 if (!is_gimple_debug (op_use_stmt)
3482 && (*code != ERROR_MARK
3483 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3485 /* We want to allow x + x but not x < 1 ? x : 2. */
3486 if (is_gimple_assign (op_use_stmt)
3487 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3489 use_operand_p use_p;
3490 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3491 cnt++;
3493 else
3494 cnt++;
3496 if (cnt != 1)
3498 fail = true;
3499 break;
3502 return ! fail && ! neg && *code != ERROR_MARK;
3505 bool
3506 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3507 tree loop_arg, enum tree_code code)
3509 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3510 enum tree_code code_;
3511 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3512 && code_ == code);
3517 /* Function vect_is_simple_reduction
3519 (1) Detect a cross-iteration def-use cycle that represents a simple
3520 reduction computation. We look for the following pattern:
3522 loop_header:
3523 a1 = phi < a0, a2 >
3524 a3 = ...
3525 a2 = operation (a3, a1)
3529 a3 = ...
3530 loop_header:
3531 a1 = phi < a0, a2 >
3532 a2 = operation (a3, a1)
3534 such that:
3535 1. operation is commutative and associative and it is safe to
3536 change the order of the computation
3537 2. no uses for a2 in the loop (a2 is used out of the loop)
3538 3. no uses of a1 in the loop besides the reduction operation
3539 4. no uses of a1 outside the loop.
3541 Conditions 1,4 are tested here.
3542 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3544 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3545 nested cycles.
3547 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3548 reductions:
3550 a1 = phi < a0, a2 >
3551 inner loop (def of a3)
3552 a2 = phi < a3 >
3554 (4) Detect condition expressions, ie:
3555 for (int i = 0; i < N; i++)
3556 if (a[i] < val)
3557 ret_val = a[i];
3561 static stmt_vec_info
3562 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3563 bool *double_reduc, bool *reduc_chain_p)
3565 gphi *phi = as_a <gphi *> (phi_info->stmt);
3566 gimple *phi_use_stmt = NULL;
3567 imm_use_iterator imm_iter;
3568 use_operand_p use_p;
3570 *double_reduc = false;
3571 *reduc_chain_p = false;
3572 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3574 tree phi_name = PHI_RESULT (phi);
3575 /* ??? If there are no uses of the PHI result the inner loop reduction
3576 won't be detected as possibly double-reduction by vectorizable_reduction
3577 because that tries to walk the PHI arg from the preheader edge which
3578 can be constant. See PR60382. */
3579 if (has_zero_uses (phi_name))
3580 return NULL;
3581 class loop *loop = (gimple_bb (phi))->loop_father;
3582 unsigned nphi_def_loop_uses = 0;
3583 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3585 gimple *use_stmt = USE_STMT (use_p);
3586 if (is_gimple_debug (use_stmt))
3587 continue;
3589 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3591 if (dump_enabled_p ())
3592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3593 "intermediate value used outside loop.\n");
3595 return NULL;
3598 nphi_def_loop_uses++;
3599 phi_use_stmt = use_stmt;
3602 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3603 if (TREE_CODE (latch_def) != SSA_NAME)
3605 if (dump_enabled_p ())
3606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3607 "reduction: not ssa_name: %T\n", latch_def);
3608 return NULL;
3611 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3612 if (!def_stmt_info
3613 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3614 return NULL;
3616 bool nested_in_vect_loop
3617 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3618 unsigned nlatch_def_loop_uses = 0;
3619 auto_vec<gphi *, 3> lcphis;
3620 bool inner_loop_of_double_reduc = false;
3621 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3623 gimple *use_stmt = USE_STMT (use_p);
3624 if (is_gimple_debug (use_stmt))
3625 continue;
3626 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3627 nlatch_def_loop_uses++;
3628 else
3630 /* We can have more than one loop-closed PHI. */
3631 lcphis.safe_push (as_a <gphi *> (use_stmt));
3632 if (nested_in_vect_loop
3633 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3634 == vect_double_reduction_def))
3635 inner_loop_of_double_reduc = true;
3639 /* If we are vectorizing an inner reduction we are executing that
3640 in the original order only in case we are not dealing with a
3641 double reduction. */
3642 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3644 if (dump_enabled_p ())
3645 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3646 "detected nested cycle: ");
3647 return def_stmt_info;
3650 /* If this isn't a nested cycle or if the nested cycle reduction value
3651 is used ouside of the inner loop we cannot handle uses of the reduction
3652 value. */
3653 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3655 if (dump_enabled_p ())
3656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3657 "reduction used in loop.\n");
3658 return NULL;
3661 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3662 defined in the inner loop. */
3663 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3665 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3666 if (gimple_phi_num_args (def_stmt) != 1
3667 || TREE_CODE (op1) != SSA_NAME)
3669 if (dump_enabled_p ())
3670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3671 "unsupported phi node definition.\n");
3673 return NULL;
3676 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3677 if (gimple_bb (def1)
3678 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3679 && loop->inner
3680 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3681 && is_gimple_assign (def1)
3682 && is_a <gphi *> (phi_use_stmt)
3683 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3685 if (dump_enabled_p ())
3686 report_vect_op (MSG_NOTE, def_stmt,
3687 "detected double reduction: ");
3689 *double_reduc = true;
3690 return def_stmt_info;
3693 return NULL;
3696 /* Look for the expression computing latch_def from then loop PHI result. */
3697 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3698 enum tree_code code;
3699 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3700 path))
3702 STMT_VINFO_REDUC_CODE (phi_info) = code;
3703 if (code == COND_EXPR && !nested_in_vect_loop)
3704 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3706 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3707 reduction chain for which the additional restriction is that
3708 all operations in the chain are the same. */
3709 auto_vec<stmt_vec_info, 8> reduc_chain;
3710 unsigned i;
3711 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3712 for (i = path.length () - 1; i >= 1; --i)
3714 gimple *stmt = USE_STMT (path[i].second);
3715 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3716 STMT_VINFO_REDUC_IDX (stmt_info)
3717 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3718 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3719 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3720 && (i == 1 || i == path.length () - 1));
3721 if ((stmt_code != code && !leading_conversion)
3722 /* We can only handle the final value in epilogue
3723 generation for reduction chains. */
3724 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3725 is_slp_reduc = false;
3726 /* For reduction chains we support a trailing/leading
3727 conversions. We do not store those in the actual chain. */
3728 if (leading_conversion)
3729 continue;
3730 reduc_chain.safe_push (stmt_info);
3732 if (is_slp_reduc && reduc_chain.length () > 1)
3734 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3736 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3737 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3739 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3740 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3742 /* Save the chain for further analysis in SLP detection. */
3743 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3744 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3746 *reduc_chain_p = true;
3747 if (dump_enabled_p ())
3748 dump_printf_loc (MSG_NOTE, vect_location,
3749 "reduction: detected reduction chain\n");
3751 else if (dump_enabled_p ())
3752 dump_printf_loc (MSG_NOTE, vect_location,
3753 "reduction: detected reduction\n");
3755 return def_stmt_info;
3758 if (dump_enabled_p ())
3759 dump_printf_loc (MSG_NOTE, vect_location,
3760 "reduction: unknown pattern\n");
3762 return NULL;
3765 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3766 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3767 or -1 if not known. */
3769 static int
3770 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3772 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3773 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3775 if (dump_enabled_p ())
3776 dump_printf_loc (MSG_NOTE, vect_location,
3777 "cost model: epilogue peel iters set to vf/2 "
3778 "because loop iterations are unknown .\n");
3779 return assumed_vf / 2;
3781 else
3783 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3784 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3785 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3786 /* If we need to peel for gaps, but no peeling is required, we have to
3787 peel VF iterations. */
3788 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3789 peel_iters_epilogue = assumed_vf;
3790 return peel_iters_epilogue;
3794 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3796 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3797 int *peel_iters_epilogue,
3798 stmt_vector_for_cost *scalar_cost_vec,
3799 stmt_vector_for_cost *prologue_cost_vec,
3800 stmt_vector_for_cost *epilogue_cost_vec)
3802 int retval = 0;
3804 *peel_iters_epilogue
3805 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3807 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3809 /* If peeled iterations are known but number of scalar loop
3810 iterations are unknown, count a taken branch per peeled loop. */
3811 if (peel_iters_prologue > 0)
3812 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3813 NULL, NULL_TREE, 0, vect_prologue);
3814 if (*peel_iters_epilogue > 0)
3815 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3816 NULL, NULL_TREE, 0, vect_epilogue);
3819 stmt_info_for_cost *si;
3820 int j;
3821 if (peel_iters_prologue)
3822 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3823 retval += record_stmt_cost (prologue_cost_vec,
3824 si->count * peel_iters_prologue,
3825 si->kind, si->stmt_info, si->misalign,
3826 vect_prologue);
3827 if (*peel_iters_epilogue)
3828 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3829 retval += record_stmt_cost (epilogue_cost_vec,
3830 si->count * *peel_iters_epilogue,
3831 si->kind, si->stmt_info, si->misalign,
3832 vect_epilogue);
3834 return retval;
3837 /* Function vect_estimate_min_profitable_iters
3839 Return the number of iterations required for the vector version of the
3840 loop to be profitable relative to the cost of the scalar version of the
3841 loop.
3843 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3844 of iterations for vectorization. -1 value means loop vectorization
3845 is not profitable. This returned value may be used for dynamic
3846 profitability check.
3848 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3849 for static check against estimated number of iterations. */
3851 static void
3852 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3853 int *ret_min_profitable_niters,
3854 int *ret_min_profitable_estimate)
3856 int min_profitable_iters;
3857 int min_profitable_estimate;
3858 int peel_iters_prologue;
3859 int peel_iters_epilogue;
3860 unsigned vec_inside_cost = 0;
3861 int vec_outside_cost = 0;
3862 unsigned vec_prologue_cost = 0;
3863 unsigned vec_epilogue_cost = 0;
3864 int scalar_single_iter_cost = 0;
3865 int scalar_outside_cost = 0;
3866 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3867 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3868 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3870 /* Cost model disabled. */
3871 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3873 if (dump_enabled_p ())
3874 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3875 *ret_min_profitable_niters = 0;
3876 *ret_min_profitable_estimate = 0;
3877 return;
3880 /* Requires loop versioning tests to handle misalignment. */
3881 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3883 /* FIXME: Make cost depend on complexity of individual check. */
3884 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3885 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3886 NULL, NULL_TREE, 0, vect_prologue);
3887 if (dump_enabled_p ())
3888 dump_printf (MSG_NOTE,
3889 "cost model: Adding cost of checks for loop "
3890 "versioning to treat misalignment.\n");
3893 /* Requires loop versioning with alias checks. */
3894 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3896 /* FIXME: Make cost depend on complexity of individual check. */
3897 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3898 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3899 NULL, NULL_TREE, 0, vect_prologue);
3900 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3901 if (len)
3902 /* Count LEN - 1 ANDs and LEN comparisons. */
3903 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3904 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3905 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3906 if (len)
3908 /* Count LEN - 1 ANDs and LEN comparisons. */
3909 unsigned int nstmts = len * 2 - 1;
3910 /* +1 for each bias that needs adding. */
3911 for (unsigned int i = 0; i < len; ++i)
3912 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3913 nstmts += 1;
3914 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3915 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3917 if (dump_enabled_p ())
3918 dump_printf (MSG_NOTE,
3919 "cost model: Adding cost of checks for loop "
3920 "versioning aliasing.\n");
3923 /* Requires loop versioning with niter checks. */
3924 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3926 /* FIXME: Make cost depend on complexity of individual check. */
3927 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3928 NULL, NULL_TREE, 0, vect_prologue);
3929 if (dump_enabled_p ())
3930 dump_printf (MSG_NOTE,
3931 "cost model: Adding cost of checks for loop "
3932 "versioning niters.\n");
3935 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3936 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3937 NULL, NULL_TREE, 0, vect_prologue);
3939 /* Count statements in scalar loop. Using this as scalar cost for a single
3940 iteration for now.
3942 TODO: Add outer loop support.
3944 TODO: Consider assigning different costs to different scalar
3945 statements. */
3947 scalar_single_iter_cost
3948 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3950 /* Add additional cost for the peeled instructions in prologue and epilogue
3951 loop. (For fully-masked loops there will be no peeling.)
3953 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3954 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3956 TODO: Build an expression that represents peel_iters for prologue and
3957 epilogue to be used in a run-time test. */
3959 bool prologue_need_br_taken_cost = false;
3960 bool prologue_need_br_not_taken_cost = false;
3962 /* Calculate peel_iters_prologue. */
3963 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3964 peel_iters_prologue = 0;
3965 else if (npeel < 0)
3967 peel_iters_prologue = assumed_vf / 2;
3968 if (dump_enabled_p ())
3969 dump_printf (MSG_NOTE, "cost model: "
3970 "prologue peel iters set to vf/2.\n");
3972 /* If peeled iterations are unknown, count a taken branch and a not taken
3973 branch per peeled loop. Even if scalar loop iterations are known,
3974 vector iterations are not known since peeled prologue iterations are
3975 not known. Hence guards remain the same. */
3976 prologue_need_br_taken_cost = true;
3977 prologue_need_br_not_taken_cost = true;
3979 else
3981 peel_iters_prologue = npeel;
3982 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3983 /* If peeled iterations are known but number of scalar loop
3984 iterations are unknown, count a taken branch per peeled loop. */
3985 prologue_need_br_taken_cost = true;
3988 bool epilogue_need_br_taken_cost = false;
3989 bool epilogue_need_br_not_taken_cost = false;
3991 /* Calculate peel_iters_epilogue. */
3992 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3993 /* We need to peel exactly one iteration for gaps. */
3994 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3995 else if (npeel < 0)
3997 /* If peeling for alignment is unknown, loop bound of main loop
3998 becomes unknown. */
3999 peel_iters_epilogue = assumed_vf / 2;
4000 if (dump_enabled_p ())
4001 dump_printf (MSG_NOTE, "cost model: "
4002 "epilogue peel iters set to vf/2 because "
4003 "peeling for alignment is unknown.\n");
4005 /* See the same reason above in peel_iters_prologue calculation. */
4006 epilogue_need_br_taken_cost = true;
4007 epilogue_need_br_not_taken_cost = true;
4009 else
4011 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4012 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4013 /* If peeled iterations are known but number of scalar loop
4014 iterations are unknown, count a taken branch per peeled loop. */
4015 epilogue_need_br_taken_cost = true;
4018 stmt_info_for_cost *si;
4019 int j;
4020 /* Add costs associated with peel_iters_prologue. */
4021 if (peel_iters_prologue)
4022 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4024 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4025 si->count * peel_iters_prologue, si->kind,
4026 si->stmt_info, si->vectype, si->misalign,
4027 vect_prologue);
4030 /* Add costs associated with peel_iters_epilogue. */
4031 if (peel_iters_epilogue)
4032 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4034 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4035 si->count * peel_iters_epilogue, si->kind,
4036 si->stmt_info, si->vectype, si->misalign,
4037 vect_epilogue);
4040 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4042 if (prologue_need_br_taken_cost)
4043 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4044 NULL, NULL_TREE, 0, vect_prologue);
4046 if (prologue_need_br_not_taken_cost)
4047 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4048 cond_branch_not_taken, NULL, NULL_TREE, 0,
4049 vect_prologue);
4051 if (epilogue_need_br_taken_cost)
4052 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4053 NULL, NULL_TREE, 0, vect_epilogue);
4055 if (epilogue_need_br_not_taken_cost)
4056 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4057 cond_branch_not_taken, NULL, NULL_TREE, 0,
4058 vect_epilogue);
4060 /* Take care of special costs for rgroup controls of partial vectors. */
4061 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4063 /* Calculate how many masks we need to generate. */
4064 unsigned int num_masks = 0;
4065 rgroup_controls *rgm;
4066 unsigned int num_vectors_m1;
4067 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4068 if (rgm->type)
4069 num_masks += num_vectors_m1 + 1;
4070 gcc_assert (num_masks > 0);
4072 /* In the worst case, we need to generate each mask in the prologue
4073 and in the loop body. One of the loop body mask instructions
4074 replaces the comparison in the scalar loop, and since we don't
4075 count the scalar comparison against the scalar body, we shouldn't
4076 count that vector instruction against the vector body either.
4078 Sometimes we can use unpacks instead of generating prologue
4079 masks and sometimes the prologue mask will fold to a constant,
4080 so the actual prologue cost might be smaller. However, it's
4081 simpler and safer to use the worst-case cost; if this ends up
4082 being the tie-breaker between vectorizing or not, then it's
4083 probably better not to vectorize. */
4084 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4085 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4086 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4087 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4089 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4091 /* Referring to the functions vect_set_loop_condition_partial_vectors
4092 and vect_set_loop_controls_directly, we need to generate each
4093 length in the prologue and in the loop body if required. Although
4094 there are some possible optimizations, we consider the worst case
4095 here. */
4097 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4098 bool need_iterate_p
4099 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4100 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4102 /* Calculate how many statements to be added. */
4103 unsigned int prologue_stmts = 0;
4104 unsigned int body_stmts = 0;
4106 rgroup_controls *rgc;
4107 unsigned int num_vectors_m1;
4108 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4109 if (rgc->type)
4111 /* May need one SHIFT for nitems_total computation. */
4112 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4113 if (nitems != 1 && !niters_known_p)
4114 prologue_stmts += 1;
4116 /* May need one MAX and one MINUS for wrap around. */
4117 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4118 prologue_stmts += 2;
4120 /* Need one MAX and one MINUS for each batch limit excepting for
4121 the 1st one. */
4122 prologue_stmts += num_vectors_m1 * 2;
4124 unsigned int num_vectors = num_vectors_m1 + 1;
4126 /* Need to set up lengths in prologue, only one MIN required
4127 for each since start index is zero. */
4128 prologue_stmts += num_vectors;
4130 /* Each may need two MINs and one MINUS to update lengths in body
4131 for next iteration. */
4132 if (need_iterate_p)
4133 body_stmts += 3 * num_vectors;
4136 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4137 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4138 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4139 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4142 /* FORNOW: The scalar outside cost is incremented in one of the
4143 following ways:
4145 1. The vectorizer checks for alignment and aliasing and generates
4146 a condition that allows dynamic vectorization. A cost model
4147 check is ANDED with the versioning condition. Hence scalar code
4148 path now has the added cost of the versioning check.
4150 if (cost > th & versioning_check)
4151 jmp to vector code
4153 Hence run-time scalar is incremented by not-taken branch cost.
4155 2. The vectorizer then checks if a prologue is required. If the
4156 cost model check was not done before during versioning, it has to
4157 be done before the prologue check.
4159 if (cost <= th)
4160 prologue = scalar_iters
4161 if (prologue == 0)
4162 jmp to vector code
4163 else
4164 execute prologue
4165 if (prologue == num_iters)
4166 go to exit
4168 Hence the run-time scalar cost is incremented by a taken branch,
4169 plus a not-taken branch, plus a taken branch cost.
4171 3. The vectorizer then checks if an epilogue is required. If the
4172 cost model check was not done before during prologue check, it
4173 has to be done with the epilogue check.
4175 if (prologue == 0)
4176 jmp to vector code
4177 else
4178 execute prologue
4179 if (prologue == num_iters)
4180 go to exit
4181 vector code:
4182 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4183 jmp to epilogue
4185 Hence the run-time scalar cost should be incremented by 2 taken
4186 branches.
4188 TODO: The back end may reorder the BBS's differently and reverse
4189 conditions/branch directions. Change the estimates below to
4190 something more reasonable. */
4192 /* If the number of iterations is known and we do not do versioning, we can
4193 decide whether to vectorize at compile time. Hence the scalar version
4194 do not carry cost model guard costs. */
4195 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4196 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4198 /* Cost model check occurs at versioning. */
4199 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4200 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4201 else
4203 /* Cost model check occurs at prologue generation. */
4204 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4205 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4206 + vect_get_stmt_cost (cond_branch_not_taken);
4207 /* Cost model check occurs at epilogue generation. */
4208 else
4209 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4213 /* Complete the target-specific cost calculations. */
4214 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4215 &vec_inside_cost, &vec_epilogue_cost);
4217 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4219 /* Stash the costs so that we can compare two loop_vec_infos. */
4220 loop_vinfo->vec_inside_cost = vec_inside_cost;
4221 loop_vinfo->vec_outside_cost = vec_outside_cost;
4223 if (dump_enabled_p ())
4225 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4226 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4227 vec_inside_cost);
4228 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4229 vec_prologue_cost);
4230 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4231 vec_epilogue_cost);
4232 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4233 scalar_single_iter_cost);
4234 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4235 scalar_outside_cost);
4236 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4237 vec_outside_cost);
4238 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4239 peel_iters_prologue);
4240 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4241 peel_iters_epilogue);
4244 /* Calculate number of iterations required to make the vector version
4245 profitable, relative to the loop bodies only. The following condition
4246 must hold true:
4247 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4248 where
4249 SIC = scalar iteration cost, VIC = vector iteration cost,
4250 VOC = vector outside cost, VF = vectorization factor,
4251 NPEEL = prologue iterations + epilogue iterations,
4252 SOC = scalar outside cost for run time cost model check. */
4254 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4255 - vec_inside_cost);
4256 if (saving_per_viter <= 0)
4258 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4259 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4260 "vectorization did not happen for a simd loop");
4262 if (dump_enabled_p ())
4263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4264 "cost model: the vector iteration cost = %d "
4265 "divided by the scalar iteration cost = %d "
4266 "is greater or equal to the vectorization factor = %d"
4267 ".\n",
4268 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4269 *ret_min_profitable_niters = -1;
4270 *ret_min_profitable_estimate = -1;
4271 return;
4274 /* ??? The "if" arm is written to handle all cases; see below for what
4275 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4276 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4278 /* Rewriting the condition above in terms of the number of
4279 vector iterations (vniters) rather than the number of
4280 scalar iterations (niters) gives:
4282 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4284 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4286 For integer N, X and Y when X > 0:
4288 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4289 int outside_overhead = (vec_outside_cost
4290 - scalar_single_iter_cost * peel_iters_prologue
4291 - scalar_single_iter_cost * peel_iters_epilogue
4292 - scalar_outside_cost);
4293 /* We're only interested in cases that require at least one
4294 vector iteration. */
4295 int min_vec_niters = 1;
4296 if (outside_overhead > 0)
4297 min_vec_niters = outside_overhead / saving_per_viter + 1;
4299 if (dump_enabled_p ())
4300 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4301 min_vec_niters);
4303 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4305 /* Now that we know the minimum number of vector iterations,
4306 find the minimum niters for which the scalar cost is larger:
4308 SIC * niters > VIC * vniters + VOC - SOC
4310 We know that the minimum niters is no more than
4311 vniters * VF + NPEEL, but it might be (and often is) less
4312 than that if a partial vector iteration is cheaper than the
4313 equivalent scalar code. */
4314 int threshold = (vec_inside_cost * min_vec_niters
4315 + vec_outside_cost
4316 - scalar_outside_cost);
4317 if (threshold <= 0)
4318 min_profitable_iters = 1;
4319 else
4320 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4322 else
4323 /* Convert the number of vector iterations into a number of
4324 scalar iterations. */
4325 min_profitable_iters = (min_vec_niters * assumed_vf
4326 + peel_iters_prologue
4327 + peel_iters_epilogue);
4329 else
4331 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4332 * assumed_vf
4333 - vec_inside_cost * peel_iters_prologue
4334 - vec_inside_cost * peel_iters_epilogue);
4335 if (min_profitable_iters <= 0)
4336 min_profitable_iters = 0;
4337 else
4339 min_profitable_iters /= saving_per_viter;
4341 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4342 <= (((int) vec_inside_cost * min_profitable_iters)
4343 + (((int) vec_outside_cost - scalar_outside_cost)
4344 * assumed_vf)))
4345 min_profitable_iters++;
4349 if (dump_enabled_p ())
4350 dump_printf (MSG_NOTE,
4351 " Calculated minimum iters for profitability: %d\n",
4352 min_profitable_iters);
4354 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4355 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4356 /* We want the vectorized loop to execute at least once. */
4357 min_profitable_iters = assumed_vf + peel_iters_prologue;
4358 else if (min_profitable_iters < peel_iters_prologue)
4359 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4360 vectorized loop executes at least once. */
4361 min_profitable_iters = peel_iters_prologue;
4363 if (dump_enabled_p ())
4364 dump_printf_loc (MSG_NOTE, vect_location,
4365 " Runtime profitability threshold = %d\n",
4366 min_profitable_iters);
4368 *ret_min_profitable_niters = min_profitable_iters;
4370 /* Calculate number of iterations required to make the vector version
4371 profitable, relative to the loop bodies only.
4373 Non-vectorized variant is SIC * niters and it must win over vector
4374 variant on the expected loop trip count. The following condition must hold true:
4375 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4377 if (vec_outside_cost <= 0)
4378 min_profitable_estimate = 0;
4379 /* ??? This "else if" arm is written to handle all cases; see below for
4380 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4381 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4383 /* This is a repeat of the code above, but with + SOC rather
4384 than - SOC. */
4385 int outside_overhead = (vec_outside_cost
4386 - scalar_single_iter_cost * peel_iters_prologue
4387 - scalar_single_iter_cost * peel_iters_epilogue
4388 + scalar_outside_cost);
4389 int min_vec_niters = 1;
4390 if (outside_overhead > 0)
4391 min_vec_niters = outside_overhead / saving_per_viter + 1;
4393 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4395 int threshold = (vec_inside_cost * min_vec_niters
4396 + vec_outside_cost
4397 + scalar_outside_cost);
4398 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4400 else
4401 min_profitable_estimate = (min_vec_niters * assumed_vf
4402 + peel_iters_prologue
4403 + peel_iters_epilogue);
4405 else
4407 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4408 * assumed_vf
4409 - vec_inside_cost * peel_iters_prologue
4410 - vec_inside_cost * peel_iters_epilogue)
4411 / ((scalar_single_iter_cost * assumed_vf)
4412 - vec_inside_cost);
4414 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4415 if (dump_enabled_p ())
4416 dump_printf_loc (MSG_NOTE, vect_location,
4417 " Static estimate profitability threshold = %d\n",
4418 min_profitable_estimate);
4420 *ret_min_profitable_estimate = min_profitable_estimate;
4423 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4424 vector elements (not bits) for a vector with NELT elements. */
4425 static void
4426 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4427 vec_perm_builder *sel)
4429 /* The encoding is a single stepped pattern. Any wrap-around is handled
4430 by vec_perm_indices. */
4431 sel->new_vector (nelt, 1, 3);
4432 for (unsigned int i = 0; i < 3; i++)
4433 sel->quick_push (i + offset);
4436 /* Checks whether the target supports whole-vector shifts for vectors of mode
4437 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4438 it supports vec_perm_const with masks for all necessary shift amounts. */
4439 static bool
4440 have_whole_vector_shift (machine_mode mode)
4442 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4443 return true;
4445 /* Variable-length vectors should be handled via the optab. */
4446 unsigned int nelt;
4447 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4448 return false;
4450 vec_perm_builder sel;
4451 vec_perm_indices indices;
4452 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4454 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4455 indices.new_vector (sel, 2, nelt);
4456 if (!can_vec_perm_const_p (mode, indices, false))
4457 return false;
4459 return true;
4462 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4463 functions. Design better to avoid maintenance issues. */
4465 /* Function vect_model_reduction_cost.
4467 Models cost for a reduction operation, including the vector ops
4468 generated within the strip-mine loop in some cases, the initial
4469 definition before the loop, and the epilogue code that must be generated. */
4471 static void
4472 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4473 stmt_vec_info stmt_info, internal_fn reduc_fn,
4474 vect_reduction_type reduction_type,
4475 int ncopies, stmt_vector_for_cost *cost_vec)
4477 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4478 enum tree_code code;
4479 optab optab;
4480 tree vectype;
4481 machine_mode mode;
4482 class loop *loop = NULL;
4484 if (loop_vinfo)
4485 loop = LOOP_VINFO_LOOP (loop_vinfo);
4487 /* Condition reductions generate two reductions in the loop. */
4488 if (reduction_type == COND_REDUCTION)
4489 ncopies *= 2;
4491 vectype = STMT_VINFO_VECTYPE (stmt_info);
4492 mode = TYPE_MODE (vectype);
4493 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4495 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4497 if (reduction_type == EXTRACT_LAST_REDUCTION)
4498 /* No extra instructions are needed in the prologue. The loop body
4499 operations are costed in vectorizable_condition. */
4500 inside_cost = 0;
4501 else if (reduction_type == FOLD_LEFT_REDUCTION)
4503 /* No extra instructions needed in the prologue. */
4504 prologue_cost = 0;
4506 if (reduc_fn != IFN_LAST)
4507 /* Count one reduction-like operation per vector. */
4508 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4509 stmt_info, 0, vect_body);
4510 else
4512 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4513 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4514 inside_cost = record_stmt_cost (cost_vec, nelements,
4515 vec_to_scalar, stmt_info, 0,
4516 vect_body);
4517 inside_cost += record_stmt_cost (cost_vec, nelements,
4518 scalar_stmt, stmt_info, 0,
4519 vect_body);
4522 else
4524 /* Add in cost for initial definition.
4525 For cond reduction we have four vectors: initial index, step,
4526 initial result of the data reduction, initial value of the index
4527 reduction. */
4528 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4529 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4530 scalar_to_vec, stmt_info, 0,
4531 vect_prologue);
4534 /* Determine cost of epilogue code.
4536 We have a reduction operator that will reduce the vector in one statement.
4537 Also requires scalar extract. */
4539 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4541 if (reduc_fn != IFN_LAST)
4543 if (reduction_type == COND_REDUCTION)
4545 /* An EQ stmt and an COND_EXPR stmt. */
4546 epilogue_cost += record_stmt_cost (cost_vec, 2,
4547 vector_stmt, stmt_info, 0,
4548 vect_epilogue);
4549 /* Reduction of the max index and a reduction of the found
4550 values. */
4551 epilogue_cost += record_stmt_cost (cost_vec, 2,
4552 vec_to_scalar, stmt_info, 0,
4553 vect_epilogue);
4554 /* A broadcast of the max value. */
4555 epilogue_cost += record_stmt_cost (cost_vec, 1,
4556 scalar_to_vec, stmt_info, 0,
4557 vect_epilogue);
4559 else
4561 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4562 stmt_info, 0, vect_epilogue);
4563 epilogue_cost += record_stmt_cost (cost_vec, 1,
4564 vec_to_scalar, stmt_info, 0,
4565 vect_epilogue);
4568 else if (reduction_type == COND_REDUCTION)
4570 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4571 /* Extraction of scalar elements. */
4572 epilogue_cost += record_stmt_cost (cost_vec,
4573 2 * estimated_nunits,
4574 vec_to_scalar, stmt_info, 0,
4575 vect_epilogue);
4576 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4577 epilogue_cost += record_stmt_cost (cost_vec,
4578 2 * estimated_nunits - 3,
4579 scalar_stmt, stmt_info, 0,
4580 vect_epilogue);
4582 else if (reduction_type == EXTRACT_LAST_REDUCTION
4583 || reduction_type == FOLD_LEFT_REDUCTION)
4584 /* No extra instructions need in the epilogue. */
4586 else
4588 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4589 tree bitsize =
4590 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4591 int element_bitsize = tree_to_uhwi (bitsize);
4592 int nelements = vec_size_in_bits / element_bitsize;
4594 if (code == COND_EXPR)
4595 code = MAX_EXPR;
4597 optab = optab_for_tree_code (code, vectype, optab_default);
4599 /* We have a whole vector shift available. */
4600 if (optab != unknown_optab
4601 && VECTOR_MODE_P (mode)
4602 && optab_handler (optab, mode) != CODE_FOR_nothing
4603 && have_whole_vector_shift (mode))
4605 /* Final reduction via vector shifts and the reduction operator.
4606 Also requires scalar extract. */
4607 epilogue_cost += record_stmt_cost (cost_vec,
4608 exact_log2 (nelements) * 2,
4609 vector_stmt, stmt_info, 0,
4610 vect_epilogue);
4611 epilogue_cost += record_stmt_cost (cost_vec, 1,
4612 vec_to_scalar, stmt_info, 0,
4613 vect_epilogue);
4615 else
4616 /* Use extracts and reduction op for final reduction. For N
4617 elements, we have N extracts and N-1 reduction ops. */
4618 epilogue_cost += record_stmt_cost (cost_vec,
4619 nelements + nelements - 1,
4620 vector_stmt, stmt_info, 0,
4621 vect_epilogue);
4625 if (dump_enabled_p ())
4626 dump_printf (MSG_NOTE,
4627 "vect_model_reduction_cost: inside_cost = %d, "
4628 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4629 prologue_cost, epilogue_cost);
4632 /* SEQ is a sequence of instructions that initialize the reduction
4633 described by REDUC_INFO. Emit them in the appropriate place. */
4635 static void
4636 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4637 stmt_vec_info reduc_info, gimple *seq)
4639 if (reduc_info->reused_accumulator)
4641 /* When reusing an accumulator from the main loop, we only need
4642 initialization instructions if the main loop can be skipped.
4643 In that case, emit the initialization instructions at the end
4644 of the guard block that does the skip. */
4645 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4646 gcc_assert (skip_edge);
4647 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4648 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4650 else
4652 /* The normal case: emit the initialization instructions on the
4653 preheader edge. */
4654 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4655 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4659 /* Function get_initial_def_for_reduction
4661 Input:
4662 REDUC_INFO - the info_for_reduction
4663 INIT_VAL - the initial value of the reduction variable
4664 NEUTRAL_OP - a value that has no effect on the reduction, as per
4665 neutral_op_for_reduction
4667 Output:
4668 Return a vector variable, initialized according to the operation that
4669 STMT_VINFO performs. This vector will be used as the initial value
4670 of the vector of partial results.
4672 The value we need is a vector in which element 0 has value INIT_VAL
4673 and every other element has value NEUTRAL_OP. */
4675 static tree
4676 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4677 stmt_vec_info reduc_info,
4678 tree init_val, tree neutral_op)
4680 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4681 tree scalar_type = TREE_TYPE (init_val);
4682 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4683 tree init_def;
4684 gimple_seq stmts = NULL;
4686 gcc_assert (vectype);
4688 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4689 || SCALAR_FLOAT_TYPE_P (scalar_type));
4691 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4692 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4694 if (operand_equal_p (init_val, neutral_op))
4696 /* If both elements are equal then the vector described above is
4697 just a splat. */
4698 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4699 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4701 else
4703 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4704 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4705 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4707 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4708 element 0. */
4709 init_def = gimple_build_vector_from_val (&stmts, vectype,
4710 neutral_op);
4711 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4712 vectype, init_def, init_val);
4714 else
4716 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4717 tree_vector_builder elts (vectype, 1, 2);
4718 elts.quick_push (init_val);
4719 elts.quick_push (neutral_op);
4720 init_def = gimple_build_vector (&stmts, &elts);
4724 if (stmts)
4725 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4726 return init_def;
4729 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4730 which performs a reduction involving GROUP_SIZE scalar statements.
4731 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4732 is nonnull, introducing extra elements of that value will not change the
4733 result. */
4735 static void
4736 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4737 stmt_vec_info reduc_info,
4738 vec<tree> *vec_oprnds,
4739 unsigned int number_of_vectors,
4740 unsigned int group_size, tree neutral_op)
4742 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4743 unsigned HOST_WIDE_INT nunits;
4744 unsigned j, number_of_places_left_in_vector;
4745 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4746 unsigned int i;
4748 gcc_assert (group_size == initial_values.length () || neutral_op);
4750 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4751 created vectors. It is greater than 1 if unrolling is performed.
4753 For example, we have two scalar operands, s1 and s2 (e.g., group of
4754 strided accesses of size two), while NUNITS is four (i.e., four scalars
4755 of this type can be packed in a vector). The output vector will contain
4756 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4757 will be 2).
4759 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4760 vectors containing the operands.
4762 For example, NUNITS is four as before, and the group size is 8
4763 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4764 {s5, s6, s7, s8}. */
4766 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4767 nunits = group_size;
4769 number_of_places_left_in_vector = nunits;
4770 bool constant_p = true;
4771 tree_vector_builder elts (vector_type, nunits, 1);
4772 elts.quick_grow (nunits);
4773 gimple_seq ctor_seq = NULL;
4774 for (j = 0; j < nunits * number_of_vectors; ++j)
4776 tree op;
4777 i = j % group_size;
4779 /* Get the def before the loop. In reduction chain we have only
4780 one initial value. Else we have as many as PHIs in the group. */
4781 if (i >= initial_values.length () || (j > i && neutral_op))
4782 op = neutral_op;
4783 else
4784 op = initial_values[i];
4786 /* Create 'vect_ = {op0,op1,...,opn}'. */
4787 number_of_places_left_in_vector--;
4788 elts[nunits - number_of_places_left_in_vector - 1] = op;
4789 if (!CONSTANT_CLASS_P (op))
4790 constant_p = false;
4792 if (number_of_places_left_in_vector == 0)
4794 tree init;
4795 if (constant_p && !neutral_op
4796 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4797 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4798 /* Build the vector directly from ELTS. */
4799 init = gimple_build_vector (&ctor_seq, &elts);
4800 else if (neutral_op)
4802 /* Build a vector of the neutral value and shift the
4803 other elements into place. */
4804 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4805 neutral_op);
4806 int k = nunits;
4807 while (k > 0 && elts[k - 1] == neutral_op)
4808 k -= 1;
4809 while (k > 0)
4811 k -= 1;
4812 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4813 vector_type, init, elts[k]);
4816 else
4818 /* First time round, duplicate ELTS to fill the
4819 required number of vectors. */
4820 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4821 elts, number_of_vectors, *vec_oprnds);
4822 break;
4824 vec_oprnds->quick_push (init);
4826 number_of_places_left_in_vector = nunits;
4827 elts.new_vector (vector_type, nunits, 1);
4828 elts.quick_grow (nunits);
4829 constant_p = true;
4832 if (ctor_seq != NULL)
4833 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4836 /* For a statement STMT_INFO taking part in a reduction operation return
4837 the stmt_vec_info the meta information is stored on. */
4839 stmt_vec_info
4840 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4842 stmt_info = vect_orig_stmt (stmt_info);
4843 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4844 if (!is_a <gphi *> (stmt_info->stmt)
4845 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4846 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4847 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4848 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4850 if (gimple_phi_num_args (phi) == 1)
4851 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4853 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4855 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4856 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4857 stmt_info = info;
4859 return stmt_info;
4862 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4863 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4864 return false. */
4866 static bool
4867 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4868 stmt_vec_info reduc_info)
4870 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4871 if (!main_loop_vinfo)
4872 return false;
4874 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4875 return false;
4877 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4878 auto_vec<tree, 16> main_loop_results (num_phis);
4879 auto_vec<tree, 16> initial_values (num_phis);
4880 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4882 /* The epilogue loop can be entered either from the main loop or
4883 from an earlier guard block. */
4884 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4885 for (tree incoming_value : reduc_info->reduc_initial_values)
4887 /* Look for:
4889 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4890 INITIAL_VALUE(guard block)>. */
4891 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4893 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4894 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4896 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4897 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4899 main_loop_results.quick_push (from_main_loop);
4900 initial_values.quick_push (from_skip);
4903 else
4904 /* The main loop dominates the epilogue loop. */
4905 main_loop_results.splice (reduc_info->reduc_initial_values);
4907 /* See if the main loop has the kind of accumulator we need. */
4908 vect_reusable_accumulator *accumulator
4909 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4910 if (!accumulator
4911 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4912 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4913 accumulator->reduc_info->reduc_scalar_results.begin ()))
4914 return false;
4916 /* Handle the case where we can reduce wider vectors to narrower ones. */
4917 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4918 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4919 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4920 TYPE_VECTOR_SUBPARTS (vectype)))
4921 return false;
4923 /* Non-SLP reductions might apply an adjustment after the reduction
4924 operation, in order to simplify the initialization of the accumulator.
4925 If the epilogue loop carries on from where the main loop left off,
4926 it should apply the same adjustment to the final reduction result.
4928 If the epilogue loop can also be entered directly (rather than via
4929 the main loop), we need to be able to handle that case in the same way,
4930 with the same adjustment. (In principle we could add a PHI node
4931 to select the correct adjustment, but in practice that shouldn't be
4932 necessary.) */
4933 tree main_adjustment
4934 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4935 if (loop_vinfo->main_loop_edge && main_adjustment)
4937 gcc_assert (num_phis == 1);
4938 tree initial_value = initial_values[0];
4939 /* Check that we can use INITIAL_VALUE as the adjustment and
4940 initialize the accumulator with a neutral value instead. */
4941 if (!operand_equal_p (initial_value, main_adjustment))
4942 return false;
4943 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4944 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4945 code, initial_value);
4947 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4948 reduc_info->reduc_initial_values.truncate (0);
4949 reduc_info->reduc_initial_values.splice (initial_values);
4950 reduc_info->reused_accumulator = accumulator;
4951 return true;
4954 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4955 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
4957 static tree
4958 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4959 gimple_seq *seq)
4961 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4962 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4963 tree stype = TREE_TYPE (vectype);
4964 tree new_temp = vec_def;
4965 while (nunits > nunits1)
4967 nunits /= 2;
4968 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4969 stype, nunits);
4970 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4972 /* The target has to make sure we support lowpart/highpart
4973 extraction, either via direct vector extract or through
4974 an integer mode punning. */
4975 tree dst1, dst2;
4976 gimple *epilog_stmt;
4977 if (convert_optab_handler (vec_extract_optab,
4978 TYPE_MODE (TREE_TYPE (new_temp)),
4979 TYPE_MODE (vectype1))
4980 != CODE_FOR_nothing)
4982 /* Extract sub-vectors directly once vec_extract becomes
4983 a conversion optab. */
4984 dst1 = make_ssa_name (vectype1);
4985 epilog_stmt
4986 = gimple_build_assign (dst1, BIT_FIELD_REF,
4987 build3 (BIT_FIELD_REF, vectype1,
4988 new_temp, TYPE_SIZE (vectype1),
4989 bitsize_int (0)));
4990 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4991 dst2 = make_ssa_name (vectype1);
4992 epilog_stmt
4993 = gimple_build_assign (dst2, BIT_FIELD_REF,
4994 build3 (BIT_FIELD_REF, vectype1,
4995 new_temp, TYPE_SIZE (vectype1),
4996 bitsize_int (bitsize)));
4997 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4999 else
5001 /* Extract via punning to appropriately sized integer mode
5002 vector. */
5003 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5004 tree etype = build_vector_type (eltype, 2);
5005 gcc_assert (convert_optab_handler (vec_extract_optab,
5006 TYPE_MODE (etype),
5007 TYPE_MODE (eltype))
5008 != CODE_FOR_nothing);
5009 tree tem = make_ssa_name (etype);
5010 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5011 build1 (VIEW_CONVERT_EXPR,
5012 etype, new_temp));
5013 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5014 new_temp = tem;
5015 tem = make_ssa_name (eltype);
5016 epilog_stmt
5017 = gimple_build_assign (tem, BIT_FIELD_REF,
5018 build3 (BIT_FIELD_REF, eltype,
5019 new_temp, TYPE_SIZE (eltype),
5020 bitsize_int (0)));
5021 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5022 dst1 = make_ssa_name (vectype1);
5023 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5024 build1 (VIEW_CONVERT_EXPR,
5025 vectype1, tem));
5026 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5027 tem = make_ssa_name (eltype);
5028 epilog_stmt
5029 = gimple_build_assign (tem, BIT_FIELD_REF,
5030 build3 (BIT_FIELD_REF, eltype,
5031 new_temp, TYPE_SIZE (eltype),
5032 bitsize_int (bitsize)));
5033 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5034 dst2 = make_ssa_name (vectype1);
5035 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5036 build1 (VIEW_CONVERT_EXPR,
5037 vectype1, tem));
5038 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5041 new_temp = make_ssa_name (vectype1);
5042 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5043 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5046 return new_temp;
5049 /* Function vect_create_epilog_for_reduction
5051 Create code at the loop-epilog to finalize the result of a reduction
5052 computation.
5054 STMT_INFO is the scalar reduction stmt that is being vectorized.
5055 SLP_NODE is an SLP node containing a group of reduction statements. The
5056 first one in this group is STMT_INFO.
5057 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5058 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5059 (counting from 0)
5061 This function:
5062 1. Completes the reduction def-use cycles.
5063 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5064 by calling the function specified by REDUC_FN if available, or by
5065 other means (whole-vector shifts or a scalar loop).
5066 The function also creates a new phi node at the loop exit to preserve
5067 loop-closed form, as illustrated below.
5069 The flow at the entry to this function:
5071 loop:
5072 vec_def = phi <vec_init, null> # REDUCTION_PHI
5073 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5074 s_loop = scalar_stmt # (scalar) STMT_INFO
5075 loop_exit:
5076 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5077 use <s_out0>
5078 use <s_out0>
5080 The above is transformed by this function into:
5082 loop:
5083 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5084 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5085 s_loop = scalar_stmt # (scalar) STMT_INFO
5086 loop_exit:
5087 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5088 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5089 v_out2 = reduce <v_out1>
5090 s_out3 = extract_field <v_out2, 0>
5091 s_out4 = adjust_result <s_out3>
5092 use <s_out4>
5093 use <s_out4>
5096 static void
5097 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5098 stmt_vec_info stmt_info,
5099 slp_tree slp_node,
5100 slp_instance slp_node_instance)
5102 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5103 gcc_assert (reduc_info->is_reduc_info);
5104 /* For double reductions we need to get at the inner loop reduction
5105 stmt which has the meta info attached. Our stmt_info is that of the
5106 loop-closed PHI of the inner loop which we remember as
5107 def for the reduction PHI generation. */
5108 bool double_reduc = false;
5109 stmt_vec_info rdef_info = stmt_info;
5110 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5112 gcc_assert (!slp_node);
5113 double_reduc = true;
5114 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5115 (stmt_info->stmt, 0));
5116 stmt_info = vect_stmt_to_vectorize (stmt_info);
5118 gphi *reduc_def_stmt
5119 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5120 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5121 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5122 tree vectype;
5123 machine_mode mode;
5124 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5125 basic_block exit_bb;
5126 tree scalar_dest;
5127 tree scalar_type;
5128 gimple *new_phi = NULL, *phi;
5129 gimple_stmt_iterator exit_gsi;
5130 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5131 gimple *epilog_stmt = NULL;
5132 gimple *exit_phi;
5133 tree bitsize;
5134 tree def;
5135 tree orig_name, scalar_result;
5136 imm_use_iterator imm_iter, phi_imm_iter;
5137 use_operand_p use_p, phi_use_p;
5138 gimple *use_stmt;
5139 auto_vec<tree> reduc_inputs;
5140 int j, i;
5141 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5142 unsigned int group_size = 1, k;
5143 auto_vec<gimple *> phis;
5144 /* SLP reduction without reduction chain, e.g.,
5145 # a1 = phi <a2, a0>
5146 # b1 = phi <b2, b0>
5147 a2 = operation (a1)
5148 b2 = operation (b1) */
5149 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5150 bool direct_slp_reduc;
5151 tree induction_index = NULL_TREE;
5153 if (slp_node)
5154 group_size = SLP_TREE_LANES (slp_node);
5156 if (nested_in_vect_loop_p (loop, stmt_info))
5158 outer_loop = loop;
5159 loop = loop->inner;
5160 gcc_assert (!slp_node && double_reduc);
5163 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5164 gcc_assert (vectype);
5165 mode = TYPE_MODE (vectype);
5167 tree induc_val = NULL_TREE;
5168 tree adjustment_def = NULL;
5169 if (slp_node)
5171 else
5173 /* Optimize: for induction condition reduction, if we can't use zero
5174 for induc_val, use initial_def. */
5175 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5176 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5177 else if (double_reduc)
5179 else
5180 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5183 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5184 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5185 if (slp_reduc)
5186 /* All statements produce live-out values. */
5187 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5188 else if (slp_node)
5189 /* The last statement in the reduction chain produces the live-out
5190 value. */
5191 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5193 unsigned vec_num;
5194 int ncopies;
5195 if (slp_node)
5197 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5198 ncopies = 1;
5200 else
5202 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5203 vec_num = 1;
5204 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5207 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5208 which is updated with the current index of the loop for every match of
5209 the original loop's cond_expr (VEC_STMT). This results in a vector
5210 containing the last time the condition passed for that vector lane.
5211 The first match will be a 1 to allow 0 to be used for non-matching
5212 indexes. If there are no matches at all then the vector will be all
5213 zeroes.
5215 PR92772: This algorithm is broken for architectures that support
5216 masked vectors, but do not provide fold_extract_last. */
5217 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5219 auto_vec<std::pair<tree, bool>, 2> ccompares;
5220 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5221 cond_info = vect_stmt_to_vectorize (cond_info);
5222 while (cond_info != reduc_info)
5224 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5226 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5227 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5228 ccompares.safe_push
5229 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5230 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5232 cond_info
5233 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5234 1 + STMT_VINFO_REDUC_IDX
5235 (cond_info)));
5236 cond_info = vect_stmt_to_vectorize (cond_info);
5238 gcc_assert (ccompares.length () != 0);
5240 tree indx_before_incr, indx_after_incr;
5241 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5242 int scalar_precision
5243 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5244 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5245 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5246 (TYPE_MODE (vectype), cr_index_scalar_type,
5247 TYPE_VECTOR_SUBPARTS (vectype));
5249 /* First we create a simple vector induction variable which starts
5250 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5251 vector size (STEP). */
5253 /* Create a {1,2,3,...} vector. */
5254 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5256 /* Create a vector of the step value. */
5257 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5258 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5260 /* Create an induction variable. */
5261 gimple_stmt_iterator incr_gsi;
5262 bool insert_after;
5263 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5264 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5265 insert_after, &indx_before_incr, &indx_after_incr);
5267 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5268 filled with zeros (VEC_ZERO). */
5270 /* Create a vector of 0s. */
5271 tree zero = build_zero_cst (cr_index_scalar_type);
5272 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5274 /* Create a vector phi node. */
5275 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5276 new_phi = create_phi_node (new_phi_tree, loop->header);
5277 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5278 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5280 /* Now take the condition from the loops original cond_exprs
5281 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5282 every match uses values from the induction variable
5283 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5284 (NEW_PHI_TREE).
5285 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5286 the new cond_expr (INDEX_COND_EXPR). */
5287 gimple_seq stmts = NULL;
5288 for (int i = ccompares.length () - 1; i != -1; --i)
5290 tree ccompare = ccompares[i].first;
5291 if (ccompares[i].second)
5292 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5293 cr_index_vector_type,
5294 ccompare,
5295 indx_before_incr, new_phi_tree);
5296 else
5297 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5298 cr_index_vector_type,
5299 ccompare,
5300 new_phi_tree, indx_before_incr);
5302 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5304 /* Update the phi with the vec cond. */
5305 induction_index = new_phi_tree;
5306 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5307 loop_latch_edge (loop), UNKNOWN_LOCATION);
5310 /* 2. Create epilog code.
5311 The reduction epilog code operates across the elements of the vector
5312 of partial results computed by the vectorized loop.
5313 The reduction epilog code consists of:
5315 step 1: compute the scalar result in a vector (v_out2)
5316 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5317 step 3: adjust the scalar result (s_out3) if needed.
5319 Step 1 can be accomplished using one the following three schemes:
5320 (scheme 1) using reduc_fn, if available.
5321 (scheme 2) using whole-vector shifts, if available.
5322 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5323 combined.
5325 The overall epilog code looks like this:
5327 s_out0 = phi <s_loop> # original EXIT_PHI
5328 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5329 v_out2 = reduce <v_out1> # step 1
5330 s_out3 = extract_field <v_out2, 0> # step 2
5331 s_out4 = adjust_result <s_out3> # step 3
5333 (step 3 is optional, and steps 1 and 2 may be combined).
5334 Lastly, the uses of s_out0 are replaced by s_out4. */
5337 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5338 v_out1 = phi <VECT_DEF>
5339 Store them in NEW_PHIS. */
5340 if (double_reduc)
5341 loop = outer_loop;
5342 exit_bb = single_exit (loop)->dest;
5343 exit_gsi = gsi_after_labels (exit_bb);
5344 reduc_inputs.create (slp_node ? vec_num : ncopies);
5345 for (unsigned i = 0; i < vec_num; i++)
5347 gimple_seq stmts = NULL;
5348 if (slp_node)
5349 def = vect_get_slp_vect_def (slp_node, i);
5350 else
5351 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5352 for (j = 0; j < ncopies; j++)
5354 tree new_def = copy_ssa_name (def);
5355 phi = create_phi_node (new_def, exit_bb);
5356 if (j)
5357 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5358 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5359 new_def = gimple_convert (&stmts, vectype, new_def);
5360 reduc_inputs.quick_push (new_def);
5362 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5365 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5366 (i.e. when reduc_fn is not available) and in the final adjustment
5367 code (if needed). Also get the original scalar reduction variable as
5368 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5369 represents a reduction pattern), the tree-code and scalar-def are
5370 taken from the original stmt that the pattern-stmt (STMT) replaces.
5371 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5372 are taken from STMT. */
5374 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5375 if (orig_stmt_info != stmt_info)
5377 /* Reduction pattern */
5378 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5379 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5382 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5383 scalar_type = TREE_TYPE (scalar_dest);
5384 scalar_results.create (group_size);
5385 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5386 bitsize = TYPE_SIZE (scalar_type);
5388 /* True if we should implement SLP_REDUC using native reduction operations
5389 instead of scalar operations. */
5390 direct_slp_reduc = (reduc_fn != IFN_LAST
5391 && slp_reduc
5392 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5394 /* In case of reduction chain, e.g.,
5395 # a1 = phi <a3, a0>
5396 a2 = operation (a1)
5397 a3 = operation (a2),
5399 we may end up with more than one vector result. Here we reduce them
5400 to one vector.
5402 The same is true if we couldn't use a single defuse cycle. */
5403 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5404 || direct_slp_reduc
5405 || ncopies > 1)
5407 gimple_seq stmts = NULL;
5408 tree single_input = reduc_inputs[0];
5409 for (k = 1; k < reduc_inputs.length (); k++)
5410 single_input = gimple_build (&stmts, code, vectype,
5411 single_input, reduc_inputs[k]);
5412 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5414 reduc_inputs.truncate (0);
5415 reduc_inputs.safe_push (single_input);
5418 tree orig_reduc_input = reduc_inputs[0];
5420 /* If this loop is an epilogue loop that can be skipped after the
5421 main loop, we can only share a reduction operation between the
5422 main loop and the epilogue if we put it at the target of the
5423 skip edge.
5425 We can still reuse accumulators if this check fails. Doing so has
5426 the minor(?) benefit of making the epilogue loop's scalar result
5427 independent of the main loop's scalar result. */
5428 bool unify_with_main_loop_p = false;
5429 if (reduc_info->reused_accumulator
5430 && loop_vinfo->skip_this_loop_edge
5431 && single_succ_p (exit_bb)
5432 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5434 unify_with_main_loop_p = true;
5436 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5437 reduc_inputs[0] = make_ssa_name (vectype);
5438 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5439 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5440 UNKNOWN_LOCATION);
5441 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5442 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5443 exit_gsi = gsi_after_labels (reduc_block);
5446 /* Shouldn't be used beyond this point. */
5447 exit_bb = nullptr;
5449 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5450 && reduc_fn != IFN_LAST)
5452 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5453 various data values where the condition matched and another vector
5454 (INDUCTION_INDEX) containing all the indexes of those matches. We
5455 need to extract the last matching index (which will be the index with
5456 highest value) and use this to index into the data vector.
5457 For the case where there were no matches, the data vector will contain
5458 all default values and the index vector will be all zeros. */
5460 /* Get various versions of the type of the vector of indexes. */
5461 tree index_vec_type = TREE_TYPE (induction_index);
5462 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5463 tree index_scalar_type = TREE_TYPE (index_vec_type);
5464 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5466 /* Get an unsigned integer version of the type of the data vector. */
5467 int scalar_precision
5468 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5469 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5470 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5471 vectype);
5473 /* First we need to create a vector (ZERO_VEC) of zeros and another
5474 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5475 can create using a MAX reduction and then expanding.
5476 In the case where the loop never made any matches, the max index will
5477 be zero. */
5479 /* Vector of {0, 0, 0,...}. */
5480 tree zero_vec = build_zero_cst (vectype);
5482 /* Find maximum value from the vector of found indexes. */
5483 tree max_index = make_ssa_name (index_scalar_type);
5484 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5485 1, induction_index);
5486 gimple_call_set_lhs (max_index_stmt, max_index);
5487 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5489 /* Vector of {max_index, max_index, max_index,...}. */
5490 tree max_index_vec = make_ssa_name (index_vec_type);
5491 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5492 max_index);
5493 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5494 max_index_vec_rhs);
5495 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5497 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5498 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5499 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5500 otherwise. Only one value should match, resulting in a vector
5501 (VEC_COND) with one data value and the rest zeros.
5502 In the case where the loop never made any matches, every index will
5503 match, resulting in a vector with all data values (which will all be
5504 the default value). */
5506 /* Compare the max index vector to the vector of found indexes to find
5507 the position of the max value. */
5508 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5509 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5510 induction_index,
5511 max_index_vec);
5512 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5514 /* Use the compare to choose either values from the data vector or
5515 zero. */
5516 tree vec_cond = make_ssa_name (vectype);
5517 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5518 vec_compare,
5519 reduc_inputs[0],
5520 zero_vec);
5521 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5523 /* Finally we need to extract the data value from the vector (VEC_COND)
5524 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5525 reduction, but because this doesn't exist, we can use a MAX reduction
5526 instead. The data value might be signed or a float so we need to cast
5527 it first.
5528 In the case where the loop never made any matches, the data values are
5529 all identical, and so will reduce down correctly. */
5531 /* Make the matched data values unsigned. */
5532 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5533 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5534 vec_cond);
5535 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5536 VIEW_CONVERT_EXPR,
5537 vec_cond_cast_rhs);
5538 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5540 /* Reduce down to a scalar value. */
5541 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5542 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5543 1, vec_cond_cast);
5544 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5545 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5547 /* Convert the reduced value back to the result type and set as the
5548 result. */
5549 gimple_seq stmts = NULL;
5550 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5551 data_reduc);
5552 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5553 scalar_results.safe_push (new_temp);
5555 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5556 && reduc_fn == IFN_LAST)
5558 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5559 idx = 0;
5560 idx_val = induction_index[0];
5561 val = data_reduc[0];
5562 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5563 if (induction_index[i] > idx_val)
5564 val = data_reduc[i], idx_val = induction_index[i];
5565 return val; */
5567 tree data_eltype = TREE_TYPE (vectype);
5568 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5569 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5570 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5571 /* Enforced by vectorizable_reduction, which ensures we have target
5572 support before allowing a conditional reduction on variable-length
5573 vectors. */
5574 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5575 tree idx_val = NULL_TREE, val = NULL_TREE;
5576 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5578 tree old_idx_val = idx_val;
5579 tree old_val = val;
5580 idx_val = make_ssa_name (idx_eltype);
5581 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5582 build3 (BIT_FIELD_REF, idx_eltype,
5583 induction_index,
5584 bitsize_int (el_size),
5585 bitsize_int (off)));
5586 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5587 val = make_ssa_name (data_eltype);
5588 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5589 build3 (BIT_FIELD_REF,
5590 data_eltype,
5591 reduc_inputs[0],
5592 bitsize_int (el_size),
5593 bitsize_int (off)));
5594 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5595 if (off != 0)
5597 tree new_idx_val = idx_val;
5598 if (off != v_size - el_size)
5600 new_idx_val = make_ssa_name (idx_eltype);
5601 epilog_stmt = gimple_build_assign (new_idx_val,
5602 MAX_EXPR, idx_val,
5603 old_idx_val);
5604 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5606 tree new_val = make_ssa_name (data_eltype);
5607 epilog_stmt = gimple_build_assign (new_val,
5608 COND_EXPR,
5609 build2 (GT_EXPR,
5610 boolean_type_node,
5611 idx_val,
5612 old_idx_val),
5613 val, old_val);
5614 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5615 idx_val = new_idx_val;
5616 val = new_val;
5619 /* Convert the reduced value back to the result type and set as the
5620 result. */
5621 gimple_seq stmts = NULL;
5622 val = gimple_convert (&stmts, scalar_type, val);
5623 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5624 scalar_results.safe_push (val);
5627 /* 2.3 Create the reduction code, using one of the three schemes described
5628 above. In SLP we simply need to extract all the elements from the
5629 vector (without reducing them), so we use scalar shifts. */
5630 else if (reduc_fn != IFN_LAST && !slp_reduc)
5632 tree tmp;
5633 tree vec_elem_type;
5635 /* Case 1: Create:
5636 v_out2 = reduc_expr <v_out1> */
5638 if (dump_enabled_p ())
5639 dump_printf_loc (MSG_NOTE, vect_location,
5640 "Reduce using direct vector reduction.\n");
5642 gimple_seq stmts = NULL;
5643 vec_elem_type = TREE_TYPE (vectype);
5644 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5645 vec_elem_type, reduc_inputs[0]);
5646 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5647 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5649 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5650 && induc_val)
5652 /* Earlier we set the initial value to be a vector if induc_val
5653 values. Check the result and if it is induc_val then replace
5654 with the original initial value, unless induc_val is
5655 the same as initial_def already. */
5656 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5657 induc_val);
5658 tree initial_def = reduc_info->reduc_initial_values[0];
5660 tmp = make_ssa_name (new_scalar_dest);
5661 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5662 initial_def, new_temp);
5663 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5664 new_temp = tmp;
5667 scalar_results.safe_push (new_temp);
5669 else if (direct_slp_reduc)
5671 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5672 with the elements for other SLP statements replaced with the
5673 neutral value. We can then do a normal reduction on each vector. */
5675 /* Enforced by vectorizable_reduction. */
5676 gcc_assert (reduc_inputs.length () == 1);
5677 gcc_assert (pow2p_hwi (group_size));
5679 gimple_seq seq = NULL;
5681 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5682 and the same element size as VECTYPE. */
5683 tree index = build_index_vector (vectype, 0, 1);
5684 tree index_type = TREE_TYPE (index);
5685 tree index_elt_type = TREE_TYPE (index_type);
5686 tree mask_type = truth_type_for (index_type);
5688 /* Create a vector that, for each element, identifies which of
5689 the REDUC_GROUP_SIZE results should use it. */
5690 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5691 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5692 build_vector_from_val (index_type, index_mask));
5694 /* Get a neutral vector value. This is simply a splat of the neutral
5695 scalar value if we have one, otherwise the initial scalar value
5696 is itself a neutral value. */
5697 tree vector_identity = NULL_TREE;
5698 tree neutral_op = NULL_TREE;
5699 if (slp_node)
5701 tree initial_value = NULL_TREE;
5702 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5703 initial_value = reduc_info->reduc_initial_values[0];
5704 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5705 initial_value);
5707 if (neutral_op)
5708 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5709 neutral_op);
5710 for (unsigned int i = 0; i < group_size; ++i)
5712 /* If there's no univeral neutral value, we can use the
5713 initial scalar value from the original PHI. This is used
5714 for MIN and MAX reduction, for example. */
5715 if (!neutral_op)
5717 tree scalar_value = reduc_info->reduc_initial_values[i];
5718 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5719 scalar_value);
5720 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5721 scalar_value);
5724 /* Calculate the equivalent of:
5726 sel[j] = (index[j] == i);
5728 which selects the elements of REDUC_INPUTS[0] that should
5729 be included in the result. */
5730 tree compare_val = build_int_cst (index_elt_type, i);
5731 compare_val = build_vector_from_val (index_type, compare_val);
5732 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5733 index, compare_val);
5735 /* Calculate the equivalent of:
5737 vec = seq ? reduc_inputs[0] : vector_identity;
5739 VEC is now suitable for a full vector reduction. */
5740 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5741 sel, reduc_inputs[0], vector_identity);
5743 /* Do the reduction and convert it to the appropriate type. */
5744 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5745 TREE_TYPE (vectype), vec);
5746 scalar = gimple_convert (&seq, scalar_type, scalar);
5747 scalar_results.safe_push (scalar);
5749 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5751 else
5753 bool reduce_with_shift;
5754 tree vec_temp;
5756 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5758 /* See if the target wants to do the final (shift) reduction
5759 in a vector mode of smaller size and first reduce upper/lower
5760 halves against each other. */
5761 enum machine_mode mode1 = mode;
5762 tree stype = TREE_TYPE (vectype);
5763 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5764 unsigned nunits1 = nunits;
5765 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5766 && reduc_inputs.length () == 1)
5768 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5769 /* For SLP reductions we have to make sure lanes match up, but
5770 since we're doing individual element final reduction reducing
5771 vector width here is even more important.
5772 ??? We can also separate lanes with permutes, for the common
5773 case of power-of-two group-size odd/even extracts would work. */
5774 if (slp_reduc && nunits != nunits1)
5776 nunits1 = least_common_multiple (nunits1, group_size);
5777 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5780 if (!slp_reduc
5781 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5782 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5784 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5785 stype, nunits1);
5786 reduce_with_shift = have_whole_vector_shift (mode1);
5787 if (!VECTOR_MODE_P (mode1))
5788 reduce_with_shift = false;
5789 else
5791 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5792 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5793 reduce_with_shift = false;
5796 /* First reduce the vector to the desired vector size we should
5797 do shift reduction on by combining upper and lower halves. */
5798 gimple_seq stmts = NULL;
5799 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5800 code, &stmts);
5801 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5802 reduc_inputs[0] = new_temp;
5804 if (reduce_with_shift && !slp_reduc)
5806 int element_bitsize = tree_to_uhwi (bitsize);
5807 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5808 for variable-length vectors and also requires direct target support
5809 for loop reductions. */
5810 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5811 int nelements = vec_size_in_bits / element_bitsize;
5812 vec_perm_builder sel;
5813 vec_perm_indices indices;
5815 int elt_offset;
5817 tree zero_vec = build_zero_cst (vectype1);
5818 /* Case 2: Create:
5819 for (offset = nelements/2; offset >= 1; offset/=2)
5821 Create: va' = vec_shift <va, offset>
5822 Create: va = vop <va, va'>
5823 } */
5825 tree rhs;
5827 if (dump_enabled_p ())
5828 dump_printf_loc (MSG_NOTE, vect_location,
5829 "Reduce using vector shifts\n");
5831 gimple_seq stmts = NULL;
5832 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5833 for (elt_offset = nelements / 2;
5834 elt_offset >= 1;
5835 elt_offset /= 2)
5837 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5838 indices.new_vector (sel, 2, nelements);
5839 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5840 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5841 new_temp, zero_vec, mask);
5842 new_temp = gimple_build (&stmts, code,
5843 vectype1, new_name, new_temp);
5845 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5847 /* 2.4 Extract the final scalar result. Create:
5848 s_out3 = extract_field <v_out2, bitpos> */
5850 if (dump_enabled_p ())
5851 dump_printf_loc (MSG_NOTE, vect_location,
5852 "extract scalar result\n");
5854 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5855 bitsize, bitsize_zero_node);
5856 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5857 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5858 gimple_assign_set_lhs (epilog_stmt, new_temp);
5859 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5860 scalar_results.safe_push (new_temp);
5862 else
5864 /* Case 3: Create:
5865 s = extract_field <v_out2, 0>
5866 for (offset = element_size;
5867 offset < vector_size;
5868 offset += element_size;)
5870 Create: s' = extract_field <v_out2, offset>
5871 Create: s = op <s, s'> // For non SLP cases
5872 } */
5874 if (dump_enabled_p ())
5875 dump_printf_loc (MSG_NOTE, vect_location,
5876 "Reduce using scalar code.\n");
5878 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5879 int element_bitsize = tree_to_uhwi (bitsize);
5880 tree compute_type = TREE_TYPE (vectype);
5881 gimple_seq stmts = NULL;
5882 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5884 int bit_offset;
5885 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5886 vec_temp, bitsize, bitsize_zero_node);
5888 /* In SLP we don't need to apply reduction operation, so we just
5889 collect s' values in SCALAR_RESULTS. */
5890 if (slp_reduc)
5891 scalar_results.safe_push (new_temp);
5893 for (bit_offset = element_bitsize;
5894 bit_offset < vec_size_in_bits;
5895 bit_offset += element_bitsize)
5897 tree bitpos = bitsize_int (bit_offset);
5898 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5899 compute_type, vec_temp,
5900 bitsize, bitpos);
5901 if (slp_reduc)
5903 /* In SLP we don't need to apply reduction operation, so
5904 we just collect s' values in SCALAR_RESULTS. */
5905 new_temp = new_name;
5906 scalar_results.safe_push (new_name);
5908 else
5909 new_temp = gimple_build (&stmts, code, compute_type,
5910 new_name, new_temp);
5914 /* The only case where we need to reduce scalar results in SLP, is
5915 unrolling. If the size of SCALAR_RESULTS is greater than
5916 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5917 REDUC_GROUP_SIZE. */
5918 if (slp_reduc)
5920 tree res, first_res, new_res;
5922 /* Reduce multiple scalar results in case of SLP unrolling. */
5923 for (j = group_size; scalar_results.iterate (j, &res);
5924 j++)
5926 first_res = scalar_results[j % group_size];
5927 new_res = gimple_build (&stmts, code, compute_type,
5928 first_res, res);
5929 scalar_results[j % group_size] = new_res;
5931 scalar_results.truncate (group_size);
5932 for (k = 0; k < group_size; k++)
5933 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5934 scalar_results[k]);
5936 else
5938 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5939 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5940 scalar_results.safe_push (new_temp);
5943 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5946 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5947 && induc_val)
5949 /* Earlier we set the initial value to be a vector if induc_val
5950 values. Check the result and if it is induc_val then replace
5951 with the original initial value, unless induc_val is
5952 the same as initial_def already. */
5953 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5954 induc_val);
5955 tree initial_def = reduc_info->reduc_initial_values[0];
5957 tree tmp = make_ssa_name (new_scalar_dest);
5958 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5959 initial_def, new_temp);
5960 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5961 scalar_results[0] = tmp;
5965 /* 2.5 Adjust the final result by the initial value of the reduction
5966 variable. (When such adjustment is not needed, then
5967 'adjustment_def' is zero). For example, if code is PLUS we create:
5968 new_temp = loop_exit_def + adjustment_def */
5970 if (adjustment_def)
5972 gcc_assert (!slp_reduc);
5973 gimple_seq stmts = NULL;
5974 if (double_reduc)
5976 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5977 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5978 new_temp = gimple_build (&stmts, code, vectype,
5979 reduc_inputs[0], adjustment_def);
5981 else
5983 new_temp = scalar_results[0];
5984 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5985 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5986 new_temp = gimple_build (&stmts, code, scalar_type,
5987 new_temp, adjustment_def);
5990 epilog_stmt = gimple_seq_last_stmt (stmts);
5991 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5992 scalar_results[0] = new_temp;
5995 /* Record this operation if it could be reused by the epilogue loop. */
5996 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
5997 loop_vinfo->reusable_accumulators.put (scalar_results[0],
5998 { orig_reduc_input, reduc_info });
6000 if (double_reduc)
6001 loop = outer_loop;
6003 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6004 phis with new adjusted scalar results, i.e., replace use <s_out0>
6005 with use <s_out4>.
6007 Transform:
6008 loop_exit:
6009 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6010 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6011 v_out2 = reduce <v_out1>
6012 s_out3 = extract_field <v_out2, 0>
6013 s_out4 = adjust_result <s_out3>
6014 use <s_out0>
6015 use <s_out0>
6017 into:
6019 loop_exit:
6020 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6021 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6022 v_out2 = reduce <v_out1>
6023 s_out3 = extract_field <v_out2, 0>
6024 s_out4 = adjust_result <s_out3>
6025 use <s_out4>
6026 use <s_out4> */
6028 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6029 for (k = 0; k < live_out_stmts.size (); k++)
6031 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6032 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6034 phis.create (3);
6035 /* Find the loop-closed-use at the loop exit of the original scalar
6036 result. (The reduction result is expected to have two immediate uses,
6037 one at the latch block, and one at the loop exit). For double
6038 reductions we are looking for exit phis of the outer loop. */
6039 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6041 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6043 if (!is_gimple_debug (USE_STMT (use_p)))
6044 phis.safe_push (USE_STMT (use_p));
6046 else
6048 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6050 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6052 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6054 if (!flow_bb_inside_loop_p (loop,
6055 gimple_bb (USE_STMT (phi_use_p)))
6056 && !is_gimple_debug (USE_STMT (phi_use_p)))
6057 phis.safe_push (USE_STMT (phi_use_p));
6063 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6065 /* Replace the uses: */
6066 orig_name = PHI_RESULT (exit_phi);
6068 /* Look for a single use at the target of the skip edge. */
6069 if (unify_with_main_loop_p)
6071 use_operand_p use_p;
6072 gimple *user;
6073 if (!single_imm_use (orig_name, &use_p, &user))
6074 gcc_unreachable ();
6075 orig_name = gimple_get_lhs (user);
6078 scalar_result = scalar_results[k];
6079 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6081 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6082 SET_USE (use_p, scalar_result);
6083 update_stmt (use_stmt);
6087 phis.release ();
6091 /* Return a vector of type VECTYPE that is equal to the vector select
6092 operation "MASK ? VEC : IDENTITY". Insert the select statements
6093 before GSI. */
6095 static tree
6096 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6097 tree vec, tree identity)
6099 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6100 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6101 mask, vec, identity);
6102 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6103 return cond;
6106 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6107 order, starting with LHS. Insert the extraction statements before GSI and
6108 associate the new scalar SSA names with variable SCALAR_DEST.
6109 Return the SSA name for the result. */
6111 static tree
6112 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6113 tree_code code, tree lhs, tree vector_rhs)
6115 tree vectype = TREE_TYPE (vector_rhs);
6116 tree scalar_type = TREE_TYPE (vectype);
6117 tree bitsize = TYPE_SIZE (scalar_type);
6118 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6119 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6121 for (unsigned HOST_WIDE_INT bit_offset = 0;
6122 bit_offset < vec_size_in_bits;
6123 bit_offset += element_bitsize)
6125 tree bitpos = bitsize_int (bit_offset);
6126 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6127 bitsize, bitpos);
6129 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6130 rhs = make_ssa_name (scalar_dest, stmt);
6131 gimple_assign_set_lhs (stmt, rhs);
6132 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6134 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6135 tree new_name = make_ssa_name (scalar_dest, stmt);
6136 gimple_assign_set_lhs (stmt, new_name);
6137 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6138 lhs = new_name;
6140 return lhs;
6143 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6144 type of the vector input. */
6146 static internal_fn
6147 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6149 internal_fn mask_reduc_fn;
6151 switch (reduc_fn)
6153 case IFN_FOLD_LEFT_PLUS:
6154 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6155 break;
6157 default:
6158 return IFN_LAST;
6161 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6162 OPTIMIZE_FOR_SPEED))
6163 return mask_reduc_fn;
6164 return IFN_LAST;
6167 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6168 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6169 statement. CODE is the operation performed by STMT_INFO and OPS are
6170 its scalar operands. REDUC_INDEX is the index of the operand in
6171 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6172 implements in-order reduction, or IFN_LAST if we should open-code it.
6173 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6174 that should be used to control the operation in a fully-masked loop. */
6176 static bool
6177 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6178 stmt_vec_info stmt_info,
6179 gimple_stmt_iterator *gsi,
6180 gimple **vec_stmt, slp_tree slp_node,
6181 gimple *reduc_def_stmt,
6182 tree_code code, internal_fn reduc_fn,
6183 tree ops[3], tree vectype_in,
6184 int reduc_index, vec_loop_masks *masks)
6186 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6187 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6188 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6190 int ncopies;
6191 if (slp_node)
6192 ncopies = 1;
6193 else
6194 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6196 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6197 gcc_assert (ncopies == 1);
6198 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6200 if (slp_node)
6201 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6202 TYPE_VECTOR_SUBPARTS (vectype_in)));
6204 tree op0 = ops[1 - reduc_index];
6206 int group_size = 1;
6207 stmt_vec_info scalar_dest_def_info;
6208 auto_vec<tree> vec_oprnds0;
6209 if (slp_node)
6211 auto_vec<vec<tree> > vec_defs (2);
6212 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6213 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6214 vec_defs[0].release ();
6215 vec_defs[1].release ();
6216 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6217 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6219 else
6221 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6222 op0, &vec_oprnds0);
6223 scalar_dest_def_info = stmt_info;
6226 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6227 tree scalar_type = TREE_TYPE (scalar_dest);
6228 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6230 int vec_num = vec_oprnds0.length ();
6231 gcc_assert (vec_num == 1 || slp_node);
6232 tree vec_elem_type = TREE_TYPE (vectype_out);
6233 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6235 tree vector_identity = NULL_TREE;
6236 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6237 vector_identity = build_zero_cst (vectype_out);
6239 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6240 int i;
6241 tree def0;
6242 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6244 gimple *new_stmt;
6245 tree mask = NULL_TREE;
6246 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6247 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6249 /* Handle MINUS by adding the negative. */
6250 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6252 tree negated = make_ssa_name (vectype_out);
6253 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6254 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6255 def0 = negated;
6258 if (mask && mask_reduc_fn == IFN_LAST)
6259 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6260 vector_identity);
6262 /* On the first iteration the input is simply the scalar phi
6263 result, and for subsequent iterations it is the output of
6264 the preceding operation. */
6265 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6267 if (mask && mask_reduc_fn != IFN_LAST)
6268 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6269 def0, mask);
6270 else
6271 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6272 def0);
6273 /* For chained SLP reductions the output of the previous reduction
6274 operation serves as the input of the next. For the final statement
6275 the output cannot be a temporary - we reuse the original
6276 scalar destination of the last statement. */
6277 if (i != vec_num - 1)
6279 gimple_set_lhs (new_stmt, scalar_dest_var);
6280 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6281 gimple_set_lhs (new_stmt, reduc_var);
6284 else
6286 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6287 reduc_var, def0);
6288 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6289 /* Remove the statement, so that we can use the same code paths
6290 as for statements that we've just created. */
6291 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6292 gsi_remove (&tmp_gsi, true);
6295 if (i == vec_num - 1)
6297 gimple_set_lhs (new_stmt, scalar_dest);
6298 vect_finish_replace_stmt (loop_vinfo,
6299 scalar_dest_def_info,
6300 new_stmt);
6302 else
6303 vect_finish_stmt_generation (loop_vinfo,
6304 scalar_dest_def_info,
6305 new_stmt, gsi);
6307 if (slp_node)
6308 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6309 else
6311 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6312 *vec_stmt = new_stmt;
6316 return true;
6319 /* Function is_nonwrapping_integer_induction.
6321 Check if STMT_VINO (which is part of loop LOOP) both increments and
6322 does not cause overflow. */
6324 static bool
6325 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6327 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6328 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6329 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6330 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6331 widest_int ni, max_loop_value, lhs_max;
6332 wi::overflow_type overflow = wi::OVF_NONE;
6334 /* Make sure the loop is integer based. */
6335 if (TREE_CODE (base) != INTEGER_CST
6336 || TREE_CODE (step) != INTEGER_CST)
6337 return false;
6339 /* Check that the max size of the loop will not wrap. */
6341 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6342 return true;
6344 if (! max_stmt_executions (loop, &ni))
6345 return false;
6347 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6348 &overflow);
6349 if (overflow)
6350 return false;
6352 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6353 TYPE_SIGN (lhs_type), &overflow);
6354 if (overflow)
6355 return false;
6357 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6358 <= TYPE_PRECISION (lhs_type));
6361 /* Check if masking can be supported by inserting a conditional expression.
6362 CODE is the code for the operation. COND_FN is the conditional internal
6363 function, if it exists. VECTYPE_IN is the type of the vector input. */
6364 static bool
6365 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6366 tree vectype_in)
6368 if (cond_fn != IFN_LAST
6369 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6370 OPTIMIZE_FOR_SPEED))
6371 return false;
6373 switch (code)
6375 case DOT_PROD_EXPR:
6376 case SAD_EXPR:
6377 return true;
6379 default:
6380 return false;
6384 /* Insert a conditional expression to enable masked vectorization. CODE is the
6385 code for the operation. VOP is the array of operands. MASK is the loop
6386 mask. GSI is a statement iterator used to place the new conditional
6387 expression. */
6388 static void
6389 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6390 gimple_stmt_iterator *gsi)
6392 switch (code)
6394 case DOT_PROD_EXPR:
6396 tree vectype = TREE_TYPE (vop[1]);
6397 tree zero = build_zero_cst (vectype);
6398 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6399 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6400 mask, vop[1], zero);
6401 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6402 vop[1] = masked_op1;
6403 break;
6406 case SAD_EXPR:
6408 tree vectype = TREE_TYPE (vop[1]);
6409 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6410 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6411 mask, vop[1], vop[0]);
6412 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6413 vop[1] = masked_op1;
6414 break;
6417 default:
6418 gcc_unreachable ();
6422 /* Function vectorizable_reduction.
6424 Check if STMT_INFO performs a reduction operation that can be vectorized.
6425 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6426 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6427 Return true if STMT_INFO is vectorizable in this way.
6429 This function also handles reduction idioms (patterns) that have been
6430 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6431 may be of this form:
6432 X = pattern_expr (arg0, arg1, ..., X)
6433 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6434 sequence that had been detected and replaced by the pattern-stmt
6435 (STMT_INFO).
6437 This function also handles reduction of condition expressions, for example:
6438 for (int i = 0; i < N; i++)
6439 if (a[i] < value)
6440 last = a[i];
6441 This is handled by vectorising the loop and creating an additional vector
6442 containing the loop indexes for which "a[i] < value" was true. In the
6443 function epilogue this is reduced to a single max value and then used to
6444 index into the vector of results.
6446 In some cases of reduction patterns, the type of the reduction variable X is
6447 different than the type of the other arguments of STMT_INFO.
6448 In such cases, the vectype that is used when transforming STMT_INFO into
6449 a vector stmt is different than the vectype that is used to determine the
6450 vectorization factor, because it consists of a different number of elements
6451 than the actual number of elements that are being operated upon in parallel.
6453 For example, consider an accumulation of shorts into an int accumulator.
6454 On some targets it's possible to vectorize this pattern operating on 8
6455 shorts at a time (hence, the vectype for purposes of determining the
6456 vectorization factor should be V8HI); on the other hand, the vectype that
6457 is used to create the vector form is actually V4SI (the type of the result).
6459 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6460 indicates what is the actual level of parallelism (V8HI in the example), so
6461 that the right vectorization factor would be derived. This vectype
6462 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6463 be used to create the vectorized stmt. The right vectype for the vectorized
6464 stmt is obtained from the type of the result X:
6465 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6467 This means that, contrary to "regular" reductions (or "regular" stmts in
6468 general), the following equation:
6469 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6470 does *NOT* necessarily hold for reduction patterns. */
6472 bool
6473 vectorizable_reduction (loop_vec_info loop_vinfo,
6474 stmt_vec_info stmt_info, slp_tree slp_node,
6475 slp_instance slp_node_instance,
6476 stmt_vector_for_cost *cost_vec)
6478 tree scalar_dest;
6479 tree vectype_in = NULL_TREE;
6480 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6481 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6482 stmt_vec_info cond_stmt_vinfo = NULL;
6483 tree scalar_type;
6484 int i;
6485 int ncopies;
6486 bool single_defuse_cycle = false;
6487 bool nested_cycle = false;
6488 bool double_reduc = false;
6489 int vec_num;
6490 tree tem;
6491 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6492 tree cond_reduc_val = NULL_TREE;
6494 /* Make sure it was already recognized as a reduction computation. */
6495 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6496 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6497 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6498 return false;
6500 /* The stmt we store reduction analysis meta on. */
6501 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6502 reduc_info->is_reduc_info = true;
6504 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6506 if (is_a <gphi *> (stmt_info->stmt))
6508 if (slp_node)
6510 /* We eventually need to set a vector type on invariant
6511 arguments. */
6512 unsigned j;
6513 slp_tree child;
6514 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6515 if (!vect_maybe_update_slp_op_vectype
6516 (child, SLP_TREE_VECTYPE (slp_node)))
6518 if (dump_enabled_p ())
6519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6520 "incompatible vector types for "
6521 "invariants\n");
6522 return false;
6525 /* Analysis for double-reduction is done on the outer
6526 loop PHI, nested cycles have no further restrictions. */
6527 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6529 else
6530 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6531 return true;
6534 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6535 stmt_vec_info phi_info = stmt_info;
6536 if (!is_a <gphi *> (stmt_info->stmt))
6538 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6539 return true;
6541 if (slp_node)
6543 slp_node_instance->reduc_phis = slp_node;
6544 /* ??? We're leaving slp_node to point to the PHIs, we only
6545 need it to get at the number of vector stmts which wasn't
6546 yet initialized for the instance root. */
6548 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6549 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6550 else
6552 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6553 == vect_double_reduction_def);
6554 use_operand_p use_p;
6555 gimple *use_stmt;
6556 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6557 &use_p, &use_stmt);
6558 gcc_assert (res);
6559 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6560 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6563 /* PHIs should not participate in patterns. */
6564 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6565 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6567 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6568 and compute the reduction chain length. Discover the real
6569 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6570 tree reduc_def
6571 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6572 loop_latch_edge
6573 (gimple_bb (reduc_def_phi)->loop_father));
6574 unsigned reduc_chain_length = 0;
6575 bool only_slp_reduc_chain = true;
6576 stmt_info = NULL;
6577 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6578 while (reduc_def != PHI_RESULT (reduc_def_phi))
6580 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6581 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6582 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6584 if (dump_enabled_p ())
6585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6586 "reduction chain broken by patterns.\n");
6587 return false;
6589 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6590 only_slp_reduc_chain = false;
6591 /* ??? For epilogue generation live members of the chain need
6592 to point back to the PHI via their original stmt for
6593 info_for_reduction to work. */
6594 if (STMT_VINFO_LIVE_P (vdef))
6595 STMT_VINFO_REDUC_DEF (def) = phi_info;
6596 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6597 if (!assign)
6599 if (dump_enabled_p ())
6600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6601 "reduction chain includes calls.\n");
6602 return false;
6604 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6606 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6607 TREE_TYPE (gimple_assign_rhs1 (assign))))
6609 if (dump_enabled_p ())
6610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6611 "conversion in the reduction chain.\n");
6612 return false;
6615 else if (!stmt_info)
6616 /* First non-conversion stmt. */
6617 stmt_info = vdef;
6618 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6619 reduc_chain_length++;
6620 if (!stmt_info && slp_node)
6621 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6623 /* PHIs should not participate in patterns. */
6624 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6626 if (nested_in_vect_loop_p (loop, stmt_info))
6628 loop = loop->inner;
6629 nested_cycle = true;
6632 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6633 element. */
6634 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6636 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6637 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6639 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6640 gcc_assert (slp_node
6641 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6643 /* 1. Is vectorizable reduction? */
6644 /* Not supportable if the reduction variable is used in the loop, unless
6645 it's a reduction chain. */
6646 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6647 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6648 return false;
6650 /* Reductions that are not used even in an enclosing outer-loop,
6651 are expected to be "live" (used out of the loop). */
6652 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6653 && !STMT_VINFO_LIVE_P (stmt_info))
6654 return false;
6656 /* 2. Has this been recognized as a reduction pattern?
6658 Check if STMT represents a pattern that has been recognized
6659 in earlier analysis stages. For stmts that represent a pattern,
6660 the STMT_VINFO_RELATED_STMT field records the last stmt in
6661 the original sequence that constitutes the pattern. */
6663 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6664 if (orig_stmt_info)
6666 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6667 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6670 /* 3. Check the operands of the operation. The first operands are defined
6671 inside the loop body. The last operand is the reduction variable,
6672 which is defined by the loop-header-phi. */
6674 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6675 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6676 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6677 enum tree_code code = gimple_assign_rhs_code (stmt);
6678 bool lane_reduc_code_p
6679 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6680 int op_type = TREE_CODE_LENGTH (code);
6681 enum optab_subtype optab_query_kind = optab_vector;
6682 if (code == DOT_PROD_EXPR
6683 && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6684 != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6685 optab_query_kind = optab_vector_mixed_sign;
6688 scalar_dest = gimple_assign_lhs (stmt);
6689 scalar_type = TREE_TYPE (scalar_dest);
6690 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6691 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6692 return false;
6694 /* Do not try to vectorize bit-precision reductions. */
6695 if (!type_has_mode_precision_p (scalar_type))
6696 return false;
6698 /* For lane-reducing ops we're reducing the number of reduction PHIs
6699 which means the only use of that may be in the lane-reducing operation. */
6700 if (lane_reduc_code_p
6701 && reduc_chain_length != 1
6702 && !only_slp_reduc_chain)
6704 if (dump_enabled_p ())
6705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6706 "lane-reducing reduction with extra stmts.\n");
6707 return false;
6710 /* All uses but the last are expected to be defined in the loop.
6711 The last use is the reduction variable. In case of nested cycle this
6712 assumption is not true: we use reduc_index to record the index of the
6713 reduction variable. */
6714 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6715 /* We need to skip an extra operand for COND_EXPRs with embedded
6716 comparison. */
6717 unsigned opno_adjust = 0;
6718 if (code == COND_EXPR
6719 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6720 opno_adjust = 1;
6721 for (i = 0; i < op_type; i++)
6723 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6724 if (i == 0 && code == COND_EXPR)
6725 continue;
6727 stmt_vec_info def_stmt_info;
6728 enum vect_def_type dt;
6729 tree op;
6730 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6731 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6732 &def_stmt_info))
6734 if (dump_enabled_p ())
6735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6736 "use not simple.\n");
6737 return false;
6739 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6740 continue;
6742 /* There should be only one cycle def in the stmt, the one
6743 leading to reduc_def. */
6744 if (VECTORIZABLE_CYCLE_DEF (dt))
6745 return false;
6747 /* To properly compute ncopies we are interested in the widest
6748 non-reduction input type in case we're looking at a widening
6749 accumulation that we later handle in vect_transform_reduction. */
6750 if (lane_reduc_code_p
6751 && tem
6752 && (!vectype_in
6753 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6754 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6755 vectype_in = tem;
6757 if (code == COND_EXPR)
6759 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6760 if (dt == vect_constant_def)
6762 cond_reduc_dt = dt;
6763 cond_reduc_val = op;
6765 if (dt == vect_induction_def
6766 && def_stmt_info
6767 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6769 cond_reduc_dt = dt;
6770 cond_stmt_vinfo = def_stmt_info;
6774 if (!vectype_in)
6775 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6776 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6778 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6779 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6780 /* If we have a condition reduction, see if we can simplify it further. */
6781 if (v_reduc_type == COND_REDUCTION)
6783 if (slp_node)
6784 return false;
6786 /* When the condition uses the reduction value in the condition, fail. */
6787 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6789 if (dump_enabled_p ())
6790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6791 "condition depends on previous iteration\n");
6792 return false;
6795 if (reduc_chain_length == 1
6796 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6797 vectype_in, OPTIMIZE_FOR_SPEED))
6799 if (dump_enabled_p ())
6800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6801 "optimizing condition reduction with"
6802 " FOLD_EXTRACT_LAST.\n");
6803 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6805 else if (cond_reduc_dt == vect_induction_def)
6807 tree base
6808 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6809 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6811 gcc_assert (TREE_CODE (base) == INTEGER_CST
6812 && TREE_CODE (step) == INTEGER_CST);
6813 cond_reduc_val = NULL_TREE;
6814 enum tree_code cond_reduc_op_code = ERROR_MARK;
6815 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6816 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6818 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6819 above base; punt if base is the minimum value of the type for
6820 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6821 else if (tree_int_cst_sgn (step) == -1)
6823 cond_reduc_op_code = MIN_EXPR;
6824 if (tree_int_cst_sgn (base) == -1)
6825 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6826 else if (tree_int_cst_lt (base,
6827 TYPE_MAX_VALUE (TREE_TYPE (base))))
6828 cond_reduc_val
6829 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6831 else
6833 cond_reduc_op_code = MAX_EXPR;
6834 if (tree_int_cst_sgn (base) == 1)
6835 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6836 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6837 base))
6838 cond_reduc_val
6839 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6841 if (cond_reduc_val)
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_NOTE, vect_location,
6845 "condition expression based on "
6846 "integer induction.\n");
6847 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6848 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6849 = cond_reduc_val;
6850 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6853 else if (cond_reduc_dt == vect_constant_def)
6855 enum vect_def_type cond_initial_dt;
6856 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6857 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6858 if (cond_initial_dt == vect_constant_def
6859 && types_compatible_p (TREE_TYPE (cond_initial_val),
6860 TREE_TYPE (cond_reduc_val)))
6862 tree e = fold_binary (LE_EXPR, boolean_type_node,
6863 cond_initial_val, cond_reduc_val);
6864 if (e && (integer_onep (e) || integer_zerop (e)))
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_NOTE, vect_location,
6868 "condition expression based on "
6869 "compile time constant.\n");
6870 /* Record reduction code at analysis stage. */
6871 STMT_VINFO_REDUC_CODE (reduc_info)
6872 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6873 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6879 if (STMT_VINFO_LIVE_P (phi_info))
6880 return false;
6882 if (slp_node)
6883 ncopies = 1;
6884 else
6885 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6887 gcc_assert (ncopies >= 1);
6889 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6891 if (nested_cycle)
6893 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6894 == vect_double_reduction_def);
6895 double_reduc = true;
6898 /* 4.2. Check support for the epilog operation.
6900 If STMT represents a reduction pattern, then the type of the
6901 reduction variable may be different than the type of the rest
6902 of the arguments. For example, consider the case of accumulation
6903 of shorts into an int accumulator; The original code:
6904 S1: int_a = (int) short_a;
6905 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6907 was replaced with:
6908 STMT: int_acc = widen_sum <short_a, int_acc>
6910 This means that:
6911 1. The tree-code that is used to create the vector operation in the
6912 epilog code (that reduces the partial results) is not the
6913 tree-code of STMT, but is rather the tree-code of the original
6914 stmt from the pattern that STMT is replacing. I.e, in the example
6915 above we want to use 'widen_sum' in the loop, but 'plus' in the
6916 epilog.
6917 2. The type (mode) we use to check available target support
6918 for the vector operation to be created in the *epilog*, is
6919 determined by the type of the reduction variable (in the example
6920 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6921 However the type (mode) we use to check available target support
6922 for the vector operation to be created *inside the loop*, is
6923 determined by the type of the other arguments to STMT (in the
6924 example we'd check this: optab_handler (widen_sum_optab,
6925 vect_short_mode)).
6927 This is contrary to "regular" reductions, in which the types of all
6928 the arguments are the same as the type of the reduction variable.
6929 For "regular" reductions we can therefore use the same vector type
6930 (and also the same tree-code) when generating the epilog code and
6931 when generating the code inside the loop. */
6933 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6934 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6936 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6937 if (reduction_type == TREE_CODE_REDUCTION)
6939 /* Check whether it's ok to change the order of the computation.
6940 Generally, when vectorizing a reduction we change the order of the
6941 computation. This may change the behavior of the program in some
6942 cases, so we need to check that this is ok. One exception is when
6943 vectorizing an outer-loop: the inner-loop is executed sequentially,
6944 and therefore vectorizing reductions in the inner-loop during
6945 outer-loop vectorization is safe. Likewise when we are vectorizing
6946 a series of reductions using SLP and the VF is one the reductions
6947 are performed in scalar order. */
6948 if (slp_node
6949 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6950 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6952 else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6954 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6955 is not directy used in stmt. */
6956 if (!only_slp_reduc_chain
6957 && reduc_chain_length != 1)
6959 if (dump_enabled_p ())
6960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6961 "in-order reduction chain without SLP.\n");
6962 return false;
6964 STMT_VINFO_REDUC_TYPE (reduc_info)
6965 = reduction_type = FOLD_LEFT_REDUCTION;
6967 else if (!commutative_tree_code (orig_code)
6968 || !associative_tree_code (orig_code))
6970 if (dump_enabled_p ())
6971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6972 "reduction: not commutative/associative");
6973 return false;
6977 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6978 && ncopies > 1)
6980 if (dump_enabled_p ())
6981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982 "multiple types in double reduction or condition "
6983 "reduction or fold-left reduction.\n");
6984 return false;
6987 internal_fn reduc_fn = IFN_LAST;
6988 if (reduction_type == TREE_CODE_REDUCTION
6989 || reduction_type == FOLD_LEFT_REDUCTION
6990 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6991 || reduction_type == CONST_COND_REDUCTION)
6993 if (reduction_type == FOLD_LEFT_REDUCTION
6994 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6995 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6997 if (reduc_fn != IFN_LAST
6998 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6999 OPTIMIZE_FOR_SPEED))
7001 if (dump_enabled_p ())
7002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7003 "reduc op not supported by target.\n");
7005 reduc_fn = IFN_LAST;
7008 else
7010 if (!nested_cycle || double_reduc)
7012 if (dump_enabled_p ())
7013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7014 "no reduc code for scalar code.\n");
7016 return false;
7020 else if (reduction_type == COND_REDUCTION)
7022 int scalar_precision
7023 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7024 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7025 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7026 vectype_out);
7028 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7029 OPTIMIZE_FOR_SPEED))
7030 reduc_fn = IFN_REDUC_MAX;
7032 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7034 if (reduction_type != EXTRACT_LAST_REDUCTION
7035 && (!nested_cycle || double_reduc)
7036 && reduc_fn == IFN_LAST
7037 && !nunits_out.is_constant ())
7039 if (dump_enabled_p ())
7040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041 "missing target support for reduction on"
7042 " variable-length vectors.\n");
7043 return false;
7046 /* For SLP reductions, see if there is a neutral value we can use. */
7047 tree neutral_op = NULL_TREE;
7048 if (slp_node)
7050 tree initial_value = NULL_TREE;
7051 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7052 initial_value = vect_phi_initial_value (reduc_def_phi);
7053 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7054 orig_code, initial_value);
7057 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7059 /* We can't support in-order reductions of code such as this:
7061 for (int i = 0; i < n1; ++i)
7062 for (int j = 0; j < n2; ++j)
7063 l += a[j];
7065 since GCC effectively transforms the loop when vectorizing:
7067 for (int i = 0; i < n1 / VF; ++i)
7068 for (int j = 0; j < n2; ++j)
7069 for (int k = 0; k < VF; ++k)
7070 l += a[j];
7072 which is a reassociation of the original operation. */
7073 if (dump_enabled_p ())
7074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7075 "in-order double reduction not supported.\n");
7077 return false;
7080 if (reduction_type == FOLD_LEFT_REDUCTION
7081 && slp_node
7082 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7084 /* We cannot use in-order reductions in this case because there is
7085 an implicit reassociation of the operations involved. */
7086 if (dump_enabled_p ())
7087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7088 "in-order unchained SLP reductions not supported.\n");
7089 return false;
7092 /* For double reductions, and for SLP reductions with a neutral value,
7093 we construct a variable-length initial vector by loading a vector
7094 full of the neutral value and then shift-and-inserting the start
7095 values into the low-numbered elements. */
7096 if ((double_reduc || neutral_op)
7097 && !nunits_out.is_constant ()
7098 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7099 vectype_out, OPTIMIZE_FOR_SPEED))
7101 if (dump_enabled_p ())
7102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7103 "reduction on variable-length vectors requires"
7104 " target support for a vector-shift-and-insert"
7105 " operation.\n");
7106 return false;
7109 /* Check extra constraints for variable-length unchained SLP reductions. */
7110 if (STMT_SLP_TYPE (stmt_info)
7111 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7112 && !nunits_out.is_constant ())
7114 /* We checked above that we could build the initial vector when
7115 there's a neutral element value. Check here for the case in
7116 which each SLP statement has its own initial value and in which
7117 that value needs to be repeated for every instance of the
7118 statement within the initial vector. */
7119 unsigned int group_size = SLP_TREE_LANES (slp_node);
7120 if (!neutral_op
7121 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7122 TREE_TYPE (vectype_out)))
7124 if (dump_enabled_p ())
7125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7126 "unsupported form of SLP reduction for"
7127 " variable-length vectors: cannot build"
7128 " initial vector.\n");
7129 return false;
7131 /* The epilogue code relies on the number of elements being a multiple
7132 of the group size. The duplicate-and-interleave approach to setting
7133 up the initial vector does too. */
7134 if (!multiple_p (nunits_out, group_size))
7136 if (dump_enabled_p ())
7137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7138 "unsupported form of SLP reduction for"
7139 " variable-length vectors: the vector size"
7140 " is not a multiple of the number of results.\n");
7141 return false;
7145 if (reduction_type == COND_REDUCTION)
7147 widest_int ni;
7149 if (! max_loop_iterations (loop, &ni))
7151 if (dump_enabled_p ())
7152 dump_printf_loc (MSG_NOTE, vect_location,
7153 "loop count not known, cannot create cond "
7154 "reduction.\n");
7155 return false;
7157 /* Convert backedges to iterations. */
7158 ni += 1;
7160 /* The additional index will be the same type as the condition. Check
7161 that the loop can fit into this less one (because we'll use up the
7162 zero slot for when there are no matches). */
7163 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7164 if (wi::geu_p (ni, wi::to_widest (max_index)))
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_NOTE, vect_location,
7168 "loop size is greater than data size.\n");
7169 return false;
7173 /* In case the vectorization factor (VF) is bigger than the number
7174 of elements that we can fit in a vectype (nunits), we have to generate
7175 more than one vector stmt - i.e - we need to "unroll" the
7176 vector stmt by a factor VF/nunits. For more details see documentation
7177 in vectorizable_operation. */
7179 /* If the reduction is used in an outer loop we need to generate
7180 VF intermediate results, like so (e.g. for ncopies=2):
7181 r0 = phi (init, r0)
7182 r1 = phi (init, r1)
7183 r0 = x0 + r0;
7184 r1 = x1 + r1;
7185 (i.e. we generate VF results in 2 registers).
7186 In this case we have a separate def-use cycle for each copy, and therefore
7187 for each copy we get the vector def for the reduction variable from the
7188 respective phi node created for this copy.
7190 Otherwise (the reduction is unused in the loop nest), we can combine
7191 together intermediate results, like so (e.g. for ncopies=2):
7192 r = phi (init, r)
7193 r = x0 + r;
7194 r = x1 + r;
7195 (i.e. we generate VF/2 results in a single register).
7196 In this case for each copy we get the vector def for the reduction variable
7197 from the vectorized reduction operation generated in the previous iteration.
7199 This only works when we see both the reduction PHI and its only consumer
7200 in vectorizable_reduction and there are no intermediate stmts
7201 participating. */
7202 if (ncopies > 1
7203 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7204 && reduc_chain_length == 1)
7205 single_defuse_cycle = true;
7207 if (single_defuse_cycle || lane_reduc_code_p)
7209 gcc_assert (code != COND_EXPR);
7211 /* 4. Supportable by target? */
7212 bool ok = true;
7214 /* 4.1. check support for the operation in the loop */
7215 optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7216 if (!optab)
7218 if (dump_enabled_p ())
7219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7220 "no optab.\n");
7221 ok = false;
7224 machine_mode vec_mode = TYPE_MODE (vectype_in);
7225 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7227 if (dump_enabled_p ())
7228 dump_printf (MSG_NOTE, "op not supported by target.\n");
7229 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7230 || !vect_can_vectorize_without_simd_p (code))
7231 ok = false;
7232 else
7233 if (dump_enabled_p ())
7234 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7237 if (vect_emulated_vector_p (vectype_in)
7238 && !vect_can_vectorize_without_simd_p (code))
7240 if (dump_enabled_p ())
7241 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7242 return false;
7245 /* lane-reducing operations have to go through vect_transform_reduction.
7246 For the other cases try without the single cycle optimization. */
7247 if (!ok)
7249 if (lane_reduc_code_p)
7250 return false;
7251 else
7252 single_defuse_cycle = false;
7255 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7257 /* If the reduction stmt is one of the patterns that have lane
7258 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7259 if ((ncopies > 1 && ! single_defuse_cycle)
7260 && lane_reduc_code_p)
7262 if (dump_enabled_p ())
7263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7264 "multi def-use cycle not possible for lane-reducing "
7265 "reduction operation\n");
7266 return false;
7269 if (slp_node
7270 && !(!single_defuse_cycle
7271 && code != DOT_PROD_EXPR
7272 && code != WIDEN_SUM_EXPR
7273 && code != SAD_EXPR
7274 && reduction_type != FOLD_LEFT_REDUCTION))
7275 for (i = 0; i < op_type; i++)
7276 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7278 if (dump_enabled_p ())
7279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7280 "incompatible vector types for invariants\n");
7281 return false;
7284 if (slp_node)
7285 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7286 else
7287 vec_num = 1;
7289 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7290 reduction_type, ncopies, cost_vec);
7291 /* Cost the reduction op inside the loop if transformed via
7292 vect_transform_reduction. Otherwise this is costed by the
7293 separate vectorizable_* routines. */
7294 if (single_defuse_cycle
7295 || code == DOT_PROD_EXPR
7296 || code == WIDEN_SUM_EXPR
7297 || code == SAD_EXPR)
7298 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7300 if (dump_enabled_p ()
7301 && reduction_type == FOLD_LEFT_REDUCTION)
7302 dump_printf_loc (MSG_NOTE, vect_location,
7303 "using an in-order (fold-left) reduction.\n");
7304 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7305 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7306 reductions go through their own vectorizable_* routines. */
7307 if (!single_defuse_cycle
7308 && code != DOT_PROD_EXPR
7309 && code != WIDEN_SUM_EXPR
7310 && code != SAD_EXPR
7311 && reduction_type != FOLD_LEFT_REDUCTION)
7313 stmt_vec_info tem
7314 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7315 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7317 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7318 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7320 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7321 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7323 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7325 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7326 internal_fn cond_fn = get_conditional_internal_fn (code);
7328 if (reduction_type != FOLD_LEFT_REDUCTION
7329 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7330 && (cond_fn == IFN_LAST
7331 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7332 OPTIMIZE_FOR_SPEED)))
7334 if (dump_enabled_p ())
7335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7336 "can't operate on partial vectors because"
7337 " no conditional operation is available.\n");
7338 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7340 else if (reduction_type == FOLD_LEFT_REDUCTION
7341 && reduc_fn == IFN_LAST
7342 && !expand_vec_cond_expr_p (vectype_in,
7343 truth_type_for (vectype_in),
7344 SSA_NAME))
7346 if (dump_enabled_p ())
7347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7348 "can't operate on partial vectors because"
7349 " no conditional operation is available.\n");
7350 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7352 else
7353 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7354 vectype_in, NULL);
7356 return true;
7359 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7360 value. */
7362 bool
7363 vect_transform_reduction (loop_vec_info loop_vinfo,
7364 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7365 gimple **vec_stmt, slp_tree slp_node)
7367 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7368 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7369 int i;
7370 int ncopies;
7371 int vec_num;
7373 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7374 gcc_assert (reduc_info->is_reduc_info);
7376 if (nested_in_vect_loop_p (loop, stmt_info))
7378 loop = loop->inner;
7379 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7382 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7383 enum tree_code code = gimple_assign_rhs_code (stmt);
7384 int op_type = TREE_CODE_LENGTH (code);
7386 /* Flatten RHS. */
7387 tree ops[3];
7388 switch (get_gimple_rhs_class (code))
7390 case GIMPLE_TERNARY_RHS:
7391 ops[2] = gimple_assign_rhs3 (stmt);
7392 /* Fall thru. */
7393 case GIMPLE_BINARY_RHS:
7394 ops[0] = gimple_assign_rhs1 (stmt);
7395 ops[1] = gimple_assign_rhs2 (stmt);
7396 break;
7397 default:
7398 gcc_unreachable ();
7401 /* All uses but the last are expected to be defined in the loop.
7402 The last use is the reduction variable. In case of nested cycle this
7403 assumption is not true: we use reduc_index to record the index of the
7404 reduction variable. */
7405 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7406 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7407 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7408 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7410 if (slp_node)
7412 ncopies = 1;
7413 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7415 else
7417 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7418 vec_num = 1;
7421 internal_fn cond_fn = get_conditional_internal_fn (code);
7422 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7423 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7425 /* Transform. */
7426 tree new_temp = NULL_TREE;
7427 auto_vec<tree> vec_oprnds0;
7428 auto_vec<tree> vec_oprnds1;
7429 auto_vec<tree> vec_oprnds2;
7430 tree def0;
7432 if (dump_enabled_p ())
7433 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7435 /* FORNOW: Multiple types are not supported for condition. */
7436 if (code == COND_EXPR)
7437 gcc_assert (ncopies == 1);
7439 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7441 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7442 if (reduction_type == FOLD_LEFT_REDUCTION)
7444 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7445 return vectorize_fold_left_reduction
7446 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7447 reduc_fn, ops, vectype_in, reduc_index, masks);
7450 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7451 gcc_assert (single_defuse_cycle
7452 || code == DOT_PROD_EXPR
7453 || code == WIDEN_SUM_EXPR
7454 || code == SAD_EXPR);
7456 /* Create the destination vector */
7457 tree scalar_dest = gimple_assign_lhs (stmt);
7458 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7460 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7461 single_defuse_cycle && reduc_index == 0
7462 ? NULL_TREE : ops[0], &vec_oprnds0,
7463 single_defuse_cycle && reduc_index == 1
7464 ? NULL_TREE : ops[1], &vec_oprnds1,
7465 op_type == ternary_op
7466 && !(single_defuse_cycle && reduc_index == 2)
7467 ? ops[2] : NULL_TREE, &vec_oprnds2);
7468 if (single_defuse_cycle)
7470 gcc_assert (!slp_node);
7471 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7472 ops[reduc_index],
7473 reduc_index == 0 ? &vec_oprnds0
7474 : (reduc_index == 1 ? &vec_oprnds1
7475 : &vec_oprnds2));
7478 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7480 gimple *new_stmt;
7481 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7482 if (masked_loop_p && !mask_by_cond_expr)
7484 /* Make sure that the reduction accumulator is vop[0]. */
7485 if (reduc_index == 1)
7487 gcc_assert (commutative_tree_code (code));
7488 std::swap (vop[0], vop[1]);
7490 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7491 vectype_in, i);
7492 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7493 vop[0], vop[1], vop[0]);
7494 new_temp = make_ssa_name (vec_dest, call);
7495 gimple_call_set_lhs (call, new_temp);
7496 gimple_call_set_nothrow (call, true);
7497 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7498 new_stmt = call;
7500 else
7502 if (op_type == ternary_op)
7503 vop[2] = vec_oprnds2[i];
7505 if (masked_loop_p && mask_by_cond_expr)
7507 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7508 vectype_in, i);
7509 build_vect_cond_expr (code, vop, mask, gsi);
7512 new_stmt = gimple_build_assign (vec_dest, code,
7513 vop[0], vop[1], vop[2]);
7514 new_temp = make_ssa_name (vec_dest, new_stmt);
7515 gimple_assign_set_lhs (new_stmt, new_temp);
7516 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7519 if (slp_node)
7520 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7521 else if (single_defuse_cycle
7522 && i < ncopies - 1)
7524 if (reduc_index == 0)
7525 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7526 else if (reduc_index == 1)
7527 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7528 else if (reduc_index == 2)
7529 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7531 else
7532 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7535 if (!slp_node)
7536 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7538 return true;
7541 /* Transform phase of a cycle PHI. */
7543 bool
7544 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7545 stmt_vec_info stmt_info, gimple **vec_stmt,
7546 slp_tree slp_node, slp_instance slp_node_instance)
7548 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7549 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7550 int i;
7551 int ncopies;
7552 int j;
7553 bool nested_cycle = false;
7554 int vec_num;
7556 if (nested_in_vect_loop_p (loop, stmt_info))
7558 loop = loop->inner;
7559 nested_cycle = true;
7562 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7563 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7564 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7565 gcc_assert (reduc_info->is_reduc_info);
7567 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7568 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7569 /* Leave the scalar phi in place. */
7570 return true;
7572 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7573 /* For a nested cycle we do not fill the above. */
7574 if (!vectype_in)
7575 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7576 gcc_assert (vectype_in);
7578 if (slp_node)
7580 /* The size vect_schedule_slp_instance computes is off for us. */
7581 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7582 * SLP_TREE_LANES (slp_node), vectype_in);
7583 ncopies = 1;
7585 else
7587 vec_num = 1;
7588 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7591 /* Check whether we should use a single PHI node and accumulate
7592 vectors to one before the backedge. */
7593 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7594 ncopies = 1;
7596 /* Create the destination vector */
7597 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7598 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7599 vectype_out);
7601 /* Get the loop-entry arguments. */
7602 tree vec_initial_def = NULL_TREE;
7603 auto_vec<tree> vec_initial_defs;
7604 if (slp_node)
7606 vec_initial_defs.reserve (vec_num);
7607 if (nested_cycle)
7609 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7610 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7611 &vec_initial_defs);
7613 else
7615 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7616 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7617 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7619 unsigned int num_phis = stmts.length ();
7620 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7621 num_phis = 1;
7622 initial_values.reserve (num_phis);
7623 for (unsigned int i = 0; i < num_phis; ++i)
7625 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7626 initial_values.quick_push (vect_phi_initial_value (this_phi));
7628 if (vec_num == 1)
7629 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7630 if (!initial_values.is_empty ())
7632 tree initial_value
7633 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7634 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7635 tree neutral_op
7636 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7637 code, initial_value);
7638 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7639 &vec_initial_defs, vec_num,
7640 stmts.length (), neutral_op);
7644 else
7646 /* Get at the scalar def before the loop, that defines the initial
7647 value of the reduction variable. */
7648 tree initial_def = vect_phi_initial_value (phi);
7649 reduc_info->reduc_initial_values.safe_push (initial_def);
7650 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7651 and we can't use zero for induc_val, use initial_def. Similarly
7652 for REDUC_MIN and initial_def larger than the base. */
7653 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7655 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7656 if (TREE_CODE (initial_def) == INTEGER_CST
7657 && !integer_zerop (induc_val)
7658 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7659 && tree_int_cst_lt (initial_def, induc_val))
7660 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7661 && tree_int_cst_lt (induc_val, initial_def))))
7663 induc_val = initial_def;
7664 /* Communicate we used the initial_def to epilouge
7665 generation. */
7666 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7668 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7670 else if (nested_cycle)
7672 /* Do not use an adjustment def as that case is not supported
7673 correctly if ncopies is not one. */
7674 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7675 ncopies, initial_def,
7676 &vec_initial_defs);
7678 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7679 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7680 /* Fill the initial vector with the initial scalar value. */
7681 vec_initial_def
7682 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7683 initial_def, initial_def);
7684 else
7686 if (ncopies == 1)
7687 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7688 if (!reduc_info->reduc_initial_values.is_empty ())
7690 initial_def = reduc_info->reduc_initial_values[0];
7691 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7692 tree neutral_op
7693 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7694 code, initial_def);
7695 gcc_assert (neutral_op);
7696 /* Try to simplify the vector initialization by applying an
7697 adjustment after the reduction has been performed. */
7698 if (!reduc_info->reused_accumulator
7699 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7700 && !operand_equal_p (neutral_op, initial_def))
7702 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7703 = initial_def;
7704 initial_def = neutral_op;
7706 vec_initial_def
7707 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7708 initial_def, neutral_op);
7713 if (vec_initial_def)
7715 vec_initial_defs.create (ncopies);
7716 for (i = 0; i < ncopies; ++i)
7717 vec_initial_defs.quick_push (vec_initial_def);
7720 if (auto *accumulator = reduc_info->reused_accumulator)
7722 tree def = accumulator->reduc_input;
7723 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7725 unsigned int nreduc;
7726 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7727 (TREE_TYPE (def)),
7728 TYPE_VECTOR_SUBPARTS (vectype_out),
7729 &nreduc);
7730 gcc_assert (res);
7731 gimple_seq stmts = NULL;
7732 /* Reduce the single vector to a smaller one. */
7733 if (nreduc != 1)
7735 /* Perform the reduction in the appropriate type. */
7736 tree rvectype = vectype_out;
7737 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7738 TREE_TYPE (TREE_TYPE (def))))
7739 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7740 TYPE_VECTOR_SUBPARTS
7741 (vectype_out));
7742 def = vect_create_partial_epilog (def, rvectype,
7743 STMT_VINFO_REDUC_CODE
7744 (reduc_info),
7745 &stmts);
7747 /* Adjust the input so we pick up the partially reduced value
7748 for the skip edge in vect_create_epilog_for_reduction. */
7749 accumulator->reduc_input = def;
7750 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7751 def = gimple_convert (&stmts, vectype_out, def);
7752 if (loop_vinfo->main_loop_edge)
7754 /* While we'd like to insert on the edge this will split
7755 blocks and disturb bookkeeping, we also will eventually
7756 need this on the skip edge. Rely on sinking to
7757 fixup optimal placement and insert in the pred. */
7758 gimple_stmt_iterator gsi
7759 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7760 /* Insert before a cond that eventually skips the
7761 epilogue. */
7762 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7763 gsi_prev (&gsi);
7764 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7766 else
7767 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7768 stmts);
7770 if (loop_vinfo->main_loop_edge)
7771 vec_initial_defs[0]
7772 = vect_get_main_loop_result (loop_vinfo, def,
7773 vec_initial_defs[0]);
7774 else
7775 vec_initial_defs.safe_push (def);
7778 /* Generate the reduction PHIs upfront. */
7779 for (i = 0; i < vec_num; i++)
7781 tree vec_init_def = vec_initial_defs[i];
7782 for (j = 0; j < ncopies; j++)
7784 /* Create the reduction-phi that defines the reduction
7785 operand. */
7786 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7788 /* Set the loop-entry arg of the reduction-phi. */
7789 if (j != 0 && nested_cycle)
7790 vec_init_def = vec_initial_defs[j];
7791 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7792 UNKNOWN_LOCATION);
7794 /* The loop-latch arg is set in epilogue processing. */
7796 if (slp_node)
7797 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7798 else
7800 if (j == 0)
7801 *vec_stmt = new_phi;
7802 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7807 return true;
7810 /* Vectorizes LC PHIs. */
7812 bool
7813 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7814 stmt_vec_info stmt_info, gimple **vec_stmt,
7815 slp_tree slp_node)
7817 if (!loop_vinfo
7818 || !is_a <gphi *> (stmt_info->stmt)
7819 || gimple_phi_num_args (stmt_info->stmt) != 1)
7820 return false;
7822 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7823 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7824 return false;
7826 if (!vec_stmt) /* transformation not required. */
7828 /* Deal with copies from externs or constants that disguise as
7829 loop-closed PHI nodes (PR97886). */
7830 if (slp_node
7831 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7832 SLP_TREE_VECTYPE (slp_node)))
7834 if (dump_enabled_p ())
7835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7836 "incompatible vector types for invariants\n");
7837 return false;
7839 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7840 return true;
7843 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7844 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7845 basic_block bb = gimple_bb (stmt_info->stmt);
7846 edge e = single_pred_edge (bb);
7847 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7848 auto_vec<tree> vec_oprnds;
7849 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7850 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7851 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7852 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7854 /* Create the vectorized LC PHI node. */
7855 gphi *new_phi = create_phi_node (vec_dest, bb);
7856 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7857 if (slp_node)
7858 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7859 else
7860 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7862 if (!slp_node)
7863 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7865 return true;
7868 /* Vectorizes PHIs. */
7870 bool
7871 vectorizable_phi (vec_info *,
7872 stmt_vec_info stmt_info, gimple **vec_stmt,
7873 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7875 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7876 return false;
7878 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7879 return false;
7881 tree vectype = SLP_TREE_VECTYPE (slp_node);
7883 if (!vec_stmt) /* transformation not required. */
7885 slp_tree child;
7886 unsigned i;
7887 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7888 if (!child)
7890 if (dump_enabled_p ())
7891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7892 "PHI node with unvectorized backedge def\n");
7893 return false;
7895 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7897 if (dump_enabled_p ())
7898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899 "incompatible vector types for invariants\n");
7900 return false;
7902 /* For single-argument PHIs assume coalescing which means zero cost
7903 for the scalar and the vector PHIs. This avoids artificially
7904 favoring the vector path (but may pessimize it in some cases). */
7905 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7906 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7907 vector_stmt, stmt_info, vectype, 0, vect_body);
7908 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7909 return true;
7912 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7913 basic_block bb = gimple_bb (stmt_info->stmt);
7914 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7915 auto_vec<gphi *> new_phis;
7916 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7918 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7920 /* Skip not yet vectorized defs. */
7921 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7922 && SLP_TREE_VEC_STMTS (child).is_empty ())
7923 continue;
7925 auto_vec<tree> vec_oprnds;
7926 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7927 if (!new_phis.exists ())
7929 new_phis.create (vec_oprnds.length ());
7930 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7932 /* Create the vectorized LC PHI node. */
7933 new_phis.quick_push (create_phi_node (vec_dest, bb));
7934 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7937 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7938 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7939 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7941 /* We should have at least one already vectorized child. */
7942 gcc_assert (new_phis.exists ());
7944 return true;
7947 /* Return true if VECTYPE represents a vector that requires lowering
7948 by the vector lowering pass. */
7950 bool
7951 vect_emulated_vector_p (tree vectype)
7953 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7954 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7955 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7958 /* Return true if we can emulate CODE on an integer mode representation
7959 of a vector. */
7961 bool
7962 vect_can_vectorize_without_simd_p (tree_code code)
7964 switch (code)
7966 case PLUS_EXPR:
7967 case MINUS_EXPR:
7968 case NEGATE_EXPR:
7969 case BIT_AND_EXPR:
7970 case BIT_IOR_EXPR:
7971 case BIT_XOR_EXPR:
7972 case BIT_NOT_EXPR:
7973 return true;
7975 default:
7976 return false;
7980 /* Function vectorizable_induction
7982 Check if STMT_INFO performs an induction computation that can be vectorized.
7983 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7984 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7985 Return true if STMT_INFO is vectorizable in this way. */
7987 bool
7988 vectorizable_induction (loop_vec_info loop_vinfo,
7989 stmt_vec_info stmt_info,
7990 gimple **vec_stmt, slp_tree slp_node,
7991 stmt_vector_for_cost *cost_vec)
7993 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7994 unsigned ncopies;
7995 bool nested_in_vect_loop = false;
7996 class loop *iv_loop;
7997 tree vec_def;
7998 edge pe = loop_preheader_edge (loop);
7999 basic_block new_bb;
8000 tree new_vec, vec_init, vec_step, t;
8001 tree new_name;
8002 gimple *new_stmt;
8003 gphi *induction_phi;
8004 tree induc_def, vec_dest;
8005 tree init_expr, step_expr;
8006 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8007 unsigned i;
8008 tree expr;
8009 gimple_stmt_iterator si;
8011 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8012 if (!phi)
8013 return false;
8015 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8016 return false;
8018 /* Make sure it was recognized as induction computation. */
8019 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8020 return false;
8022 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8023 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8025 if (slp_node)
8026 ncopies = 1;
8027 else
8028 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8029 gcc_assert (ncopies >= 1);
8031 /* FORNOW. These restrictions should be relaxed. */
8032 if (nested_in_vect_loop_p (loop, stmt_info))
8034 imm_use_iterator imm_iter;
8035 use_operand_p use_p;
8036 gimple *exit_phi;
8037 edge latch_e;
8038 tree loop_arg;
8040 if (ncopies > 1)
8042 if (dump_enabled_p ())
8043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8044 "multiple types in nested loop.\n");
8045 return false;
8048 exit_phi = NULL;
8049 latch_e = loop_latch_edge (loop->inner);
8050 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8051 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8053 gimple *use_stmt = USE_STMT (use_p);
8054 if (is_gimple_debug (use_stmt))
8055 continue;
8057 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8059 exit_phi = use_stmt;
8060 break;
8063 if (exit_phi)
8065 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8066 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8067 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8069 if (dump_enabled_p ())
8070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8071 "inner-loop induction only used outside "
8072 "of the outer vectorized loop.\n");
8073 return false;
8077 nested_in_vect_loop = true;
8078 iv_loop = loop->inner;
8080 else
8081 iv_loop = loop;
8082 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8084 if (slp_node && !nunits.is_constant ())
8086 /* The current SLP code creates the step value element-by-element. */
8087 if (dump_enabled_p ())
8088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8089 "SLP induction not supported for variable-length"
8090 " vectors.\n");
8091 return false;
8094 if (!vec_stmt) /* transformation not required. */
8096 unsigned inside_cost = 0, prologue_cost = 0;
8097 if (slp_node)
8099 /* We eventually need to set a vector type on invariant
8100 arguments. */
8101 unsigned j;
8102 slp_tree child;
8103 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8104 if (!vect_maybe_update_slp_op_vectype
8105 (child, SLP_TREE_VECTYPE (slp_node)))
8107 if (dump_enabled_p ())
8108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8109 "incompatible vector types for "
8110 "invariants\n");
8111 return false;
8113 /* loop cost for vec_loop. */
8114 inside_cost
8115 = record_stmt_cost (cost_vec,
8116 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8117 vector_stmt, stmt_info, 0, vect_body);
8118 /* prologue cost for vec_init (if not nested) and step. */
8119 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8120 scalar_to_vec,
8121 stmt_info, 0, vect_prologue);
8123 else /* if (!slp_node) */
8125 /* loop cost for vec_loop. */
8126 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8127 stmt_info, 0, vect_body);
8128 /* prologue cost for vec_init and vec_step. */
8129 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8130 stmt_info, 0, vect_prologue);
8132 if (dump_enabled_p ())
8133 dump_printf_loc (MSG_NOTE, vect_location,
8134 "vect_model_induction_cost: inside_cost = %d, "
8135 "prologue_cost = %d .\n", inside_cost,
8136 prologue_cost);
8138 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8139 DUMP_VECT_SCOPE ("vectorizable_induction");
8140 return true;
8143 /* Transform. */
8145 /* Compute a vector variable, initialized with the first VF values of
8146 the induction variable. E.g., for an iv with IV_PHI='X' and
8147 evolution S, for a vector of 4 units, we want to compute:
8148 [X, X + S, X + 2*S, X + 3*S]. */
8150 if (dump_enabled_p ())
8151 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8153 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8154 gcc_assert (step_expr != NULL_TREE);
8155 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8157 pe = loop_preheader_edge (iv_loop);
8158 /* Find the first insertion point in the BB. */
8159 basic_block bb = gimple_bb (phi);
8160 si = gsi_after_labels (bb);
8162 /* For SLP induction we have to generate several IVs as for example
8163 with group size 3 we need
8164 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8165 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8166 if (slp_node)
8168 /* Enforced above. */
8169 unsigned int const_nunits = nunits.to_constant ();
8171 /* The initial values are vectorized, but any lanes > group_size
8172 need adjustment. */
8173 slp_tree init_node
8174 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8176 /* Gather steps. Since we do not vectorize inductions as
8177 cycles we have to reconstruct the step from SCEV data. */
8178 unsigned group_size = SLP_TREE_LANES (slp_node);
8179 tree *steps = XALLOCAVEC (tree, group_size);
8180 tree *inits = XALLOCAVEC (tree, group_size);
8181 stmt_vec_info phi_info;
8182 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8184 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8185 if (!init_node)
8186 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8187 pe->dest_idx);
8190 /* Now generate the IVs. */
8191 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8192 gcc_assert ((const_nunits * nvects) % group_size == 0);
8193 unsigned nivs;
8194 if (nested_in_vect_loop)
8195 nivs = nvects;
8196 else
8198 /* Compute the number of distinct IVs we need. First reduce
8199 group_size if it is a multiple of const_nunits so we get
8200 one IV for a group_size of 4 but const_nunits 2. */
8201 unsigned group_sizep = group_size;
8202 if (group_sizep % const_nunits == 0)
8203 group_sizep = group_sizep / const_nunits;
8204 nivs = least_common_multiple (group_sizep,
8205 const_nunits) / const_nunits;
8207 tree stept = TREE_TYPE (step_vectype);
8208 tree lupdate_mul = NULL_TREE;
8209 if (!nested_in_vect_loop)
8211 /* The number of iterations covered in one vector iteration. */
8212 unsigned lup_mul = (nvects * const_nunits) / group_size;
8213 lupdate_mul
8214 = build_vector_from_val (step_vectype,
8215 SCALAR_FLOAT_TYPE_P (stept)
8216 ? build_real_from_wide (stept, lup_mul,
8217 UNSIGNED)
8218 : build_int_cstu (stept, lup_mul));
8220 tree peel_mul = NULL_TREE;
8221 gimple_seq init_stmts = NULL;
8222 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8224 if (SCALAR_FLOAT_TYPE_P (stept))
8225 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8226 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8227 else
8228 peel_mul = gimple_convert (&init_stmts, stept,
8229 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8230 peel_mul = gimple_build_vector_from_val (&init_stmts,
8231 step_vectype, peel_mul);
8233 unsigned ivn;
8234 auto_vec<tree> vec_steps;
8235 for (ivn = 0; ivn < nivs; ++ivn)
8237 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8238 tree_vector_builder init_elts (vectype, const_nunits, 1);
8239 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8240 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8242 /* The scalar steps of the IVs. */
8243 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8244 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8245 step_elts.quick_push (elt);
8246 if (!init_node)
8248 /* The scalar inits of the IVs if not vectorized. */
8249 elt = inits[(ivn*const_nunits + eltn) % group_size];
8250 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8251 TREE_TYPE (elt)))
8252 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8253 TREE_TYPE (vectype), elt);
8254 init_elts.quick_push (elt);
8256 /* The number of steps to add to the initial values. */
8257 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8258 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8259 ? build_real_from_wide (stept,
8260 mul_elt, UNSIGNED)
8261 : build_int_cstu (stept, mul_elt));
8263 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8264 vec_steps.safe_push (vec_step);
8265 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8266 if (peel_mul)
8267 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8268 step_mul, peel_mul);
8269 if (!init_node)
8270 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8272 /* Create the induction-phi that defines the induction-operand. */
8273 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8274 "vec_iv_");
8275 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8276 induc_def = PHI_RESULT (induction_phi);
8278 /* Create the iv update inside the loop */
8279 tree up = vec_step;
8280 if (lupdate_mul)
8281 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8282 vec_step, lupdate_mul);
8283 gimple_seq stmts = NULL;
8284 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8285 vec_def = gimple_build (&stmts,
8286 PLUS_EXPR, step_vectype, vec_def, up);
8287 vec_def = gimple_convert (&stmts, vectype, vec_def);
8288 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8289 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8290 UNKNOWN_LOCATION);
8292 if (init_node)
8293 vec_init = vect_get_slp_vect_def (init_node, ivn);
8294 if (!nested_in_vect_loop
8295 && !integer_zerop (step_mul))
8297 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8298 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8299 vec_step, step_mul);
8300 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8301 vec_def, up);
8302 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8305 /* Set the arguments of the phi node: */
8306 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8308 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8310 if (!nested_in_vect_loop)
8312 /* Fill up to the number of vectors we need for the whole group. */
8313 nivs = least_common_multiple (group_size,
8314 const_nunits) / const_nunits;
8315 vec_steps.reserve (nivs-ivn);
8316 for (; ivn < nivs; ++ivn)
8318 SLP_TREE_VEC_STMTS (slp_node)
8319 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8320 vec_steps.quick_push (vec_steps[0]);
8324 /* Re-use IVs when we can. We are generating further vector
8325 stmts by adding VF' * stride to the IVs generated above. */
8326 if (ivn < nvects)
8328 unsigned vfp
8329 = least_common_multiple (group_size, const_nunits) / group_size;
8330 tree lupdate_mul
8331 = build_vector_from_val (step_vectype,
8332 SCALAR_FLOAT_TYPE_P (stept)
8333 ? build_real_from_wide (stept,
8334 vfp, UNSIGNED)
8335 : build_int_cstu (stept, vfp));
8336 for (; ivn < nvects; ++ivn)
8338 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8339 tree def = gimple_get_lhs (iv);
8340 if (ivn < 2*nivs)
8341 vec_steps[ivn - nivs]
8342 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8343 vec_steps[ivn - nivs], lupdate_mul);
8344 gimple_seq stmts = NULL;
8345 def = gimple_convert (&stmts, step_vectype, def);
8346 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8347 def, vec_steps[ivn % nivs]);
8348 def = gimple_convert (&stmts, vectype, def);
8349 if (gimple_code (iv) == GIMPLE_PHI)
8350 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8351 else
8353 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8354 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8356 SLP_TREE_VEC_STMTS (slp_node)
8357 .quick_push (SSA_NAME_DEF_STMT (def));
8361 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8362 gcc_assert (!new_bb);
8364 return true;
8367 init_expr = vect_phi_initial_value (phi);
8369 gimple_seq stmts = NULL;
8370 if (!nested_in_vect_loop)
8372 /* Convert the initial value to the IV update type. */
8373 tree new_type = TREE_TYPE (step_expr);
8374 init_expr = gimple_convert (&stmts, new_type, init_expr);
8376 /* If we are using the loop mask to "peel" for alignment then we need
8377 to adjust the start value here. */
8378 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8379 if (skip_niters != NULL_TREE)
8381 if (FLOAT_TYPE_P (vectype))
8382 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8383 skip_niters);
8384 else
8385 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8386 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8387 skip_niters, step_expr);
8388 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8389 init_expr, skip_step);
8393 if (stmts)
8395 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8396 gcc_assert (!new_bb);
8399 /* Create the vector that holds the initial_value of the induction. */
8400 if (nested_in_vect_loop)
8402 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8403 been created during vectorization of previous stmts. We obtain it
8404 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8405 auto_vec<tree> vec_inits;
8406 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8407 init_expr, &vec_inits);
8408 vec_init = vec_inits[0];
8409 /* If the initial value is not of proper type, convert it. */
8410 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8412 new_stmt
8413 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8414 vect_simple_var,
8415 "vec_iv_"),
8416 VIEW_CONVERT_EXPR,
8417 build1 (VIEW_CONVERT_EXPR, vectype,
8418 vec_init));
8419 vec_init = gimple_assign_lhs (new_stmt);
8420 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8421 new_stmt);
8422 gcc_assert (!new_bb);
8425 else
8427 /* iv_loop is the loop to be vectorized. Create:
8428 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8429 stmts = NULL;
8430 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8432 unsigned HOST_WIDE_INT const_nunits;
8433 if (nunits.is_constant (&const_nunits))
8435 tree_vector_builder elts (step_vectype, const_nunits, 1);
8436 elts.quick_push (new_name);
8437 for (i = 1; i < const_nunits; i++)
8439 /* Create: new_name_i = new_name + step_expr */
8440 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8441 new_name, step_expr);
8442 elts.quick_push (new_name);
8444 /* Create a vector from [new_name_0, new_name_1, ...,
8445 new_name_nunits-1] */
8446 vec_init = gimple_build_vector (&stmts, &elts);
8448 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8449 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8450 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8451 new_name, step_expr);
8452 else
8454 /* Build:
8455 [base, base, base, ...]
8456 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8457 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8458 gcc_assert (flag_associative_math);
8459 tree index = build_index_vector (step_vectype, 0, 1);
8460 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8461 new_name);
8462 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8463 step_expr);
8464 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8465 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8466 vec_init, step_vec);
8467 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8468 vec_init, base_vec);
8470 vec_init = gimple_convert (&stmts, vectype, vec_init);
8472 if (stmts)
8474 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8475 gcc_assert (!new_bb);
8480 /* Create the vector that holds the step of the induction. */
8481 if (nested_in_vect_loop)
8482 /* iv_loop is nested in the loop to be vectorized. Generate:
8483 vec_step = [S, S, S, S] */
8484 new_name = step_expr;
8485 else
8487 /* iv_loop is the loop to be vectorized. Generate:
8488 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8489 gimple_seq seq = NULL;
8490 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8492 expr = build_int_cst (integer_type_node, vf);
8493 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8495 else
8496 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8497 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8498 expr, step_expr);
8499 if (seq)
8501 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8502 gcc_assert (!new_bb);
8506 t = unshare_expr (new_name);
8507 gcc_assert (CONSTANT_CLASS_P (new_name)
8508 || TREE_CODE (new_name) == SSA_NAME);
8509 new_vec = build_vector_from_val (step_vectype, t);
8510 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8511 new_vec, step_vectype, NULL);
8514 /* Create the following def-use cycle:
8515 loop prolog:
8516 vec_init = ...
8517 vec_step = ...
8518 loop:
8519 vec_iv = PHI <vec_init, vec_loop>
8521 STMT
8523 vec_loop = vec_iv + vec_step; */
8525 /* Create the induction-phi that defines the induction-operand. */
8526 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8527 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8528 induc_def = PHI_RESULT (induction_phi);
8530 /* Create the iv update inside the loop */
8531 stmts = NULL;
8532 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8533 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8534 vec_def = gimple_convert (&stmts, vectype, vec_def);
8535 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8536 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8538 /* Set the arguments of the phi node: */
8539 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8540 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8541 UNKNOWN_LOCATION);
8543 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8544 *vec_stmt = induction_phi;
8546 /* In case that vectorization factor (VF) is bigger than the number
8547 of elements that we can fit in a vectype (nunits), we have to generate
8548 more than one vector stmt - i.e - we need to "unroll" the
8549 vector stmt by a factor VF/nunits. For more details see documentation
8550 in vectorizable_operation. */
8552 if (ncopies > 1)
8554 gimple_seq seq = NULL;
8555 /* FORNOW. This restriction should be relaxed. */
8556 gcc_assert (!nested_in_vect_loop);
8558 /* Create the vector that holds the step of the induction. */
8559 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8561 expr = build_int_cst (integer_type_node, nunits);
8562 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8564 else
8565 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8566 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8567 expr, step_expr);
8568 if (seq)
8570 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8571 gcc_assert (!new_bb);
8574 t = unshare_expr (new_name);
8575 gcc_assert (CONSTANT_CLASS_P (new_name)
8576 || TREE_CODE (new_name) == SSA_NAME);
8577 new_vec = build_vector_from_val (step_vectype, t);
8578 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8579 new_vec, step_vectype, NULL);
8581 vec_def = induc_def;
8582 for (i = 1; i < ncopies; i++)
8584 /* vec_i = vec_prev + vec_step */
8585 gimple_seq stmts = NULL;
8586 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8587 vec_def = gimple_build (&stmts,
8588 PLUS_EXPR, step_vectype, vec_def, vec_step);
8589 vec_def = gimple_convert (&stmts, vectype, vec_def);
8591 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8592 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8593 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8597 if (dump_enabled_p ())
8598 dump_printf_loc (MSG_NOTE, vect_location,
8599 "transform induction: created def-use cycle: %G%G",
8600 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8602 return true;
8605 /* Function vectorizable_live_operation.
8607 STMT_INFO computes a value that is used outside the loop. Check if
8608 it can be supported. */
8610 bool
8611 vectorizable_live_operation (vec_info *vinfo,
8612 stmt_vec_info stmt_info,
8613 gimple_stmt_iterator *gsi,
8614 slp_tree slp_node, slp_instance slp_node_instance,
8615 int slp_index, bool vec_stmt_p,
8616 stmt_vector_for_cost *cost_vec)
8618 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8619 imm_use_iterator imm_iter;
8620 tree lhs, lhs_type, bitsize;
8621 tree vectype = (slp_node
8622 ? SLP_TREE_VECTYPE (slp_node)
8623 : STMT_VINFO_VECTYPE (stmt_info));
8624 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8625 int ncopies;
8626 gimple *use_stmt;
8627 auto_vec<tree> vec_oprnds;
8628 int vec_entry = 0;
8629 poly_uint64 vec_index = 0;
8631 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8633 /* If a stmt of a reduction is live, vectorize it via
8634 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8635 validity so just trigger the transform here. */
8636 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8638 if (!vec_stmt_p)
8639 return true;
8640 if (slp_node)
8642 /* For reduction chains the meta-info is attached to
8643 the group leader. */
8644 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8645 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8646 /* For SLP reductions we vectorize the epilogue for
8647 all involved stmts together. */
8648 else if (slp_index != 0)
8649 return true;
8650 else
8651 /* For SLP reductions the meta-info is attached to
8652 the representative. */
8653 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8655 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8656 gcc_assert (reduc_info->is_reduc_info);
8657 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8658 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8659 return true;
8660 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8661 slp_node_instance);
8662 return true;
8665 /* If STMT is not relevant and it is a simple assignment and its inputs are
8666 invariant then it can remain in place, unvectorized. The original last
8667 scalar value that it computes will be used. */
8668 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8670 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8671 if (dump_enabled_p ())
8672 dump_printf_loc (MSG_NOTE, vect_location,
8673 "statement is simple and uses invariant. Leaving in "
8674 "place.\n");
8675 return true;
8678 if (slp_node)
8679 ncopies = 1;
8680 else
8681 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8683 if (slp_node)
8685 gcc_assert (slp_index >= 0);
8687 /* Get the last occurrence of the scalar index from the concatenation of
8688 all the slp vectors. Calculate which slp vector it is and the index
8689 within. */
8690 int num_scalar = SLP_TREE_LANES (slp_node);
8691 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8692 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8694 /* Calculate which vector contains the result, and which lane of
8695 that vector we need. */
8696 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8698 if (dump_enabled_p ())
8699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8700 "Cannot determine which vector holds the"
8701 " final result.\n");
8702 return false;
8706 if (!vec_stmt_p)
8708 /* No transformation required. */
8709 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8711 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8712 OPTIMIZE_FOR_SPEED))
8714 if (dump_enabled_p ())
8715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8716 "can't operate on partial vectors "
8717 "because the target doesn't support extract "
8718 "last reduction.\n");
8719 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8721 else if (slp_node)
8723 if (dump_enabled_p ())
8724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8725 "can't operate on partial vectors "
8726 "because an SLP statement is live after "
8727 "the loop.\n");
8728 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8730 else if (ncopies > 1)
8732 if (dump_enabled_p ())
8733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8734 "can't operate on partial vectors "
8735 "because ncopies is greater than 1.\n");
8736 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8738 else
8740 gcc_assert (ncopies == 1 && !slp_node);
8741 vect_record_loop_mask (loop_vinfo,
8742 &LOOP_VINFO_MASKS (loop_vinfo),
8743 1, vectype, NULL);
8746 /* ??? Enable for loop costing as well. */
8747 if (!loop_vinfo)
8748 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8749 0, vect_epilogue);
8750 return true;
8753 /* Use the lhs of the original scalar statement. */
8754 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8755 if (dump_enabled_p ())
8756 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8757 "stmt %G", stmt);
8759 lhs = gimple_get_lhs (stmt);
8760 lhs_type = TREE_TYPE (lhs);
8762 bitsize = vector_element_bits_tree (vectype);
8764 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8765 tree vec_lhs, bitstart;
8766 gimple *vec_stmt;
8767 if (slp_node)
8769 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8771 /* Get the correct slp vectorized stmt. */
8772 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8773 vec_lhs = gimple_get_lhs (vec_stmt);
8775 /* Get entry to use. */
8776 bitstart = bitsize_int (vec_index);
8777 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8779 else
8781 /* For multiple copies, get the last copy. */
8782 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8783 vec_lhs = gimple_get_lhs (vec_stmt);
8785 /* Get the last lane in the vector. */
8786 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8789 if (loop_vinfo)
8791 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8792 requirement, insert one phi node for it. It looks like:
8793 loop;
8795 # lhs' = PHI <lhs>
8797 loop;
8799 # vec_lhs' = PHI <vec_lhs>
8800 new_tree = lane_extract <vec_lhs', ...>;
8801 lhs' = new_tree; */
8803 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8804 basic_block exit_bb = single_exit (loop)->dest;
8805 gcc_assert (single_pred_p (exit_bb));
8807 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8808 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8809 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8811 gimple_seq stmts = NULL;
8812 tree new_tree;
8813 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8815 /* Emit:
8817 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8819 where VEC_LHS is the vectorized live-out result and MASK is
8820 the loop mask for the final iteration. */
8821 gcc_assert (ncopies == 1 && !slp_node);
8822 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8823 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8824 1, vectype, 0);
8825 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8826 mask, vec_lhs_phi);
8828 /* Convert the extracted vector element to the scalar type. */
8829 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8831 else
8833 tree bftype = TREE_TYPE (vectype);
8834 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8835 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8836 new_tree = build3 (BIT_FIELD_REF, bftype,
8837 vec_lhs_phi, bitsize, bitstart);
8838 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8839 &stmts, true, NULL_TREE);
8842 if (stmts)
8844 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8845 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8847 /* Remove existing phi from lhs and create one copy from new_tree. */
8848 tree lhs_phi = NULL_TREE;
8849 gimple_stmt_iterator gsi;
8850 for (gsi = gsi_start_phis (exit_bb);
8851 !gsi_end_p (gsi); gsi_next (&gsi))
8853 gimple *phi = gsi_stmt (gsi);
8854 if ((gimple_phi_arg_def (phi, 0) == lhs))
8856 remove_phi_node (&gsi, false);
8857 lhs_phi = gimple_phi_result (phi);
8858 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8859 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8860 break;
8865 /* Replace use of lhs with newly computed result. If the use stmt is a
8866 single arg PHI, just replace all uses of PHI result. It's necessary
8867 because lcssa PHI defining lhs may be before newly inserted stmt. */
8868 use_operand_p use_p;
8869 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8870 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8871 && !is_gimple_debug (use_stmt))
8873 if (gimple_code (use_stmt) == GIMPLE_PHI
8874 && gimple_phi_num_args (use_stmt) == 1)
8876 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8878 else
8880 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8881 SET_USE (use_p, new_tree);
8883 update_stmt (use_stmt);
8886 else
8888 /* For basic-block vectorization simply insert the lane-extraction. */
8889 tree bftype = TREE_TYPE (vectype);
8890 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8891 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8892 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8893 vec_lhs, bitsize, bitstart);
8894 gimple_seq stmts = NULL;
8895 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8896 &stmts, true, NULL_TREE);
8897 if (TREE_CODE (new_tree) == SSA_NAME
8898 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8899 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8900 if (is_a <gphi *> (vec_stmt))
8902 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8903 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8905 else
8907 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8908 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8911 /* Replace use of lhs with newly computed result. If the use stmt is a
8912 single arg PHI, just replace all uses of PHI result. It's necessary
8913 because lcssa PHI defining lhs may be before newly inserted stmt. */
8914 use_operand_p use_p;
8915 stmt_vec_info use_stmt_info;
8916 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8917 if (!is_gimple_debug (use_stmt)
8918 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8919 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8921 /* ??? This can happen when the live lane ends up being
8922 used in a vector construction code-generated by an
8923 external SLP node (and code-generation for that already
8924 happened). See gcc.dg/vect/bb-slp-47.c.
8925 Doing this is what would happen if that vector CTOR
8926 were not code-generated yet so it is not too bad.
8927 ??? In fact we'd likely want to avoid this situation
8928 in the first place. */
8929 if (TREE_CODE (new_tree) == SSA_NAME
8930 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8931 && gimple_code (use_stmt) != GIMPLE_PHI
8932 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8933 use_stmt))
8935 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8936 gcc_assert (code == CONSTRUCTOR
8937 || code == VIEW_CONVERT_EXPR
8938 || CONVERT_EXPR_CODE_P (code));
8939 if (dump_enabled_p ())
8940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8941 "Using original scalar computation for "
8942 "live lane because use preceeds vector "
8943 "def\n");
8944 continue;
8946 /* ??? It can also happen that we end up pulling a def into
8947 a loop where replacing out-of-loop uses would require
8948 a new LC SSA PHI node. Retain the original scalar in
8949 those cases as well. PR98064. */
8950 if (TREE_CODE (new_tree) == SSA_NAME
8951 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8952 && (gimple_bb (use_stmt)->loop_father
8953 != gimple_bb (vec_stmt)->loop_father)
8954 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8955 gimple_bb (use_stmt)->loop_father))
8957 if (dump_enabled_p ())
8958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8959 "Using original scalar computation for "
8960 "live lane because there is an out-of-loop "
8961 "definition for it\n");
8962 continue;
8964 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8965 SET_USE (use_p, new_tree);
8966 update_stmt (use_stmt);
8970 return true;
8973 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8975 static void
8976 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8978 ssa_op_iter op_iter;
8979 imm_use_iterator imm_iter;
8980 def_operand_p def_p;
8981 gimple *ustmt;
8983 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8985 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8987 basic_block bb;
8989 if (!is_gimple_debug (ustmt))
8990 continue;
8992 bb = gimple_bb (ustmt);
8994 if (!flow_bb_inside_loop_p (loop, bb))
8996 if (gimple_debug_bind_p (ustmt))
8998 if (dump_enabled_p ())
8999 dump_printf_loc (MSG_NOTE, vect_location,
9000 "killing debug use\n");
9002 gimple_debug_bind_reset_value (ustmt);
9003 update_stmt (ustmt);
9005 else
9006 gcc_unreachable ();
9012 /* Given loop represented by LOOP_VINFO, return true if computation of
9013 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9014 otherwise. */
9016 static bool
9017 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9019 /* Constant case. */
9020 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9022 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9023 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9025 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9026 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9027 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9028 return true;
9031 widest_int max;
9032 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9033 /* Check the upper bound of loop niters. */
9034 if (get_max_loop_iterations (loop, &max))
9036 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9037 signop sgn = TYPE_SIGN (type);
9038 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9039 if (max < type_max)
9040 return true;
9042 return false;
9045 /* Return a mask type with half the number of elements as OLD_TYPE,
9046 given that it should have mode NEW_MODE. */
9048 tree
9049 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9051 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9052 return build_truth_vector_type_for_mode (nunits, new_mode);
9055 /* Return a mask type with twice as many elements as OLD_TYPE,
9056 given that it should have mode NEW_MODE. */
9058 tree
9059 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9061 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9062 return build_truth_vector_type_for_mode (nunits, new_mode);
9065 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9066 contain a sequence of NVECTORS masks that each control a vector of type
9067 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9068 these vector masks with the vector version of SCALAR_MASK. */
9070 void
9071 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9072 unsigned int nvectors, tree vectype, tree scalar_mask)
9074 gcc_assert (nvectors != 0);
9075 if (masks->length () < nvectors)
9076 masks->safe_grow_cleared (nvectors, true);
9077 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9078 /* The number of scalars per iteration and the number of vectors are
9079 both compile-time constants. */
9080 unsigned int nscalars_per_iter
9081 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9082 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9084 if (scalar_mask)
9086 scalar_cond_masked_key cond (scalar_mask, nvectors);
9087 loop_vinfo->scalar_cond_masked_set.add (cond);
9090 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9092 rgm->max_nscalars_per_iter = nscalars_per_iter;
9093 rgm->type = truth_type_for (vectype);
9094 rgm->factor = 1;
9098 /* Given a complete set of masks MASKS, extract mask number INDEX
9099 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9100 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9102 See the comment above vec_loop_masks for more details about the mask
9103 arrangement. */
9105 tree
9106 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9107 unsigned int nvectors, tree vectype, unsigned int index)
9109 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9110 tree mask_type = rgm->type;
9112 /* Populate the rgroup's mask array, if this is the first time we've
9113 used it. */
9114 if (rgm->controls.is_empty ())
9116 rgm->controls.safe_grow_cleared (nvectors, true);
9117 for (unsigned int i = 0; i < nvectors; ++i)
9119 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9120 /* Provide a dummy definition until the real one is available. */
9121 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9122 rgm->controls[i] = mask;
9126 tree mask = rgm->controls[index];
9127 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9128 TYPE_VECTOR_SUBPARTS (vectype)))
9130 /* A loop mask for data type X can be reused for data type Y
9131 if X has N times more elements than Y and if Y's elements
9132 are N times bigger than X's. In this case each sequence
9133 of N elements in the loop mask will be all-zero or all-one.
9134 We can then view-convert the mask so that each sequence of
9135 N elements is replaced by a single element. */
9136 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9137 TYPE_VECTOR_SUBPARTS (vectype)));
9138 gimple_seq seq = NULL;
9139 mask_type = truth_type_for (vectype);
9140 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9141 if (seq)
9142 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9144 return mask;
9147 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9148 lengths for controlling an operation on VECTYPE. The operation splits
9149 each element of VECTYPE into FACTOR separate subelements, measuring the
9150 length as a number of these subelements. */
9152 void
9153 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9154 unsigned int nvectors, tree vectype, unsigned int factor)
9156 gcc_assert (nvectors != 0);
9157 if (lens->length () < nvectors)
9158 lens->safe_grow_cleared (nvectors, true);
9159 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9161 /* The number of scalars per iteration, scalar occupied bytes and
9162 the number of vectors are both compile-time constants. */
9163 unsigned int nscalars_per_iter
9164 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9165 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9167 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9169 /* For now, we only support cases in which all loads and stores fall back
9170 to VnQI or none do. */
9171 gcc_assert (!rgl->max_nscalars_per_iter
9172 || (rgl->factor == 1 && factor == 1)
9173 || (rgl->max_nscalars_per_iter * rgl->factor
9174 == nscalars_per_iter * factor));
9175 rgl->max_nscalars_per_iter = nscalars_per_iter;
9176 rgl->type = vectype;
9177 rgl->factor = factor;
9181 /* Given a complete set of length LENS, extract length number INDEX for an
9182 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9184 tree
9185 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9186 unsigned int nvectors, unsigned int index)
9188 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9190 /* Populate the rgroup's len array, if this is the first time we've
9191 used it. */
9192 if (rgl->controls.is_empty ())
9194 rgl->controls.safe_grow_cleared (nvectors, true);
9195 for (unsigned int i = 0; i < nvectors; ++i)
9197 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9198 gcc_assert (len_type != NULL_TREE);
9199 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9201 /* Provide a dummy definition until the real one is available. */
9202 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9203 rgl->controls[i] = len;
9207 return rgl->controls[index];
9210 /* Scale profiling counters by estimation for LOOP which is vectorized
9211 by factor VF. */
9213 static void
9214 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9216 edge preheader = loop_preheader_edge (loop);
9217 /* Reduce loop iterations by the vectorization factor. */
9218 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9219 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9221 if (freq_h.nonzero_p ())
9223 profile_probability p;
9225 /* Avoid dropping loop body profile counter to 0 because of zero count
9226 in loop's preheader. */
9227 if (!(freq_e == profile_count::zero ()))
9228 freq_e = freq_e.force_nonzero ();
9229 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9230 scale_loop_frequencies (loop, p);
9233 edge exit_e = single_exit (loop);
9234 exit_e->probability = profile_probability::always ()
9235 .apply_scale (1, new_est_niter + 1);
9237 edge exit_l = single_pred_edge (loop->latch);
9238 profile_probability prob = exit_l->probability;
9239 exit_l->probability = exit_e->probability.invert ();
9240 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9241 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9244 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9245 latch edge values originally defined by it. */
9247 static void
9248 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9249 stmt_vec_info def_stmt_info)
9251 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9252 if (!def || TREE_CODE (def) != SSA_NAME)
9253 return;
9254 stmt_vec_info phi_info;
9255 imm_use_iterator iter;
9256 use_operand_p use_p;
9257 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9258 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9259 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9260 && (phi_info = loop_vinfo->lookup_stmt (phi))
9261 && STMT_VINFO_RELEVANT_P (phi_info)
9262 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9263 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9264 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9266 loop_p loop = gimple_bb (phi)->loop_father;
9267 edge e = loop_latch_edge (loop);
9268 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9270 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9271 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9272 gcc_assert (phi_defs.length () == latch_defs.length ());
9273 for (unsigned i = 0; i < phi_defs.length (); ++i)
9274 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9275 gimple_get_lhs (latch_defs[i]), e,
9276 gimple_phi_arg_location (phi, e->dest_idx));
9281 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9282 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9283 stmt_vec_info. */
9285 static bool
9286 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9287 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9289 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9290 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9292 if (dump_enabled_p ())
9293 dump_printf_loc (MSG_NOTE, vect_location,
9294 "------>vectorizing statement: %G", stmt_info->stmt);
9296 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9297 vect_loop_kill_debug_uses (loop, stmt_info);
9299 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9300 && !STMT_VINFO_LIVE_P (stmt_info))
9301 return false;
9303 if (STMT_VINFO_VECTYPE (stmt_info))
9305 poly_uint64 nunits
9306 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9307 if (!STMT_SLP_TYPE (stmt_info)
9308 && maybe_ne (nunits, vf)
9309 && dump_enabled_p ())
9310 /* For SLP VF is set according to unrolling factor, and not
9311 to vector size, hence for SLP this print is not valid. */
9312 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9315 /* Pure SLP statements have already been vectorized. We still need
9316 to apply loop vectorization to hybrid SLP statements. */
9317 if (PURE_SLP_STMT (stmt_info))
9318 return false;
9320 if (dump_enabled_p ())
9321 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9323 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9324 *seen_store = stmt_info;
9326 return true;
9329 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9330 in the hash_map with its corresponding values. */
9332 static tree
9333 find_in_mapping (tree t, void *context)
9335 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9337 tree *value = mapping->get (t);
9338 return value ? *value : t;
9341 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9342 original loop that has now been vectorized.
9344 The inits of the data_references need to be advanced with the number of
9345 iterations of the main loop. This has been computed in vect_do_peeling and
9346 is stored in parameter ADVANCE. We first restore the data_references
9347 initial offset with the values recored in ORIG_DRS_INIT.
9349 Since the loop_vec_info of this EPILOGUE was constructed for the original
9350 loop, its stmt_vec_infos all point to the original statements. These need
9351 to be updated to point to their corresponding copies as well as the SSA_NAMES
9352 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9354 The data_reference's connections also need to be updated. Their
9355 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9356 stmt_vec_infos, their statements need to point to their corresponding copy,
9357 if they are gather loads or scatter stores then their reference needs to be
9358 updated to point to its corresponding copy and finally we set
9359 'base_misaligned' to false as we have already peeled for alignment in the
9360 prologue of the main loop. */
9362 static void
9363 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9365 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9366 auto_vec<gimple *> stmt_worklist;
9367 hash_map<tree,tree> mapping;
9368 gimple *orig_stmt, *new_stmt;
9369 gimple_stmt_iterator epilogue_gsi;
9370 gphi_iterator epilogue_phi_gsi;
9371 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9372 basic_block *epilogue_bbs = get_loop_body (epilogue);
9373 unsigned i;
9375 free (LOOP_VINFO_BBS (epilogue_vinfo));
9376 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9378 /* Advance data_reference's with the number of iterations of the previous
9379 loop and its prologue. */
9380 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9383 /* The EPILOGUE loop is a copy of the original loop so they share the same
9384 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9385 point to the copied statements. We also create a mapping of all LHS' in
9386 the original loop and all the LHS' in the EPILOGUE and create worklists to
9387 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9388 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9390 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9391 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9393 new_stmt = epilogue_phi_gsi.phi ();
9395 gcc_assert (gimple_uid (new_stmt) > 0);
9396 stmt_vinfo
9397 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9399 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9400 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9402 mapping.put (gimple_phi_result (orig_stmt),
9403 gimple_phi_result (new_stmt));
9404 /* PHI nodes can not have patterns or related statements. */
9405 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9406 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9409 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9410 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9412 new_stmt = gsi_stmt (epilogue_gsi);
9413 if (is_gimple_debug (new_stmt))
9414 continue;
9416 gcc_assert (gimple_uid (new_stmt) > 0);
9417 stmt_vinfo
9418 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9420 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9421 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9423 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9424 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9426 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9428 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9429 for (gimple_stmt_iterator gsi = gsi_start (seq);
9430 !gsi_end_p (gsi); gsi_next (&gsi))
9431 stmt_worklist.safe_push (gsi_stmt (gsi));
9434 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9435 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9437 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9438 stmt_worklist.safe_push (stmt);
9439 /* Set BB such that the assert in
9440 'get_initial_def_for_reduction' is able to determine that
9441 the BB of the related stmt is inside this loop. */
9442 gimple_set_bb (stmt,
9443 gimple_bb (new_stmt));
9444 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9445 gcc_assert (related_vinfo == NULL
9446 || related_vinfo == stmt_vinfo);
9451 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9452 using the original main loop and thus need to be updated to refer to the
9453 cloned variables used in the epilogue. */
9454 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9456 gimple *stmt = stmt_worklist[i];
9457 tree *new_op;
9459 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9461 tree op = gimple_op (stmt, j);
9462 if ((new_op = mapping.get(op)))
9463 gimple_set_op (stmt, j, *new_op);
9464 else
9466 /* PR92429: The last argument of simplify_replace_tree disables
9467 folding when replacing arguments. This is required as
9468 otherwise you might end up with different statements than the
9469 ones analyzed in vect_loop_analyze, leading to different
9470 vectorization. */
9471 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9472 &find_in_mapping, &mapping, false);
9473 gimple_set_op (stmt, j, op);
9478 struct data_reference *dr;
9479 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9480 FOR_EACH_VEC_ELT (datarefs, i, dr)
9482 orig_stmt = DR_STMT (dr);
9483 gcc_assert (gimple_uid (orig_stmt) > 0);
9484 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9485 /* Data references for gather loads and scatter stores do not use the
9486 updated offset we set using ADVANCE. Instead we have to make sure the
9487 reference in the data references point to the corresponding copy of
9488 the original in the epilogue. */
9489 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9490 == VMAT_GATHER_SCATTER)
9492 DR_REF (dr)
9493 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9494 &find_in_mapping, &mapping);
9495 DR_BASE_ADDRESS (dr)
9496 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9497 &find_in_mapping, &mapping);
9499 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9500 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9501 /* The vector size of the epilogue is smaller than that of the main loop
9502 so the alignment is either the same or lower. This means the dr will
9503 thus by definition be aligned. */
9504 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9507 epilogue_vinfo->shared->datarefs_copy.release ();
9508 epilogue_vinfo->shared->save_datarefs ();
9511 /* Function vect_transform_loop.
9513 The analysis phase has determined that the loop is vectorizable.
9514 Vectorize the loop - created vectorized stmts to replace the scalar
9515 stmts in the loop, and update the loop exit condition.
9516 Returns scalar epilogue loop if any. */
9518 class loop *
9519 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9521 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9522 class loop *epilogue = NULL;
9523 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9524 int nbbs = loop->num_nodes;
9525 int i;
9526 tree niters_vector = NULL_TREE;
9527 tree step_vector = NULL_TREE;
9528 tree niters_vector_mult_vf = NULL_TREE;
9529 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9530 unsigned int lowest_vf = constant_lower_bound (vf);
9531 gimple *stmt;
9532 bool check_profitability = false;
9533 unsigned int th;
9535 DUMP_VECT_SCOPE ("vec_transform_loop");
9537 loop_vinfo->shared->check_datarefs ();
9539 /* Use the more conservative vectorization threshold. If the number
9540 of iterations is constant assume the cost check has been performed
9541 by our caller. If the threshold makes all loops profitable that
9542 run at least the (estimated) vectorization factor number of times
9543 checking is pointless, too. */
9544 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9545 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9547 if (dump_enabled_p ())
9548 dump_printf_loc (MSG_NOTE, vect_location,
9549 "Profitability threshold is %d loop iterations.\n",
9550 th);
9551 check_profitability = true;
9554 /* Make sure there exists a single-predecessor exit bb. Do this before
9555 versioning. */
9556 edge e = single_exit (loop);
9557 if (! single_pred_p (e->dest))
9559 split_loop_exit_edge (e, true);
9560 if (dump_enabled_p ())
9561 dump_printf (MSG_NOTE, "split exit edge\n");
9564 /* Version the loop first, if required, so the profitability check
9565 comes first. */
9567 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9569 class loop *sloop
9570 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9571 sloop->force_vectorize = false;
9572 check_profitability = false;
9575 /* Make sure there exists a single-predecessor exit bb also on the
9576 scalar loop copy. Do this after versioning but before peeling
9577 so CFG structure is fine for both scalar and if-converted loop
9578 to make slpeel_duplicate_current_defs_from_edges face matched
9579 loop closed PHI nodes on the exit. */
9580 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9582 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9583 if (! single_pred_p (e->dest))
9585 split_loop_exit_edge (e, true);
9586 if (dump_enabled_p ())
9587 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9591 tree niters = vect_build_loop_niters (loop_vinfo);
9592 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9593 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9594 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9595 tree advance;
9596 drs_init_vec orig_drs_init;
9598 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9599 &step_vector, &niters_vector_mult_vf, th,
9600 check_profitability, niters_no_overflow,
9601 &advance);
9603 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9604 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9605 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9606 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9608 if (niters_vector == NULL_TREE)
9610 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9611 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9612 && known_eq (lowest_vf, vf))
9614 niters_vector
9615 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9616 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9617 step_vector = build_one_cst (TREE_TYPE (niters));
9619 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9620 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9621 &step_vector, niters_no_overflow);
9622 else
9623 /* vect_do_peeling subtracted the number of peeled prologue
9624 iterations from LOOP_VINFO_NITERS. */
9625 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9626 &niters_vector, &step_vector,
9627 niters_no_overflow);
9630 /* 1) Make sure the loop header has exactly two entries
9631 2) Make sure we have a preheader basic block. */
9633 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9635 split_edge (loop_preheader_edge (loop));
9637 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9638 /* This will deal with any possible peeling. */
9639 vect_prepare_for_masked_peels (loop_vinfo);
9641 /* Schedule the SLP instances first, then handle loop vectorization
9642 below. */
9643 if (!loop_vinfo->slp_instances.is_empty ())
9645 DUMP_VECT_SCOPE ("scheduling SLP instances");
9646 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9649 /* FORNOW: the vectorizer supports only loops which body consist
9650 of one basic block (header + empty latch). When the vectorizer will
9651 support more involved loop forms, the order by which the BBs are
9652 traversed need to be reconsidered. */
9654 for (i = 0; i < nbbs; i++)
9656 basic_block bb = bbs[i];
9657 stmt_vec_info stmt_info;
9659 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9660 gsi_next (&si))
9662 gphi *phi = si.phi ();
9663 if (dump_enabled_p ())
9664 dump_printf_loc (MSG_NOTE, vect_location,
9665 "------>vectorizing phi: %G", phi);
9666 stmt_info = loop_vinfo->lookup_stmt (phi);
9667 if (!stmt_info)
9668 continue;
9670 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9671 vect_loop_kill_debug_uses (loop, stmt_info);
9673 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9674 && !STMT_VINFO_LIVE_P (stmt_info))
9675 continue;
9677 if (STMT_VINFO_VECTYPE (stmt_info)
9678 && (maybe_ne
9679 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9680 && dump_enabled_p ())
9681 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9683 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9684 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9685 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9686 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9687 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9688 && ! PURE_SLP_STMT (stmt_info))
9690 if (dump_enabled_p ())
9691 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9692 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9696 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9697 gsi_next (&si))
9699 gphi *phi = si.phi ();
9700 stmt_info = loop_vinfo->lookup_stmt (phi);
9701 if (!stmt_info)
9702 continue;
9704 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9705 && !STMT_VINFO_LIVE_P (stmt_info))
9706 continue;
9708 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9709 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9710 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9711 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9712 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9713 && ! PURE_SLP_STMT (stmt_info))
9714 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9717 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9718 !gsi_end_p (si);)
9720 stmt = gsi_stmt (si);
9721 /* During vectorization remove existing clobber stmts. */
9722 if (gimple_clobber_p (stmt))
9724 unlink_stmt_vdef (stmt);
9725 gsi_remove (&si, true);
9726 release_defs (stmt);
9728 else
9730 /* Ignore vector stmts created in the outer loop. */
9731 stmt_info = loop_vinfo->lookup_stmt (stmt);
9733 /* vector stmts created in the outer-loop during vectorization of
9734 stmts in an inner-loop may not have a stmt_info, and do not
9735 need to be vectorized. */
9736 stmt_vec_info seen_store = NULL;
9737 if (stmt_info)
9739 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9741 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9742 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9743 !gsi_end_p (subsi); gsi_next (&subsi))
9745 stmt_vec_info pat_stmt_info
9746 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9747 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9748 &si, &seen_store);
9750 stmt_vec_info pat_stmt_info
9751 = STMT_VINFO_RELATED_STMT (stmt_info);
9752 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9753 &si, &seen_store))
9754 maybe_set_vectorized_backedge_value (loop_vinfo,
9755 pat_stmt_info);
9757 else
9759 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9760 &seen_store))
9761 maybe_set_vectorized_backedge_value (loop_vinfo,
9762 stmt_info);
9765 gsi_next (&si);
9766 if (seen_store)
9768 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9769 /* Interleaving. If IS_STORE is TRUE, the
9770 vectorization of the interleaving chain was
9771 completed - free all the stores in the chain. */
9772 vect_remove_stores (loop_vinfo,
9773 DR_GROUP_FIRST_ELEMENT (seen_store));
9774 else
9775 /* Free the attached stmt_vec_info and remove the stmt. */
9776 loop_vinfo->remove_stmt (stmt_info);
9781 /* Stub out scalar statements that must not survive vectorization.
9782 Doing this here helps with grouped statements, or statements that
9783 are involved in patterns. */
9784 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9785 !gsi_end_p (gsi); gsi_next (&gsi))
9787 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9788 if (!call || !gimple_call_internal_p (call))
9789 continue;
9790 internal_fn ifn = gimple_call_internal_fn (call);
9791 if (ifn == IFN_MASK_LOAD)
9793 tree lhs = gimple_get_lhs (call);
9794 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9796 tree zero = build_zero_cst (TREE_TYPE (lhs));
9797 gimple *new_stmt = gimple_build_assign (lhs, zero);
9798 gsi_replace (&gsi, new_stmt, true);
9801 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9803 tree lhs = gimple_get_lhs (call);
9804 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9806 tree else_arg
9807 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9808 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9809 gsi_replace (&gsi, new_stmt, true);
9813 } /* BBs in loop */
9815 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9816 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9817 if (integer_onep (step_vector))
9818 niters_no_overflow = true;
9819 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9820 niters_vector_mult_vf, !niters_no_overflow);
9822 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9823 scale_profile_for_vect_loop (loop, assumed_vf);
9825 /* True if the final iteration might not handle a full vector's
9826 worth of scalar iterations. */
9827 bool final_iter_may_be_partial
9828 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9829 /* The minimum number of iterations performed by the epilogue. This
9830 is 1 when peeling for gaps because we always need a final scalar
9831 iteration. */
9832 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9833 /* +1 to convert latch counts to loop iteration counts,
9834 -min_epilogue_iters to remove iterations that cannot be performed
9835 by the vector code. */
9836 int bias_for_lowest = 1 - min_epilogue_iters;
9837 int bias_for_assumed = bias_for_lowest;
9838 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9839 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9841 /* When the amount of peeling is known at compile time, the first
9842 iteration will have exactly alignment_npeels active elements.
9843 In the worst case it will have at least one. */
9844 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9845 bias_for_lowest += lowest_vf - min_first_active;
9846 bias_for_assumed += assumed_vf - min_first_active;
9848 /* In these calculations the "- 1" converts loop iteration counts
9849 back to latch counts. */
9850 if (loop->any_upper_bound)
9852 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9853 loop->nb_iterations_upper_bound
9854 = (final_iter_may_be_partial
9855 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9856 lowest_vf) - 1
9857 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9858 lowest_vf) - 1);
9859 if (main_vinfo)
9861 unsigned int bound;
9862 poly_uint64 main_iters
9863 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9864 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9865 main_iters
9866 = upper_bound (main_iters,
9867 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9868 if (can_div_away_from_zero_p (main_iters,
9869 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9870 &bound))
9871 loop->nb_iterations_upper_bound
9872 = wi::umin ((widest_int) (bound - 1),
9873 loop->nb_iterations_upper_bound);
9876 if (loop->any_likely_upper_bound)
9877 loop->nb_iterations_likely_upper_bound
9878 = (final_iter_may_be_partial
9879 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9880 + bias_for_lowest, lowest_vf) - 1
9881 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9882 + bias_for_lowest, lowest_vf) - 1);
9883 if (loop->any_estimate)
9884 loop->nb_iterations_estimate
9885 = (final_iter_may_be_partial
9886 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9887 assumed_vf) - 1
9888 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9889 assumed_vf) - 1);
9891 if (dump_enabled_p ())
9893 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9895 dump_printf_loc (MSG_NOTE, vect_location,
9896 "LOOP VECTORIZED\n");
9897 if (loop->inner)
9898 dump_printf_loc (MSG_NOTE, vect_location,
9899 "OUTER LOOP VECTORIZED\n");
9900 dump_printf (MSG_NOTE, "\n");
9902 else
9903 dump_printf_loc (MSG_NOTE, vect_location,
9904 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9905 GET_MODE_NAME (loop_vinfo->vector_mode));
9908 /* Loops vectorized with a variable factor won't benefit from
9909 unrolling/peeling. */
9910 if (!vf.is_constant ())
9912 loop->unroll = 1;
9913 if (dump_enabled_p ())
9914 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9915 " variable-length vectorization factor\n");
9917 /* Free SLP instances here because otherwise stmt reference counting
9918 won't work. */
9919 slp_instance instance;
9920 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9921 vect_free_slp_instance (instance);
9922 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9923 /* Clear-up safelen field since its value is invalid after vectorization
9924 since vectorized loop can have loop-carried dependencies. */
9925 loop->safelen = 0;
9927 if (epilogue)
9929 update_epilogue_loop_vinfo (epilogue, advance);
9931 epilogue->simduid = loop->simduid;
9932 epilogue->force_vectorize = loop->force_vectorize;
9933 epilogue->dont_vectorize = false;
9936 return epilogue;
9939 /* The code below is trying to perform simple optimization - revert
9940 if-conversion for masked stores, i.e. if the mask of a store is zero
9941 do not perform it and all stored value producers also if possible.
9942 For example,
9943 for (i=0; i<n; i++)
9944 if (c[i])
9946 p1[i] += 1;
9947 p2[i] = p3[i] +2;
9949 this transformation will produce the following semi-hammock:
9951 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9953 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9954 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9955 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9956 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9957 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9958 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9962 void
9963 optimize_mask_stores (class loop *loop)
9965 basic_block *bbs = get_loop_body (loop);
9966 unsigned nbbs = loop->num_nodes;
9967 unsigned i;
9968 basic_block bb;
9969 class loop *bb_loop;
9970 gimple_stmt_iterator gsi;
9971 gimple *stmt;
9972 auto_vec<gimple *> worklist;
9973 auto_purge_vect_location sentinel;
9975 vect_location = find_loop_location (loop);
9976 /* Pick up all masked stores in loop if any. */
9977 for (i = 0; i < nbbs; i++)
9979 bb = bbs[i];
9980 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9981 gsi_next (&gsi))
9983 stmt = gsi_stmt (gsi);
9984 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9985 worklist.safe_push (stmt);
9989 free (bbs);
9990 if (worklist.is_empty ())
9991 return;
9993 /* Loop has masked stores. */
9994 while (!worklist.is_empty ())
9996 gimple *last, *last_store;
9997 edge e, efalse;
9998 tree mask;
9999 basic_block store_bb, join_bb;
10000 gimple_stmt_iterator gsi_to;
10001 tree vdef, new_vdef;
10002 gphi *phi;
10003 tree vectype;
10004 tree zero;
10006 last = worklist.pop ();
10007 mask = gimple_call_arg (last, 2);
10008 bb = gimple_bb (last);
10009 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10010 the same loop as if_bb. It could be different to LOOP when two
10011 level loop-nest is vectorized and mask_store belongs to the inner
10012 one. */
10013 e = split_block (bb, last);
10014 bb_loop = bb->loop_father;
10015 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10016 join_bb = e->dest;
10017 store_bb = create_empty_bb (bb);
10018 add_bb_to_loop (store_bb, bb_loop);
10019 e->flags = EDGE_TRUE_VALUE;
10020 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10021 /* Put STORE_BB to likely part. */
10022 efalse->probability = profile_probability::unlikely ();
10023 store_bb->count = efalse->count ();
10024 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10025 if (dom_info_available_p (CDI_DOMINATORS))
10026 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10027 if (dump_enabled_p ())
10028 dump_printf_loc (MSG_NOTE, vect_location,
10029 "Create new block %d to sink mask stores.",
10030 store_bb->index);
10031 /* Create vector comparison with boolean result. */
10032 vectype = TREE_TYPE (mask);
10033 zero = build_zero_cst (vectype);
10034 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10035 gsi = gsi_last_bb (bb);
10036 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10037 /* Create new PHI node for vdef of the last masked store:
10038 .MEM_2 = VDEF <.MEM_1>
10039 will be converted to
10040 .MEM.3 = VDEF <.MEM_1>
10041 and new PHI node will be created in join bb
10042 .MEM_2 = PHI <.MEM_1, .MEM_3>
10044 vdef = gimple_vdef (last);
10045 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10046 gimple_set_vdef (last, new_vdef);
10047 phi = create_phi_node (vdef, join_bb);
10048 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10050 /* Put all masked stores with the same mask to STORE_BB if possible. */
10051 while (true)
10053 gimple_stmt_iterator gsi_from;
10054 gimple *stmt1 = NULL;
10056 /* Move masked store to STORE_BB. */
10057 last_store = last;
10058 gsi = gsi_for_stmt (last);
10059 gsi_from = gsi;
10060 /* Shift GSI to the previous stmt for further traversal. */
10061 gsi_prev (&gsi);
10062 gsi_to = gsi_start_bb (store_bb);
10063 gsi_move_before (&gsi_from, &gsi_to);
10064 /* Setup GSI_TO to the non-empty block start. */
10065 gsi_to = gsi_start_bb (store_bb);
10066 if (dump_enabled_p ())
10067 dump_printf_loc (MSG_NOTE, vect_location,
10068 "Move stmt to created bb\n%G", last);
10069 /* Move all stored value producers if possible. */
10070 while (!gsi_end_p (gsi))
10072 tree lhs;
10073 imm_use_iterator imm_iter;
10074 use_operand_p use_p;
10075 bool res;
10077 /* Skip debug statements. */
10078 if (is_gimple_debug (gsi_stmt (gsi)))
10080 gsi_prev (&gsi);
10081 continue;
10083 stmt1 = gsi_stmt (gsi);
10084 /* Do not consider statements writing to memory or having
10085 volatile operand. */
10086 if (gimple_vdef (stmt1)
10087 || gimple_has_volatile_ops (stmt1))
10088 break;
10089 gsi_from = gsi;
10090 gsi_prev (&gsi);
10091 lhs = gimple_get_lhs (stmt1);
10092 if (!lhs)
10093 break;
10095 /* LHS of vectorized stmt must be SSA_NAME. */
10096 if (TREE_CODE (lhs) != SSA_NAME)
10097 break;
10099 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10101 /* Remove dead scalar statement. */
10102 if (has_zero_uses (lhs))
10104 gsi_remove (&gsi_from, true);
10105 continue;
10109 /* Check that LHS does not have uses outside of STORE_BB. */
10110 res = true;
10111 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10113 gimple *use_stmt;
10114 use_stmt = USE_STMT (use_p);
10115 if (is_gimple_debug (use_stmt))
10116 continue;
10117 if (gimple_bb (use_stmt) != store_bb)
10119 res = false;
10120 break;
10123 if (!res)
10124 break;
10126 if (gimple_vuse (stmt1)
10127 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10128 break;
10130 /* Can move STMT1 to STORE_BB. */
10131 if (dump_enabled_p ())
10132 dump_printf_loc (MSG_NOTE, vect_location,
10133 "Move stmt to created bb\n%G", stmt1);
10134 gsi_move_before (&gsi_from, &gsi_to);
10135 /* Shift GSI_TO for further insertion. */
10136 gsi_prev (&gsi_to);
10138 /* Put other masked stores with the same mask to STORE_BB. */
10139 if (worklist.is_empty ()
10140 || gimple_call_arg (worklist.last (), 2) != mask
10141 || worklist.last () != stmt1)
10142 break;
10143 last = worklist.pop ();
10145 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10149 /* Decide whether it is possible to use a zero-based induction variable
10150 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10151 the value that the induction variable must be able to hold in order
10152 to ensure that the rgroups eventually have no active vector elements.
10153 Return -1 otherwise. */
10155 widest_int
10156 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10158 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10159 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10160 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10162 /* Calculate the value that the induction variable must be able
10163 to hit in order to ensure that we end the loop with an all-false mask.
10164 This involves adding the maximum number of inactive trailing scalar
10165 iterations. */
10166 widest_int iv_limit = -1;
10167 if (max_loop_iterations (loop, &iv_limit))
10169 if (niters_skip)
10171 /* Add the maximum number of skipped iterations to the
10172 maximum iteration count. */
10173 if (TREE_CODE (niters_skip) == INTEGER_CST)
10174 iv_limit += wi::to_widest (niters_skip);
10175 else
10176 iv_limit += max_vf - 1;
10178 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10179 /* Make a conservatively-correct assumption. */
10180 iv_limit += max_vf - 1;
10182 /* IV_LIMIT is the maximum number of latch iterations, which is also
10183 the maximum in-range IV value. Round this value down to the previous
10184 vector alignment boundary and then add an extra full iteration. */
10185 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10186 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10188 return iv_limit;
10191 /* For the given rgroup_controls RGC, check whether an induction variable
10192 would ever hit a value that produces a set of all-false masks or zero
10193 lengths before wrapping around. Return true if it's possible to wrap
10194 around before hitting the desirable value, otherwise return false. */
10196 bool
10197 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10199 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10201 if (iv_limit == -1)
10202 return true;
10204 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10205 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10206 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10208 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10209 return true;
10211 return false;