Support TI mode and soft float on PA64
[official-gcc.git] / gcc / tree-vect-loop.c
blobabf87f99d6d4a18a50c4499fc8cccc8dc3ff814b
1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
158 bool *, bool *);
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162 may already be set for general statements (not just data refs). */
164 static opt_result
165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
166 bool vectype_maybe_set_p,
167 poly_uint64 *vf)
169 gimple *stmt = stmt_info->stmt;
171 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
172 && !STMT_VINFO_LIVE_P (stmt_info))
173 || gimple_clobber_p (stmt))
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
177 return opt_result::success ();
180 tree stmt_vectype, nunits_vectype;
181 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
182 &stmt_vectype,
183 &nunits_vectype);
184 if (!res)
185 return res;
187 if (stmt_vectype)
189 if (STMT_VINFO_VECTYPE (stmt_info))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 || vectype_maybe_set_p)
195 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. Return true on success
209 or false if something prevented vectorization. */
211 static opt_result
212 vect_determine_vf_for_stmt (vec_info *vinfo,
213 stmt_vec_info stmt_info, poly_uint64 *vf)
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
217 stmt_info->stmt);
218 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
219 if (!res)
220 return res;
222 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
223 && STMT_VINFO_RELATED_STMT (stmt_info))
225 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
226 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
228 /* If a pattern statement has def stmts, analyze them too. */
229 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
230 !gsi_end_p (si); gsi_next (&si))
232 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
233 if (dump_enabled_p ())
234 dump_printf_loc (MSG_NOTE, vect_location,
235 "==> examining pattern def stmt: %G",
236 def_stmt_info->stmt);
237 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
238 if (!res)
239 return res;
242 if (dump_enabled_p ())
243 dump_printf_loc (MSG_NOTE, vect_location,
244 "==> examining pattern statement: %G",
245 stmt_info->stmt);
246 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
247 if (!res)
248 return res;
251 return opt_result::success ();
254 /* Function vect_determine_vectorization_factor
256 Determine the vectorization factor (VF). VF is the number of data elements
257 that are operated upon in parallel in a single iteration of the vectorized
258 loop. For example, when vectorizing a loop that operates on 4byte elements,
259 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260 elements can fit in a single vector register.
262 We currently support vectorization of loops in which all types operated upon
263 are of the same size. Therefore this function currently sets VF according to
264 the size of the types operated upon, and fails if there are multiple sizes
265 in the loop.
267 VF is also the factor by which the loop iterations are strip-mined, e.g.:
268 original loop:
269 for (i=0; i<N; i++){
270 a[i] = b[i] + c[i];
273 vectorized loop:
274 for (i=0; i<N; i+=VF){
275 a[i:VF] = b[i:VF] + c[i:VF];
279 static opt_result
280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
282 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284 unsigned nbbs = loop->num_nodes;
285 poly_uint64 vectorization_factor = 1;
286 tree scalar_type = NULL_TREE;
287 gphi *phi;
288 tree vectype;
289 stmt_vec_info stmt_info;
290 unsigned i;
292 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
294 for (i = 0; i < nbbs; i++)
296 basic_block bb = bbs[i];
298 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
299 gsi_next (&si))
301 phi = si.phi ();
302 stmt_info = loop_vinfo->lookup_stmt (phi);
303 if (dump_enabled_p ())
304 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
305 phi);
307 gcc_assert (stmt_info);
309 if (STMT_VINFO_RELEVANT_P (stmt_info)
310 || STMT_VINFO_LIVE_P (stmt_info))
312 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
313 scalar_type = TREE_TYPE (PHI_RESULT (phi));
315 if (dump_enabled_p ())
316 dump_printf_loc (MSG_NOTE, vect_location,
317 "get vectype for scalar type: %T\n",
318 scalar_type);
320 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
321 if (!vectype)
322 return opt_result::failure_at (phi,
323 "not vectorized: unsupported "
324 "data-type %T\n",
325 scalar_type);
326 STMT_VINFO_VECTYPE (stmt_info) = vectype;
328 if (dump_enabled_p ())
329 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
330 vectype);
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
335 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
336 dump_printf (MSG_NOTE, "\n");
339 vect_update_max_nunits (&vectorization_factor, vectype);
343 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
344 gsi_next (&si))
346 if (is_gimple_debug (gsi_stmt (si)))
347 continue;
348 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
349 opt_result res
350 = vect_determine_vf_for_stmt (loop_vinfo,
351 stmt_info, &vectorization_factor);
352 if (!res)
353 return res;
357 /* TODO: Analyze cost. Decide if worth while to vectorize. */
358 if (dump_enabled_p ())
360 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
361 dump_dec (MSG_NOTE, vectorization_factor);
362 dump_printf (MSG_NOTE, "\n");
365 if (known_le (vectorization_factor, 1U))
366 return opt_result::failure_at (vect_location,
367 "not vectorized: unsupported data-type\n");
368 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
369 return opt_result::success ();
373 /* Function vect_is_simple_iv_evolution.
375 FORNOW: A simple evolution of an induction variables in the loop is
376 considered a polynomial evolution. */
378 static bool
379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
380 tree * step)
382 tree init_expr;
383 tree step_expr;
384 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
385 basic_block bb;
387 /* When there is no evolution in this loop, the evolution function
388 is not "simple". */
389 if (evolution_part == NULL_TREE)
390 return false;
392 /* When the evolution is a polynomial of degree >= 2
393 the evolution function is not "simple". */
394 if (tree_is_chrec (evolution_part))
395 return false;
397 step_expr = evolution_part;
398 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
402 step_expr, init_expr);
404 *init = init_expr;
405 *step = step_expr;
407 if (TREE_CODE (step_expr) != INTEGER_CST
408 && (TREE_CODE (step_expr) != SSA_NAME
409 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
410 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
411 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
412 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
413 || !flag_associative_math)))
414 && (TREE_CODE (step_expr) != REAL_CST
415 || !flag_associative_math))
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "step unknown.\n");
420 return false;
423 return true;
426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
427 what we are assuming is a double reduction. For example, given
428 a structure like this:
430 outer1:
431 x_1 = PHI <x_4(outer2), ...>;
434 inner:
435 x_2 = PHI <x_1(outer1), ...>;
437 x_3 = ...;
440 outer2:
441 x_4 = PHI <x_3(inner)>;
444 outer loop analysis would treat x_1 as a double reduction phi and
445 this function would then return true for x_2. */
447 static bool
448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
450 use_operand_p use_p;
451 ssa_op_iter op_iter;
452 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
453 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
454 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
455 return true;
456 return false;
459 /* Function vect_analyze_scalar_cycles_1.
461 Examine the cross iteration def-use cycles of scalar variables
462 in LOOP. LOOP_VINFO represents the loop that is now being
463 considered for vectorization (can be LOOP, or an outer-loop
464 enclosing LOOP). */
466 static void
467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
469 basic_block bb = loop->header;
470 tree init, step;
471 auto_vec<stmt_vec_info, 64> worklist;
472 gphi_iterator gsi;
473 bool double_reduc, reduc_chain;
475 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
477 /* First - identify all inductions. Reduction detection assumes that all the
478 inductions have been identified, therefore, this order must not be
479 changed. */
480 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
482 gphi *phi = gsi.phi ();
483 tree access_fn = NULL;
484 tree def = PHI_RESULT (phi);
485 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
490 /* Skip virtual phi's. The data dependences that are associated with
491 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
492 if (virtual_operand_p (def))
493 continue;
495 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
497 /* Analyze the evolution function. */
498 access_fn = analyze_scalar_evolution (loop, def);
499 if (access_fn)
501 STRIP_NOPS (access_fn);
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "Access function of PHI: %T\n", access_fn);
505 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
506 = initial_condition_in_loop_num (access_fn, loop->num);
507 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
508 = evolution_part_in_loop_num (access_fn, loop->num);
511 if (!access_fn
512 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
513 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
514 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
515 && TREE_CODE (step) != INTEGER_CST))
517 worklist.safe_push (stmt_vinfo);
518 continue;
521 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 != NULL_TREE);
523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
525 if (dump_enabled_p ())
526 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
527 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
531 /* Second - identify all reductions and nested cycles. */
532 while (worklist.length () > 0)
534 stmt_vec_info stmt_vinfo = worklist.pop ();
535 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
536 tree def = PHI_RESULT (phi);
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
541 gcc_assert (!virtual_operand_p (def)
542 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
544 stmt_vec_info reduc_stmt_info
545 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
546 &reduc_chain);
547 if (reduc_stmt_info)
549 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
550 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
551 if (double_reduc)
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_NOTE, vect_location,
555 "Detected double reduction.\n");
557 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
558 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
560 else
562 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "Detected vectorizable nested cycle.\n");
568 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
570 else
572 if (dump_enabled_p ())
573 dump_printf_loc (MSG_NOTE, vect_location,
574 "Detected reduction.\n");
576 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
577 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
578 /* Store the reduction cycles for possible vectorization in
579 loop-aware SLP if it was not detected as reduction
580 chain. */
581 if (! reduc_chain)
582 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
583 (reduc_stmt_info);
587 else
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Unknown def-use cycle pattern.\n");
595 /* Function vect_analyze_scalar_cycles.
597 Examine the cross iteration def-use cycles of scalar variables, by
598 analyzing the loop-header PHIs of scalar variables. Classify each
599 cycle as one of the following: invariant, induction, reduction, unknown.
600 We do that for the loop represented by LOOP_VINFO, and also to its
601 inner-loop, if exists.
602 Examples for scalar cycles:
604 Example1: reduction:
606 loop1:
607 for (i=0; i<N; i++)
608 sum += a[i];
610 Example2: induction:
612 loop2:
613 for (i=0; i<N; i++)
614 a[i] = i; */
616 static void
617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
619 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
621 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
623 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
624 Reductions in such inner-loop therefore have different properties than
625 the reductions in the nest that gets vectorized:
626 1. When vectorized, they are executed in the same order as in the original
627 scalar loop, so we can't change the order of computation when
628 vectorizing them.
629 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
630 current checks are too strict. */
632 if (loop->inner)
633 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
636 /* Transfer group and reduction information from STMT_INFO to its
637 pattern stmt. */
639 static void
640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
642 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
643 stmt_vec_info stmtp;
644 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
645 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
646 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
649 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
650 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
651 == STMT_VINFO_DEF_TYPE (stmt_info));
652 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
653 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
654 if (stmt_info)
655 REDUC_GROUP_NEXT_ELEMENT (stmtp)
656 = STMT_VINFO_RELATED_STMT (stmt_info);
658 while (stmt_info);
661 /* Fixup scalar cycles that now have their stmts detected as patterns. */
663 static void
664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
666 stmt_vec_info first;
667 unsigned i;
669 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
674 if ((STMT_VINFO_IN_PATTERN_P (next)
675 != STMT_VINFO_IN_PATTERN_P (first))
676 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
677 break;
678 next = REDUC_GROUP_NEXT_ELEMENT (next);
680 /* If all reduction chain members are well-formed patterns adjust
681 the group to group the pattern stmts instead. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
685 if (STMT_VINFO_IN_PATTERN_P (first))
687 vect_fixup_reduc_chain (first);
688 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
689 = STMT_VINFO_RELATED_STMT (first);
692 /* If not all stmt in the chain are patterns or if we failed
693 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
694 it as regular reduction instead. */
695 else
697 stmt_vec_info vinfo = first;
698 stmt_vec_info last = NULL;
699 while (vinfo)
701 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
702 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
703 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
704 last = vinfo;
705 vinfo = next;
707 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
708 = vect_internal_def;
709 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
710 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
711 --i;
716 /* Function vect_get_loop_niters.
718 Determine how many iterations the loop is executed and place it
719 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
720 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
721 niter information holds in ASSUMPTIONS.
723 Return the loop exit condition. */
726 static gcond *
727 vect_get_loop_niters (class loop *loop, tree *assumptions,
728 tree *number_of_iterations, tree *number_of_iterationsm1)
730 edge exit = single_exit (loop);
731 class tree_niter_desc niter_desc;
732 tree niter_assumptions, niter, may_be_zero;
733 gcond *cond = get_loop_exit_condition (loop);
735 *assumptions = boolean_true_node;
736 *number_of_iterationsm1 = chrec_dont_know;
737 *number_of_iterations = chrec_dont_know;
738 DUMP_VECT_SCOPE ("get_loop_niters");
740 if (!exit)
741 return cond;
743 may_be_zero = NULL_TREE;
744 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
745 || chrec_contains_undetermined (niter_desc.niter))
746 return cond;
748 niter_assumptions = niter_desc.assumptions;
749 may_be_zero = niter_desc.may_be_zero;
750 niter = niter_desc.niter;
752 if (may_be_zero && integer_zerop (may_be_zero))
753 may_be_zero = NULL_TREE;
755 if (may_be_zero)
757 if (COMPARISON_CLASS_P (may_be_zero))
759 /* Try to combine may_be_zero with assumptions, this can simplify
760 computation of niter expression. */
761 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
762 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
763 niter_assumptions,
764 fold_build1 (TRUTH_NOT_EXPR,
765 boolean_type_node,
766 may_be_zero));
767 else
768 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
769 build_int_cst (TREE_TYPE (niter), 0),
770 rewrite_to_non_trapping_overflow (niter));
772 may_be_zero = NULL_TREE;
774 else if (integer_nonzerop (may_be_zero))
776 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
777 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
778 return cond;
780 else
781 return cond;
784 *assumptions = niter_assumptions;
785 *number_of_iterationsm1 = niter;
787 /* We want the number of loop header executions which is the number
788 of latch executions plus one.
789 ??? For UINT_MAX latch executions this number overflows to zero
790 for loops like do { n++; } while (n != 0); */
791 if (niter && !chrec_contains_undetermined (niter))
792 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
793 build_int_cst (TREE_TYPE (niter), 1));
794 *number_of_iterations = niter;
796 return cond;
799 /* Function bb_in_loop_p
801 Used as predicate for dfs order traversal of the loop bbs. */
803 static bool
804 bb_in_loop_p (const_basic_block bb, const void *data)
806 const class loop *const loop = (const class loop *)data;
807 if (flow_bb_inside_loop_p (loop, bb))
808 return true;
809 return false;
813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
814 stmt_vec_info structs for all the stmts in LOOP_IN. */
816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
817 : vec_info (vec_info::loop, shared),
818 loop (loop_in),
819 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
820 num_itersm1 (NULL_TREE),
821 num_iters (NULL_TREE),
822 num_iters_unchanged (NULL_TREE),
823 num_iters_assumptions (NULL_TREE),
824 th (0),
825 versioning_threshold (0),
826 vectorization_factor (0),
827 main_loop_edge (nullptr),
828 skip_main_loop_edge (nullptr),
829 skip_this_loop_edge (nullptr),
830 reusable_accumulators (),
831 max_vectorization_factor (0),
832 mask_skip_niters (NULL_TREE),
833 rgroup_compare_type (NULL_TREE),
834 simd_if_cond (NULL_TREE),
835 unaligned_dr (NULL),
836 peeling_for_alignment (0),
837 ptr_mask (0),
838 ivexpr_map (NULL),
839 scan_map (NULL),
840 slp_unrolling_factor (1),
841 single_scalar_iteration_cost (0),
842 vec_outside_cost (0),
843 vec_inside_cost (0),
844 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
845 vectorizable (false),
846 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
847 using_partial_vectors_p (false),
848 epil_using_partial_vectors_p (false),
849 peeling_for_gaps (false),
850 peeling_for_niter (false),
851 no_data_dependencies (false),
852 has_mask_store (false),
853 scalar_loop_scaling (profile_probability::uninitialized ()),
854 scalar_loop (NULL),
855 orig_loop_info (NULL)
857 /* CHECKME: We want to visit all BBs before their successors (except for
858 latch blocks, for which this assertion wouldn't hold). In the simple
859 case of the loop forms we allow, a dfs order of the BBs would the same
860 as reversed postorder traversal, so we are safe. */
862 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
863 bbs, loop->num_nodes, loop);
864 gcc_assert (nbbs == loop->num_nodes);
866 for (unsigned int i = 0; i < nbbs; i++)
868 basic_block bb = bbs[i];
869 gimple_stmt_iterator si;
871 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
873 gimple *phi = gsi_stmt (si);
874 gimple_set_uid (phi, 0);
875 add_stmt (phi);
878 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
880 gimple *stmt = gsi_stmt (si);
881 gimple_set_uid (stmt, 0);
882 if (is_gimple_debug (stmt))
883 continue;
884 add_stmt (stmt);
885 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
886 third argument is the #pragma omp simd if (x) condition, when 0,
887 loop shouldn't be vectorized, when non-zero constant, it should
888 be vectorized normally, otherwise versioned with vectorized loop
889 done if the condition is non-zero at runtime. */
890 if (loop_in->simduid
891 && is_gimple_call (stmt)
892 && gimple_call_internal_p (stmt)
893 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
894 && gimple_call_num_args (stmt) >= 3
895 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
896 && (loop_in->simduid
897 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
899 tree arg = gimple_call_arg (stmt, 2);
900 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
901 simd_if_cond = arg;
902 else
903 gcc_assert (integer_nonzerop (arg));
908 epilogue_vinfos.create (6);
911 /* Free all levels of rgroup CONTROLS. */
913 void
914 release_vec_loop_controls (vec<rgroup_controls> *controls)
916 rgroup_controls *rgc;
917 unsigned int i;
918 FOR_EACH_VEC_ELT (*controls, i, rgc)
919 rgc->controls.release ();
920 controls->release ();
923 /* Free all memory used by the _loop_vec_info, as well as all the
924 stmt_vec_info structs of all the stmts in the loop. */
926 _loop_vec_info::~_loop_vec_info ()
928 free (bbs);
930 release_vec_loop_controls (&masks);
931 release_vec_loop_controls (&lens);
932 delete ivexpr_map;
933 delete scan_map;
934 epilogue_vinfos.release ();
936 /* When we release an epiloge vinfo that we do not intend to use
937 avoid clearing AUX of the main loop which should continue to
938 point to the main loop vinfo since otherwise we'll leak that. */
939 if (loop->aux == this)
940 loop->aux = NULL;
943 /* Return an invariant or register for EXPR and emit necessary
944 computations in the LOOP_VINFO loop preheader. */
946 tree
947 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
949 if (is_gimple_reg (expr)
950 || is_gimple_min_invariant (expr))
951 return expr;
953 if (! loop_vinfo->ivexpr_map)
954 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
955 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
956 if (! cached)
958 gimple_seq stmts = NULL;
959 cached = force_gimple_operand (unshare_expr (expr),
960 &stmts, true, NULL_TREE);
961 if (stmts)
963 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
964 gsi_insert_seq_on_edge_immediate (e, stmts);
967 return cached;
970 /* Return true if we can use CMP_TYPE as the comparison type to produce
971 all masks required to mask LOOP_VINFO. */
973 static bool
974 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
976 rgroup_controls *rgm;
977 unsigned int i;
978 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
979 if (rgm->type != NULL_TREE
980 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
981 cmp_type, rgm->type,
982 OPTIMIZE_FOR_SPEED))
983 return false;
984 return true;
987 /* Calculate the maximum number of scalars per iteration for every
988 rgroup in LOOP_VINFO. */
990 static unsigned int
991 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
993 unsigned int res = 1;
994 unsigned int i;
995 rgroup_controls *rgm;
996 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
997 res = MAX (res, rgm->max_nscalars_per_iter);
998 return res;
1001 /* Calculate the minimum precision necessary to represent:
1003 MAX_NITERS * FACTOR
1005 as an unsigned integer, where MAX_NITERS is the maximum number of
1006 loop header iterations for the original scalar form of LOOP_VINFO. */
1008 static unsigned
1009 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1011 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1013 /* Get the maximum number of iterations that is representable
1014 in the counter type. */
1015 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1016 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1018 /* Get a more refined estimate for the number of iterations. */
1019 widest_int max_back_edges;
1020 if (max_loop_iterations (loop, &max_back_edges))
1021 max_ni = wi::smin (max_ni, max_back_edges + 1);
1023 /* Work out how many bits we need to represent the limit. */
1024 return wi::min_precision (max_ni * factor, UNSIGNED);
1027 /* True if the loop needs peeling or partial vectors when vectorized. */
1029 static bool
1030 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1032 unsigned HOST_WIDE_INT const_vf;
1033 HOST_WIDE_INT max_niter
1034 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1036 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1037 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1038 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1039 (loop_vinfo));
1041 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1042 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1044 /* Work out the (constant) number of iterations that need to be
1045 peeled for reasons other than niters. */
1046 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1047 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1048 peel_niter += 1;
1049 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1050 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1051 return true;
1053 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1054 /* ??? When peeling for gaps but not alignment, we could
1055 try to check whether the (variable) niters is known to be
1056 VF * N + 1. That's something of a niche case though. */
1057 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1058 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1059 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1060 < (unsigned) exact_log2 (const_vf))
1061 /* In case of versioning, check if the maximum number of
1062 iterations is greater than th. If they are identical,
1063 the epilogue is unnecessary. */
1064 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1065 || ((unsigned HOST_WIDE_INT) max_niter
1066 > (th / const_vf) * const_vf))))
1067 return true;
1069 return false;
1072 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1073 whether we can actually generate the masks required. Return true if so,
1074 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1076 static bool
1077 vect_verify_full_masking (loop_vec_info loop_vinfo)
1079 unsigned int min_ni_width;
1080 unsigned int max_nscalars_per_iter
1081 = vect_get_max_nscalars_per_iter (loop_vinfo);
1083 /* Use a normal loop if there are no statements that need masking.
1084 This only happens in rare degenerate cases: it means that the loop
1085 has no loads, no stores, and no live-out values. */
1086 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1087 return false;
1089 /* Work out how many bits we need to represent the limit. */
1090 min_ni_width
1091 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1093 /* Find a scalar mode for which WHILE_ULT is supported. */
1094 opt_scalar_int_mode cmp_mode_iter;
1095 tree cmp_type = NULL_TREE;
1096 tree iv_type = NULL_TREE;
1097 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1098 unsigned int iv_precision = UINT_MAX;
1100 if (iv_limit != -1)
1101 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1102 UNSIGNED);
1104 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1106 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1107 if (cmp_bits >= min_ni_width
1108 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1110 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1111 if (this_type
1112 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1114 /* Although we could stop as soon as we find a valid mode,
1115 there are at least two reasons why that's not always the
1116 best choice:
1118 - An IV that's Pmode or wider is more likely to be reusable
1119 in address calculations than an IV that's narrower than
1120 Pmode.
1122 - Doing the comparison in IV_PRECISION or wider allows
1123 a natural 0-based IV, whereas using a narrower comparison
1124 type requires mitigations against wrap-around.
1126 Conversely, if the IV limit is variable, doing the comparison
1127 in a wider type than the original type can introduce
1128 unnecessary extensions, so picking the widest valid mode
1129 is not always a good choice either.
1131 Here we prefer the first IV type that's Pmode or wider,
1132 and the first comparison type that's IV_PRECISION or wider.
1133 (The comparison type must be no wider than the IV type,
1134 to avoid extensions in the vector loop.)
1136 ??? We might want to try continuing beyond Pmode for ILP32
1137 targets if CMP_BITS < IV_PRECISION. */
1138 iv_type = this_type;
1139 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1140 cmp_type = this_type;
1141 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1142 break;
1147 if (!cmp_type)
1148 return false;
1150 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1151 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1152 return true;
1155 /* Check whether we can use vector access with length based on precison
1156 comparison. So far, to keep it simple, we only allow the case that the
1157 precision of the target supported length is larger than the precision
1158 required by loop niters. */
1160 static bool
1161 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1163 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1164 return false;
1166 unsigned int max_nitems_per_iter = 1;
1167 unsigned int i;
1168 rgroup_controls *rgl;
1169 /* Find the maximum number of items per iteration for every rgroup. */
1170 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1172 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1173 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1176 /* Work out how many bits we need to represent the length limit. */
1177 unsigned int min_ni_prec
1178 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1180 /* Now use the maximum of below precisions for one suitable IV type:
1181 - the IV's natural precision
1182 - the precision needed to hold: the maximum number of scalar
1183 iterations multiplied by the scale factor (min_ni_prec above)
1184 - the Pmode precision
1186 If min_ni_prec is less than the precision of the current niters,
1187 we perfer to still use the niters type. Prefer to use Pmode and
1188 wider IV to avoid narrow conversions. */
1190 unsigned int ni_prec
1191 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1192 min_ni_prec = MAX (min_ni_prec, ni_prec);
1193 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1195 tree iv_type = NULL_TREE;
1196 opt_scalar_int_mode tmode_iter;
1197 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1199 scalar_mode tmode = tmode_iter.require ();
1200 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1202 /* ??? Do we really want to construct one IV whose precision exceeds
1203 BITS_PER_WORD? */
1204 if (tbits > BITS_PER_WORD)
1205 break;
1207 /* Find the first available standard integral type. */
1208 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1210 iv_type = build_nonstandard_integer_type (tbits, true);
1211 break;
1215 if (!iv_type)
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "can't vectorize with length-based partial vectors"
1220 " because there is no suitable iv type.\n");
1221 return false;
1224 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1225 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1227 return true;
1230 /* Calculate the cost of one scalar iteration of the loop. */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1234 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236 int nbbs = loop->num_nodes, factor;
1237 int innerloop_iters, i;
1239 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1241 /* Gather costs for statements in the scalar loop. */
1243 /* FORNOW. */
1244 innerloop_iters = 1;
1245 if (loop->inner)
1246 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1248 for (i = 0; i < nbbs; i++)
1250 gimple_stmt_iterator si;
1251 basic_block bb = bbs[i];
1253 if (bb->loop_father == loop->inner)
1254 factor = innerloop_iters;
1255 else
1256 factor = 1;
1258 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1260 gimple *stmt = gsi_stmt (si);
1261 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1263 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1264 continue;
1266 /* Skip stmts that are not vectorized inside the loop. */
1267 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1268 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1269 && (!STMT_VINFO_LIVE_P (vstmt_info)
1270 || !VECTORIZABLE_CYCLE_DEF
1271 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1272 continue;
1274 vect_cost_for_stmt kind;
1275 if (STMT_VINFO_DATA_REF (stmt_info))
1277 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1278 kind = scalar_load;
1279 else
1280 kind = scalar_store;
1282 else if (vect_nop_conversion_p (stmt_info))
1283 continue;
1284 else
1285 kind = scalar_stmt;
1287 /* We are using vect_prologue here to avoid scaling twice
1288 by the inner loop factor. */
1289 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1290 factor, kind, stmt_info, 0, vect_prologue);
1294 /* Now accumulate cost. */
1295 vector_costs *target_cost_data = init_cost (loop_vinfo, true);
1296 stmt_info_for_cost *si;
1297 int j;
1298 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1299 j, si)
1300 (void) add_stmt_cost (target_cost_data, si->count,
1301 si->kind, si->stmt_info, si->vectype,
1302 si->misalign, si->where);
1303 unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0;
1304 finish_cost (target_cost_data, &prologue_cost, &body_cost,
1305 &epilogue_cost);
1306 delete target_cost_data;
1307 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1308 = prologue_cost + body_cost + epilogue_cost;
1312 /* Function vect_analyze_loop_form.
1314 Verify that certain CFG restrictions hold, including:
1315 - the loop has a pre-header
1316 - the loop has a single entry and exit
1317 - the loop exit condition is simple enough
1318 - the number of iterations can be analyzed, i.e, a countable loop. The
1319 niter could be analyzed under some assumptions. */
1321 opt_result
1322 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1324 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1326 /* Different restrictions apply when we are considering an inner-most loop,
1327 vs. an outer (nested) loop.
1328 (FORNOW. May want to relax some of these restrictions in the future). */
1330 info->inner_loop_cond = NULL;
1331 if (!loop->inner)
1333 /* Inner-most loop. We currently require that the number of BBs is
1334 exactly 2 (the header and latch). Vectorizable inner-most loops
1335 look like this:
1337 (pre-header)
1339 header <--------+
1340 | | |
1341 | +--> latch --+
1343 (exit-bb) */
1345 if (loop->num_nodes != 2)
1346 return opt_result::failure_at (vect_location,
1347 "not vectorized:"
1348 " control flow in loop.\n");
1350 if (empty_block_p (loop->header))
1351 return opt_result::failure_at (vect_location,
1352 "not vectorized: empty loop.\n");
1354 else
1356 class loop *innerloop = loop->inner;
1357 edge entryedge;
1359 /* Nested loop. We currently require that the loop is doubly-nested,
1360 contains a single inner loop, and the number of BBs is exactly 5.
1361 Vectorizable outer-loops look like this:
1363 (pre-header)
1365 header <---+
1367 inner-loop |
1369 tail ------+
1371 (exit-bb)
1373 The inner-loop has the properties expected of inner-most loops
1374 as described above. */
1376 if ((loop->inner)->inner || (loop->inner)->next)
1377 return opt_result::failure_at (vect_location,
1378 "not vectorized:"
1379 " multiple nested loops.\n");
1381 if (loop->num_nodes != 5)
1382 return opt_result::failure_at (vect_location,
1383 "not vectorized:"
1384 " control flow in loop.\n");
1386 entryedge = loop_preheader_edge (innerloop);
1387 if (entryedge->src != loop->header
1388 || !single_exit (innerloop)
1389 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1390 return opt_result::failure_at (vect_location,
1391 "not vectorized:"
1392 " unsupported outerloop form.\n");
1394 /* Analyze the inner-loop. */
1395 vect_loop_form_info inner;
1396 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1397 if (!res)
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401 "not vectorized: Bad inner loop.\n");
1402 return res;
1405 /* Don't support analyzing niter under assumptions for inner
1406 loop. */
1407 if (!integer_onep (inner.assumptions))
1408 return opt_result::failure_at (vect_location,
1409 "not vectorized: Bad inner loop.\n");
1411 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: inner-loop count not"
1414 " invariant.\n");
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE, vect_location,
1418 "Considering outer-loop vectorization.\n");
1419 info->inner_loop_cond = inner.loop_cond;
1422 if (!single_exit (loop))
1423 return opt_result::failure_at (vect_location,
1424 "not vectorized: multiple exits.\n");
1425 if (EDGE_COUNT (loop->header->preds) != 2)
1426 return opt_result::failure_at (vect_location,
1427 "not vectorized:"
1428 " too many incoming edges.\n");
1430 /* We assume that the loop exit condition is at the end of the loop. i.e,
1431 that the loop is represented as a do-while (with a proper if-guard
1432 before the loop if needed), where the loop header contains all the
1433 executable statements, and the latch is empty. */
1434 if (!empty_block_p (loop->latch)
1435 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1436 return opt_result::failure_at (vect_location,
1437 "not vectorized: latch block not empty.\n");
1439 /* Make sure the exit is not abnormal. */
1440 edge e = single_exit (loop);
1441 if (e->flags & EDGE_ABNORMAL)
1442 return opt_result::failure_at (vect_location,
1443 "not vectorized:"
1444 " abnormal loop exit edge.\n");
1446 info->loop_cond
1447 = vect_get_loop_niters (loop, &info->assumptions,
1448 &info->number_of_iterations,
1449 &info->number_of_iterationsm1);
1450 if (!info->loop_cond)
1451 return opt_result::failure_at
1452 (vect_location,
1453 "not vectorized: complicated exit condition.\n");
1455 if (integer_zerop (info->assumptions)
1456 || !info->number_of_iterations
1457 || chrec_contains_undetermined (info->number_of_iterations))
1458 return opt_result::failure_at
1459 (info->loop_cond,
1460 "not vectorized: number of iterations cannot be computed.\n");
1462 if (integer_zerop (info->number_of_iterations))
1463 return opt_result::failure_at
1464 (info->loop_cond,
1465 "not vectorized: number of iterations = 0.\n");
1467 return opt_result::success ();
1470 /* Create a loop_vec_info for LOOP with SHARED and the
1471 vect_analyze_loop_form result. */
1473 loop_vec_info
1474 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1475 const vect_loop_form_info *info)
1477 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1478 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1479 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1480 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1481 if (!integer_onep (info->assumptions))
1483 /* We consider to vectorize this loop by versioning it under
1484 some assumptions. In order to do this, we need to clear
1485 existing information computed by scev and niter analyzer. */
1486 scev_reset_htab ();
1487 free_numbers_of_iterations_estimates (loop);
1488 /* Also set flag for this loop so that following scev and niter
1489 analysis are done under the assumptions. */
1490 loop_constraint_set (loop, LOOP_C_FINITE);
1491 /* Also record the assumptions for versioning. */
1492 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1495 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1497 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE, vect_location,
1500 "Symbolic number of iterations is ");
1501 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1502 dump_printf (MSG_NOTE, "\n");
1506 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1507 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508 if (info->inner_loop_cond)
1510 stmt_vec_info inner_loop_cond_info
1511 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1512 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1513 /* If we have an estimate on the number of iterations of the inner
1514 loop use that to limit the scale for costing, otherwise use
1515 --param vect-inner-loop-cost-factor literally. */
1516 widest_int nit;
1517 if (estimated_stmt_executions (loop->inner, &nit))
1518 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1519 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1522 return loop_vinfo;
1527 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1528 statements update the vectorization factor. */
1530 static void
1531 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1533 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1534 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1535 int nbbs = loop->num_nodes;
1536 poly_uint64 vectorization_factor;
1537 int i;
1539 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1541 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1542 gcc_assert (known_ne (vectorization_factor, 0U));
1544 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1545 vectorization factor of the loop is the unrolling factor required by
1546 the SLP instances. If that unrolling factor is 1, we say, that we
1547 perform pure SLP on loop - cross iteration parallelism is not
1548 exploited. */
1549 bool only_slp_in_loop = true;
1550 for (i = 0; i < nbbs; i++)
1552 basic_block bb = bbs[i];
1553 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1554 gsi_next (&si))
1556 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1557 if (!stmt_info)
1558 continue;
1559 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1560 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1561 && !PURE_SLP_STMT (stmt_info))
1562 /* STMT needs both SLP and loop-based vectorization. */
1563 only_slp_in_loop = false;
1565 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1566 gsi_next (&si))
1568 if (is_gimple_debug (gsi_stmt (si)))
1569 continue;
1570 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1571 stmt_info = vect_stmt_to_vectorize (stmt_info);
1572 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574 && !PURE_SLP_STMT (stmt_info))
1575 /* STMT needs both SLP and loop-based vectorization. */
1576 only_slp_in_loop = false;
1580 if (only_slp_in_loop)
1582 if (dump_enabled_p ())
1583 dump_printf_loc (MSG_NOTE, vect_location,
1584 "Loop contains only SLP stmts\n");
1585 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1587 else
1589 if (dump_enabled_p ())
1590 dump_printf_loc (MSG_NOTE, vect_location,
1591 "Loop contains SLP and non-SLP stmts\n");
1592 /* Both the vectorization factor and unroll factor have the form
1593 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1594 so they must have a common multiple. */
1595 vectorization_factor
1596 = force_common_multiple (vectorization_factor,
1597 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1600 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1601 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "Updating vectorization factor to ");
1605 dump_dec (MSG_NOTE, vectorization_factor);
1606 dump_printf (MSG_NOTE, ".\n");
1610 /* Return true if STMT_INFO describes a double reduction phi and if
1611 the other phi in the reduction is also relevant for vectorization.
1612 This rejects cases such as:
1614 outer1:
1615 x_1 = PHI <x_3(outer2), ...>;
1618 inner:
1619 x_2 = ...;
1622 outer2:
1623 x_3 = PHI <x_2(inner)>;
1625 if nothing in x_2 or elsewhere makes x_1 relevant. */
1627 static bool
1628 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1630 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1631 return false;
1633 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1636 /* Function vect_analyze_loop_operations.
1638 Scan the loop stmts and make sure they are all vectorizable. */
1640 static opt_result
1641 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1643 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1644 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1645 int nbbs = loop->num_nodes;
1646 int i;
1647 stmt_vec_info stmt_info;
1648 bool need_to_vectorize = false;
1649 bool ok;
1651 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1653 auto_vec<stmt_info_for_cost> cost_vec;
1655 for (i = 0; i < nbbs; i++)
1657 basic_block bb = bbs[i];
1659 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1660 gsi_next (&si))
1662 gphi *phi = si.phi ();
1663 ok = true;
1665 stmt_info = loop_vinfo->lookup_stmt (phi);
1666 if (dump_enabled_p ())
1667 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1668 if (virtual_operand_p (gimple_phi_result (phi)))
1669 continue;
1671 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1672 (i.e., a phi in the tail of the outer-loop). */
1673 if (! is_loop_header_bb_p (bb))
1675 /* FORNOW: we currently don't support the case that these phis
1676 are not used in the outerloop (unless it is double reduction,
1677 i.e., this phi is vect_reduction_def), cause this case
1678 requires to actually do something here. */
1679 if (STMT_VINFO_LIVE_P (stmt_info)
1680 && !vect_active_double_reduction_p (stmt_info))
1681 return opt_result::failure_at (phi,
1682 "Unsupported loop-closed phi"
1683 " in outer-loop.\n");
1685 /* If PHI is used in the outer loop, we check that its operand
1686 is defined in the inner loop. */
1687 if (STMT_VINFO_RELEVANT_P (stmt_info))
1689 tree phi_op;
1691 if (gimple_phi_num_args (phi) != 1)
1692 return opt_result::failure_at (phi, "unsupported phi");
1694 phi_op = PHI_ARG_DEF (phi, 0);
1695 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1696 if (!op_def_info)
1697 return opt_result::failure_at (phi, "unsupported phi\n");
1699 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1700 && (STMT_VINFO_RELEVANT (op_def_info)
1701 != vect_used_in_outer_by_reduction))
1702 return opt_result::failure_at (phi, "unsupported phi\n");
1704 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1705 || (STMT_VINFO_DEF_TYPE (stmt_info)
1706 == vect_double_reduction_def))
1707 && !vectorizable_lc_phi (loop_vinfo,
1708 stmt_info, NULL, NULL))
1709 return opt_result::failure_at (phi, "unsupported phi\n");
1712 continue;
1715 gcc_assert (stmt_info);
1717 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1718 || STMT_VINFO_LIVE_P (stmt_info))
1719 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1720 /* A scalar-dependence cycle that we don't support. */
1721 return opt_result::failure_at (phi,
1722 "not vectorized:"
1723 " scalar dependence cycle.\n");
1725 if (STMT_VINFO_RELEVANT_P (stmt_info))
1727 need_to_vectorize = true;
1728 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1729 && ! PURE_SLP_STMT (stmt_info))
1730 ok = vectorizable_induction (loop_vinfo,
1731 stmt_info, NULL, NULL,
1732 &cost_vec);
1733 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1734 || (STMT_VINFO_DEF_TYPE (stmt_info)
1735 == vect_double_reduction_def)
1736 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1737 && ! PURE_SLP_STMT (stmt_info))
1738 ok = vectorizable_reduction (loop_vinfo,
1739 stmt_info, NULL, NULL, &cost_vec);
1742 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1743 if (ok
1744 && STMT_VINFO_LIVE_P (stmt_info)
1745 && !PURE_SLP_STMT (stmt_info))
1746 ok = vectorizable_live_operation (loop_vinfo,
1747 stmt_info, NULL, NULL, NULL,
1748 -1, false, &cost_vec);
1750 if (!ok)
1751 return opt_result::failure_at (phi,
1752 "not vectorized: relevant phi not "
1753 "supported: %G",
1754 static_cast <gimple *> (phi));
1757 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1758 gsi_next (&si))
1760 gimple *stmt = gsi_stmt (si);
1761 if (!gimple_clobber_p (stmt)
1762 && !is_gimple_debug (stmt))
1764 opt_result res
1765 = vect_analyze_stmt (loop_vinfo,
1766 loop_vinfo->lookup_stmt (stmt),
1767 &need_to_vectorize,
1768 NULL, NULL, &cost_vec);
1769 if (!res)
1770 return res;
1773 } /* bbs */
1775 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1777 /* All operations in the loop are either irrelevant (deal with loop
1778 control, or dead), or only used outside the loop and can be moved
1779 out of the loop (e.g. invariants, inductions). The loop can be
1780 optimized away by scalar optimizations. We're better off not
1781 touching this loop. */
1782 if (!need_to_vectorize)
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_NOTE, vect_location,
1786 "All the computation can be taken out of the loop.\n");
1787 return opt_result::failure_at
1788 (vect_location,
1789 "not vectorized: redundant loop. no profit to vectorize.\n");
1792 return opt_result::success ();
1795 /* Return true if we know that the iteration count is smaller than the
1796 vectorization factor. Return false if it isn't, or if we can't be sure
1797 either way. */
1799 static bool
1800 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1802 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1804 HOST_WIDE_INT max_niter;
1805 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1806 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1807 else
1808 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1810 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1811 return true;
1813 return false;
1816 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1817 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1818 definitely no, or -1 if it's worth retrying. */
1820 static int
1821 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1823 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1824 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1826 /* Only loops that can handle partially-populated vectors can have iteration
1827 counts less than the vectorization factor. */
1828 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1830 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1832 if (dump_enabled_p ())
1833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1834 "not vectorized: iteration count smaller than "
1835 "vectorization factor.\n");
1836 return 0;
1840 /* If using the "very cheap" model. reject cases in which we'd keep
1841 a copy of the scalar code (even if we might be able to vectorize it). */
1842 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1843 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1844 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1845 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1847 if (dump_enabled_p ())
1848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1849 "some scalar iterations would need to be peeled\n");
1850 return 0;
1853 int min_profitable_iters, min_profitable_estimate;
1854 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1855 &min_profitable_estimate);
1857 if (min_profitable_iters < 0)
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "not vectorized: vectorization not profitable.\n");
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864 "not vectorized: vector version will never be "
1865 "profitable.\n");
1866 return -1;
1869 int min_scalar_loop_bound = (param_min_vect_loop_bound
1870 * assumed_vf);
1872 /* Use the cost model only if it is more conservative than user specified
1873 threshold. */
1874 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1875 min_profitable_iters);
1877 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1879 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1880 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1882 if (dump_enabled_p ())
1883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884 "not vectorized: vectorization not profitable.\n");
1885 if (dump_enabled_p ())
1886 dump_printf_loc (MSG_NOTE, vect_location,
1887 "not vectorized: iteration count smaller than user "
1888 "specified loop bound parameter or minimum profitable "
1889 "iterations (whichever is more conservative).\n");
1890 return 0;
1893 /* The static profitablity threshold min_profitable_estimate includes
1894 the cost of having to check at runtime whether the scalar loop
1895 should be used instead. If it turns out that we don't need or want
1896 such a check, the threshold we should use for the static estimate
1897 is simply the point at which the vector loop becomes more profitable
1898 than the scalar loop. */
1899 if (min_profitable_estimate > min_profitable_iters
1900 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1901 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1902 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1903 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1907 " choice between the scalar and vector loops\n");
1908 min_profitable_estimate = min_profitable_iters;
1911 /* If the vector loop needs multiple iterations to be beneficial then
1912 things are probably too close to call, and the conservative thing
1913 would be to stick with the scalar code. */
1914 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1915 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1919 "one iteration of the vector loop would be"
1920 " more expensive than the equivalent number of"
1921 " iterations of the scalar loop\n");
1922 return 0;
1925 HOST_WIDE_INT estimated_niter;
1927 /* If we are vectorizing an epilogue then we know the maximum number of
1928 scalar iterations it will cover is at least one lower than the
1929 vectorization factor of the main loop. */
1930 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1931 estimated_niter
1932 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1933 else
1935 estimated_niter = estimated_stmt_executions_int (loop);
1936 if (estimated_niter == -1)
1937 estimated_niter = likely_max_stmt_executions_int (loop);
1939 if (estimated_niter != -1
1940 && ((unsigned HOST_WIDE_INT) estimated_niter
1941 < MAX (th, (unsigned) min_profitable_estimate)))
1943 if (dump_enabled_p ())
1944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1945 "not vectorized: estimated iteration count too "
1946 "small.\n");
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_NOTE, vect_location,
1949 "not vectorized: estimated iteration count smaller "
1950 "than specified loop bound parameter or minimum "
1951 "profitable iterations (whichever is more "
1952 "conservative).\n");
1953 return -1;
1956 return 1;
1959 static opt_result
1960 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1961 vec<data_reference_p> *datarefs,
1962 unsigned int *n_stmts)
1964 *n_stmts = 0;
1965 for (unsigned i = 0; i < loop->num_nodes; i++)
1966 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1967 !gsi_end_p (gsi); gsi_next (&gsi))
1969 gimple *stmt = gsi_stmt (gsi);
1970 if (is_gimple_debug (stmt))
1971 continue;
1972 ++(*n_stmts);
1973 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1974 NULL, 0);
1975 if (!res)
1977 if (is_gimple_call (stmt) && loop->safelen)
1979 tree fndecl = gimple_call_fndecl (stmt), op;
1980 if (fndecl != NULL_TREE)
1982 cgraph_node *node = cgraph_node::get (fndecl);
1983 if (node != NULL && node->simd_clones != NULL)
1985 unsigned int j, n = gimple_call_num_args (stmt);
1986 for (j = 0; j < n; j++)
1988 op = gimple_call_arg (stmt, j);
1989 if (DECL_P (op)
1990 || (REFERENCE_CLASS_P (op)
1991 && get_base_address (op)))
1992 break;
1994 op = gimple_call_lhs (stmt);
1995 /* Ignore #pragma omp declare simd functions
1996 if they don't have data references in the
1997 call stmt itself. */
1998 if (j == n
1999 && !(op
2000 && (DECL_P (op)
2001 || (REFERENCE_CLASS_P (op)
2002 && get_base_address (op)))))
2003 continue;
2007 return res;
2009 /* If dependence analysis will give up due to the limit on the
2010 number of datarefs stop here and fail fatally. */
2011 if (datarefs->length ()
2012 > (unsigned)param_loop_max_datarefs_for_datadeps)
2013 return opt_result::failure_at (stmt, "exceeded param "
2014 "loop-max-datarefs-for-datadeps\n");
2016 return opt_result::success ();
2019 /* Look for SLP-only access groups and turn each individual access into its own
2020 group. */
2021 static void
2022 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2024 unsigned int i;
2025 struct data_reference *dr;
2027 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2029 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2030 FOR_EACH_VEC_ELT (datarefs, i, dr)
2032 gcc_assert (DR_REF (dr));
2033 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2035 /* Check if the load is a part of an interleaving chain. */
2036 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2038 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2039 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2040 unsigned int group_size = DR_GROUP_SIZE (first_element);
2042 /* Check if SLP-only groups. */
2043 if (!STMT_SLP_TYPE (stmt_info)
2044 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2046 /* Dissolve the group. */
2047 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2049 stmt_vec_info vinfo = first_element;
2050 while (vinfo)
2052 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2053 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2054 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2055 DR_GROUP_SIZE (vinfo) = 1;
2056 if (STMT_VINFO_STRIDED_P (first_element))
2057 DR_GROUP_GAP (vinfo) = 0;
2058 else
2059 DR_GROUP_GAP (vinfo) = group_size - 1;
2060 /* Duplicate and adjust alignment info, it needs to
2061 be present on each group leader, see dr_misalignment. */
2062 if (vinfo != first_element)
2064 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2065 dr_info2->target_alignment = dr_info->target_alignment;
2066 int misalignment = dr_info->misalignment;
2067 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2069 HOST_WIDE_INT diff
2070 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2071 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2072 unsigned HOST_WIDE_INT align_c
2073 = dr_info->target_alignment.to_constant ();
2074 misalignment = (misalignment + diff) % align_c;
2076 dr_info2->misalignment = misalignment;
2078 vinfo = next;
2085 /* Determine if operating on full vectors for LOOP_VINFO might leave
2086 some scalar iterations still to do. If so, decide how we should
2087 handle those scalar iterations. The possibilities are:
2089 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2090 In this case:
2092 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2093 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2094 LOOP_VINFO_PEELING_FOR_NITER == false
2096 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2097 to handle the remaining scalar iterations. In this case:
2099 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2100 LOOP_VINFO_PEELING_FOR_NITER == true
2102 There are two choices:
2104 (2a) Consider vectorizing the epilogue loop at the same VF as the
2105 main loop, but using partial vectors instead of full vectors.
2106 In this case:
2108 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2110 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2111 In this case:
2113 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2115 When FOR_EPILOGUE_P is true, make this determination based on the
2116 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2117 based on the assumption that LOOP_VINFO is the main loop. The caller
2118 has made sure that the number of iterations is set appropriately for
2119 this value of FOR_EPILOGUE_P. */
2121 opt_result
2122 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2123 bool for_epilogue_p)
2125 /* Determine whether there would be any scalar iterations left over. */
2126 bool need_peeling_or_partial_vectors_p
2127 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2129 /* Decide whether to vectorize the loop with partial vectors. */
2130 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2131 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2132 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2133 && need_peeling_or_partial_vectors_p)
2135 /* For partial-vector-usage=1, try to push the handling of partial
2136 vectors to the epilogue, with the main loop continuing to operate
2137 on full vectors.
2139 ??? We could then end up failing to use partial vectors if we
2140 decide to peel iterations into a prologue, and if the main loop
2141 then ends up processing fewer than VF iterations. */
2142 if (param_vect_partial_vector_usage == 1
2143 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2144 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2145 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2146 else
2147 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2150 if (dump_enabled_p ())
2152 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2153 dump_printf_loc (MSG_NOTE, vect_location,
2154 "operating on partial vectors%s.\n",
2155 for_epilogue_p ? " for epilogue loop" : "");
2156 else
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "operating only on full vectors%s.\n",
2159 for_epilogue_p ? " for epilogue loop" : "");
2162 if (for_epilogue_p)
2164 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165 gcc_assert (orig_loop_vinfo);
2166 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2167 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2168 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2171 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2172 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174 /* Check that the loop processes at least one full vector. */
2175 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2176 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2177 if (known_lt (wi::to_widest (scalar_niters), vf))
2178 return opt_result::failure_at (vect_location,
2179 "loop does not have enough iterations"
2180 " to support vectorization.\n");
2182 /* If we need to peel an extra epilogue iteration to handle data
2183 accesses with gaps, check that there are enough scalar iterations
2184 available.
2186 The check above is redundant with this one when peeling for gaps,
2187 but the distinction is useful for diagnostics. */
2188 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2189 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2190 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2191 return opt_result::failure_at (vect_location,
2192 "loop does not have enough iterations"
2193 " to support peeling for gaps.\n");
2196 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2197 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2198 && need_peeling_or_partial_vectors_p);
2200 return opt_result::success ();
2203 /* Function vect_analyze_loop_2.
2205 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2206 for it. The different analyses will record information in the
2207 loop_vec_info struct. */
2208 static opt_result
2209 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2211 opt_result ok = opt_result::success ();
2212 int res;
2213 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2214 poly_uint64 min_vf = 2;
2215 loop_vec_info orig_loop_vinfo = NULL;
2217 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2218 loop_vec_info of the first vectorized loop. */
2219 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2220 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2221 else
2222 orig_loop_vinfo = loop_vinfo;
2223 gcc_assert (orig_loop_vinfo);
2225 /* The first group of checks is independent of the vector size. */
2226 fatal = true;
2228 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2229 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2230 return opt_result::failure_at (vect_location,
2231 "not vectorized: simd if(0)\n");
2233 /* Find all data references in the loop (which correspond to vdefs/vuses)
2234 and analyze their evolution in the loop. */
2236 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2238 /* Gather the data references and count stmts in the loop. */
2239 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2241 opt_result res
2242 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2243 &LOOP_VINFO_DATAREFS (loop_vinfo),
2244 &LOOP_VINFO_N_STMTS (loop_vinfo));
2245 if (!res)
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "not vectorized: loop contains function "
2250 "calls or data references that cannot "
2251 "be analyzed\n");
2252 return res;
2254 loop_vinfo->shared->save_datarefs ();
2256 else
2257 loop_vinfo->shared->check_datarefs ();
2259 /* Analyze the data references and also adjust the minimal
2260 vectorization factor according to the loads and stores. */
2262 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2263 if (!ok)
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2267 "bad data references.\n");
2268 return ok;
2271 /* Classify all cross-iteration scalar data-flow cycles.
2272 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2273 vect_analyze_scalar_cycles (loop_vinfo);
2275 vect_pattern_recog (loop_vinfo);
2277 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2279 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2280 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2282 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2283 if (!ok)
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "bad data access.\n");
2288 return ok;
2291 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2293 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2294 if (!ok)
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "unexpected pattern.\n");
2299 return ok;
2302 /* While the rest of the analysis below depends on it in some way. */
2303 fatal = false;
2305 /* Analyze data dependences between the data-refs in the loop
2306 and adjust the maximum vectorization factor according to
2307 the dependences.
2308 FORNOW: fail at the first data dependence that we encounter. */
2310 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2311 if (!ok)
2313 if (dump_enabled_p ())
2314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2315 "bad data dependence.\n");
2316 return ok;
2318 if (max_vf != MAX_VECTORIZATION_FACTOR
2319 && maybe_lt (max_vf, min_vf))
2320 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2321 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2323 ok = vect_determine_vectorization_factor (loop_vinfo);
2324 if (!ok)
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328 "can't determine vectorization factor.\n");
2329 return ok;
2331 if (max_vf != MAX_VECTORIZATION_FACTOR
2332 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2333 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2335 /* Compute the scalar iteration cost. */
2336 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2338 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2340 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2341 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2342 if (!ok)
2343 return ok;
2345 /* If there are any SLP instances mark them as pure_slp. */
2346 bool slp = vect_make_slp_decision (loop_vinfo);
2347 if (slp)
2349 /* Find stmts that need to be both vectorized and SLPed. */
2350 vect_detect_hybrid_slp (loop_vinfo);
2352 /* Update the vectorization factor based on the SLP decision. */
2353 vect_update_vf_for_slp (loop_vinfo);
2355 /* Optimize the SLP graph with the vectorization factor fixed. */
2356 vect_optimize_slp (loop_vinfo);
2358 /* Gather the loads reachable from the SLP graph entries. */
2359 vect_gather_slp_loads (loop_vinfo);
2362 bool saved_can_use_partial_vectors_p
2363 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2365 /* We don't expect to have to roll back to anything other than an empty
2366 set of rgroups. */
2367 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2369 /* This is the point where we can re-start analysis with SLP forced off. */
2370 start_over:
2372 /* Now the vectorization factor is final. */
2373 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2374 gcc_assert (known_ne (vectorization_factor, 0U));
2376 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2378 dump_printf_loc (MSG_NOTE, vect_location,
2379 "vectorization_factor = ");
2380 dump_dec (MSG_NOTE, vectorization_factor);
2381 dump_printf (MSG_NOTE, ", niters = %wd\n",
2382 LOOP_VINFO_INT_NITERS (loop_vinfo));
2385 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = init_cost (loop_vinfo, false);
2387 /* Analyze the alignment of the data-refs in the loop.
2388 Fail if a data reference is found that cannot be vectorized. */
2390 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2391 if (!ok)
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395 "bad data alignment.\n");
2396 return ok;
2399 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2400 It is important to call pruning after vect_analyze_data_ref_accesses,
2401 since we use grouping information gathered by interleaving analysis. */
2402 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2403 if (!ok)
2404 return ok;
2406 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2407 vectorization, since we do not want to add extra peeling or
2408 add versioning for alignment. */
2409 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2410 /* This pass will decide on using loop versioning and/or loop peeling in
2411 order to enhance the alignment of data references in the loop. */
2412 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2413 if (!ok)
2414 return ok;
2416 if (slp)
2418 /* Analyze operations in the SLP instances. Note this may
2419 remove unsupported SLP instances which makes the above
2420 SLP kind detection invalid. */
2421 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2422 vect_slp_analyze_operations (loop_vinfo);
2423 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2425 ok = opt_result::failure_at (vect_location,
2426 "unsupported SLP instances\n");
2427 goto again;
2430 /* Check whether any load in ALL SLP instances is possibly permuted. */
2431 slp_tree load_node, slp_root;
2432 unsigned i, x;
2433 slp_instance instance;
2434 bool can_use_lanes = true;
2435 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2437 slp_root = SLP_INSTANCE_TREE (instance);
2438 int group_size = SLP_TREE_LANES (slp_root);
2439 tree vectype = SLP_TREE_VECTYPE (slp_root);
2440 bool loads_permuted = false;
2441 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2443 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2444 continue;
2445 unsigned j;
2446 stmt_vec_info load_info;
2447 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2448 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2450 loads_permuted = true;
2451 break;
2455 /* If the loads and stores can be handled with load/store-lane
2456 instructions record it and move on to the next instance. */
2457 if (loads_permuted
2458 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2459 && vect_store_lanes_supported (vectype, group_size, false))
2461 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2463 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2464 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2465 /* Use SLP for strided accesses (or if we can't
2466 load-lanes). */
2467 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2468 || ! vect_load_lanes_supported
2469 (STMT_VINFO_VECTYPE (stmt_vinfo),
2470 DR_GROUP_SIZE (stmt_vinfo), false))
2471 break;
2474 can_use_lanes
2475 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2477 if (can_use_lanes && dump_enabled_p ())
2478 dump_printf_loc (MSG_NOTE, vect_location,
2479 "SLP instance %p can use load/store-lanes\n",
2480 instance);
2482 else
2484 can_use_lanes = false;
2485 break;
2489 /* If all SLP instances can use load/store-lanes abort SLP and try again
2490 with SLP disabled. */
2491 if (can_use_lanes)
2493 ok = opt_result::failure_at (vect_location,
2494 "Built SLP cancelled: can use "
2495 "load/store-lanes\n");
2496 if (dump_enabled_p ())
2497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2498 "Built SLP cancelled: all SLP instances support "
2499 "load/store-lanes\n");
2500 goto again;
2504 /* Dissolve SLP-only groups. */
2505 vect_dissolve_slp_only_groups (loop_vinfo);
2507 /* Scan all the remaining operations in the loop that are not subject
2508 to SLP and make sure they are vectorizable. */
2509 ok = vect_analyze_loop_operations (loop_vinfo);
2510 if (!ok)
2512 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514 "bad operation or unsupported loop bound.\n");
2515 return ok;
2518 /* For now, we don't expect to mix both masking and length approaches for one
2519 loop, disable it if both are recorded. */
2520 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2521 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2522 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526 "can't vectorize a loop with partial vectors"
2527 " because we don't expect to mix different"
2528 " approaches with partial vectors for the"
2529 " same loop.\n");
2530 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2533 /* If we still have the option of using partial vectors,
2534 check whether we can generate the necessary loop controls. */
2535 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2536 && !vect_verify_full_masking (loop_vinfo)
2537 && !vect_verify_loop_lens (loop_vinfo))
2538 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2540 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2541 to be able to handle fewer than VF scalars, or needs to have a lower VF
2542 than the main loop. */
2543 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2544 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2545 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2546 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2547 return opt_result::failure_at (vect_location,
2548 "Vectorization factor too high for"
2549 " epilogue loop.\n");
2551 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2552 assuming that the loop will be used as a main loop. We will redo
2553 this analysis later if we instead decide to use the loop as an
2554 epilogue loop. */
2555 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2556 if (!ok)
2557 return ok;
2559 /* Check the costings of the loop make vectorizing worthwhile. */
2560 res = vect_analyze_loop_costing (loop_vinfo);
2561 if (res < 0)
2563 ok = opt_result::failure_at (vect_location,
2564 "Loop costings may not be worthwhile.\n");
2565 goto again;
2567 if (!res)
2568 return opt_result::failure_at (vect_location,
2569 "Loop costings not worthwhile.\n");
2571 /* If an epilogue loop is required make sure we can create one. */
2572 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2573 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2575 if (dump_enabled_p ())
2576 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2577 if (!vect_can_advance_ivs_p (loop_vinfo)
2578 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2579 single_exit (LOOP_VINFO_LOOP
2580 (loop_vinfo))))
2582 ok = opt_result::failure_at (vect_location,
2583 "not vectorized: can't create required "
2584 "epilog loop\n");
2585 goto again;
2589 /* During peeling, we need to check if number of loop iterations is
2590 enough for both peeled prolog loop and vector loop. This check
2591 can be merged along with threshold check of loop versioning, so
2592 increase threshold for this case if necessary.
2594 If we are analyzing an epilogue we still want to check what its
2595 versioning threshold would be. If we decide to vectorize the epilogues we
2596 will want to use the lowest versioning threshold of all epilogues and main
2597 loop. This will enable us to enter a vectorized epilogue even when
2598 versioning the loop. We can't simply check whether the epilogue requires
2599 versioning though since we may have skipped some versioning checks when
2600 analyzing the epilogue. For instance, checks for alias versioning will be
2601 skipped when dealing with epilogues as we assume we already checked them
2602 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2603 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2605 poly_uint64 niters_th = 0;
2606 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2608 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2610 /* Niters for peeled prolog loop. */
2611 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2613 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2614 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2615 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2617 else
2618 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2621 /* Niters for at least one iteration of vectorized loop. */
2622 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2623 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2624 /* One additional iteration because of peeling for gap. */
2625 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2626 niters_th += 1;
2628 /* Use the same condition as vect_transform_loop to decide when to use
2629 the cost to determine a versioning threshold. */
2630 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2631 && ordered_p (th, niters_th))
2632 niters_th = ordered_max (poly_uint64 (th), niters_th);
2634 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2637 gcc_assert (known_eq (vectorization_factor,
2638 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2640 /* Ok to vectorize! */
2641 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2642 return opt_result::success ();
2644 again:
2645 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2646 gcc_assert (!ok);
2648 /* Try again with SLP forced off but if we didn't do any SLP there is
2649 no point in re-trying. */
2650 if (!slp)
2651 return ok;
2653 /* If there are reduction chains re-trying will fail anyway. */
2654 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2655 return ok;
2657 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2658 via interleaving or lane instructions. */
2659 slp_instance instance;
2660 slp_tree node;
2661 unsigned i, j;
2662 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2664 stmt_vec_info vinfo;
2665 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2666 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2667 continue;
2668 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2669 unsigned int size = DR_GROUP_SIZE (vinfo);
2670 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2671 if (! vect_store_lanes_supported (vectype, size, false)
2672 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2673 && ! vect_grouped_store_supported (vectype, size))
2674 return opt_result::failure_at (vinfo->stmt,
2675 "unsupported grouped store\n");
2676 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2678 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2679 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2680 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2681 size = DR_GROUP_SIZE (vinfo);
2682 vectype = STMT_VINFO_VECTYPE (vinfo);
2683 if (! vect_load_lanes_supported (vectype, size, false)
2684 && ! vect_grouped_load_supported (vectype, single_element_p,
2685 size))
2686 return opt_result::failure_at (vinfo->stmt,
2687 "unsupported grouped load\n");
2691 if (dump_enabled_p ())
2692 dump_printf_loc (MSG_NOTE, vect_location,
2693 "re-trying with SLP disabled\n");
2695 /* Roll back state appropriately. No SLP this time. */
2696 slp = false;
2697 /* Restore vectorization factor as it were without SLP. */
2698 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2699 /* Free the SLP instances. */
2700 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2701 vect_free_slp_instance (instance);
2702 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2703 /* Reset SLP type to loop_vect on all stmts. */
2704 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2706 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2707 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2708 !gsi_end_p (si); gsi_next (&si))
2710 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2711 STMT_SLP_TYPE (stmt_info) = loop_vect;
2712 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2713 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2715 /* vectorizable_reduction adjusts reduction stmt def-types,
2716 restore them to that of the PHI. */
2717 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2718 = STMT_VINFO_DEF_TYPE (stmt_info);
2719 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2720 (STMT_VINFO_REDUC_DEF (stmt_info)))
2721 = STMT_VINFO_DEF_TYPE (stmt_info);
2724 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2725 !gsi_end_p (si); gsi_next (&si))
2727 if (is_gimple_debug (gsi_stmt (si)))
2728 continue;
2729 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2730 STMT_SLP_TYPE (stmt_info) = loop_vect;
2731 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2733 stmt_vec_info pattern_stmt_info
2734 = STMT_VINFO_RELATED_STMT (stmt_info);
2735 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2736 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2738 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2739 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2740 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2741 !gsi_end_p (pi); gsi_next (&pi))
2742 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2743 = loop_vect;
2747 /* Free optimized alias test DDRS. */
2748 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2749 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2750 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2751 /* Reset target cost data. */
2752 delete LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2753 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = nullptr;
2754 /* Reset accumulated rgroup information. */
2755 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2756 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2757 /* Reset assorted flags. */
2758 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2759 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2760 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2761 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2762 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2763 = saved_can_use_partial_vectors_p;
2765 goto start_over;
2768 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2769 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2770 OLD_LOOP_VINFO is better unless something specifically indicates
2771 otherwise.
2773 Note that this deliberately isn't a partial order. */
2775 static bool
2776 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2777 loop_vec_info old_loop_vinfo)
2779 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2780 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2782 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2783 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2785 /* Always prefer a VF of loop->simdlen over any other VF. */
2786 if (loop->simdlen)
2788 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2789 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2790 if (new_simdlen_p != old_simdlen_p)
2791 return new_simdlen_p;
2794 /* Limit the VFs to what is likely to be the maximum number of iterations,
2795 to handle cases in which at least one loop_vinfo is fully-masked. */
2796 HOST_WIDE_INT estimated_max_niter;
2797 loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
2798 unsigned HOST_WIDE_INT main_vf;
2799 if (main_loop
2800 && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
2801 && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
2802 estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
2803 else
2804 estimated_max_niter = likely_max_stmt_executions_int (loop);
2805 if (estimated_max_niter != -1)
2807 if (known_le (estimated_max_niter, new_vf))
2808 new_vf = estimated_max_niter;
2809 if (known_le (estimated_max_niter, old_vf))
2810 old_vf = estimated_max_niter;
2813 /* Check whether the (fractional) cost per scalar iteration is lower
2814 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2815 poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2816 poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2818 HOST_WIDE_INT est_rel_new_min
2819 = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2820 HOST_WIDE_INT est_rel_new_max
2821 = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2823 HOST_WIDE_INT est_rel_old_min
2824 = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2825 HOST_WIDE_INT est_rel_old_max
2826 = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2828 /* Check first if we can make out an unambigous total order from the minimum
2829 and maximum estimates. */
2830 if (est_rel_new_min < est_rel_old_min
2831 && est_rel_new_max < est_rel_old_max)
2832 return true;
2833 else if (est_rel_old_min < est_rel_new_min
2834 && est_rel_old_max < est_rel_new_max)
2835 return false;
2836 /* When old_loop_vinfo uses a variable vectorization factor,
2837 we know that it has a lower cost for at least one runtime VF.
2838 However, we don't know how likely that VF is.
2840 One option would be to compare the costs for the estimated VFs.
2841 The problem is that that can put too much pressure on the cost
2842 model. E.g. if the estimated VF is also the lowest possible VF,
2843 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2844 for the estimated VF, we'd then choose new_loop_vinfo even
2845 though (a) new_loop_vinfo might not actually be better than
2846 old_loop_vinfo for that VF and (b) it would be significantly
2847 worse at larger VFs.
2849 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2850 no more expensive than old_loop_vinfo even after doubling the
2851 estimated old_loop_vinfo VF. For all but trivial loops, this
2852 ensures that we only pick new_loop_vinfo if it is significantly
2853 better than old_loop_vinfo at the estimated VF. */
2855 if (est_rel_old_min != est_rel_new_min
2856 || est_rel_old_max != est_rel_new_max)
2858 HOST_WIDE_INT est_rel_new_likely
2859 = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2860 HOST_WIDE_INT est_rel_old_likely
2861 = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2863 return est_rel_new_likely * 2 <= est_rel_old_likely;
2866 /* If there's nothing to choose between the loop bodies, see whether
2867 there's a difference in the prologue and epilogue costs. */
2868 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2869 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2871 return false;
2874 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2875 true if we should. */
2877 static bool
2878 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2879 loop_vec_info old_loop_vinfo)
2881 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2882 return false;
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_NOTE, vect_location,
2886 "***** Preferring vector mode %s to vector mode %s\n",
2887 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2888 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2889 return true;
2892 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2893 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2894 MODE_I to the next mode useful to analyze.
2895 Return the loop_vinfo on success and wrapped null on failure. */
2897 static opt_loop_vec_info
2898 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2899 const vect_loop_form_info *loop_form_info,
2900 loop_vec_info main_loop_vinfo,
2901 const vector_modes &vector_modes, unsigned &mode_i,
2902 machine_mode &autodetected_vector_mode,
2903 bool &fatal)
2905 loop_vec_info loop_vinfo
2906 = vect_create_loop_vinfo (loop, shared, loop_form_info);
2907 if (main_loop_vinfo)
2908 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_vinfo;
2910 machine_mode vector_mode = vector_modes[mode_i];
2911 loop_vinfo->vector_mode = vector_mode;
2913 /* Run the main analysis. */
2914 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
2915 if (dump_enabled_p ())
2916 dump_printf_loc (MSG_NOTE, vect_location,
2917 "***** Analysis %s with vector mode %s\n",
2918 res ? "succeeded" : " failed",
2919 GET_MODE_NAME (loop_vinfo->vector_mode));
2921 /* Remember the autodetected vector mode. */
2922 if (vector_mode == VOIDmode)
2923 autodetected_vector_mode = loop_vinfo->vector_mode;
2925 /* Advance mode_i, first skipping modes that would result in the
2926 same analysis result. */
2927 while (mode_i + 1 < vector_modes.length ()
2928 && vect_chooses_same_modes_p (loop_vinfo,
2929 vector_modes[mode_i + 1]))
2931 if (dump_enabled_p ())
2932 dump_printf_loc (MSG_NOTE, vect_location,
2933 "***** The result for vector mode %s would"
2934 " be the same\n",
2935 GET_MODE_NAME (vector_modes[mode_i + 1]));
2936 mode_i += 1;
2938 if (mode_i + 1 < vector_modes.length ()
2939 && VECTOR_MODE_P (autodetected_vector_mode)
2940 && (related_vector_mode (vector_modes[mode_i + 1],
2941 GET_MODE_INNER (autodetected_vector_mode))
2942 == autodetected_vector_mode)
2943 && (related_vector_mode (autodetected_vector_mode,
2944 GET_MODE_INNER (vector_modes[mode_i + 1]))
2945 == vector_modes[mode_i + 1]))
2947 if (dump_enabled_p ())
2948 dump_printf_loc (MSG_NOTE, vect_location,
2949 "***** Skipping vector mode %s, which would"
2950 " repeat the analysis for %s\n",
2951 GET_MODE_NAME (vector_modes[mode_i + 1]),
2952 GET_MODE_NAME (autodetected_vector_mode));
2953 mode_i += 1;
2955 mode_i++;
2957 if (!res)
2959 delete loop_vinfo;
2960 if (fatal)
2961 gcc_checking_assert (main_loop_vinfo == NULL);
2962 return opt_loop_vec_info::propagate_failure (res);
2965 return opt_loop_vec_info::success (loop_vinfo);
2968 /* Function vect_analyze_loop.
2970 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2971 for it. The different analyses will record information in the
2972 loop_vec_info struct. */
2973 opt_loop_vec_info
2974 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2976 DUMP_VECT_SCOPE ("analyze_loop_nest");
2978 if (loop_outer (loop)
2979 && loop_vec_info_for_loop (loop_outer (loop))
2980 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2981 return opt_loop_vec_info::failure_at (vect_location,
2982 "outer-loop already vectorized.\n");
2984 if (!find_loop_nest (loop, &shared->loop_nest))
2985 return opt_loop_vec_info::failure_at
2986 (vect_location,
2987 "not vectorized: loop nest containing two or more consecutive inner"
2988 " loops cannot be vectorized\n");
2990 /* Analyze the loop form. */
2991 vect_loop_form_info loop_form_info;
2992 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2993 if (!res)
2995 if (dump_enabled_p ())
2996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2997 "bad loop form.\n");
2998 return opt_loop_vec_info::propagate_failure (res);
3001 /* When pick_lowest_cost_p is true, we should in principle iterate
3002 over all the loop_vec_infos that LOOP_VINFO could replace and
3003 try to vectorize LOOP_VINFO under the same conditions.
3004 E.g. when trying to replace an epilogue loop, we should vectorize
3005 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
3006 to replace the main loop, we should vectorize LOOP_VINFO as a main
3007 loop too.
3009 However, autovectorize_vector_modes is usually sorted as follows:
3011 - Modes that naturally produce lower VFs usually follow modes that
3012 naturally produce higher VFs.
3014 - When modes naturally produce the same VF, maskable modes
3015 usually follow unmaskable ones, so that the maskable mode
3016 can be used to vectorize the epilogue of the unmaskable mode.
3018 This order is preferred because it leads to the maximum
3019 epilogue vectorization opportunities. Targets should only use
3020 a different order if they want to make wide modes available while
3021 disparaging them relative to earlier, smaller modes. The assumption
3022 in that case is that the wider modes are more expensive in some
3023 way that isn't reflected directly in the costs.
3025 There should therefore be few interesting cases in which
3026 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
3027 treated as a standalone loop, and ends up being genuinely cheaper
3028 than FIRST_LOOP_VINFO. */
3030 auto_vector_modes vector_modes;
3031 /* Autodetect first vector size we try. */
3032 vector_modes.safe_push (VOIDmode);
3033 unsigned int autovec_flags
3034 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3035 loop->simdlen != 0);
3036 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3037 && !unlimited_cost_model (loop));
3038 machine_mode autodetected_vector_mode = VOIDmode;
3039 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3040 unsigned int mode_i = 0;
3041 unsigned int first_loop_i = 0;
3042 unsigned int first_loop_next_i = 0;
3043 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3045 /* First determine the main loop vectorization mode. */
3046 while (1)
3048 unsigned int loop_vinfo_i = mode_i;
3049 bool fatal;
3050 opt_loop_vec_info loop_vinfo
3051 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3052 NULL, vector_modes, mode_i,
3053 autodetected_vector_mode, fatal);
3054 if (fatal)
3055 break;
3057 if (loop_vinfo)
3059 /* Once we hit the desired simdlen for the first time,
3060 discard any previous attempts. */
3061 if (simdlen
3062 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3064 delete first_loop_vinfo;
3065 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3066 simdlen = 0;
3068 else if (pick_lowest_cost_p && first_loop_vinfo)
3070 /* Keep trying to roll back vectorization attempts while the
3071 loop_vec_infos they produced were worse than this one. */
3072 if (vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3074 delete first_loop_vinfo;
3075 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3078 if (first_loop_vinfo == NULL)
3080 first_loop_vinfo = loop_vinfo;
3081 first_loop_i = loop_vinfo_i;
3082 first_loop_next_i = mode_i;
3084 else
3086 delete loop_vinfo;
3087 loop_vinfo = opt_loop_vec_info::success (NULL);
3090 /* Commit to first_loop_vinfo if we have no reason to try
3091 alternatives. */
3092 if (!simdlen && !pick_lowest_cost_p)
3093 break;
3095 if (mode_i == vector_modes.length ()
3096 || autodetected_vector_mode == VOIDmode)
3097 break;
3099 /* Try the next biggest vector size. */
3100 if (dump_enabled_p ())
3101 dump_printf_loc (MSG_NOTE, vect_location,
3102 "***** Re-trying analysis with vector mode %s\n",
3103 GET_MODE_NAME (vector_modes[mode_i]));
3105 if (!first_loop_vinfo)
3106 return opt_loop_vec_info::propagate_failure (res);
3108 if (dump_enabled_p ())
3109 dump_printf_loc (MSG_NOTE, vect_location,
3110 "***** Choosing vector mode %s\n",
3111 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3113 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3114 enabled, SIMDUID is not set, it is the innermost loop and we have
3115 either already found the loop's SIMDLEN or there was no SIMDLEN to
3116 begin with.
3117 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3118 bool vect_epilogues = (!simdlen
3119 && loop->inner == NULL
3120 && param_vect_epilogues_nomask
3121 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3122 && !loop->simduid);
3123 if (!vect_epilogues)
3124 return first_loop_vinfo;
3126 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3127 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3129 /* Handle the case that the original loop can use partial
3130 vectorization, but want to only adopt it for the epilogue.
3131 The retry should be in the same mode as original. */
3132 if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
3134 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
3135 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
3136 if (dump_enabled_p ())
3137 dump_printf_loc (MSG_NOTE, vect_location,
3138 "***** Re-trying analysis with same vector mode"
3139 " %s for epilogue with partial vectors.\n",
3140 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3141 mode_i = first_loop_i;
3143 else
3145 mode_i = first_loop_next_i;
3146 if (mode_i == vector_modes.length ())
3147 return first_loop_vinfo;
3150 /* ??? If first_loop_vinfo was using VOIDmode then we probably
3151 want to instead search for the corresponding mode in vector_modes[]. */
3153 while (1)
3155 bool fatal;
3156 opt_loop_vec_info loop_vinfo
3157 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3158 first_loop_vinfo,
3159 vector_modes, mode_i,
3160 autodetected_vector_mode, fatal);
3161 if (fatal)
3162 break;
3164 if (loop_vinfo)
3166 if (pick_lowest_cost_p)
3168 /* Keep trying to roll back vectorization attempts while the
3169 loop_vec_infos they produced were worse than this one. */
3170 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3171 while (!vinfos.is_empty ()
3172 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3174 gcc_assert (vect_epilogues);
3175 delete vinfos.pop ();
3178 /* For now only allow one epilogue loop. */
3179 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3181 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3182 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3183 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3184 || maybe_ne (lowest_th, 0U));
3185 /* Keep track of the known smallest versioning
3186 threshold. */
3187 if (ordered_p (lowest_th, th))
3188 lowest_th = ordered_min (lowest_th, th);
3190 else
3192 delete loop_vinfo;
3193 loop_vinfo = opt_loop_vec_info::success (NULL);
3196 /* For now only allow one epilogue loop, but allow
3197 pick_lowest_cost_p to replace it, so commit to the
3198 first epilogue if we have no reason to try alternatives. */
3199 if (!pick_lowest_cost_p)
3200 break;
3203 if (mode_i == vector_modes.length ())
3204 break;
3206 /* Try the next biggest vector size. */
3207 if (dump_enabled_p ())
3208 dump_printf_loc (MSG_NOTE, vect_location,
3209 "***** Re-trying epilogue analysis with vector "
3210 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3213 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3215 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3216 if (dump_enabled_p ())
3217 dump_printf_loc (MSG_NOTE, vect_location,
3218 "***** Choosing epilogue vector mode %s\n",
3219 GET_MODE_NAME
3220 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3223 return first_loop_vinfo;
3226 /* Return true if there is an in-order reduction function for CODE, storing
3227 it in *REDUC_FN if so. */
3229 static bool
3230 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3232 switch (code)
3234 case PLUS_EXPR:
3235 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3236 return true;
3238 default:
3239 return false;
3243 /* Function reduction_fn_for_scalar_code
3245 Input:
3246 CODE - tree_code of a reduction operations.
3248 Output:
3249 REDUC_FN - the corresponding internal function to be used to reduce the
3250 vector of partial results into a single scalar result, or IFN_LAST
3251 if the operation is a supported reduction operation, but does not have
3252 such an internal function.
3254 Return FALSE if CODE currently cannot be vectorized as reduction. */
3256 bool
3257 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3259 switch (code)
3261 case MAX_EXPR:
3262 *reduc_fn = IFN_REDUC_MAX;
3263 return true;
3265 case MIN_EXPR:
3266 *reduc_fn = IFN_REDUC_MIN;
3267 return true;
3269 case PLUS_EXPR:
3270 *reduc_fn = IFN_REDUC_PLUS;
3271 return true;
3273 case BIT_AND_EXPR:
3274 *reduc_fn = IFN_REDUC_AND;
3275 return true;
3277 case BIT_IOR_EXPR:
3278 *reduc_fn = IFN_REDUC_IOR;
3279 return true;
3281 case BIT_XOR_EXPR:
3282 *reduc_fn = IFN_REDUC_XOR;
3283 return true;
3285 case MULT_EXPR:
3286 case MINUS_EXPR:
3287 *reduc_fn = IFN_LAST;
3288 return true;
3290 default:
3291 return false;
3295 /* If there is a neutral value X such that a reduction would not be affected
3296 by the introduction of additional X elements, return that X, otherwise
3297 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3298 of the scalar elements. If the reduction has just a single initial value
3299 then INITIAL_VALUE is that value, otherwise it is null. */
3301 static tree
3302 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3304 switch (code)
3306 case WIDEN_SUM_EXPR:
3307 case DOT_PROD_EXPR:
3308 case SAD_EXPR:
3309 case PLUS_EXPR:
3310 case MINUS_EXPR:
3311 case BIT_IOR_EXPR:
3312 case BIT_XOR_EXPR:
3313 return build_zero_cst (scalar_type);
3315 case MULT_EXPR:
3316 return build_one_cst (scalar_type);
3318 case BIT_AND_EXPR:
3319 return build_all_ones_cst (scalar_type);
3321 case MAX_EXPR:
3322 case MIN_EXPR:
3323 return initial_value;
3325 default:
3326 return NULL_TREE;
3330 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3331 STMT is printed with a message MSG. */
3333 static void
3334 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3336 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3339 /* Return true if we need an in-order reduction for operation CODE
3340 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3341 overflow must wrap. */
3343 bool
3344 needs_fold_left_reduction_p (tree type, tree_code code)
3346 /* CHECKME: check for !flag_finite_math_only too? */
3347 if (SCALAR_FLOAT_TYPE_P (type))
3348 switch (code)
3350 case MIN_EXPR:
3351 case MAX_EXPR:
3352 return false;
3354 default:
3355 return !flag_associative_math;
3358 if (INTEGRAL_TYPE_P (type))
3360 if (!operation_no_trapping_overflow (type, code))
3361 return true;
3362 return false;
3365 if (SAT_FIXED_POINT_TYPE_P (type))
3366 return true;
3368 return false;
3371 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3372 has a handled computation expression. Store the main reduction
3373 operation in *CODE. */
3375 static bool
3376 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3377 tree loop_arg, enum tree_code *code,
3378 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3380 auto_bitmap visited;
3381 tree lookfor = PHI_RESULT (phi);
3382 ssa_op_iter curri;
3383 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3384 while (USE_FROM_PTR (curr) != loop_arg)
3385 curr = op_iter_next_use (&curri);
3386 curri.i = curri.numops;
3389 path.safe_push (std::make_pair (curri, curr));
3390 tree use = USE_FROM_PTR (curr);
3391 if (use == lookfor)
3392 break;
3393 gimple *def = SSA_NAME_DEF_STMT (use);
3394 if (gimple_nop_p (def)
3395 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3397 pop:
3400 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3401 curri = x.first;
3402 curr = x.second;
3404 curr = op_iter_next_use (&curri);
3405 /* Skip already visited or non-SSA operands (from iterating
3406 over PHI args). */
3407 while (curr != NULL_USE_OPERAND_P
3408 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3409 || ! bitmap_set_bit (visited,
3410 SSA_NAME_VERSION
3411 (USE_FROM_PTR (curr)))));
3413 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3414 if (curr == NULL_USE_OPERAND_P)
3415 break;
3417 else
3419 if (gimple_code (def) == GIMPLE_PHI)
3420 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3421 else
3422 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3423 while (curr != NULL_USE_OPERAND_P
3424 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3425 || ! bitmap_set_bit (visited,
3426 SSA_NAME_VERSION
3427 (USE_FROM_PTR (curr)))))
3428 curr = op_iter_next_use (&curri);
3429 if (curr == NULL_USE_OPERAND_P)
3430 goto pop;
3433 while (1);
3434 if (dump_file && (dump_flags & TDF_DETAILS))
3436 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3437 unsigned i;
3438 std::pair<ssa_op_iter, use_operand_p> *x;
3439 FOR_EACH_VEC_ELT (path, i, x)
3440 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3441 dump_printf (MSG_NOTE, "\n");
3444 /* Check whether the reduction path detected is valid. */
3445 bool fail = path.length () == 0;
3446 bool neg = false;
3447 int sign = -1;
3448 *code = ERROR_MARK;
3449 for (unsigned i = 1; i < path.length (); ++i)
3451 gimple *use_stmt = USE_STMT (path[i].second);
3452 tree op = USE_FROM_PTR (path[i].second);
3453 if (! is_gimple_assign (use_stmt)
3454 /* The following make sure we can compute the operand index
3455 easily plus it mostly disallows chaining via COND_EXPR condition
3456 operands. */
3457 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3458 && (gimple_num_ops (use_stmt) <= 2
3459 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3460 && (gimple_num_ops (use_stmt) <= 3
3461 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3463 fail = true;
3464 break;
3466 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3467 if (use_code == MINUS_EXPR)
3469 use_code = PLUS_EXPR;
3470 /* Track whether we negate the reduction value each iteration. */
3471 if (gimple_assign_rhs2 (use_stmt) == op)
3472 neg = ! neg;
3474 if (CONVERT_EXPR_CODE_P (use_code)
3475 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3476 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3478 else if (*code == ERROR_MARK)
3480 *code = use_code;
3481 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3483 else if (use_code != *code)
3485 fail = true;
3486 break;
3488 else if ((use_code == MIN_EXPR
3489 || use_code == MAX_EXPR)
3490 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3492 fail = true;
3493 break;
3495 /* Check there's only a single stmt the op is used on. For the
3496 not value-changing tail and the last stmt allow out-of-loop uses.
3497 ??? We could relax this and handle arbitrary live stmts by
3498 forcing a scalar epilogue for example. */
3499 imm_use_iterator imm_iter;
3500 gimple *op_use_stmt;
3501 unsigned cnt = 0;
3502 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3503 if (!is_gimple_debug (op_use_stmt)
3504 && (*code != ERROR_MARK
3505 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3507 /* We want to allow x + x but not x < 1 ? x : 2. */
3508 if (is_gimple_assign (op_use_stmt)
3509 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3511 use_operand_p use_p;
3512 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3513 cnt++;
3515 else
3516 cnt++;
3518 if (cnt != 1)
3520 fail = true;
3521 break;
3524 return ! fail && ! neg && *code != ERROR_MARK;
3527 bool
3528 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3529 tree loop_arg, enum tree_code code)
3531 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3532 enum tree_code code_;
3533 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3534 && code_ == code);
3539 /* Function vect_is_simple_reduction
3541 (1) Detect a cross-iteration def-use cycle that represents a simple
3542 reduction computation. We look for the following pattern:
3544 loop_header:
3545 a1 = phi < a0, a2 >
3546 a3 = ...
3547 a2 = operation (a3, a1)
3551 a3 = ...
3552 loop_header:
3553 a1 = phi < a0, a2 >
3554 a2 = operation (a3, a1)
3556 such that:
3557 1. operation is commutative and associative and it is safe to
3558 change the order of the computation
3559 2. no uses for a2 in the loop (a2 is used out of the loop)
3560 3. no uses of a1 in the loop besides the reduction operation
3561 4. no uses of a1 outside the loop.
3563 Conditions 1,4 are tested here.
3564 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3566 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3567 nested cycles.
3569 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3570 reductions:
3572 a1 = phi < a0, a2 >
3573 inner loop (def of a3)
3574 a2 = phi < a3 >
3576 (4) Detect condition expressions, ie:
3577 for (int i = 0; i < N; i++)
3578 if (a[i] < val)
3579 ret_val = a[i];
3583 static stmt_vec_info
3584 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3585 bool *double_reduc, bool *reduc_chain_p)
3587 gphi *phi = as_a <gphi *> (phi_info->stmt);
3588 gimple *phi_use_stmt = NULL;
3589 imm_use_iterator imm_iter;
3590 use_operand_p use_p;
3592 *double_reduc = false;
3593 *reduc_chain_p = false;
3594 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3596 tree phi_name = PHI_RESULT (phi);
3597 /* ??? If there are no uses of the PHI result the inner loop reduction
3598 won't be detected as possibly double-reduction by vectorizable_reduction
3599 because that tries to walk the PHI arg from the preheader edge which
3600 can be constant. See PR60382. */
3601 if (has_zero_uses (phi_name))
3602 return NULL;
3603 class loop *loop = (gimple_bb (phi))->loop_father;
3604 unsigned nphi_def_loop_uses = 0;
3605 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3607 gimple *use_stmt = USE_STMT (use_p);
3608 if (is_gimple_debug (use_stmt))
3609 continue;
3611 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3613 if (dump_enabled_p ())
3614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3615 "intermediate value used outside loop.\n");
3617 return NULL;
3620 nphi_def_loop_uses++;
3621 phi_use_stmt = use_stmt;
3624 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3625 if (TREE_CODE (latch_def) != SSA_NAME)
3627 if (dump_enabled_p ())
3628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3629 "reduction: not ssa_name: %T\n", latch_def);
3630 return NULL;
3633 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3634 if (!def_stmt_info
3635 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3636 return NULL;
3638 bool nested_in_vect_loop
3639 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3640 unsigned nlatch_def_loop_uses = 0;
3641 auto_vec<gphi *, 3> lcphis;
3642 bool inner_loop_of_double_reduc = false;
3643 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3645 gimple *use_stmt = USE_STMT (use_p);
3646 if (is_gimple_debug (use_stmt))
3647 continue;
3648 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3649 nlatch_def_loop_uses++;
3650 else
3652 /* We can have more than one loop-closed PHI. */
3653 lcphis.safe_push (as_a <gphi *> (use_stmt));
3654 if (nested_in_vect_loop
3655 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3656 == vect_double_reduction_def))
3657 inner_loop_of_double_reduc = true;
3661 /* If we are vectorizing an inner reduction we are executing that
3662 in the original order only in case we are not dealing with a
3663 double reduction. */
3664 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3666 if (dump_enabled_p ())
3667 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3668 "detected nested cycle: ");
3669 return def_stmt_info;
3672 /* If this isn't a nested cycle or if the nested cycle reduction value
3673 is used ouside of the inner loop we cannot handle uses of the reduction
3674 value. */
3675 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3677 if (dump_enabled_p ())
3678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3679 "reduction used in loop.\n");
3680 return NULL;
3683 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3684 defined in the inner loop. */
3685 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3687 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3688 if (gimple_phi_num_args (def_stmt) != 1
3689 || TREE_CODE (op1) != SSA_NAME)
3691 if (dump_enabled_p ())
3692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3693 "unsupported phi node definition.\n");
3695 return NULL;
3698 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3699 if (gimple_bb (def1)
3700 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3701 && loop->inner
3702 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3703 && is_gimple_assign (def1)
3704 && is_a <gphi *> (phi_use_stmt)
3705 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3707 if (dump_enabled_p ())
3708 report_vect_op (MSG_NOTE, def_stmt,
3709 "detected double reduction: ");
3711 *double_reduc = true;
3712 return def_stmt_info;
3715 return NULL;
3718 /* Look for the expression computing latch_def from then loop PHI result. */
3719 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3720 enum tree_code code;
3721 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3722 path))
3724 STMT_VINFO_REDUC_CODE (phi_info) = code;
3725 if (code == COND_EXPR && !nested_in_vect_loop)
3726 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3728 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3729 reduction chain for which the additional restriction is that
3730 all operations in the chain are the same. */
3731 auto_vec<stmt_vec_info, 8> reduc_chain;
3732 unsigned i;
3733 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3734 for (i = path.length () - 1; i >= 1; --i)
3736 gimple *stmt = USE_STMT (path[i].second);
3737 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3738 STMT_VINFO_REDUC_IDX (stmt_info)
3739 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3740 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3741 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3742 && (i == 1 || i == path.length () - 1));
3743 if ((stmt_code != code && !leading_conversion)
3744 /* We can only handle the final value in epilogue
3745 generation for reduction chains. */
3746 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3747 is_slp_reduc = false;
3748 /* For reduction chains we support a trailing/leading
3749 conversions. We do not store those in the actual chain. */
3750 if (leading_conversion)
3751 continue;
3752 reduc_chain.safe_push (stmt_info);
3754 if (is_slp_reduc && reduc_chain.length () > 1)
3756 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3758 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3759 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3761 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3762 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3764 /* Save the chain for further analysis in SLP detection. */
3765 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3766 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3768 *reduc_chain_p = true;
3769 if (dump_enabled_p ())
3770 dump_printf_loc (MSG_NOTE, vect_location,
3771 "reduction: detected reduction chain\n");
3773 else if (dump_enabled_p ())
3774 dump_printf_loc (MSG_NOTE, vect_location,
3775 "reduction: detected reduction\n");
3777 return def_stmt_info;
3780 if (dump_enabled_p ())
3781 dump_printf_loc (MSG_NOTE, vect_location,
3782 "reduction: unknown pattern\n");
3784 return NULL;
3787 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3788 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3789 or -1 if not known. */
3791 static int
3792 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3794 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3795 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3797 if (dump_enabled_p ())
3798 dump_printf_loc (MSG_NOTE, vect_location,
3799 "cost model: epilogue peel iters set to vf/2 "
3800 "because loop iterations are unknown .\n");
3801 return assumed_vf / 2;
3803 else
3805 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3806 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3807 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3808 /* If we need to peel for gaps, but no peeling is required, we have to
3809 peel VF iterations. */
3810 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3811 peel_iters_epilogue = assumed_vf;
3812 return peel_iters_epilogue;
3816 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3818 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3819 int *peel_iters_epilogue,
3820 stmt_vector_for_cost *scalar_cost_vec,
3821 stmt_vector_for_cost *prologue_cost_vec,
3822 stmt_vector_for_cost *epilogue_cost_vec)
3824 int retval = 0;
3826 *peel_iters_epilogue
3827 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3829 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3831 /* If peeled iterations are known but number of scalar loop
3832 iterations are unknown, count a taken branch per peeled loop. */
3833 if (peel_iters_prologue > 0)
3834 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3835 NULL, NULL_TREE, 0, vect_prologue);
3836 if (*peel_iters_epilogue > 0)
3837 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3838 NULL, NULL_TREE, 0, vect_epilogue);
3841 stmt_info_for_cost *si;
3842 int j;
3843 if (peel_iters_prologue)
3844 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3845 retval += record_stmt_cost (prologue_cost_vec,
3846 si->count * peel_iters_prologue,
3847 si->kind, si->stmt_info, si->misalign,
3848 vect_prologue);
3849 if (*peel_iters_epilogue)
3850 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3851 retval += record_stmt_cost (epilogue_cost_vec,
3852 si->count * *peel_iters_epilogue,
3853 si->kind, si->stmt_info, si->misalign,
3854 vect_epilogue);
3856 return retval;
3859 /* Function vect_estimate_min_profitable_iters
3861 Return the number of iterations required for the vector version of the
3862 loop to be profitable relative to the cost of the scalar version of the
3863 loop.
3865 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3866 of iterations for vectorization. -1 value means loop vectorization
3867 is not profitable. This returned value may be used for dynamic
3868 profitability check.
3870 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3871 for static check against estimated number of iterations. */
3873 static void
3874 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3875 int *ret_min_profitable_niters,
3876 int *ret_min_profitable_estimate)
3878 int min_profitable_iters;
3879 int min_profitable_estimate;
3880 int peel_iters_prologue;
3881 int peel_iters_epilogue;
3882 unsigned vec_inside_cost = 0;
3883 int vec_outside_cost = 0;
3884 unsigned vec_prologue_cost = 0;
3885 unsigned vec_epilogue_cost = 0;
3886 int scalar_single_iter_cost = 0;
3887 int scalar_outside_cost = 0;
3888 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3889 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3890 vector_costs *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3892 /* Cost model disabled. */
3893 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3895 if (dump_enabled_p ())
3896 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3897 *ret_min_profitable_niters = 0;
3898 *ret_min_profitable_estimate = 0;
3899 return;
3902 /* Requires loop versioning tests to handle misalignment. */
3903 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3905 /* FIXME: Make cost depend on complexity of individual check. */
3906 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3907 (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3908 NULL, NULL_TREE, 0, vect_prologue);
3909 if (dump_enabled_p ())
3910 dump_printf (MSG_NOTE,
3911 "cost model: Adding cost of checks for loop "
3912 "versioning to treat misalignment.\n");
3915 /* Requires loop versioning with alias checks. */
3916 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3918 /* FIXME: Make cost depend on complexity of individual check. */
3919 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3920 (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3921 NULL, NULL_TREE, 0, vect_prologue);
3922 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3923 if (len)
3924 /* Count LEN - 1 ANDs and LEN comparisons. */
3925 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3926 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3927 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3928 if (len)
3930 /* Count LEN - 1 ANDs and LEN comparisons. */
3931 unsigned int nstmts = len * 2 - 1;
3932 /* +1 for each bias that needs adding. */
3933 for (unsigned int i = 0; i < len; ++i)
3934 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3935 nstmts += 1;
3936 (void) add_stmt_cost (target_cost_data, nstmts,
3937 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3939 if (dump_enabled_p ())
3940 dump_printf (MSG_NOTE,
3941 "cost model: Adding cost of checks for loop "
3942 "versioning aliasing.\n");
3945 /* Requires loop versioning with niter checks. */
3946 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3948 /* FIXME: Make cost depend on complexity of individual check. */
3949 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3950 NULL, NULL_TREE, 0, vect_prologue);
3951 if (dump_enabled_p ())
3952 dump_printf (MSG_NOTE,
3953 "cost model: Adding cost of checks for loop "
3954 "versioning niters.\n");
3957 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3958 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3959 NULL, NULL_TREE, 0, vect_prologue);
3961 /* Count statements in scalar loop. Using this as scalar cost for a single
3962 iteration for now.
3964 TODO: Add outer loop support.
3966 TODO: Consider assigning different costs to different scalar
3967 statements. */
3969 scalar_single_iter_cost
3970 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3972 /* Add additional cost for the peeled instructions in prologue and epilogue
3973 loop. (For fully-masked loops there will be no peeling.)
3975 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3976 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3978 TODO: Build an expression that represents peel_iters for prologue and
3979 epilogue to be used in a run-time test. */
3981 bool prologue_need_br_taken_cost = false;
3982 bool prologue_need_br_not_taken_cost = false;
3984 /* Calculate peel_iters_prologue. */
3985 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3986 peel_iters_prologue = 0;
3987 else if (npeel < 0)
3989 peel_iters_prologue = assumed_vf / 2;
3990 if (dump_enabled_p ())
3991 dump_printf (MSG_NOTE, "cost model: "
3992 "prologue peel iters set to vf/2.\n");
3994 /* If peeled iterations are unknown, count a taken branch and a not taken
3995 branch per peeled loop. Even if scalar loop iterations are known,
3996 vector iterations are not known since peeled prologue iterations are
3997 not known. Hence guards remain the same. */
3998 prologue_need_br_taken_cost = true;
3999 prologue_need_br_not_taken_cost = true;
4001 else
4003 peel_iters_prologue = npeel;
4004 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4005 /* If peeled iterations are known but number of scalar loop
4006 iterations are unknown, count a taken branch per peeled loop. */
4007 prologue_need_br_taken_cost = true;
4010 bool epilogue_need_br_taken_cost = false;
4011 bool epilogue_need_br_not_taken_cost = false;
4013 /* Calculate peel_iters_epilogue. */
4014 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4015 /* We need to peel exactly one iteration for gaps. */
4016 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4017 else if (npeel < 0)
4019 /* If peeling for alignment is unknown, loop bound of main loop
4020 becomes unknown. */
4021 peel_iters_epilogue = assumed_vf / 2;
4022 if (dump_enabled_p ())
4023 dump_printf (MSG_NOTE, "cost model: "
4024 "epilogue peel iters set to vf/2 because "
4025 "peeling for alignment is unknown.\n");
4027 /* See the same reason above in peel_iters_prologue calculation. */
4028 epilogue_need_br_taken_cost = true;
4029 epilogue_need_br_not_taken_cost = true;
4031 else
4033 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4034 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4035 /* If peeled iterations are known but number of scalar loop
4036 iterations are unknown, count a taken branch per peeled loop. */
4037 epilogue_need_br_taken_cost = true;
4040 stmt_info_for_cost *si;
4041 int j;
4042 /* Add costs associated with peel_iters_prologue. */
4043 if (peel_iters_prologue)
4044 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4046 (void) add_stmt_cost (target_cost_data,
4047 si->count * peel_iters_prologue, si->kind,
4048 si->stmt_info, si->vectype, si->misalign,
4049 vect_prologue);
4052 /* Add costs associated with peel_iters_epilogue. */
4053 if (peel_iters_epilogue)
4054 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4056 (void) add_stmt_cost (target_cost_data,
4057 si->count * peel_iters_epilogue, si->kind,
4058 si->stmt_info, si->vectype, si->misalign,
4059 vect_epilogue);
4062 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4064 if (prologue_need_br_taken_cost)
4065 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4066 NULL, NULL_TREE, 0, vect_prologue);
4068 if (prologue_need_br_not_taken_cost)
4069 (void) add_stmt_cost (target_cost_data, 1,
4070 cond_branch_not_taken, NULL, NULL_TREE, 0,
4071 vect_prologue);
4073 if (epilogue_need_br_taken_cost)
4074 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4075 NULL, NULL_TREE, 0, vect_epilogue);
4077 if (epilogue_need_br_not_taken_cost)
4078 (void) add_stmt_cost (target_cost_data, 1,
4079 cond_branch_not_taken, NULL, NULL_TREE, 0,
4080 vect_epilogue);
4082 /* Take care of special costs for rgroup controls of partial vectors. */
4083 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4085 /* Calculate how many masks we need to generate. */
4086 unsigned int num_masks = 0;
4087 rgroup_controls *rgm;
4088 unsigned int num_vectors_m1;
4089 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4090 if (rgm->type)
4091 num_masks += num_vectors_m1 + 1;
4092 gcc_assert (num_masks > 0);
4094 /* In the worst case, we need to generate each mask in the prologue
4095 and in the loop body. One of the loop body mask instructions
4096 replaces the comparison in the scalar loop, and since we don't
4097 count the scalar comparison against the scalar body, we shouldn't
4098 count that vector instruction against the vector body either.
4100 Sometimes we can use unpacks instead of generating prologue
4101 masks and sometimes the prologue mask will fold to a constant,
4102 so the actual prologue cost might be smaller. However, it's
4103 simpler and safer to use the worst-case cost; if this ends up
4104 being the tie-breaker between vectorizing or not, then it's
4105 probably better not to vectorize. */
4106 (void) add_stmt_cost (target_cost_data, num_masks,
4107 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4108 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4109 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4111 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4113 /* Referring to the functions vect_set_loop_condition_partial_vectors
4114 and vect_set_loop_controls_directly, we need to generate each
4115 length in the prologue and in the loop body if required. Although
4116 there are some possible optimizations, we consider the worst case
4117 here. */
4119 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4120 bool need_iterate_p
4121 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4122 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4124 /* Calculate how many statements to be added. */
4125 unsigned int prologue_stmts = 0;
4126 unsigned int body_stmts = 0;
4128 rgroup_controls *rgc;
4129 unsigned int num_vectors_m1;
4130 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4131 if (rgc->type)
4133 /* May need one SHIFT for nitems_total computation. */
4134 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4135 if (nitems != 1 && !niters_known_p)
4136 prologue_stmts += 1;
4138 /* May need one MAX and one MINUS for wrap around. */
4139 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4140 prologue_stmts += 2;
4142 /* Need one MAX and one MINUS for each batch limit excepting for
4143 the 1st one. */
4144 prologue_stmts += num_vectors_m1 * 2;
4146 unsigned int num_vectors = num_vectors_m1 + 1;
4148 /* Need to set up lengths in prologue, only one MIN required
4149 for each since start index is zero. */
4150 prologue_stmts += num_vectors;
4152 /* Each may need two MINs and one MINUS to update lengths in body
4153 for next iteration. */
4154 if (need_iterate_p)
4155 body_stmts += 3 * num_vectors;
4158 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4159 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4160 (void) add_stmt_cost (target_cost_data, body_stmts,
4161 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4164 /* FORNOW: The scalar outside cost is incremented in one of the
4165 following ways:
4167 1. The vectorizer checks for alignment and aliasing and generates
4168 a condition that allows dynamic vectorization. A cost model
4169 check is ANDED with the versioning condition. Hence scalar code
4170 path now has the added cost of the versioning check.
4172 if (cost > th & versioning_check)
4173 jmp to vector code
4175 Hence run-time scalar is incremented by not-taken branch cost.
4177 2. The vectorizer then checks if a prologue is required. If the
4178 cost model check was not done before during versioning, it has to
4179 be done before the prologue check.
4181 if (cost <= th)
4182 prologue = scalar_iters
4183 if (prologue == 0)
4184 jmp to vector code
4185 else
4186 execute prologue
4187 if (prologue == num_iters)
4188 go to exit
4190 Hence the run-time scalar cost is incremented by a taken branch,
4191 plus a not-taken branch, plus a taken branch cost.
4193 3. The vectorizer then checks if an epilogue is required. If the
4194 cost model check was not done before during prologue check, it
4195 has to be done with the epilogue check.
4197 if (prologue == 0)
4198 jmp to vector code
4199 else
4200 execute prologue
4201 if (prologue == num_iters)
4202 go to exit
4203 vector code:
4204 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4205 jmp to epilogue
4207 Hence the run-time scalar cost should be incremented by 2 taken
4208 branches.
4210 TODO: The back end may reorder the BBS's differently and reverse
4211 conditions/branch directions. Change the estimates below to
4212 something more reasonable. */
4214 /* If the number of iterations is known and we do not do versioning, we can
4215 decide whether to vectorize at compile time. Hence the scalar version
4216 do not carry cost model guard costs. */
4217 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4218 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4220 /* Cost model check occurs at versioning. */
4221 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4222 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4223 else
4225 /* Cost model check occurs at prologue generation. */
4226 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4227 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4228 + vect_get_stmt_cost (cond_branch_not_taken);
4229 /* Cost model check occurs at epilogue generation. */
4230 else
4231 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4235 /* Complete the target-specific cost calculations. */
4236 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4237 &vec_inside_cost, &vec_epilogue_cost);
4239 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4241 /* Stash the costs so that we can compare two loop_vec_infos. */
4242 loop_vinfo->vec_inside_cost = vec_inside_cost;
4243 loop_vinfo->vec_outside_cost = vec_outside_cost;
4245 if (dump_enabled_p ())
4247 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4248 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4249 vec_inside_cost);
4250 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4251 vec_prologue_cost);
4252 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4253 vec_epilogue_cost);
4254 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4255 scalar_single_iter_cost);
4256 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4257 scalar_outside_cost);
4258 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4259 vec_outside_cost);
4260 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4261 peel_iters_prologue);
4262 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4263 peel_iters_epilogue);
4266 /* Calculate number of iterations required to make the vector version
4267 profitable, relative to the loop bodies only. The following condition
4268 must hold true:
4269 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4270 where
4271 SIC = scalar iteration cost, VIC = vector iteration cost,
4272 VOC = vector outside cost, VF = vectorization factor,
4273 NPEEL = prologue iterations + epilogue iterations,
4274 SOC = scalar outside cost for run time cost model check. */
4276 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4277 - vec_inside_cost);
4278 if (saving_per_viter <= 0)
4280 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4281 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4282 "vectorization did not happen for a simd loop");
4284 if (dump_enabled_p ())
4285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4286 "cost model: the vector iteration cost = %d "
4287 "divided by the scalar iteration cost = %d "
4288 "is greater or equal to the vectorization factor = %d"
4289 ".\n",
4290 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4291 *ret_min_profitable_niters = -1;
4292 *ret_min_profitable_estimate = -1;
4293 return;
4296 /* ??? The "if" arm is written to handle all cases; see below for what
4297 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4298 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4300 /* Rewriting the condition above in terms of the number of
4301 vector iterations (vniters) rather than the number of
4302 scalar iterations (niters) gives:
4304 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4306 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4308 For integer N, X and Y when X > 0:
4310 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4311 int outside_overhead = (vec_outside_cost
4312 - scalar_single_iter_cost * peel_iters_prologue
4313 - scalar_single_iter_cost * peel_iters_epilogue
4314 - scalar_outside_cost);
4315 /* We're only interested in cases that require at least one
4316 vector iteration. */
4317 int min_vec_niters = 1;
4318 if (outside_overhead > 0)
4319 min_vec_niters = outside_overhead / saving_per_viter + 1;
4321 if (dump_enabled_p ())
4322 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4323 min_vec_niters);
4325 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4327 /* Now that we know the minimum number of vector iterations,
4328 find the minimum niters for which the scalar cost is larger:
4330 SIC * niters > VIC * vniters + VOC - SOC
4332 We know that the minimum niters is no more than
4333 vniters * VF + NPEEL, but it might be (and often is) less
4334 than that if a partial vector iteration is cheaper than the
4335 equivalent scalar code. */
4336 int threshold = (vec_inside_cost * min_vec_niters
4337 + vec_outside_cost
4338 - scalar_outside_cost);
4339 if (threshold <= 0)
4340 min_profitable_iters = 1;
4341 else
4342 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4344 else
4345 /* Convert the number of vector iterations into a number of
4346 scalar iterations. */
4347 min_profitable_iters = (min_vec_niters * assumed_vf
4348 + peel_iters_prologue
4349 + peel_iters_epilogue);
4351 else
4353 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4354 * assumed_vf
4355 - vec_inside_cost * peel_iters_prologue
4356 - vec_inside_cost * peel_iters_epilogue);
4357 if (min_profitable_iters <= 0)
4358 min_profitable_iters = 0;
4359 else
4361 min_profitable_iters /= saving_per_viter;
4363 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4364 <= (((int) vec_inside_cost * min_profitable_iters)
4365 + (((int) vec_outside_cost - scalar_outside_cost)
4366 * assumed_vf)))
4367 min_profitable_iters++;
4371 if (dump_enabled_p ())
4372 dump_printf (MSG_NOTE,
4373 " Calculated minimum iters for profitability: %d\n",
4374 min_profitable_iters);
4376 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4377 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4378 /* We want the vectorized loop to execute at least once. */
4379 min_profitable_iters = assumed_vf + peel_iters_prologue;
4380 else if (min_profitable_iters < peel_iters_prologue)
4381 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4382 vectorized loop executes at least once. */
4383 min_profitable_iters = peel_iters_prologue;
4385 if (dump_enabled_p ())
4386 dump_printf_loc (MSG_NOTE, vect_location,
4387 " Runtime profitability threshold = %d\n",
4388 min_profitable_iters);
4390 *ret_min_profitable_niters = min_profitable_iters;
4392 /* Calculate number of iterations required to make the vector version
4393 profitable, relative to the loop bodies only.
4395 Non-vectorized variant is SIC * niters and it must win over vector
4396 variant on the expected loop trip count. The following condition must hold true:
4397 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4399 if (vec_outside_cost <= 0)
4400 min_profitable_estimate = 0;
4401 /* ??? This "else if" arm is written to handle all cases; see below for
4402 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4403 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4405 /* This is a repeat of the code above, but with + SOC rather
4406 than - SOC. */
4407 int outside_overhead = (vec_outside_cost
4408 - scalar_single_iter_cost * peel_iters_prologue
4409 - scalar_single_iter_cost * peel_iters_epilogue
4410 + scalar_outside_cost);
4411 int min_vec_niters = 1;
4412 if (outside_overhead > 0)
4413 min_vec_niters = outside_overhead / saving_per_viter + 1;
4415 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4417 int threshold = (vec_inside_cost * min_vec_niters
4418 + vec_outside_cost
4419 + scalar_outside_cost);
4420 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4422 else
4423 min_profitable_estimate = (min_vec_niters * assumed_vf
4424 + peel_iters_prologue
4425 + peel_iters_epilogue);
4427 else
4429 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4430 * assumed_vf
4431 - vec_inside_cost * peel_iters_prologue
4432 - vec_inside_cost * peel_iters_epilogue)
4433 / ((scalar_single_iter_cost * assumed_vf)
4434 - vec_inside_cost);
4436 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4437 if (dump_enabled_p ())
4438 dump_printf_loc (MSG_NOTE, vect_location,
4439 " Static estimate profitability threshold = %d\n",
4440 min_profitable_estimate);
4442 *ret_min_profitable_estimate = min_profitable_estimate;
4445 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4446 vector elements (not bits) for a vector with NELT elements. */
4447 static void
4448 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4449 vec_perm_builder *sel)
4451 /* The encoding is a single stepped pattern. Any wrap-around is handled
4452 by vec_perm_indices. */
4453 sel->new_vector (nelt, 1, 3);
4454 for (unsigned int i = 0; i < 3; i++)
4455 sel->quick_push (i + offset);
4458 /* Checks whether the target supports whole-vector shifts for vectors of mode
4459 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4460 it supports vec_perm_const with masks for all necessary shift amounts. */
4461 static bool
4462 have_whole_vector_shift (machine_mode mode)
4464 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4465 return true;
4467 /* Variable-length vectors should be handled via the optab. */
4468 unsigned int nelt;
4469 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4470 return false;
4472 vec_perm_builder sel;
4473 vec_perm_indices indices;
4474 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4476 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4477 indices.new_vector (sel, 2, nelt);
4478 if (!can_vec_perm_const_p (mode, indices, false))
4479 return false;
4481 return true;
4484 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4485 functions. Design better to avoid maintenance issues. */
4487 /* Function vect_model_reduction_cost.
4489 Models cost for a reduction operation, including the vector ops
4490 generated within the strip-mine loop in some cases, the initial
4491 definition before the loop, and the epilogue code that must be generated. */
4493 static void
4494 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4495 stmt_vec_info stmt_info, internal_fn reduc_fn,
4496 vect_reduction_type reduction_type,
4497 int ncopies, stmt_vector_for_cost *cost_vec)
4499 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4500 enum tree_code code;
4501 optab optab;
4502 tree vectype;
4503 machine_mode mode;
4504 class loop *loop = NULL;
4506 if (loop_vinfo)
4507 loop = LOOP_VINFO_LOOP (loop_vinfo);
4509 /* Condition reductions generate two reductions in the loop. */
4510 if (reduction_type == COND_REDUCTION)
4511 ncopies *= 2;
4513 vectype = STMT_VINFO_VECTYPE (stmt_info);
4514 mode = TYPE_MODE (vectype);
4515 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4517 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4519 if (reduction_type == EXTRACT_LAST_REDUCTION)
4520 /* No extra instructions are needed in the prologue. The loop body
4521 operations are costed in vectorizable_condition. */
4522 inside_cost = 0;
4523 else if (reduction_type == FOLD_LEFT_REDUCTION)
4525 /* No extra instructions needed in the prologue. */
4526 prologue_cost = 0;
4528 if (reduc_fn != IFN_LAST)
4529 /* Count one reduction-like operation per vector. */
4530 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4531 stmt_info, 0, vect_body);
4532 else
4534 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4535 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4536 inside_cost = record_stmt_cost (cost_vec, nelements,
4537 vec_to_scalar, stmt_info, 0,
4538 vect_body);
4539 inside_cost += record_stmt_cost (cost_vec, nelements,
4540 scalar_stmt, stmt_info, 0,
4541 vect_body);
4544 else
4546 /* Add in cost for initial definition.
4547 For cond reduction we have four vectors: initial index, step,
4548 initial result of the data reduction, initial value of the index
4549 reduction. */
4550 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4551 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4552 scalar_to_vec, stmt_info, 0,
4553 vect_prologue);
4556 /* Determine cost of epilogue code.
4558 We have a reduction operator that will reduce the vector in one statement.
4559 Also requires scalar extract. */
4561 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4563 if (reduc_fn != IFN_LAST)
4565 if (reduction_type == COND_REDUCTION)
4567 /* An EQ stmt and an COND_EXPR stmt. */
4568 epilogue_cost += record_stmt_cost (cost_vec, 2,
4569 vector_stmt, stmt_info, 0,
4570 vect_epilogue);
4571 /* Reduction of the max index and a reduction of the found
4572 values. */
4573 epilogue_cost += record_stmt_cost (cost_vec, 2,
4574 vec_to_scalar, stmt_info, 0,
4575 vect_epilogue);
4576 /* A broadcast of the max value. */
4577 epilogue_cost += record_stmt_cost (cost_vec, 1,
4578 scalar_to_vec, stmt_info, 0,
4579 vect_epilogue);
4581 else
4583 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4584 stmt_info, 0, vect_epilogue);
4585 epilogue_cost += record_stmt_cost (cost_vec, 1,
4586 vec_to_scalar, stmt_info, 0,
4587 vect_epilogue);
4590 else if (reduction_type == COND_REDUCTION)
4592 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4593 /* Extraction of scalar elements. */
4594 epilogue_cost += record_stmt_cost (cost_vec,
4595 2 * estimated_nunits,
4596 vec_to_scalar, stmt_info, 0,
4597 vect_epilogue);
4598 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4599 epilogue_cost += record_stmt_cost (cost_vec,
4600 2 * estimated_nunits - 3,
4601 scalar_stmt, stmt_info, 0,
4602 vect_epilogue);
4604 else if (reduction_type == EXTRACT_LAST_REDUCTION
4605 || reduction_type == FOLD_LEFT_REDUCTION)
4606 /* No extra instructions need in the epilogue. */
4608 else
4610 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4611 tree bitsize =
4612 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4613 int element_bitsize = tree_to_uhwi (bitsize);
4614 int nelements = vec_size_in_bits / element_bitsize;
4616 if (code == COND_EXPR)
4617 code = MAX_EXPR;
4619 optab = optab_for_tree_code (code, vectype, optab_default);
4621 /* We have a whole vector shift available. */
4622 if (optab != unknown_optab
4623 && VECTOR_MODE_P (mode)
4624 && optab_handler (optab, mode) != CODE_FOR_nothing
4625 && have_whole_vector_shift (mode))
4627 /* Final reduction via vector shifts and the reduction operator.
4628 Also requires scalar extract. */
4629 epilogue_cost += record_stmt_cost (cost_vec,
4630 exact_log2 (nelements) * 2,
4631 vector_stmt, stmt_info, 0,
4632 vect_epilogue);
4633 epilogue_cost += record_stmt_cost (cost_vec, 1,
4634 vec_to_scalar, stmt_info, 0,
4635 vect_epilogue);
4637 else
4638 /* Use extracts and reduction op for final reduction. For N
4639 elements, we have N extracts and N-1 reduction ops. */
4640 epilogue_cost += record_stmt_cost (cost_vec,
4641 nelements + nelements - 1,
4642 vector_stmt, stmt_info, 0,
4643 vect_epilogue);
4647 if (dump_enabled_p ())
4648 dump_printf (MSG_NOTE,
4649 "vect_model_reduction_cost: inside_cost = %d, "
4650 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4651 prologue_cost, epilogue_cost);
4654 /* SEQ is a sequence of instructions that initialize the reduction
4655 described by REDUC_INFO. Emit them in the appropriate place. */
4657 static void
4658 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4659 stmt_vec_info reduc_info, gimple *seq)
4661 if (reduc_info->reused_accumulator)
4663 /* When reusing an accumulator from the main loop, we only need
4664 initialization instructions if the main loop can be skipped.
4665 In that case, emit the initialization instructions at the end
4666 of the guard block that does the skip. */
4667 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4668 gcc_assert (skip_edge);
4669 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4670 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4672 else
4674 /* The normal case: emit the initialization instructions on the
4675 preheader edge. */
4676 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4677 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4681 /* Function get_initial_def_for_reduction
4683 Input:
4684 REDUC_INFO - the info_for_reduction
4685 INIT_VAL - the initial value of the reduction variable
4686 NEUTRAL_OP - a value that has no effect on the reduction, as per
4687 neutral_op_for_reduction
4689 Output:
4690 Return a vector variable, initialized according to the operation that
4691 STMT_VINFO performs. This vector will be used as the initial value
4692 of the vector of partial results.
4694 The value we need is a vector in which element 0 has value INIT_VAL
4695 and every other element has value NEUTRAL_OP. */
4697 static tree
4698 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4699 stmt_vec_info reduc_info,
4700 tree init_val, tree neutral_op)
4702 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4703 tree scalar_type = TREE_TYPE (init_val);
4704 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4705 tree init_def;
4706 gimple_seq stmts = NULL;
4708 gcc_assert (vectype);
4710 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4711 || SCALAR_FLOAT_TYPE_P (scalar_type));
4713 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4714 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4716 if (operand_equal_p (init_val, neutral_op))
4718 /* If both elements are equal then the vector described above is
4719 just a splat. */
4720 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4721 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4723 else
4725 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4726 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4727 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4729 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4730 element 0. */
4731 init_def = gimple_build_vector_from_val (&stmts, vectype,
4732 neutral_op);
4733 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4734 vectype, init_def, init_val);
4736 else
4738 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4739 tree_vector_builder elts (vectype, 1, 2);
4740 elts.quick_push (init_val);
4741 elts.quick_push (neutral_op);
4742 init_def = gimple_build_vector (&stmts, &elts);
4746 if (stmts)
4747 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4748 return init_def;
4751 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4752 which performs a reduction involving GROUP_SIZE scalar statements.
4753 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4754 is nonnull, introducing extra elements of that value will not change the
4755 result. */
4757 static void
4758 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4759 stmt_vec_info reduc_info,
4760 vec<tree> *vec_oprnds,
4761 unsigned int number_of_vectors,
4762 unsigned int group_size, tree neutral_op)
4764 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4765 unsigned HOST_WIDE_INT nunits;
4766 unsigned j, number_of_places_left_in_vector;
4767 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4768 unsigned int i;
4770 gcc_assert (group_size == initial_values.length () || neutral_op);
4772 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4773 created vectors. It is greater than 1 if unrolling is performed.
4775 For example, we have two scalar operands, s1 and s2 (e.g., group of
4776 strided accesses of size two), while NUNITS is four (i.e., four scalars
4777 of this type can be packed in a vector). The output vector will contain
4778 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4779 will be 2).
4781 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4782 vectors containing the operands.
4784 For example, NUNITS is four as before, and the group size is 8
4785 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4786 {s5, s6, s7, s8}. */
4788 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4789 nunits = group_size;
4791 number_of_places_left_in_vector = nunits;
4792 bool constant_p = true;
4793 tree_vector_builder elts (vector_type, nunits, 1);
4794 elts.quick_grow (nunits);
4795 gimple_seq ctor_seq = NULL;
4796 for (j = 0; j < nunits * number_of_vectors; ++j)
4798 tree op;
4799 i = j % group_size;
4801 /* Get the def before the loop. In reduction chain we have only
4802 one initial value. Else we have as many as PHIs in the group. */
4803 if (i >= initial_values.length () || (j > i && neutral_op))
4804 op = neutral_op;
4805 else
4806 op = initial_values[i];
4808 /* Create 'vect_ = {op0,op1,...,opn}'. */
4809 number_of_places_left_in_vector--;
4810 elts[nunits - number_of_places_left_in_vector - 1] = op;
4811 if (!CONSTANT_CLASS_P (op))
4812 constant_p = false;
4814 if (number_of_places_left_in_vector == 0)
4816 tree init;
4817 if (constant_p && !neutral_op
4818 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4819 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4820 /* Build the vector directly from ELTS. */
4821 init = gimple_build_vector (&ctor_seq, &elts);
4822 else if (neutral_op)
4824 /* Build a vector of the neutral value and shift the
4825 other elements into place. */
4826 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4827 neutral_op);
4828 int k = nunits;
4829 while (k > 0 && elts[k - 1] == neutral_op)
4830 k -= 1;
4831 while (k > 0)
4833 k -= 1;
4834 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4835 vector_type, init, elts[k]);
4838 else
4840 /* First time round, duplicate ELTS to fill the
4841 required number of vectors. */
4842 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4843 elts, number_of_vectors, *vec_oprnds);
4844 break;
4846 vec_oprnds->quick_push (init);
4848 number_of_places_left_in_vector = nunits;
4849 elts.new_vector (vector_type, nunits, 1);
4850 elts.quick_grow (nunits);
4851 constant_p = true;
4854 if (ctor_seq != NULL)
4855 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4858 /* For a statement STMT_INFO taking part in a reduction operation return
4859 the stmt_vec_info the meta information is stored on. */
4861 stmt_vec_info
4862 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4864 stmt_info = vect_orig_stmt (stmt_info);
4865 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4866 if (!is_a <gphi *> (stmt_info->stmt)
4867 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4868 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4869 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4870 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4872 if (gimple_phi_num_args (phi) == 1)
4873 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4875 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4877 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4878 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4879 stmt_info = info;
4881 return stmt_info;
4884 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4885 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4886 return false. */
4888 static bool
4889 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4890 stmt_vec_info reduc_info)
4892 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4893 if (!main_loop_vinfo)
4894 return false;
4896 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4897 return false;
4899 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4900 auto_vec<tree, 16> main_loop_results (num_phis);
4901 auto_vec<tree, 16> initial_values (num_phis);
4902 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4904 /* The epilogue loop can be entered either from the main loop or
4905 from an earlier guard block. */
4906 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4907 for (tree incoming_value : reduc_info->reduc_initial_values)
4909 /* Look for:
4911 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4912 INITIAL_VALUE(guard block)>. */
4913 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4915 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4916 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4918 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4919 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4921 main_loop_results.quick_push (from_main_loop);
4922 initial_values.quick_push (from_skip);
4925 else
4926 /* The main loop dominates the epilogue loop. */
4927 main_loop_results.splice (reduc_info->reduc_initial_values);
4929 /* See if the main loop has the kind of accumulator we need. */
4930 vect_reusable_accumulator *accumulator
4931 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4932 if (!accumulator
4933 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4934 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4935 accumulator->reduc_info->reduc_scalar_results.begin ()))
4936 return false;
4938 /* Handle the case where we can reduce wider vectors to narrower ones. */
4939 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4940 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4941 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4942 TYPE_VECTOR_SUBPARTS (vectype)))
4943 return false;
4945 /* Non-SLP reductions might apply an adjustment after the reduction
4946 operation, in order to simplify the initialization of the accumulator.
4947 If the epilogue loop carries on from where the main loop left off,
4948 it should apply the same adjustment to the final reduction result.
4950 If the epilogue loop can also be entered directly (rather than via
4951 the main loop), we need to be able to handle that case in the same way,
4952 with the same adjustment. (In principle we could add a PHI node
4953 to select the correct adjustment, but in practice that shouldn't be
4954 necessary.) */
4955 tree main_adjustment
4956 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4957 if (loop_vinfo->main_loop_edge && main_adjustment)
4959 gcc_assert (num_phis == 1);
4960 tree initial_value = initial_values[0];
4961 /* Check that we can use INITIAL_VALUE as the adjustment and
4962 initialize the accumulator with a neutral value instead. */
4963 if (!operand_equal_p (initial_value, main_adjustment))
4964 return false;
4965 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4966 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4967 code, initial_value);
4969 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4970 reduc_info->reduc_initial_values.truncate (0);
4971 reduc_info->reduc_initial_values.splice (initial_values);
4972 reduc_info->reused_accumulator = accumulator;
4973 return true;
4976 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4977 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
4979 static tree
4980 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4981 gimple_seq *seq)
4983 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4984 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4985 tree stype = TREE_TYPE (vectype);
4986 tree new_temp = vec_def;
4987 while (nunits > nunits1)
4989 nunits /= 2;
4990 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4991 stype, nunits);
4992 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4994 /* The target has to make sure we support lowpart/highpart
4995 extraction, either via direct vector extract or through
4996 an integer mode punning. */
4997 tree dst1, dst2;
4998 gimple *epilog_stmt;
4999 if (convert_optab_handler (vec_extract_optab,
5000 TYPE_MODE (TREE_TYPE (new_temp)),
5001 TYPE_MODE (vectype1))
5002 != CODE_FOR_nothing)
5004 /* Extract sub-vectors directly once vec_extract becomes
5005 a conversion optab. */
5006 dst1 = make_ssa_name (vectype1);
5007 epilog_stmt
5008 = gimple_build_assign (dst1, BIT_FIELD_REF,
5009 build3 (BIT_FIELD_REF, vectype1,
5010 new_temp, TYPE_SIZE (vectype1),
5011 bitsize_int (0)));
5012 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5013 dst2 = make_ssa_name (vectype1);
5014 epilog_stmt
5015 = gimple_build_assign (dst2, BIT_FIELD_REF,
5016 build3 (BIT_FIELD_REF, vectype1,
5017 new_temp, TYPE_SIZE (vectype1),
5018 bitsize_int (bitsize)));
5019 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5021 else
5023 /* Extract via punning to appropriately sized integer mode
5024 vector. */
5025 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5026 tree etype = build_vector_type (eltype, 2);
5027 gcc_assert (convert_optab_handler (vec_extract_optab,
5028 TYPE_MODE (etype),
5029 TYPE_MODE (eltype))
5030 != CODE_FOR_nothing);
5031 tree tem = make_ssa_name (etype);
5032 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5033 build1 (VIEW_CONVERT_EXPR,
5034 etype, new_temp));
5035 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5036 new_temp = tem;
5037 tem = make_ssa_name (eltype);
5038 epilog_stmt
5039 = gimple_build_assign (tem, BIT_FIELD_REF,
5040 build3 (BIT_FIELD_REF, eltype,
5041 new_temp, TYPE_SIZE (eltype),
5042 bitsize_int (0)));
5043 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5044 dst1 = make_ssa_name (vectype1);
5045 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5046 build1 (VIEW_CONVERT_EXPR,
5047 vectype1, tem));
5048 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5049 tem = make_ssa_name (eltype);
5050 epilog_stmt
5051 = gimple_build_assign (tem, BIT_FIELD_REF,
5052 build3 (BIT_FIELD_REF, eltype,
5053 new_temp, TYPE_SIZE (eltype),
5054 bitsize_int (bitsize)));
5055 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5056 dst2 = make_ssa_name (vectype1);
5057 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5058 build1 (VIEW_CONVERT_EXPR,
5059 vectype1, tem));
5060 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5063 new_temp = make_ssa_name (vectype1);
5064 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5065 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5068 return new_temp;
5071 /* Function vect_create_epilog_for_reduction
5073 Create code at the loop-epilog to finalize the result of a reduction
5074 computation.
5076 STMT_INFO is the scalar reduction stmt that is being vectorized.
5077 SLP_NODE is an SLP node containing a group of reduction statements. The
5078 first one in this group is STMT_INFO.
5079 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5080 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5081 (counting from 0)
5083 This function:
5084 1. Completes the reduction def-use cycles.
5085 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5086 by calling the function specified by REDUC_FN if available, or by
5087 other means (whole-vector shifts or a scalar loop).
5088 The function also creates a new phi node at the loop exit to preserve
5089 loop-closed form, as illustrated below.
5091 The flow at the entry to this function:
5093 loop:
5094 vec_def = phi <vec_init, null> # REDUCTION_PHI
5095 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5096 s_loop = scalar_stmt # (scalar) STMT_INFO
5097 loop_exit:
5098 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5099 use <s_out0>
5100 use <s_out0>
5102 The above is transformed by this function into:
5104 loop:
5105 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5106 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5107 s_loop = scalar_stmt # (scalar) STMT_INFO
5108 loop_exit:
5109 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5110 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5111 v_out2 = reduce <v_out1>
5112 s_out3 = extract_field <v_out2, 0>
5113 s_out4 = adjust_result <s_out3>
5114 use <s_out4>
5115 use <s_out4>
5118 static void
5119 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5120 stmt_vec_info stmt_info,
5121 slp_tree slp_node,
5122 slp_instance slp_node_instance)
5124 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5125 gcc_assert (reduc_info->is_reduc_info);
5126 /* For double reductions we need to get at the inner loop reduction
5127 stmt which has the meta info attached. Our stmt_info is that of the
5128 loop-closed PHI of the inner loop which we remember as
5129 def for the reduction PHI generation. */
5130 bool double_reduc = false;
5131 stmt_vec_info rdef_info = stmt_info;
5132 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5134 gcc_assert (!slp_node);
5135 double_reduc = true;
5136 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5137 (stmt_info->stmt, 0));
5138 stmt_info = vect_stmt_to_vectorize (stmt_info);
5140 gphi *reduc_def_stmt
5141 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5142 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5143 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5144 tree vectype;
5145 machine_mode mode;
5146 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5147 basic_block exit_bb;
5148 tree scalar_dest;
5149 tree scalar_type;
5150 gimple *new_phi = NULL, *phi;
5151 gimple_stmt_iterator exit_gsi;
5152 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5153 gimple *epilog_stmt = NULL;
5154 gimple *exit_phi;
5155 tree bitsize;
5156 tree def;
5157 tree orig_name, scalar_result;
5158 imm_use_iterator imm_iter, phi_imm_iter;
5159 use_operand_p use_p, phi_use_p;
5160 gimple *use_stmt;
5161 auto_vec<tree> reduc_inputs;
5162 int j, i;
5163 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5164 unsigned int group_size = 1, k;
5165 auto_vec<gimple *> phis;
5166 /* SLP reduction without reduction chain, e.g.,
5167 # a1 = phi <a2, a0>
5168 # b1 = phi <b2, b0>
5169 a2 = operation (a1)
5170 b2 = operation (b1) */
5171 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5172 bool direct_slp_reduc;
5173 tree induction_index = NULL_TREE;
5175 if (slp_node)
5176 group_size = SLP_TREE_LANES (slp_node);
5178 if (nested_in_vect_loop_p (loop, stmt_info))
5180 outer_loop = loop;
5181 loop = loop->inner;
5182 gcc_assert (!slp_node && double_reduc);
5185 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5186 gcc_assert (vectype);
5187 mode = TYPE_MODE (vectype);
5189 tree induc_val = NULL_TREE;
5190 tree adjustment_def = NULL;
5191 if (slp_node)
5193 else
5195 /* Optimize: for induction condition reduction, if we can't use zero
5196 for induc_val, use initial_def. */
5197 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5198 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5199 else if (double_reduc)
5201 else
5202 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5205 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5206 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5207 if (slp_reduc)
5208 /* All statements produce live-out values. */
5209 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5210 else if (slp_node)
5211 /* The last statement in the reduction chain produces the live-out
5212 value. */
5213 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5215 unsigned vec_num;
5216 int ncopies;
5217 if (slp_node)
5219 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5220 ncopies = 1;
5222 else
5224 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5225 vec_num = 1;
5226 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5229 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5230 which is updated with the current index of the loop for every match of
5231 the original loop's cond_expr (VEC_STMT). This results in a vector
5232 containing the last time the condition passed for that vector lane.
5233 The first match will be a 1 to allow 0 to be used for non-matching
5234 indexes. If there are no matches at all then the vector will be all
5235 zeroes.
5237 PR92772: This algorithm is broken for architectures that support
5238 masked vectors, but do not provide fold_extract_last. */
5239 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5241 auto_vec<std::pair<tree, bool>, 2> ccompares;
5242 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5243 cond_info = vect_stmt_to_vectorize (cond_info);
5244 while (cond_info != reduc_info)
5246 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5248 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5249 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5250 ccompares.safe_push
5251 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5252 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5254 cond_info
5255 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5256 1 + STMT_VINFO_REDUC_IDX
5257 (cond_info)));
5258 cond_info = vect_stmt_to_vectorize (cond_info);
5260 gcc_assert (ccompares.length () != 0);
5262 tree indx_before_incr, indx_after_incr;
5263 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5264 int scalar_precision
5265 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5266 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5267 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5268 (TYPE_MODE (vectype), cr_index_scalar_type,
5269 TYPE_VECTOR_SUBPARTS (vectype));
5271 /* First we create a simple vector induction variable which starts
5272 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5273 vector size (STEP). */
5275 /* Create a {1,2,3,...} vector. */
5276 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5278 /* Create a vector of the step value. */
5279 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5280 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5282 /* Create an induction variable. */
5283 gimple_stmt_iterator incr_gsi;
5284 bool insert_after;
5285 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5286 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5287 insert_after, &indx_before_incr, &indx_after_incr);
5289 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5290 filled with zeros (VEC_ZERO). */
5292 /* Create a vector of 0s. */
5293 tree zero = build_zero_cst (cr_index_scalar_type);
5294 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5296 /* Create a vector phi node. */
5297 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5298 new_phi = create_phi_node (new_phi_tree, loop->header);
5299 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5300 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5302 /* Now take the condition from the loops original cond_exprs
5303 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5304 every match uses values from the induction variable
5305 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5306 (NEW_PHI_TREE).
5307 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5308 the new cond_expr (INDEX_COND_EXPR). */
5309 gimple_seq stmts = NULL;
5310 for (int i = ccompares.length () - 1; i != -1; --i)
5312 tree ccompare = ccompares[i].first;
5313 if (ccompares[i].second)
5314 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5315 cr_index_vector_type,
5316 ccompare,
5317 indx_before_incr, new_phi_tree);
5318 else
5319 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5320 cr_index_vector_type,
5321 ccompare,
5322 new_phi_tree, indx_before_incr);
5324 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5326 /* Update the phi with the vec cond. */
5327 induction_index = new_phi_tree;
5328 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5329 loop_latch_edge (loop), UNKNOWN_LOCATION);
5332 /* 2. Create epilog code.
5333 The reduction epilog code operates across the elements of the vector
5334 of partial results computed by the vectorized loop.
5335 The reduction epilog code consists of:
5337 step 1: compute the scalar result in a vector (v_out2)
5338 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5339 step 3: adjust the scalar result (s_out3) if needed.
5341 Step 1 can be accomplished using one the following three schemes:
5342 (scheme 1) using reduc_fn, if available.
5343 (scheme 2) using whole-vector shifts, if available.
5344 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5345 combined.
5347 The overall epilog code looks like this:
5349 s_out0 = phi <s_loop> # original EXIT_PHI
5350 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5351 v_out2 = reduce <v_out1> # step 1
5352 s_out3 = extract_field <v_out2, 0> # step 2
5353 s_out4 = adjust_result <s_out3> # step 3
5355 (step 3 is optional, and steps 1 and 2 may be combined).
5356 Lastly, the uses of s_out0 are replaced by s_out4. */
5359 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5360 v_out1 = phi <VECT_DEF>
5361 Store them in NEW_PHIS. */
5362 if (double_reduc)
5363 loop = outer_loop;
5364 exit_bb = single_exit (loop)->dest;
5365 exit_gsi = gsi_after_labels (exit_bb);
5366 reduc_inputs.create (slp_node ? vec_num : ncopies);
5367 for (unsigned i = 0; i < vec_num; i++)
5369 gimple_seq stmts = NULL;
5370 if (slp_node)
5371 def = vect_get_slp_vect_def (slp_node, i);
5372 else
5373 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5374 for (j = 0; j < ncopies; j++)
5376 tree new_def = copy_ssa_name (def);
5377 phi = create_phi_node (new_def, exit_bb);
5378 if (j)
5379 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5380 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5381 new_def = gimple_convert (&stmts, vectype, new_def);
5382 reduc_inputs.quick_push (new_def);
5384 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5387 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5388 (i.e. when reduc_fn is not available) and in the final adjustment
5389 code (if needed). Also get the original scalar reduction variable as
5390 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5391 represents a reduction pattern), the tree-code and scalar-def are
5392 taken from the original stmt that the pattern-stmt (STMT) replaces.
5393 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5394 are taken from STMT. */
5396 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5397 if (orig_stmt_info != stmt_info)
5399 /* Reduction pattern */
5400 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5401 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5404 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5405 scalar_type = TREE_TYPE (scalar_dest);
5406 scalar_results.create (group_size);
5407 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5408 bitsize = TYPE_SIZE (scalar_type);
5410 /* True if we should implement SLP_REDUC using native reduction operations
5411 instead of scalar operations. */
5412 direct_slp_reduc = (reduc_fn != IFN_LAST
5413 && slp_reduc
5414 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5416 /* In case of reduction chain, e.g.,
5417 # a1 = phi <a3, a0>
5418 a2 = operation (a1)
5419 a3 = operation (a2),
5421 we may end up with more than one vector result. Here we reduce them
5422 to one vector.
5424 The same is true if we couldn't use a single defuse cycle. */
5425 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5426 || direct_slp_reduc
5427 || ncopies > 1)
5429 gimple_seq stmts = NULL;
5430 tree single_input = reduc_inputs[0];
5431 for (k = 1; k < reduc_inputs.length (); k++)
5432 single_input = gimple_build (&stmts, code, vectype,
5433 single_input, reduc_inputs[k]);
5434 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5436 reduc_inputs.truncate (0);
5437 reduc_inputs.safe_push (single_input);
5440 tree orig_reduc_input = reduc_inputs[0];
5442 /* If this loop is an epilogue loop that can be skipped after the
5443 main loop, we can only share a reduction operation between the
5444 main loop and the epilogue if we put it at the target of the
5445 skip edge.
5447 We can still reuse accumulators if this check fails. Doing so has
5448 the minor(?) benefit of making the epilogue loop's scalar result
5449 independent of the main loop's scalar result. */
5450 bool unify_with_main_loop_p = false;
5451 if (reduc_info->reused_accumulator
5452 && loop_vinfo->skip_this_loop_edge
5453 && single_succ_p (exit_bb)
5454 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5456 unify_with_main_loop_p = true;
5458 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5459 reduc_inputs[0] = make_ssa_name (vectype);
5460 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5461 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5462 UNKNOWN_LOCATION);
5463 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5464 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5465 exit_gsi = gsi_after_labels (reduc_block);
5468 /* Shouldn't be used beyond this point. */
5469 exit_bb = nullptr;
5471 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5472 && reduc_fn != IFN_LAST)
5474 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5475 various data values where the condition matched and another vector
5476 (INDUCTION_INDEX) containing all the indexes of those matches. We
5477 need to extract the last matching index (which will be the index with
5478 highest value) and use this to index into the data vector.
5479 For the case where there were no matches, the data vector will contain
5480 all default values and the index vector will be all zeros. */
5482 /* Get various versions of the type of the vector of indexes. */
5483 tree index_vec_type = TREE_TYPE (induction_index);
5484 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5485 tree index_scalar_type = TREE_TYPE (index_vec_type);
5486 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5488 /* Get an unsigned integer version of the type of the data vector. */
5489 int scalar_precision
5490 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5491 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5492 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5493 vectype);
5495 /* First we need to create a vector (ZERO_VEC) of zeros and another
5496 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5497 can create using a MAX reduction and then expanding.
5498 In the case where the loop never made any matches, the max index will
5499 be zero. */
5501 /* Vector of {0, 0, 0,...}. */
5502 tree zero_vec = build_zero_cst (vectype);
5504 /* Find maximum value from the vector of found indexes. */
5505 tree max_index = make_ssa_name (index_scalar_type);
5506 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5507 1, induction_index);
5508 gimple_call_set_lhs (max_index_stmt, max_index);
5509 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5511 /* Vector of {max_index, max_index, max_index,...}. */
5512 tree max_index_vec = make_ssa_name (index_vec_type);
5513 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5514 max_index);
5515 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5516 max_index_vec_rhs);
5517 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5519 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5520 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5521 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5522 otherwise. Only one value should match, resulting in a vector
5523 (VEC_COND) with one data value and the rest zeros.
5524 In the case where the loop never made any matches, every index will
5525 match, resulting in a vector with all data values (which will all be
5526 the default value). */
5528 /* Compare the max index vector to the vector of found indexes to find
5529 the position of the max value. */
5530 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5531 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5532 induction_index,
5533 max_index_vec);
5534 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5536 /* Use the compare to choose either values from the data vector or
5537 zero. */
5538 tree vec_cond = make_ssa_name (vectype);
5539 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5540 vec_compare,
5541 reduc_inputs[0],
5542 zero_vec);
5543 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5545 /* Finally we need to extract the data value from the vector (VEC_COND)
5546 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5547 reduction, but because this doesn't exist, we can use a MAX reduction
5548 instead. The data value might be signed or a float so we need to cast
5549 it first.
5550 In the case where the loop never made any matches, the data values are
5551 all identical, and so will reduce down correctly. */
5553 /* Make the matched data values unsigned. */
5554 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5555 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5556 vec_cond);
5557 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5558 VIEW_CONVERT_EXPR,
5559 vec_cond_cast_rhs);
5560 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5562 /* Reduce down to a scalar value. */
5563 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5564 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5565 1, vec_cond_cast);
5566 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5567 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5569 /* Convert the reduced value back to the result type and set as the
5570 result. */
5571 gimple_seq stmts = NULL;
5572 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5573 data_reduc);
5574 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5575 scalar_results.safe_push (new_temp);
5577 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5578 && reduc_fn == IFN_LAST)
5580 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5581 idx = 0;
5582 idx_val = induction_index[0];
5583 val = data_reduc[0];
5584 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5585 if (induction_index[i] > idx_val)
5586 val = data_reduc[i], idx_val = induction_index[i];
5587 return val; */
5589 tree data_eltype = TREE_TYPE (vectype);
5590 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5591 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5592 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5593 /* Enforced by vectorizable_reduction, which ensures we have target
5594 support before allowing a conditional reduction on variable-length
5595 vectors. */
5596 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5597 tree idx_val = NULL_TREE, val = NULL_TREE;
5598 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5600 tree old_idx_val = idx_val;
5601 tree old_val = val;
5602 idx_val = make_ssa_name (idx_eltype);
5603 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5604 build3 (BIT_FIELD_REF, idx_eltype,
5605 induction_index,
5606 bitsize_int (el_size),
5607 bitsize_int (off)));
5608 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5609 val = make_ssa_name (data_eltype);
5610 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5611 build3 (BIT_FIELD_REF,
5612 data_eltype,
5613 reduc_inputs[0],
5614 bitsize_int (el_size),
5615 bitsize_int (off)));
5616 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617 if (off != 0)
5619 tree new_idx_val = idx_val;
5620 if (off != v_size - el_size)
5622 new_idx_val = make_ssa_name (idx_eltype);
5623 epilog_stmt = gimple_build_assign (new_idx_val,
5624 MAX_EXPR, idx_val,
5625 old_idx_val);
5626 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5628 tree new_val = make_ssa_name (data_eltype);
5629 epilog_stmt = gimple_build_assign (new_val,
5630 COND_EXPR,
5631 build2 (GT_EXPR,
5632 boolean_type_node,
5633 idx_val,
5634 old_idx_val),
5635 val, old_val);
5636 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5637 idx_val = new_idx_val;
5638 val = new_val;
5641 /* Convert the reduced value back to the result type and set as the
5642 result. */
5643 gimple_seq stmts = NULL;
5644 val = gimple_convert (&stmts, scalar_type, val);
5645 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5646 scalar_results.safe_push (val);
5649 /* 2.3 Create the reduction code, using one of the three schemes described
5650 above. In SLP we simply need to extract all the elements from the
5651 vector (without reducing them), so we use scalar shifts. */
5652 else if (reduc_fn != IFN_LAST && !slp_reduc)
5654 tree tmp;
5655 tree vec_elem_type;
5657 /* Case 1: Create:
5658 v_out2 = reduc_expr <v_out1> */
5660 if (dump_enabled_p ())
5661 dump_printf_loc (MSG_NOTE, vect_location,
5662 "Reduce using direct vector reduction.\n");
5664 gimple_seq stmts = NULL;
5665 vec_elem_type = TREE_TYPE (vectype);
5666 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5667 vec_elem_type, reduc_inputs[0]);
5668 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5669 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5671 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5672 && induc_val)
5674 /* Earlier we set the initial value to be a vector if induc_val
5675 values. Check the result and if it is induc_val then replace
5676 with the original initial value, unless induc_val is
5677 the same as initial_def already. */
5678 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5679 induc_val);
5680 tree initial_def = reduc_info->reduc_initial_values[0];
5682 tmp = make_ssa_name (new_scalar_dest);
5683 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5684 initial_def, new_temp);
5685 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5686 new_temp = tmp;
5689 scalar_results.safe_push (new_temp);
5691 else if (direct_slp_reduc)
5693 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5694 with the elements for other SLP statements replaced with the
5695 neutral value. We can then do a normal reduction on each vector. */
5697 /* Enforced by vectorizable_reduction. */
5698 gcc_assert (reduc_inputs.length () == 1);
5699 gcc_assert (pow2p_hwi (group_size));
5701 gimple_seq seq = NULL;
5703 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5704 and the same element size as VECTYPE. */
5705 tree index = build_index_vector (vectype, 0, 1);
5706 tree index_type = TREE_TYPE (index);
5707 tree index_elt_type = TREE_TYPE (index_type);
5708 tree mask_type = truth_type_for (index_type);
5710 /* Create a vector that, for each element, identifies which of
5711 the REDUC_GROUP_SIZE results should use it. */
5712 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5713 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5714 build_vector_from_val (index_type, index_mask));
5716 /* Get a neutral vector value. This is simply a splat of the neutral
5717 scalar value if we have one, otherwise the initial scalar value
5718 is itself a neutral value. */
5719 tree vector_identity = NULL_TREE;
5720 tree neutral_op = NULL_TREE;
5721 if (slp_node)
5723 tree initial_value = NULL_TREE;
5724 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5725 initial_value = reduc_info->reduc_initial_values[0];
5726 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5727 initial_value);
5729 if (neutral_op)
5730 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5731 neutral_op);
5732 for (unsigned int i = 0; i < group_size; ++i)
5734 /* If there's no univeral neutral value, we can use the
5735 initial scalar value from the original PHI. This is used
5736 for MIN and MAX reduction, for example. */
5737 if (!neutral_op)
5739 tree scalar_value = reduc_info->reduc_initial_values[i];
5740 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5741 scalar_value);
5742 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5743 scalar_value);
5746 /* Calculate the equivalent of:
5748 sel[j] = (index[j] == i);
5750 which selects the elements of REDUC_INPUTS[0] that should
5751 be included in the result. */
5752 tree compare_val = build_int_cst (index_elt_type, i);
5753 compare_val = build_vector_from_val (index_type, compare_val);
5754 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5755 index, compare_val);
5757 /* Calculate the equivalent of:
5759 vec = seq ? reduc_inputs[0] : vector_identity;
5761 VEC is now suitable for a full vector reduction. */
5762 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5763 sel, reduc_inputs[0], vector_identity);
5765 /* Do the reduction and convert it to the appropriate type. */
5766 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5767 TREE_TYPE (vectype), vec);
5768 scalar = gimple_convert (&seq, scalar_type, scalar);
5769 scalar_results.safe_push (scalar);
5771 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5773 else
5775 bool reduce_with_shift;
5776 tree vec_temp;
5778 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5780 /* See if the target wants to do the final (shift) reduction
5781 in a vector mode of smaller size and first reduce upper/lower
5782 halves against each other. */
5783 enum machine_mode mode1 = mode;
5784 tree stype = TREE_TYPE (vectype);
5785 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5786 unsigned nunits1 = nunits;
5787 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5788 && reduc_inputs.length () == 1)
5790 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5791 /* For SLP reductions we have to make sure lanes match up, but
5792 since we're doing individual element final reduction reducing
5793 vector width here is even more important.
5794 ??? We can also separate lanes with permutes, for the common
5795 case of power-of-two group-size odd/even extracts would work. */
5796 if (slp_reduc && nunits != nunits1)
5798 nunits1 = least_common_multiple (nunits1, group_size);
5799 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5802 if (!slp_reduc
5803 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5804 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5806 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5807 stype, nunits1);
5808 reduce_with_shift = have_whole_vector_shift (mode1);
5809 if (!VECTOR_MODE_P (mode1))
5810 reduce_with_shift = false;
5811 else
5813 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5814 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5815 reduce_with_shift = false;
5818 /* First reduce the vector to the desired vector size we should
5819 do shift reduction on by combining upper and lower halves. */
5820 gimple_seq stmts = NULL;
5821 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5822 code, &stmts);
5823 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5824 reduc_inputs[0] = new_temp;
5826 if (reduce_with_shift && !slp_reduc)
5828 int element_bitsize = tree_to_uhwi (bitsize);
5829 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5830 for variable-length vectors and also requires direct target support
5831 for loop reductions. */
5832 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5833 int nelements = vec_size_in_bits / element_bitsize;
5834 vec_perm_builder sel;
5835 vec_perm_indices indices;
5837 int elt_offset;
5839 tree zero_vec = build_zero_cst (vectype1);
5840 /* Case 2: Create:
5841 for (offset = nelements/2; offset >= 1; offset/=2)
5843 Create: va' = vec_shift <va, offset>
5844 Create: va = vop <va, va'>
5845 } */
5847 tree rhs;
5849 if (dump_enabled_p ())
5850 dump_printf_loc (MSG_NOTE, vect_location,
5851 "Reduce using vector shifts\n");
5853 gimple_seq stmts = NULL;
5854 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5855 for (elt_offset = nelements / 2;
5856 elt_offset >= 1;
5857 elt_offset /= 2)
5859 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5860 indices.new_vector (sel, 2, nelements);
5861 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5862 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5863 new_temp, zero_vec, mask);
5864 new_temp = gimple_build (&stmts, code,
5865 vectype1, new_name, new_temp);
5867 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5869 /* 2.4 Extract the final scalar result. Create:
5870 s_out3 = extract_field <v_out2, bitpos> */
5872 if (dump_enabled_p ())
5873 dump_printf_loc (MSG_NOTE, vect_location,
5874 "extract scalar result\n");
5876 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5877 bitsize, bitsize_zero_node);
5878 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5879 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5880 gimple_assign_set_lhs (epilog_stmt, new_temp);
5881 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5882 scalar_results.safe_push (new_temp);
5884 else
5886 /* Case 3: Create:
5887 s = extract_field <v_out2, 0>
5888 for (offset = element_size;
5889 offset < vector_size;
5890 offset += element_size;)
5892 Create: s' = extract_field <v_out2, offset>
5893 Create: s = op <s, s'> // For non SLP cases
5894 } */
5896 if (dump_enabled_p ())
5897 dump_printf_loc (MSG_NOTE, vect_location,
5898 "Reduce using scalar code.\n");
5900 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5901 int element_bitsize = tree_to_uhwi (bitsize);
5902 tree compute_type = TREE_TYPE (vectype);
5903 gimple_seq stmts = NULL;
5904 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5906 int bit_offset;
5907 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5908 vec_temp, bitsize, bitsize_zero_node);
5910 /* In SLP we don't need to apply reduction operation, so we just
5911 collect s' values in SCALAR_RESULTS. */
5912 if (slp_reduc)
5913 scalar_results.safe_push (new_temp);
5915 for (bit_offset = element_bitsize;
5916 bit_offset < vec_size_in_bits;
5917 bit_offset += element_bitsize)
5919 tree bitpos = bitsize_int (bit_offset);
5920 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5921 compute_type, vec_temp,
5922 bitsize, bitpos);
5923 if (slp_reduc)
5925 /* In SLP we don't need to apply reduction operation, so
5926 we just collect s' values in SCALAR_RESULTS. */
5927 new_temp = new_name;
5928 scalar_results.safe_push (new_name);
5930 else
5931 new_temp = gimple_build (&stmts, code, compute_type,
5932 new_name, new_temp);
5936 /* The only case where we need to reduce scalar results in SLP, is
5937 unrolling. If the size of SCALAR_RESULTS is greater than
5938 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5939 REDUC_GROUP_SIZE. */
5940 if (slp_reduc)
5942 tree res, first_res, new_res;
5944 /* Reduce multiple scalar results in case of SLP unrolling. */
5945 for (j = group_size; scalar_results.iterate (j, &res);
5946 j++)
5948 first_res = scalar_results[j % group_size];
5949 new_res = gimple_build (&stmts, code, compute_type,
5950 first_res, res);
5951 scalar_results[j % group_size] = new_res;
5953 scalar_results.truncate (group_size);
5954 for (k = 0; k < group_size; k++)
5955 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5956 scalar_results[k]);
5958 else
5960 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5961 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5962 scalar_results.safe_push (new_temp);
5965 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5968 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5969 && induc_val)
5971 /* Earlier we set the initial value to be a vector if induc_val
5972 values. Check the result and if it is induc_val then replace
5973 with the original initial value, unless induc_val is
5974 the same as initial_def already. */
5975 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5976 induc_val);
5977 tree initial_def = reduc_info->reduc_initial_values[0];
5979 tree tmp = make_ssa_name (new_scalar_dest);
5980 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5981 initial_def, new_temp);
5982 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5983 scalar_results[0] = tmp;
5987 /* 2.5 Adjust the final result by the initial value of the reduction
5988 variable. (When such adjustment is not needed, then
5989 'adjustment_def' is zero). For example, if code is PLUS we create:
5990 new_temp = loop_exit_def + adjustment_def */
5992 if (adjustment_def)
5994 gcc_assert (!slp_reduc);
5995 gimple_seq stmts = NULL;
5996 if (double_reduc)
5998 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5999 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6000 new_temp = gimple_build (&stmts, code, vectype,
6001 reduc_inputs[0], adjustment_def);
6003 else
6005 new_temp = scalar_results[0];
6006 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6007 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6008 new_temp = gimple_build (&stmts, code, scalar_type,
6009 new_temp, adjustment_def);
6012 epilog_stmt = gimple_seq_last_stmt (stmts);
6013 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6014 scalar_results[0] = new_temp;
6017 /* Record this operation if it could be reused by the epilogue loop. */
6018 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6019 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6020 { orig_reduc_input, reduc_info });
6022 if (double_reduc)
6023 loop = outer_loop;
6025 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6026 phis with new adjusted scalar results, i.e., replace use <s_out0>
6027 with use <s_out4>.
6029 Transform:
6030 loop_exit:
6031 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6032 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6033 v_out2 = reduce <v_out1>
6034 s_out3 = extract_field <v_out2, 0>
6035 s_out4 = adjust_result <s_out3>
6036 use <s_out0>
6037 use <s_out0>
6039 into:
6041 loop_exit:
6042 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6043 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6044 v_out2 = reduce <v_out1>
6045 s_out3 = extract_field <v_out2, 0>
6046 s_out4 = adjust_result <s_out3>
6047 use <s_out4>
6048 use <s_out4> */
6050 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6051 for (k = 0; k < live_out_stmts.size (); k++)
6053 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6054 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6056 phis.create (3);
6057 /* Find the loop-closed-use at the loop exit of the original scalar
6058 result. (The reduction result is expected to have two immediate uses,
6059 one at the latch block, and one at the loop exit). For double
6060 reductions we are looking for exit phis of the outer loop. */
6061 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6063 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6065 if (!is_gimple_debug (USE_STMT (use_p)))
6066 phis.safe_push (USE_STMT (use_p));
6068 else
6070 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6072 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6074 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6076 if (!flow_bb_inside_loop_p (loop,
6077 gimple_bb (USE_STMT (phi_use_p)))
6078 && !is_gimple_debug (USE_STMT (phi_use_p)))
6079 phis.safe_push (USE_STMT (phi_use_p));
6085 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6087 /* Replace the uses: */
6088 orig_name = PHI_RESULT (exit_phi);
6090 /* Look for a single use at the target of the skip edge. */
6091 if (unify_with_main_loop_p)
6093 use_operand_p use_p;
6094 gimple *user;
6095 if (!single_imm_use (orig_name, &use_p, &user))
6096 gcc_unreachable ();
6097 orig_name = gimple_get_lhs (user);
6100 scalar_result = scalar_results[k];
6101 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6103 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6104 SET_USE (use_p, scalar_result);
6105 update_stmt (use_stmt);
6109 phis.release ();
6113 /* Return a vector of type VECTYPE that is equal to the vector select
6114 operation "MASK ? VEC : IDENTITY". Insert the select statements
6115 before GSI. */
6117 static tree
6118 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6119 tree vec, tree identity)
6121 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6122 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6123 mask, vec, identity);
6124 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6125 return cond;
6128 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6129 order, starting with LHS. Insert the extraction statements before GSI and
6130 associate the new scalar SSA names with variable SCALAR_DEST.
6131 Return the SSA name for the result. */
6133 static tree
6134 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6135 tree_code code, tree lhs, tree vector_rhs)
6137 tree vectype = TREE_TYPE (vector_rhs);
6138 tree scalar_type = TREE_TYPE (vectype);
6139 tree bitsize = TYPE_SIZE (scalar_type);
6140 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6141 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6143 for (unsigned HOST_WIDE_INT bit_offset = 0;
6144 bit_offset < vec_size_in_bits;
6145 bit_offset += element_bitsize)
6147 tree bitpos = bitsize_int (bit_offset);
6148 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6149 bitsize, bitpos);
6151 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6152 rhs = make_ssa_name (scalar_dest, stmt);
6153 gimple_assign_set_lhs (stmt, rhs);
6154 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6156 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6157 tree new_name = make_ssa_name (scalar_dest, stmt);
6158 gimple_assign_set_lhs (stmt, new_name);
6159 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6160 lhs = new_name;
6162 return lhs;
6165 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6166 type of the vector input. */
6168 static internal_fn
6169 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6171 internal_fn mask_reduc_fn;
6173 switch (reduc_fn)
6175 case IFN_FOLD_LEFT_PLUS:
6176 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6177 break;
6179 default:
6180 return IFN_LAST;
6183 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6184 OPTIMIZE_FOR_SPEED))
6185 return mask_reduc_fn;
6186 return IFN_LAST;
6189 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6190 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6191 statement. CODE is the operation performed by STMT_INFO and OPS are
6192 its scalar operands. REDUC_INDEX is the index of the operand in
6193 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6194 implements in-order reduction, or IFN_LAST if we should open-code it.
6195 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6196 that should be used to control the operation in a fully-masked loop. */
6198 static bool
6199 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6200 stmt_vec_info stmt_info,
6201 gimple_stmt_iterator *gsi,
6202 gimple **vec_stmt, slp_tree slp_node,
6203 gimple *reduc_def_stmt,
6204 tree_code code, internal_fn reduc_fn,
6205 tree ops[3], tree vectype_in,
6206 int reduc_index, vec_loop_masks *masks)
6208 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6209 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6210 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6212 int ncopies;
6213 if (slp_node)
6214 ncopies = 1;
6215 else
6216 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6218 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6219 gcc_assert (ncopies == 1);
6220 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6222 if (slp_node)
6223 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6224 TYPE_VECTOR_SUBPARTS (vectype_in)));
6226 tree op0 = ops[1 - reduc_index];
6228 int group_size = 1;
6229 stmt_vec_info scalar_dest_def_info;
6230 auto_vec<tree> vec_oprnds0;
6231 if (slp_node)
6233 auto_vec<vec<tree> > vec_defs (2);
6234 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6235 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6236 vec_defs[0].release ();
6237 vec_defs[1].release ();
6238 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6239 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6241 else
6243 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6244 op0, &vec_oprnds0);
6245 scalar_dest_def_info = stmt_info;
6248 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6249 tree scalar_type = TREE_TYPE (scalar_dest);
6250 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6252 int vec_num = vec_oprnds0.length ();
6253 gcc_assert (vec_num == 1 || slp_node);
6254 tree vec_elem_type = TREE_TYPE (vectype_out);
6255 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6257 tree vector_identity = NULL_TREE;
6258 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6259 vector_identity = build_zero_cst (vectype_out);
6261 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6262 int i;
6263 tree def0;
6264 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6266 gimple *new_stmt;
6267 tree mask = NULL_TREE;
6268 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6269 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6271 /* Handle MINUS by adding the negative. */
6272 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6274 tree negated = make_ssa_name (vectype_out);
6275 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6276 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6277 def0 = negated;
6280 if (mask && mask_reduc_fn == IFN_LAST)
6281 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6282 vector_identity);
6284 /* On the first iteration the input is simply the scalar phi
6285 result, and for subsequent iterations it is the output of
6286 the preceding operation. */
6287 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6289 if (mask && mask_reduc_fn != IFN_LAST)
6290 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6291 def0, mask);
6292 else
6293 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6294 def0);
6295 /* For chained SLP reductions the output of the previous reduction
6296 operation serves as the input of the next. For the final statement
6297 the output cannot be a temporary - we reuse the original
6298 scalar destination of the last statement. */
6299 if (i != vec_num - 1)
6301 gimple_set_lhs (new_stmt, scalar_dest_var);
6302 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6303 gimple_set_lhs (new_stmt, reduc_var);
6306 else
6308 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6309 reduc_var, def0);
6310 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6311 /* Remove the statement, so that we can use the same code paths
6312 as for statements that we've just created. */
6313 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6314 gsi_remove (&tmp_gsi, true);
6317 if (i == vec_num - 1)
6319 gimple_set_lhs (new_stmt, scalar_dest);
6320 vect_finish_replace_stmt (loop_vinfo,
6321 scalar_dest_def_info,
6322 new_stmt);
6324 else
6325 vect_finish_stmt_generation (loop_vinfo,
6326 scalar_dest_def_info,
6327 new_stmt, gsi);
6329 if (slp_node)
6330 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6331 else
6333 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6334 *vec_stmt = new_stmt;
6338 return true;
6341 /* Function is_nonwrapping_integer_induction.
6343 Check if STMT_VINO (which is part of loop LOOP) both increments and
6344 does not cause overflow. */
6346 static bool
6347 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6349 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6350 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6351 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6352 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6353 widest_int ni, max_loop_value, lhs_max;
6354 wi::overflow_type overflow = wi::OVF_NONE;
6356 /* Make sure the loop is integer based. */
6357 if (TREE_CODE (base) != INTEGER_CST
6358 || TREE_CODE (step) != INTEGER_CST)
6359 return false;
6361 /* Check that the max size of the loop will not wrap. */
6363 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6364 return true;
6366 if (! max_stmt_executions (loop, &ni))
6367 return false;
6369 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6370 &overflow);
6371 if (overflow)
6372 return false;
6374 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6375 TYPE_SIGN (lhs_type), &overflow);
6376 if (overflow)
6377 return false;
6379 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6380 <= TYPE_PRECISION (lhs_type));
6383 /* Check if masking can be supported by inserting a conditional expression.
6384 CODE is the code for the operation. COND_FN is the conditional internal
6385 function, if it exists. VECTYPE_IN is the type of the vector input. */
6386 static bool
6387 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6388 tree vectype_in)
6390 if (cond_fn != IFN_LAST
6391 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6392 OPTIMIZE_FOR_SPEED))
6393 return false;
6395 switch (code)
6397 case DOT_PROD_EXPR:
6398 case SAD_EXPR:
6399 return true;
6401 default:
6402 return false;
6406 /* Insert a conditional expression to enable masked vectorization. CODE is the
6407 code for the operation. VOP is the array of operands. MASK is the loop
6408 mask. GSI is a statement iterator used to place the new conditional
6409 expression. */
6410 static void
6411 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6412 gimple_stmt_iterator *gsi)
6414 switch (code)
6416 case DOT_PROD_EXPR:
6418 tree vectype = TREE_TYPE (vop[1]);
6419 tree zero = build_zero_cst (vectype);
6420 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6421 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6422 mask, vop[1], zero);
6423 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6424 vop[1] = masked_op1;
6425 break;
6428 case SAD_EXPR:
6430 tree vectype = TREE_TYPE (vop[1]);
6431 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6432 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6433 mask, vop[1], vop[0]);
6434 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6435 vop[1] = masked_op1;
6436 break;
6439 default:
6440 gcc_unreachable ();
6444 /* Function vectorizable_reduction.
6446 Check if STMT_INFO performs a reduction operation that can be vectorized.
6447 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6448 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6449 Return true if STMT_INFO is vectorizable in this way.
6451 This function also handles reduction idioms (patterns) that have been
6452 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6453 may be of this form:
6454 X = pattern_expr (arg0, arg1, ..., X)
6455 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6456 sequence that had been detected and replaced by the pattern-stmt
6457 (STMT_INFO).
6459 This function also handles reduction of condition expressions, for example:
6460 for (int i = 0; i < N; i++)
6461 if (a[i] < value)
6462 last = a[i];
6463 This is handled by vectorising the loop and creating an additional vector
6464 containing the loop indexes for which "a[i] < value" was true. In the
6465 function epilogue this is reduced to a single max value and then used to
6466 index into the vector of results.
6468 In some cases of reduction patterns, the type of the reduction variable X is
6469 different than the type of the other arguments of STMT_INFO.
6470 In such cases, the vectype that is used when transforming STMT_INFO into
6471 a vector stmt is different than the vectype that is used to determine the
6472 vectorization factor, because it consists of a different number of elements
6473 than the actual number of elements that are being operated upon in parallel.
6475 For example, consider an accumulation of shorts into an int accumulator.
6476 On some targets it's possible to vectorize this pattern operating on 8
6477 shorts at a time (hence, the vectype for purposes of determining the
6478 vectorization factor should be V8HI); on the other hand, the vectype that
6479 is used to create the vector form is actually V4SI (the type of the result).
6481 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6482 indicates what is the actual level of parallelism (V8HI in the example), so
6483 that the right vectorization factor would be derived. This vectype
6484 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6485 be used to create the vectorized stmt. The right vectype for the vectorized
6486 stmt is obtained from the type of the result X:
6487 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6489 This means that, contrary to "regular" reductions (or "regular" stmts in
6490 general), the following equation:
6491 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6492 does *NOT* necessarily hold for reduction patterns. */
6494 bool
6495 vectorizable_reduction (loop_vec_info loop_vinfo,
6496 stmt_vec_info stmt_info, slp_tree slp_node,
6497 slp_instance slp_node_instance,
6498 stmt_vector_for_cost *cost_vec)
6500 tree scalar_dest;
6501 tree vectype_in = NULL_TREE;
6502 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6503 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6504 stmt_vec_info cond_stmt_vinfo = NULL;
6505 tree scalar_type;
6506 int i;
6507 int ncopies;
6508 bool single_defuse_cycle = false;
6509 bool nested_cycle = false;
6510 bool double_reduc = false;
6511 int vec_num;
6512 tree tem;
6513 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6514 tree cond_reduc_val = NULL_TREE;
6516 /* Make sure it was already recognized as a reduction computation. */
6517 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6518 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6519 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6520 return false;
6522 /* The stmt we store reduction analysis meta on. */
6523 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6524 reduc_info->is_reduc_info = true;
6526 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6528 if (is_a <gphi *> (stmt_info->stmt))
6530 if (slp_node)
6532 /* We eventually need to set a vector type on invariant
6533 arguments. */
6534 unsigned j;
6535 slp_tree child;
6536 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6537 if (!vect_maybe_update_slp_op_vectype
6538 (child, SLP_TREE_VECTYPE (slp_node)))
6540 if (dump_enabled_p ())
6541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6542 "incompatible vector types for "
6543 "invariants\n");
6544 return false;
6547 /* Analysis for double-reduction is done on the outer
6548 loop PHI, nested cycles have no further restrictions. */
6549 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6551 else
6552 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6553 return true;
6556 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6557 stmt_vec_info phi_info = stmt_info;
6558 if (!is_a <gphi *> (stmt_info->stmt))
6560 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6561 return true;
6563 if (slp_node)
6565 slp_node_instance->reduc_phis = slp_node;
6566 /* ??? We're leaving slp_node to point to the PHIs, we only
6567 need it to get at the number of vector stmts which wasn't
6568 yet initialized for the instance root. */
6570 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6571 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6572 else
6574 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6575 == vect_double_reduction_def);
6576 use_operand_p use_p;
6577 gimple *use_stmt;
6578 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6579 &use_p, &use_stmt);
6580 gcc_assert (res);
6581 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6582 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6585 /* PHIs should not participate in patterns. */
6586 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6587 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6589 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6590 and compute the reduction chain length. Discover the real
6591 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6592 tree reduc_def
6593 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6594 loop_latch_edge
6595 (gimple_bb (reduc_def_phi)->loop_father));
6596 unsigned reduc_chain_length = 0;
6597 bool only_slp_reduc_chain = true;
6598 stmt_info = NULL;
6599 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6600 while (reduc_def != PHI_RESULT (reduc_def_phi))
6602 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6603 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6604 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6606 if (dump_enabled_p ())
6607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6608 "reduction chain broken by patterns.\n");
6609 return false;
6611 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6612 only_slp_reduc_chain = false;
6613 /* ??? For epilogue generation live members of the chain need
6614 to point back to the PHI via their original stmt for
6615 info_for_reduction to work. */
6616 if (STMT_VINFO_LIVE_P (vdef))
6617 STMT_VINFO_REDUC_DEF (def) = phi_info;
6618 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6619 if (!assign)
6621 if (dump_enabled_p ())
6622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623 "reduction chain includes calls.\n");
6624 return false;
6626 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6628 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6629 TREE_TYPE (gimple_assign_rhs1 (assign))))
6631 if (dump_enabled_p ())
6632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6633 "conversion in the reduction chain.\n");
6634 return false;
6637 else if (!stmt_info)
6638 /* First non-conversion stmt. */
6639 stmt_info = vdef;
6640 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6641 reduc_chain_length++;
6642 if (!stmt_info && slp_node)
6643 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6645 /* PHIs should not participate in patterns. */
6646 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6648 if (nested_in_vect_loop_p (loop, stmt_info))
6650 loop = loop->inner;
6651 nested_cycle = true;
6654 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6655 element. */
6656 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6658 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6659 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6661 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6662 gcc_assert (slp_node
6663 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6665 /* 1. Is vectorizable reduction? */
6666 /* Not supportable if the reduction variable is used in the loop, unless
6667 it's a reduction chain. */
6668 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6669 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6670 return false;
6672 /* Reductions that are not used even in an enclosing outer-loop,
6673 are expected to be "live" (used out of the loop). */
6674 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6675 && !STMT_VINFO_LIVE_P (stmt_info))
6676 return false;
6678 /* 2. Has this been recognized as a reduction pattern?
6680 Check if STMT represents a pattern that has been recognized
6681 in earlier analysis stages. For stmts that represent a pattern,
6682 the STMT_VINFO_RELATED_STMT field records the last stmt in
6683 the original sequence that constitutes the pattern. */
6685 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6686 if (orig_stmt_info)
6688 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6689 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6692 /* 3. Check the operands of the operation. The first operands are defined
6693 inside the loop body. The last operand is the reduction variable,
6694 which is defined by the loop-header-phi. */
6696 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6697 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6698 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6699 enum tree_code code = gimple_assign_rhs_code (stmt);
6700 bool lane_reduc_code_p
6701 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6702 int op_type = TREE_CODE_LENGTH (code);
6703 enum optab_subtype optab_query_kind = optab_vector;
6704 if (code == DOT_PROD_EXPR
6705 && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6706 != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6707 optab_query_kind = optab_vector_mixed_sign;
6710 scalar_dest = gimple_assign_lhs (stmt);
6711 scalar_type = TREE_TYPE (scalar_dest);
6712 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6713 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6714 return false;
6716 /* Do not try to vectorize bit-precision reductions. */
6717 if (!type_has_mode_precision_p (scalar_type))
6718 return false;
6720 /* For lane-reducing ops we're reducing the number of reduction PHIs
6721 which means the only use of that may be in the lane-reducing operation. */
6722 if (lane_reduc_code_p
6723 && reduc_chain_length != 1
6724 && !only_slp_reduc_chain)
6726 if (dump_enabled_p ())
6727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6728 "lane-reducing reduction with extra stmts.\n");
6729 return false;
6732 /* All uses but the last are expected to be defined in the loop.
6733 The last use is the reduction variable. In case of nested cycle this
6734 assumption is not true: we use reduc_index to record the index of the
6735 reduction variable. */
6736 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6737 /* We need to skip an extra operand for COND_EXPRs with embedded
6738 comparison. */
6739 unsigned opno_adjust = 0;
6740 if (code == COND_EXPR
6741 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6742 opno_adjust = 1;
6743 for (i = 0; i < op_type; i++)
6745 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6746 if (i == 0 && code == COND_EXPR)
6747 continue;
6749 stmt_vec_info def_stmt_info;
6750 enum vect_def_type dt;
6751 tree op;
6752 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6753 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6754 &def_stmt_info))
6756 if (dump_enabled_p ())
6757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758 "use not simple.\n");
6759 return false;
6761 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6762 continue;
6764 /* There should be only one cycle def in the stmt, the one
6765 leading to reduc_def. */
6766 if (VECTORIZABLE_CYCLE_DEF (dt))
6767 return false;
6769 /* To properly compute ncopies we are interested in the widest
6770 non-reduction input type in case we're looking at a widening
6771 accumulation that we later handle in vect_transform_reduction. */
6772 if (lane_reduc_code_p
6773 && tem
6774 && (!vectype_in
6775 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6776 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6777 vectype_in = tem;
6779 if (code == COND_EXPR)
6781 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6782 if (dt == vect_constant_def)
6784 cond_reduc_dt = dt;
6785 cond_reduc_val = op;
6787 if (dt == vect_induction_def
6788 && def_stmt_info
6789 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6791 cond_reduc_dt = dt;
6792 cond_stmt_vinfo = def_stmt_info;
6796 if (!vectype_in)
6797 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6798 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6800 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6801 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6802 /* If we have a condition reduction, see if we can simplify it further. */
6803 if (v_reduc_type == COND_REDUCTION)
6805 if (slp_node)
6806 return false;
6808 /* When the condition uses the reduction value in the condition, fail. */
6809 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6811 if (dump_enabled_p ())
6812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6813 "condition depends on previous iteration\n");
6814 return false;
6817 if (reduc_chain_length == 1
6818 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6819 vectype_in, OPTIMIZE_FOR_SPEED))
6821 if (dump_enabled_p ())
6822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6823 "optimizing condition reduction with"
6824 " FOLD_EXTRACT_LAST.\n");
6825 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6827 else if (cond_reduc_dt == vect_induction_def)
6829 tree base
6830 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6831 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6833 gcc_assert (TREE_CODE (base) == INTEGER_CST
6834 && TREE_CODE (step) == INTEGER_CST);
6835 cond_reduc_val = NULL_TREE;
6836 enum tree_code cond_reduc_op_code = ERROR_MARK;
6837 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6838 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6840 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6841 above base; punt if base is the minimum value of the type for
6842 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6843 else if (tree_int_cst_sgn (step) == -1)
6845 cond_reduc_op_code = MIN_EXPR;
6846 if (tree_int_cst_sgn (base) == -1)
6847 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6848 else if (tree_int_cst_lt (base,
6849 TYPE_MAX_VALUE (TREE_TYPE (base))))
6850 cond_reduc_val
6851 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6853 else
6855 cond_reduc_op_code = MAX_EXPR;
6856 if (tree_int_cst_sgn (base) == 1)
6857 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6858 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6859 base))
6860 cond_reduc_val
6861 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6863 if (cond_reduc_val)
6865 if (dump_enabled_p ())
6866 dump_printf_loc (MSG_NOTE, vect_location,
6867 "condition expression based on "
6868 "integer induction.\n");
6869 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6870 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6871 = cond_reduc_val;
6872 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6875 else if (cond_reduc_dt == vect_constant_def)
6877 enum vect_def_type cond_initial_dt;
6878 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6879 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6880 if (cond_initial_dt == vect_constant_def
6881 && types_compatible_p (TREE_TYPE (cond_initial_val),
6882 TREE_TYPE (cond_reduc_val)))
6884 tree e = fold_binary (LE_EXPR, boolean_type_node,
6885 cond_initial_val, cond_reduc_val);
6886 if (e && (integer_onep (e) || integer_zerop (e)))
6888 if (dump_enabled_p ())
6889 dump_printf_loc (MSG_NOTE, vect_location,
6890 "condition expression based on "
6891 "compile time constant.\n");
6892 /* Record reduction code at analysis stage. */
6893 STMT_VINFO_REDUC_CODE (reduc_info)
6894 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6895 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6901 if (STMT_VINFO_LIVE_P (phi_info))
6902 return false;
6904 if (slp_node)
6905 ncopies = 1;
6906 else
6907 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6909 gcc_assert (ncopies >= 1);
6911 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6913 if (nested_cycle)
6915 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6916 == vect_double_reduction_def);
6917 double_reduc = true;
6920 /* 4.2. Check support for the epilog operation.
6922 If STMT represents a reduction pattern, then the type of the
6923 reduction variable may be different than the type of the rest
6924 of the arguments. For example, consider the case of accumulation
6925 of shorts into an int accumulator; The original code:
6926 S1: int_a = (int) short_a;
6927 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6929 was replaced with:
6930 STMT: int_acc = widen_sum <short_a, int_acc>
6932 This means that:
6933 1. The tree-code that is used to create the vector operation in the
6934 epilog code (that reduces the partial results) is not the
6935 tree-code of STMT, but is rather the tree-code of the original
6936 stmt from the pattern that STMT is replacing. I.e, in the example
6937 above we want to use 'widen_sum' in the loop, but 'plus' in the
6938 epilog.
6939 2. The type (mode) we use to check available target support
6940 for the vector operation to be created in the *epilog*, is
6941 determined by the type of the reduction variable (in the example
6942 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6943 However the type (mode) we use to check available target support
6944 for the vector operation to be created *inside the loop*, is
6945 determined by the type of the other arguments to STMT (in the
6946 example we'd check this: optab_handler (widen_sum_optab,
6947 vect_short_mode)).
6949 This is contrary to "regular" reductions, in which the types of all
6950 the arguments are the same as the type of the reduction variable.
6951 For "regular" reductions we can therefore use the same vector type
6952 (and also the same tree-code) when generating the epilog code and
6953 when generating the code inside the loop. */
6955 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6956 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6958 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6959 if (reduction_type == TREE_CODE_REDUCTION)
6961 /* Check whether it's ok to change the order of the computation.
6962 Generally, when vectorizing a reduction we change the order of the
6963 computation. This may change the behavior of the program in some
6964 cases, so we need to check that this is ok. One exception is when
6965 vectorizing an outer-loop: the inner-loop is executed sequentially,
6966 and therefore vectorizing reductions in the inner-loop during
6967 outer-loop vectorization is safe. Likewise when we are vectorizing
6968 a series of reductions using SLP and the VF is one the reductions
6969 are performed in scalar order. */
6970 if (slp_node
6971 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6972 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6974 else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6976 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6977 is not directy used in stmt. */
6978 if (!only_slp_reduc_chain
6979 && reduc_chain_length != 1)
6981 if (dump_enabled_p ())
6982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6983 "in-order reduction chain without SLP.\n");
6984 return false;
6986 STMT_VINFO_REDUC_TYPE (reduc_info)
6987 = reduction_type = FOLD_LEFT_REDUCTION;
6989 else if (!commutative_tree_code (orig_code)
6990 || !associative_tree_code (orig_code))
6992 if (dump_enabled_p ())
6993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6994 "reduction: not commutative/associative");
6995 return false;
6999 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7000 && ncopies > 1)
7002 if (dump_enabled_p ())
7003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7004 "multiple types in double reduction or condition "
7005 "reduction or fold-left reduction.\n");
7006 return false;
7009 internal_fn reduc_fn = IFN_LAST;
7010 if (reduction_type == TREE_CODE_REDUCTION
7011 || reduction_type == FOLD_LEFT_REDUCTION
7012 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7013 || reduction_type == CONST_COND_REDUCTION)
7015 if (reduction_type == FOLD_LEFT_REDUCTION
7016 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7017 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7019 if (reduc_fn != IFN_LAST
7020 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7021 OPTIMIZE_FOR_SPEED))
7023 if (dump_enabled_p ())
7024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7025 "reduc op not supported by target.\n");
7027 reduc_fn = IFN_LAST;
7030 else
7032 if (!nested_cycle || double_reduc)
7034 if (dump_enabled_p ())
7035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7036 "no reduc code for scalar code.\n");
7038 return false;
7042 else if (reduction_type == COND_REDUCTION)
7044 int scalar_precision
7045 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7046 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7047 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7048 vectype_out);
7050 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7051 OPTIMIZE_FOR_SPEED))
7052 reduc_fn = IFN_REDUC_MAX;
7054 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7056 if (reduction_type != EXTRACT_LAST_REDUCTION
7057 && (!nested_cycle || double_reduc)
7058 && reduc_fn == IFN_LAST
7059 && !nunits_out.is_constant ())
7061 if (dump_enabled_p ())
7062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7063 "missing target support for reduction on"
7064 " variable-length vectors.\n");
7065 return false;
7068 /* For SLP reductions, see if there is a neutral value we can use. */
7069 tree neutral_op = NULL_TREE;
7070 if (slp_node)
7072 tree initial_value = NULL_TREE;
7073 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7074 initial_value = vect_phi_initial_value (reduc_def_phi);
7075 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7076 orig_code, initial_value);
7079 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7081 /* We can't support in-order reductions of code such as this:
7083 for (int i = 0; i < n1; ++i)
7084 for (int j = 0; j < n2; ++j)
7085 l += a[j];
7087 since GCC effectively transforms the loop when vectorizing:
7089 for (int i = 0; i < n1 / VF; ++i)
7090 for (int j = 0; j < n2; ++j)
7091 for (int k = 0; k < VF; ++k)
7092 l += a[j];
7094 which is a reassociation of the original operation. */
7095 if (dump_enabled_p ())
7096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097 "in-order double reduction not supported.\n");
7099 return false;
7102 if (reduction_type == FOLD_LEFT_REDUCTION
7103 && slp_node
7104 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7106 /* We cannot use in-order reductions in this case because there is
7107 an implicit reassociation of the operations involved. */
7108 if (dump_enabled_p ())
7109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7110 "in-order unchained SLP reductions not supported.\n");
7111 return false;
7114 /* For double reductions, and for SLP reductions with a neutral value,
7115 we construct a variable-length initial vector by loading a vector
7116 full of the neutral value and then shift-and-inserting the start
7117 values into the low-numbered elements. */
7118 if ((double_reduc || neutral_op)
7119 && !nunits_out.is_constant ()
7120 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7121 vectype_out, OPTIMIZE_FOR_SPEED))
7123 if (dump_enabled_p ())
7124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7125 "reduction on variable-length vectors requires"
7126 " target support for a vector-shift-and-insert"
7127 " operation.\n");
7128 return false;
7131 /* Check extra constraints for variable-length unchained SLP reductions. */
7132 if (STMT_SLP_TYPE (stmt_info)
7133 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7134 && !nunits_out.is_constant ())
7136 /* We checked above that we could build the initial vector when
7137 there's a neutral element value. Check here for the case in
7138 which each SLP statement has its own initial value and in which
7139 that value needs to be repeated for every instance of the
7140 statement within the initial vector. */
7141 unsigned int group_size = SLP_TREE_LANES (slp_node);
7142 if (!neutral_op
7143 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7144 TREE_TYPE (vectype_out)))
7146 if (dump_enabled_p ())
7147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7148 "unsupported form of SLP reduction for"
7149 " variable-length vectors: cannot build"
7150 " initial vector.\n");
7151 return false;
7153 /* The epilogue code relies on the number of elements being a multiple
7154 of the group size. The duplicate-and-interleave approach to setting
7155 up the initial vector does too. */
7156 if (!multiple_p (nunits_out, group_size))
7158 if (dump_enabled_p ())
7159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7160 "unsupported form of SLP reduction for"
7161 " variable-length vectors: the vector size"
7162 " is not a multiple of the number of results.\n");
7163 return false;
7167 if (reduction_type == COND_REDUCTION)
7169 widest_int ni;
7171 if (! max_loop_iterations (loop, &ni))
7173 if (dump_enabled_p ())
7174 dump_printf_loc (MSG_NOTE, vect_location,
7175 "loop count not known, cannot create cond "
7176 "reduction.\n");
7177 return false;
7179 /* Convert backedges to iterations. */
7180 ni += 1;
7182 /* The additional index will be the same type as the condition. Check
7183 that the loop can fit into this less one (because we'll use up the
7184 zero slot for when there are no matches). */
7185 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7186 if (wi::geu_p (ni, wi::to_widest (max_index)))
7188 if (dump_enabled_p ())
7189 dump_printf_loc (MSG_NOTE, vect_location,
7190 "loop size is greater than data size.\n");
7191 return false;
7195 /* In case the vectorization factor (VF) is bigger than the number
7196 of elements that we can fit in a vectype (nunits), we have to generate
7197 more than one vector stmt - i.e - we need to "unroll" the
7198 vector stmt by a factor VF/nunits. For more details see documentation
7199 in vectorizable_operation. */
7201 /* If the reduction is used in an outer loop we need to generate
7202 VF intermediate results, like so (e.g. for ncopies=2):
7203 r0 = phi (init, r0)
7204 r1 = phi (init, r1)
7205 r0 = x0 + r0;
7206 r1 = x1 + r1;
7207 (i.e. we generate VF results in 2 registers).
7208 In this case we have a separate def-use cycle for each copy, and therefore
7209 for each copy we get the vector def for the reduction variable from the
7210 respective phi node created for this copy.
7212 Otherwise (the reduction is unused in the loop nest), we can combine
7213 together intermediate results, like so (e.g. for ncopies=2):
7214 r = phi (init, r)
7215 r = x0 + r;
7216 r = x1 + r;
7217 (i.e. we generate VF/2 results in a single register).
7218 In this case for each copy we get the vector def for the reduction variable
7219 from the vectorized reduction operation generated in the previous iteration.
7221 This only works when we see both the reduction PHI and its only consumer
7222 in vectorizable_reduction and there are no intermediate stmts
7223 participating. */
7224 if (ncopies > 1
7225 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7226 && reduc_chain_length == 1)
7227 single_defuse_cycle = true;
7229 if (single_defuse_cycle || lane_reduc_code_p)
7231 gcc_assert (code != COND_EXPR);
7233 /* 4. Supportable by target? */
7234 bool ok = true;
7236 /* 4.1. check support for the operation in the loop */
7237 optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7238 if (!optab)
7240 if (dump_enabled_p ())
7241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7242 "no optab.\n");
7243 ok = false;
7246 machine_mode vec_mode = TYPE_MODE (vectype_in);
7247 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7249 if (dump_enabled_p ())
7250 dump_printf (MSG_NOTE, "op not supported by target.\n");
7251 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7252 || !vect_can_vectorize_without_simd_p (code))
7253 ok = false;
7254 else
7255 if (dump_enabled_p ())
7256 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7259 if (vect_emulated_vector_p (vectype_in)
7260 && !vect_can_vectorize_without_simd_p (code))
7262 if (dump_enabled_p ())
7263 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7264 return false;
7267 /* lane-reducing operations have to go through vect_transform_reduction.
7268 For the other cases try without the single cycle optimization. */
7269 if (!ok)
7271 if (lane_reduc_code_p)
7272 return false;
7273 else
7274 single_defuse_cycle = false;
7277 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7279 /* If the reduction stmt is one of the patterns that have lane
7280 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7281 if ((ncopies > 1 && ! single_defuse_cycle)
7282 && lane_reduc_code_p)
7284 if (dump_enabled_p ())
7285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7286 "multi def-use cycle not possible for lane-reducing "
7287 "reduction operation\n");
7288 return false;
7291 if (slp_node
7292 && !(!single_defuse_cycle
7293 && code != DOT_PROD_EXPR
7294 && code != WIDEN_SUM_EXPR
7295 && code != SAD_EXPR
7296 && reduction_type != FOLD_LEFT_REDUCTION))
7297 for (i = 0; i < op_type; i++)
7298 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7300 if (dump_enabled_p ())
7301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7302 "incompatible vector types for invariants\n");
7303 return false;
7306 if (slp_node)
7307 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7308 else
7309 vec_num = 1;
7311 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7312 reduction_type, ncopies, cost_vec);
7313 /* Cost the reduction op inside the loop if transformed via
7314 vect_transform_reduction. Otherwise this is costed by the
7315 separate vectorizable_* routines. */
7316 if (single_defuse_cycle
7317 || code == DOT_PROD_EXPR
7318 || code == WIDEN_SUM_EXPR
7319 || code == SAD_EXPR)
7320 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7322 if (dump_enabled_p ()
7323 && reduction_type == FOLD_LEFT_REDUCTION)
7324 dump_printf_loc (MSG_NOTE, vect_location,
7325 "using an in-order (fold-left) reduction.\n");
7326 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7327 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7328 reductions go through their own vectorizable_* routines. */
7329 if (!single_defuse_cycle
7330 && code != DOT_PROD_EXPR
7331 && code != WIDEN_SUM_EXPR
7332 && code != SAD_EXPR
7333 && reduction_type != FOLD_LEFT_REDUCTION)
7335 stmt_vec_info tem
7336 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7337 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7339 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7340 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7342 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7343 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7345 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7347 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7348 internal_fn cond_fn = get_conditional_internal_fn (code);
7350 if (reduction_type != FOLD_LEFT_REDUCTION
7351 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7352 && (cond_fn == IFN_LAST
7353 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7354 OPTIMIZE_FOR_SPEED)))
7356 if (dump_enabled_p ())
7357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7358 "can't operate on partial vectors because"
7359 " no conditional operation is available.\n");
7360 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7362 else if (reduction_type == FOLD_LEFT_REDUCTION
7363 && reduc_fn == IFN_LAST
7364 && !expand_vec_cond_expr_p (vectype_in,
7365 truth_type_for (vectype_in),
7366 SSA_NAME))
7368 if (dump_enabled_p ())
7369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7370 "can't operate on partial vectors because"
7371 " no conditional operation is available.\n");
7372 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7374 else
7375 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7376 vectype_in, NULL);
7378 return true;
7381 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7382 value. */
7384 bool
7385 vect_transform_reduction (loop_vec_info loop_vinfo,
7386 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7387 gimple **vec_stmt, slp_tree slp_node)
7389 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7390 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7391 int i;
7392 int ncopies;
7393 int vec_num;
7395 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7396 gcc_assert (reduc_info->is_reduc_info);
7398 if (nested_in_vect_loop_p (loop, stmt_info))
7400 loop = loop->inner;
7401 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7404 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7405 enum tree_code code = gimple_assign_rhs_code (stmt);
7406 int op_type = TREE_CODE_LENGTH (code);
7408 /* Flatten RHS. */
7409 tree ops[3];
7410 switch (get_gimple_rhs_class (code))
7412 case GIMPLE_TERNARY_RHS:
7413 ops[2] = gimple_assign_rhs3 (stmt);
7414 /* Fall thru. */
7415 case GIMPLE_BINARY_RHS:
7416 ops[0] = gimple_assign_rhs1 (stmt);
7417 ops[1] = gimple_assign_rhs2 (stmt);
7418 break;
7419 default:
7420 gcc_unreachable ();
7423 /* All uses but the last are expected to be defined in the loop.
7424 The last use is the reduction variable. In case of nested cycle this
7425 assumption is not true: we use reduc_index to record the index of the
7426 reduction variable. */
7427 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7428 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7429 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7430 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7432 if (slp_node)
7434 ncopies = 1;
7435 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7437 else
7439 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7440 vec_num = 1;
7443 internal_fn cond_fn = get_conditional_internal_fn (code);
7444 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7445 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7447 /* Transform. */
7448 tree new_temp = NULL_TREE;
7449 auto_vec<tree> vec_oprnds0;
7450 auto_vec<tree> vec_oprnds1;
7451 auto_vec<tree> vec_oprnds2;
7452 tree def0;
7454 if (dump_enabled_p ())
7455 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7457 /* FORNOW: Multiple types are not supported for condition. */
7458 if (code == COND_EXPR)
7459 gcc_assert (ncopies == 1);
7461 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7463 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7464 if (reduction_type == FOLD_LEFT_REDUCTION)
7466 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7467 return vectorize_fold_left_reduction
7468 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7469 reduc_fn, ops, vectype_in, reduc_index, masks);
7472 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7473 gcc_assert (single_defuse_cycle
7474 || code == DOT_PROD_EXPR
7475 || code == WIDEN_SUM_EXPR
7476 || code == SAD_EXPR);
7478 /* Create the destination vector */
7479 tree scalar_dest = gimple_assign_lhs (stmt);
7480 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7482 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7483 single_defuse_cycle && reduc_index == 0
7484 ? NULL_TREE : ops[0], &vec_oprnds0,
7485 single_defuse_cycle && reduc_index == 1
7486 ? NULL_TREE : ops[1], &vec_oprnds1,
7487 op_type == ternary_op
7488 && !(single_defuse_cycle && reduc_index == 2)
7489 ? ops[2] : NULL_TREE, &vec_oprnds2);
7490 if (single_defuse_cycle)
7492 gcc_assert (!slp_node);
7493 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7494 ops[reduc_index],
7495 reduc_index == 0 ? &vec_oprnds0
7496 : (reduc_index == 1 ? &vec_oprnds1
7497 : &vec_oprnds2));
7500 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7502 gimple *new_stmt;
7503 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7504 if (masked_loop_p && !mask_by_cond_expr)
7506 /* Make sure that the reduction accumulator is vop[0]. */
7507 if (reduc_index == 1)
7509 gcc_assert (commutative_tree_code (code));
7510 std::swap (vop[0], vop[1]);
7512 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7513 vectype_in, i);
7514 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7515 vop[0], vop[1], vop[0]);
7516 new_temp = make_ssa_name (vec_dest, call);
7517 gimple_call_set_lhs (call, new_temp);
7518 gimple_call_set_nothrow (call, true);
7519 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7520 new_stmt = call;
7522 else
7524 if (op_type == ternary_op)
7525 vop[2] = vec_oprnds2[i];
7527 if (masked_loop_p && mask_by_cond_expr)
7529 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7530 vectype_in, i);
7531 build_vect_cond_expr (code, vop, mask, gsi);
7534 new_stmt = gimple_build_assign (vec_dest, code,
7535 vop[0], vop[1], vop[2]);
7536 new_temp = make_ssa_name (vec_dest, new_stmt);
7537 gimple_assign_set_lhs (new_stmt, new_temp);
7538 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7541 if (slp_node)
7542 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7543 else if (single_defuse_cycle
7544 && i < ncopies - 1)
7546 if (reduc_index == 0)
7547 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7548 else if (reduc_index == 1)
7549 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7550 else if (reduc_index == 2)
7551 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7553 else
7554 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7557 if (!slp_node)
7558 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7560 return true;
7563 /* Transform phase of a cycle PHI. */
7565 bool
7566 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7567 stmt_vec_info stmt_info, gimple **vec_stmt,
7568 slp_tree slp_node, slp_instance slp_node_instance)
7570 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7571 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7572 int i;
7573 int ncopies;
7574 int j;
7575 bool nested_cycle = false;
7576 int vec_num;
7578 if (nested_in_vect_loop_p (loop, stmt_info))
7580 loop = loop->inner;
7581 nested_cycle = true;
7584 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7585 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7586 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7587 gcc_assert (reduc_info->is_reduc_info);
7589 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7590 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7591 /* Leave the scalar phi in place. */
7592 return true;
7594 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7595 /* For a nested cycle we do not fill the above. */
7596 if (!vectype_in)
7597 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7598 gcc_assert (vectype_in);
7600 if (slp_node)
7602 /* The size vect_schedule_slp_instance computes is off for us. */
7603 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7604 * SLP_TREE_LANES (slp_node), vectype_in);
7605 ncopies = 1;
7607 else
7609 vec_num = 1;
7610 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7613 /* Check whether we should use a single PHI node and accumulate
7614 vectors to one before the backedge. */
7615 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7616 ncopies = 1;
7618 /* Create the destination vector */
7619 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7620 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7621 vectype_out);
7623 /* Get the loop-entry arguments. */
7624 tree vec_initial_def = NULL_TREE;
7625 auto_vec<tree> vec_initial_defs;
7626 if (slp_node)
7628 vec_initial_defs.reserve (vec_num);
7629 if (nested_cycle)
7631 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7632 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7633 &vec_initial_defs);
7635 else
7637 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7638 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7639 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7641 unsigned int num_phis = stmts.length ();
7642 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7643 num_phis = 1;
7644 initial_values.reserve (num_phis);
7645 for (unsigned int i = 0; i < num_phis; ++i)
7647 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7648 initial_values.quick_push (vect_phi_initial_value (this_phi));
7650 if (vec_num == 1)
7651 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7652 if (!initial_values.is_empty ())
7654 tree initial_value
7655 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7656 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7657 tree neutral_op
7658 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7659 code, initial_value);
7660 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7661 &vec_initial_defs, vec_num,
7662 stmts.length (), neutral_op);
7666 else
7668 /* Get at the scalar def before the loop, that defines the initial
7669 value of the reduction variable. */
7670 tree initial_def = vect_phi_initial_value (phi);
7671 reduc_info->reduc_initial_values.safe_push (initial_def);
7672 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7673 and we can't use zero for induc_val, use initial_def. Similarly
7674 for REDUC_MIN and initial_def larger than the base. */
7675 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7677 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7678 if (TREE_CODE (initial_def) == INTEGER_CST
7679 && !integer_zerop (induc_val)
7680 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7681 && tree_int_cst_lt (initial_def, induc_val))
7682 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7683 && tree_int_cst_lt (induc_val, initial_def))))
7685 induc_val = initial_def;
7686 /* Communicate we used the initial_def to epilouge
7687 generation. */
7688 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7690 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7692 else if (nested_cycle)
7694 /* Do not use an adjustment def as that case is not supported
7695 correctly if ncopies is not one. */
7696 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7697 ncopies, initial_def,
7698 &vec_initial_defs);
7700 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7701 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7702 /* Fill the initial vector with the initial scalar value. */
7703 vec_initial_def
7704 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7705 initial_def, initial_def);
7706 else
7708 if (ncopies == 1)
7709 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7710 if (!reduc_info->reduc_initial_values.is_empty ())
7712 initial_def = reduc_info->reduc_initial_values[0];
7713 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7714 tree neutral_op
7715 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7716 code, initial_def);
7717 gcc_assert (neutral_op);
7718 /* Try to simplify the vector initialization by applying an
7719 adjustment after the reduction has been performed. */
7720 if (!reduc_info->reused_accumulator
7721 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7722 && !operand_equal_p (neutral_op, initial_def))
7724 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7725 = initial_def;
7726 initial_def = neutral_op;
7728 vec_initial_def
7729 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7730 initial_def, neutral_op);
7735 if (vec_initial_def)
7737 vec_initial_defs.create (ncopies);
7738 for (i = 0; i < ncopies; ++i)
7739 vec_initial_defs.quick_push (vec_initial_def);
7742 if (auto *accumulator = reduc_info->reused_accumulator)
7744 tree def = accumulator->reduc_input;
7745 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7747 unsigned int nreduc;
7748 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7749 (TREE_TYPE (def)),
7750 TYPE_VECTOR_SUBPARTS (vectype_out),
7751 &nreduc);
7752 gcc_assert (res);
7753 gimple_seq stmts = NULL;
7754 /* Reduce the single vector to a smaller one. */
7755 if (nreduc != 1)
7757 /* Perform the reduction in the appropriate type. */
7758 tree rvectype = vectype_out;
7759 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7760 TREE_TYPE (TREE_TYPE (def))))
7761 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7762 TYPE_VECTOR_SUBPARTS
7763 (vectype_out));
7764 def = vect_create_partial_epilog (def, rvectype,
7765 STMT_VINFO_REDUC_CODE
7766 (reduc_info),
7767 &stmts);
7769 /* The epilogue loop might use a different vector mode, like
7770 VNx2DI vs. V2DI. */
7771 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7773 tree reduc_type = build_vector_type_for_mode
7774 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7775 def = gimple_convert (&stmts, reduc_type, def);
7777 /* Adjust the input so we pick up the partially reduced value
7778 for the skip edge in vect_create_epilog_for_reduction. */
7779 accumulator->reduc_input = def;
7780 /* And the reduction could be carried out using a different sign. */
7781 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7782 def = gimple_convert (&stmts, vectype_out, def);
7783 if (loop_vinfo->main_loop_edge)
7785 /* While we'd like to insert on the edge this will split
7786 blocks and disturb bookkeeping, we also will eventually
7787 need this on the skip edge. Rely on sinking to
7788 fixup optimal placement and insert in the pred. */
7789 gimple_stmt_iterator gsi
7790 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7791 /* Insert before a cond that eventually skips the
7792 epilogue. */
7793 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7794 gsi_prev (&gsi);
7795 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7797 else
7798 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7799 stmts);
7801 if (loop_vinfo->main_loop_edge)
7802 vec_initial_defs[0]
7803 = vect_get_main_loop_result (loop_vinfo, def,
7804 vec_initial_defs[0]);
7805 else
7806 vec_initial_defs.safe_push (def);
7809 /* Generate the reduction PHIs upfront. */
7810 for (i = 0; i < vec_num; i++)
7812 tree vec_init_def = vec_initial_defs[i];
7813 for (j = 0; j < ncopies; j++)
7815 /* Create the reduction-phi that defines the reduction
7816 operand. */
7817 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7819 /* Set the loop-entry arg of the reduction-phi. */
7820 if (j != 0 && nested_cycle)
7821 vec_init_def = vec_initial_defs[j];
7822 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7823 UNKNOWN_LOCATION);
7825 /* The loop-latch arg is set in epilogue processing. */
7827 if (slp_node)
7828 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7829 else
7831 if (j == 0)
7832 *vec_stmt = new_phi;
7833 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7838 return true;
7841 /* Vectorizes LC PHIs. */
7843 bool
7844 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7845 stmt_vec_info stmt_info, gimple **vec_stmt,
7846 slp_tree slp_node)
7848 if (!loop_vinfo
7849 || !is_a <gphi *> (stmt_info->stmt)
7850 || gimple_phi_num_args (stmt_info->stmt) != 1)
7851 return false;
7853 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7854 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7855 return false;
7857 if (!vec_stmt) /* transformation not required. */
7859 /* Deal with copies from externs or constants that disguise as
7860 loop-closed PHI nodes (PR97886). */
7861 if (slp_node
7862 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7863 SLP_TREE_VECTYPE (slp_node)))
7865 if (dump_enabled_p ())
7866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7867 "incompatible vector types for invariants\n");
7868 return false;
7870 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7871 return true;
7874 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7875 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7876 basic_block bb = gimple_bb (stmt_info->stmt);
7877 edge e = single_pred_edge (bb);
7878 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7879 auto_vec<tree> vec_oprnds;
7880 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7881 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7882 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7883 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7885 /* Create the vectorized LC PHI node. */
7886 gphi *new_phi = create_phi_node (vec_dest, bb);
7887 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7888 if (slp_node)
7889 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7890 else
7891 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7893 if (!slp_node)
7894 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7896 return true;
7899 /* Vectorizes PHIs. */
7901 bool
7902 vectorizable_phi (vec_info *,
7903 stmt_vec_info stmt_info, gimple **vec_stmt,
7904 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7906 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7907 return false;
7909 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7910 return false;
7912 tree vectype = SLP_TREE_VECTYPE (slp_node);
7914 if (!vec_stmt) /* transformation not required. */
7916 slp_tree child;
7917 unsigned i;
7918 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7919 if (!child)
7921 if (dump_enabled_p ())
7922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7923 "PHI node with unvectorized backedge def\n");
7924 return false;
7926 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7928 if (dump_enabled_p ())
7929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7930 "incompatible vector types for invariants\n");
7931 return false;
7933 /* For single-argument PHIs assume coalescing which means zero cost
7934 for the scalar and the vector PHIs. This avoids artificially
7935 favoring the vector path (but may pessimize it in some cases). */
7936 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7937 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7938 vector_stmt, stmt_info, vectype, 0, vect_body);
7939 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7940 return true;
7943 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7944 basic_block bb = gimple_bb (stmt_info->stmt);
7945 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7946 auto_vec<gphi *> new_phis;
7947 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7949 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7951 /* Skip not yet vectorized defs. */
7952 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7953 && SLP_TREE_VEC_STMTS (child).is_empty ())
7954 continue;
7956 auto_vec<tree> vec_oprnds;
7957 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7958 if (!new_phis.exists ())
7960 new_phis.create (vec_oprnds.length ());
7961 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7963 /* Create the vectorized LC PHI node. */
7964 new_phis.quick_push (create_phi_node (vec_dest, bb));
7965 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7968 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7969 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7970 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7972 /* We should have at least one already vectorized child. */
7973 gcc_assert (new_phis.exists ());
7975 return true;
7978 /* Return true if VECTYPE represents a vector that requires lowering
7979 by the vector lowering pass. */
7981 bool
7982 vect_emulated_vector_p (tree vectype)
7984 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7985 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7986 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7989 /* Return true if we can emulate CODE on an integer mode representation
7990 of a vector. */
7992 bool
7993 vect_can_vectorize_without_simd_p (tree_code code)
7995 switch (code)
7997 case PLUS_EXPR:
7998 case MINUS_EXPR:
7999 case NEGATE_EXPR:
8000 case BIT_AND_EXPR:
8001 case BIT_IOR_EXPR:
8002 case BIT_XOR_EXPR:
8003 case BIT_NOT_EXPR:
8004 return true;
8006 default:
8007 return false;
8011 /* Function vectorizable_induction
8013 Check if STMT_INFO performs an induction computation that can be vectorized.
8014 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8015 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8016 Return true if STMT_INFO is vectorizable in this way. */
8018 bool
8019 vectorizable_induction (loop_vec_info loop_vinfo,
8020 stmt_vec_info stmt_info,
8021 gimple **vec_stmt, slp_tree slp_node,
8022 stmt_vector_for_cost *cost_vec)
8024 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8025 unsigned ncopies;
8026 bool nested_in_vect_loop = false;
8027 class loop *iv_loop;
8028 tree vec_def;
8029 edge pe = loop_preheader_edge (loop);
8030 basic_block new_bb;
8031 tree new_vec, vec_init, vec_step, t;
8032 tree new_name;
8033 gimple *new_stmt;
8034 gphi *induction_phi;
8035 tree induc_def, vec_dest;
8036 tree init_expr, step_expr;
8037 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8038 unsigned i;
8039 tree expr;
8040 gimple_stmt_iterator si;
8042 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8043 if (!phi)
8044 return false;
8046 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8047 return false;
8049 /* Make sure it was recognized as induction computation. */
8050 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8051 return false;
8053 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8054 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8056 if (slp_node)
8057 ncopies = 1;
8058 else
8059 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8060 gcc_assert (ncopies >= 1);
8062 /* FORNOW. These restrictions should be relaxed. */
8063 if (nested_in_vect_loop_p (loop, stmt_info))
8065 imm_use_iterator imm_iter;
8066 use_operand_p use_p;
8067 gimple *exit_phi;
8068 edge latch_e;
8069 tree loop_arg;
8071 if (ncopies > 1)
8073 if (dump_enabled_p ())
8074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8075 "multiple types in nested loop.\n");
8076 return false;
8079 exit_phi = NULL;
8080 latch_e = loop_latch_edge (loop->inner);
8081 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8082 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8084 gimple *use_stmt = USE_STMT (use_p);
8085 if (is_gimple_debug (use_stmt))
8086 continue;
8088 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8090 exit_phi = use_stmt;
8091 break;
8094 if (exit_phi)
8096 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8097 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8098 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8100 if (dump_enabled_p ())
8101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8102 "inner-loop induction only used outside "
8103 "of the outer vectorized loop.\n");
8104 return false;
8108 nested_in_vect_loop = true;
8109 iv_loop = loop->inner;
8111 else
8112 iv_loop = loop;
8113 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8115 if (slp_node && !nunits.is_constant ())
8117 /* The current SLP code creates the step value element-by-element. */
8118 if (dump_enabled_p ())
8119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8120 "SLP induction not supported for variable-length"
8121 " vectors.\n");
8122 return false;
8125 if (!vec_stmt) /* transformation not required. */
8127 unsigned inside_cost = 0, prologue_cost = 0;
8128 if (slp_node)
8130 /* We eventually need to set a vector type on invariant
8131 arguments. */
8132 unsigned j;
8133 slp_tree child;
8134 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8135 if (!vect_maybe_update_slp_op_vectype
8136 (child, SLP_TREE_VECTYPE (slp_node)))
8138 if (dump_enabled_p ())
8139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8140 "incompatible vector types for "
8141 "invariants\n");
8142 return false;
8144 /* loop cost for vec_loop. */
8145 inside_cost
8146 = record_stmt_cost (cost_vec,
8147 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8148 vector_stmt, stmt_info, 0, vect_body);
8149 /* prologue cost for vec_init (if not nested) and step. */
8150 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8151 scalar_to_vec,
8152 stmt_info, 0, vect_prologue);
8154 else /* if (!slp_node) */
8156 /* loop cost for vec_loop. */
8157 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8158 stmt_info, 0, vect_body);
8159 /* prologue cost for vec_init and vec_step. */
8160 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8161 stmt_info, 0, vect_prologue);
8163 if (dump_enabled_p ())
8164 dump_printf_loc (MSG_NOTE, vect_location,
8165 "vect_model_induction_cost: inside_cost = %d, "
8166 "prologue_cost = %d .\n", inside_cost,
8167 prologue_cost);
8169 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8170 DUMP_VECT_SCOPE ("vectorizable_induction");
8171 return true;
8174 /* Transform. */
8176 /* Compute a vector variable, initialized with the first VF values of
8177 the induction variable. E.g., for an iv with IV_PHI='X' and
8178 evolution S, for a vector of 4 units, we want to compute:
8179 [X, X + S, X + 2*S, X + 3*S]. */
8181 if (dump_enabled_p ())
8182 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8184 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8185 gcc_assert (step_expr != NULL_TREE);
8186 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8188 pe = loop_preheader_edge (iv_loop);
8189 /* Find the first insertion point in the BB. */
8190 basic_block bb = gimple_bb (phi);
8191 si = gsi_after_labels (bb);
8193 /* For SLP induction we have to generate several IVs as for example
8194 with group size 3 we need
8195 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8196 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8197 if (slp_node)
8199 /* Enforced above. */
8200 unsigned int const_nunits = nunits.to_constant ();
8202 /* The initial values are vectorized, but any lanes > group_size
8203 need adjustment. */
8204 slp_tree init_node
8205 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8207 /* Gather steps. Since we do not vectorize inductions as
8208 cycles we have to reconstruct the step from SCEV data. */
8209 unsigned group_size = SLP_TREE_LANES (slp_node);
8210 tree *steps = XALLOCAVEC (tree, group_size);
8211 tree *inits = XALLOCAVEC (tree, group_size);
8212 stmt_vec_info phi_info;
8213 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8215 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8216 if (!init_node)
8217 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8218 pe->dest_idx);
8221 /* Now generate the IVs. */
8222 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8223 gcc_assert ((const_nunits * nvects) % group_size == 0);
8224 unsigned nivs;
8225 if (nested_in_vect_loop)
8226 nivs = nvects;
8227 else
8229 /* Compute the number of distinct IVs we need. First reduce
8230 group_size if it is a multiple of const_nunits so we get
8231 one IV for a group_size of 4 but const_nunits 2. */
8232 unsigned group_sizep = group_size;
8233 if (group_sizep % const_nunits == 0)
8234 group_sizep = group_sizep / const_nunits;
8235 nivs = least_common_multiple (group_sizep,
8236 const_nunits) / const_nunits;
8238 tree stept = TREE_TYPE (step_vectype);
8239 tree lupdate_mul = NULL_TREE;
8240 if (!nested_in_vect_loop)
8242 /* The number of iterations covered in one vector iteration. */
8243 unsigned lup_mul = (nvects * const_nunits) / group_size;
8244 lupdate_mul
8245 = build_vector_from_val (step_vectype,
8246 SCALAR_FLOAT_TYPE_P (stept)
8247 ? build_real_from_wide (stept, lup_mul,
8248 UNSIGNED)
8249 : build_int_cstu (stept, lup_mul));
8251 tree peel_mul = NULL_TREE;
8252 gimple_seq init_stmts = NULL;
8253 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8255 if (SCALAR_FLOAT_TYPE_P (stept))
8256 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8257 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8258 else
8259 peel_mul = gimple_convert (&init_stmts, stept,
8260 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8261 peel_mul = gimple_build_vector_from_val (&init_stmts,
8262 step_vectype, peel_mul);
8264 unsigned ivn;
8265 auto_vec<tree> vec_steps;
8266 for (ivn = 0; ivn < nivs; ++ivn)
8268 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8269 tree_vector_builder init_elts (vectype, const_nunits, 1);
8270 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8271 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8273 /* The scalar steps of the IVs. */
8274 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8275 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8276 step_elts.quick_push (elt);
8277 if (!init_node)
8279 /* The scalar inits of the IVs if not vectorized. */
8280 elt = inits[(ivn*const_nunits + eltn) % group_size];
8281 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8282 TREE_TYPE (elt)))
8283 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8284 TREE_TYPE (vectype), elt);
8285 init_elts.quick_push (elt);
8287 /* The number of steps to add to the initial values. */
8288 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8289 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8290 ? build_real_from_wide (stept,
8291 mul_elt, UNSIGNED)
8292 : build_int_cstu (stept, mul_elt));
8294 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8295 vec_steps.safe_push (vec_step);
8296 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8297 if (peel_mul)
8298 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8299 step_mul, peel_mul);
8300 if (!init_node)
8301 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8303 /* Create the induction-phi that defines the induction-operand. */
8304 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8305 "vec_iv_");
8306 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8307 induc_def = PHI_RESULT (induction_phi);
8309 /* Create the iv update inside the loop */
8310 tree up = vec_step;
8311 if (lupdate_mul)
8312 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8313 vec_step, lupdate_mul);
8314 gimple_seq stmts = NULL;
8315 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8316 vec_def = gimple_build (&stmts,
8317 PLUS_EXPR, step_vectype, vec_def, up);
8318 vec_def = gimple_convert (&stmts, vectype, vec_def);
8319 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8320 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8321 UNKNOWN_LOCATION);
8323 if (init_node)
8324 vec_init = vect_get_slp_vect_def (init_node, ivn);
8325 if (!nested_in_vect_loop
8326 && !integer_zerop (step_mul))
8328 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8329 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8330 vec_step, step_mul);
8331 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8332 vec_def, up);
8333 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8336 /* Set the arguments of the phi node: */
8337 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8339 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8341 if (!nested_in_vect_loop)
8343 /* Fill up to the number of vectors we need for the whole group. */
8344 nivs = least_common_multiple (group_size,
8345 const_nunits) / const_nunits;
8346 vec_steps.reserve (nivs-ivn);
8347 for (; ivn < nivs; ++ivn)
8349 SLP_TREE_VEC_STMTS (slp_node)
8350 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8351 vec_steps.quick_push (vec_steps[0]);
8355 /* Re-use IVs when we can. We are generating further vector
8356 stmts by adding VF' * stride to the IVs generated above. */
8357 if (ivn < nvects)
8359 unsigned vfp
8360 = least_common_multiple (group_size, const_nunits) / group_size;
8361 tree lupdate_mul
8362 = build_vector_from_val (step_vectype,
8363 SCALAR_FLOAT_TYPE_P (stept)
8364 ? build_real_from_wide (stept,
8365 vfp, UNSIGNED)
8366 : build_int_cstu (stept, vfp));
8367 for (; ivn < nvects; ++ivn)
8369 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8370 tree def = gimple_get_lhs (iv);
8371 if (ivn < 2*nivs)
8372 vec_steps[ivn - nivs]
8373 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8374 vec_steps[ivn - nivs], lupdate_mul);
8375 gimple_seq stmts = NULL;
8376 def = gimple_convert (&stmts, step_vectype, def);
8377 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8378 def, vec_steps[ivn % nivs]);
8379 def = gimple_convert (&stmts, vectype, def);
8380 if (gimple_code (iv) == GIMPLE_PHI)
8381 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8382 else
8384 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8385 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8387 SLP_TREE_VEC_STMTS (slp_node)
8388 .quick_push (SSA_NAME_DEF_STMT (def));
8392 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8393 gcc_assert (!new_bb);
8395 return true;
8398 init_expr = vect_phi_initial_value (phi);
8400 gimple_seq stmts = NULL;
8401 if (!nested_in_vect_loop)
8403 /* Convert the initial value to the IV update type. */
8404 tree new_type = TREE_TYPE (step_expr);
8405 init_expr = gimple_convert (&stmts, new_type, init_expr);
8407 /* If we are using the loop mask to "peel" for alignment then we need
8408 to adjust the start value here. */
8409 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8410 if (skip_niters != NULL_TREE)
8412 if (FLOAT_TYPE_P (vectype))
8413 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8414 skip_niters);
8415 else
8416 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8417 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8418 skip_niters, step_expr);
8419 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8420 init_expr, skip_step);
8424 if (stmts)
8426 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8427 gcc_assert (!new_bb);
8430 /* Create the vector that holds the initial_value of the induction. */
8431 if (nested_in_vect_loop)
8433 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8434 been created during vectorization of previous stmts. We obtain it
8435 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8436 auto_vec<tree> vec_inits;
8437 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8438 init_expr, &vec_inits);
8439 vec_init = vec_inits[0];
8440 /* If the initial value is not of proper type, convert it. */
8441 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8443 new_stmt
8444 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8445 vect_simple_var,
8446 "vec_iv_"),
8447 VIEW_CONVERT_EXPR,
8448 build1 (VIEW_CONVERT_EXPR, vectype,
8449 vec_init));
8450 vec_init = gimple_assign_lhs (new_stmt);
8451 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8452 new_stmt);
8453 gcc_assert (!new_bb);
8456 else
8458 /* iv_loop is the loop to be vectorized. Create:
8459 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8460 stmts = NULL;
8461 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8463 unsigned HOST_WIDE_INT const_nunits;
8464 if (nunits.is_constant (&const_nunits))
8466 tree_vector_builder elts (step_vectype, const_nunits, 1);
8467 elts.quick_push (new_name);
8468 for (i = 1; i < const_nunits; i++)
8470 /* Create: new_name_i = new_name + step_expr */
8471 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8472 new_name, step_expr);
8473 elts.quick_push (new_name);
8475 /* Create a vector from [new_name_0, new_name_1, ...,
8476 new_name_nunits-1] */
8477 vec_init = gimple_build_vector (&stmts, &elts);
8479 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8480 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8481 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8482 new_name, step_expr);
8483 else
8485 /* Build:
8486 [base, base, base, ...]
8487 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8488 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8489 gcc_assert (flag_associative_math);
8490 tree index = build_index_vector (step_vectype, 0, 1);
8491 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8492 new_name);
8493 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8494 step_expr);
8495 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8496 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8497 vec_init, step_vec);
8498 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8499 vec_init, base_vec);
8501 vec_init = gimple_convert (&stmts, vectype, vec_init);
8503 if (stmts)
8505 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8506 gcc_assert (!new_bb);
8511 /* Create the vector that holds the step of the induction. */
8512 if (nested_in_vect_loop)
8513 /* iv_loop is nested in the loop to be vectorized. Generate:
8514 vec_step = [S, S, S, S] */
8515 new_name = step_expr;
8516 else
8518 /* iv_loop is the loop to be vectorized. Generate:
8519 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8520 gimple_seq seq = NULL;
8521 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8523 expr = build_int_cst (integer_type_node, vf);
8524 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8526 else
8527 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8528 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8529 expr, step_expr);
8530 if (seq)
8532 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8533 gcc_assert (!new_bb);
8537 t = unshare_expr (new_name);
8538 gcc_assert (CONSTANT_CLASS_P (new_name)
8539 || TREE_CODE (new_name) == SSA_NAME);
8540 new_vec = build_vector_from_val (step_vectype, t);
8541 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8542 new_vec, step_vectype, NULL);
8545 /* Create the following def-use cycle:
8546 loop prolog:
8547 vec_init = ...
8548 vec_step = ...
8549 loop:
8550 vec_iv = PHI <vec_init, vec_loop>
8552 STMT
8554 vec_loop = vec_iv + vec_step; */
8556 /* Create the induction-phi that defines the induction-operand. */
8557 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8558 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8559 induc_def = PHI_RESULT (induction_phi);
8561 /* Create the iv update inside the loop */
8562 stmts = NULL;
8563 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8564 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8565 vec_def = gimple_convert (&stmts, vectype, vec_def);
8566 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8567 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8569 /* Set the arguments of the phi node: */
8570 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8571 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8572 UNKNOWN_LOCATION);
8574 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8575 *vec_stmt = induction_phi;
8577 /* In case that vectorization factor (VF) is bigger than the number
8578 of elements that we can fit in a vectype (nunits), we have to generate
8579 more than one vector stmt - i.e - we need to "unroll" the
8580 vector stmt by a factor VF/nunits. For more details see documentation
8581 in vectorizable_operation. */
8583 if (ncopies > 1)
8585 gimple_seq seq = NULL;
8586 /* FORNOW. This restriction should be relaxed. */
8587 gcc_assert (!nested_in_vect_loop);
8589 /* Create the vector that holds the step of the induction. */
8590 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8592 expr = build_int_cst (integer_type_node, nunits);
8593 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8595 else
8596 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8597 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8598 expr, step_expr);
8599 if (seq)
8601 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8602 gcc_assert (!new_bb);
8605 t = unshare_expr (new_name);
8606 gcc_assert (CONSTANT_CLASS_P (new_name)
8607 || TREE_CODE (new_name) == SSA_NAME);
8608 new_vec = build_vector_from_val (step_vectype, t);
8609 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8610 new_vec, step_vectype, NULL);
8612 vec_def = induc_def;
8613 for (i = 1; i < ncopies; i++)
8615 /* vec_i = vec_prev + vec_step */
8616 gimple_seq stmts = NULL;
8617 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8618 vec_def = gimple_build (&stmts,
8619 PLUS_EXPR, step_vectype, vec_def, vec_step);
8620 vec_def = gimple_convert (&stmts, vectype, vec_def);
8622 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8623 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8624 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8628 if (dump_enabled_p ())
8629 dump_printf_loc (MSG_NOTE, vect_location,
8630 "transform induction: created def-use cycle: %G%G",
8631 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8633 return true;
8636 /* Function vectorizable_live_operation.
8638 STMT_INFO computes a value that is used outside the loop. Check if
8639 it can be supported. */
8641 bool
8642 vectorizable_live_operation (vec_info *vinfo,
8643 stmt_vec_info stmt_info,
8644 gimple_stmt_iterator *gsi,
8645 slp_tree slp_node, slp_instance slp_node_instance,
8646 int slp_index, bool vec_stmt_p,
8647 stmt_vector_for_cost *cost_vec)
8649 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8650 imm_use_iterator imm_iter;
8651 tree lhs, lhs_type, bitsize;
8652 tree vectype = (slp_node
8653 ? SLP_TREE_VECTYPE (slp_node)
8654 : STMT_VINFO_VECTYPE (stmt_info));
8655 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8656 int ncopies;
8657 gimple *use_stmt;
8658 auto_vec<tree> vec_oprnds;
8659 int vec_entry = 0;
8660 poly_uint64 vec_index = 0;
8662 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8664 /* If a stmt of a reduction is live, vectorize it via
8665 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8666 validity so just trigger the transform here. */
8667 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8669 if (!vec_stmt_p)
8670 return true;
8671 if (slp_node)
8673 /* For reduction chains the meta-info is attached to
8674 the group leader. */
8675 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8676 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8677 /* For SLP reductions we vectorize the epilogue for
8678 all involved stmts together. */
8679 else if (slp_index != 0)
8680 return true;
8681 else
8682 /* For SLP reductions the meta-info is attached to
8683 the representative. */
8684 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8686 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8687 gcc_assert (reduc_info->is_reduc_info);
8688 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8689 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8690 return true;
8691 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8692 slp_node_instance);
8693 return true;
8696 /* If STMT is not relevant and it is a simple assignment and its inputs are
8697 invariant then it can remain in place, unvectorized. The original last
8698 scalar value that it computes will be used. */
8699 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8701 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8702 if (dump_enabled_p ())
8703 dump_printf_loc (MSG_NOTE, vect_location,
8704 "statement is simple and uses invariant. Leaving in "
8705 "place.\n");
8706 return true;
8709 if (slp_node)
8710 ncopies = 1;
8711 else
8712 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8714 if (slp_node)
8716 gcc_assert (slp_index >= 0);
8718 /* Get the last occurrence of the scalar index from the concatenation of
8719 all the slp vectors. Calculate which slp vector it is and the index
8720 within. */
8721 int num_scalar = SLP_TREE_LANES (slp_node);
8722 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8723 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8725 /* Calculate which vector contains the result, and which lane of
8726 that vector we need. */
8727 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8729 if (dump_enabled_p ())
8730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8731 "Cannot determine which vector holds the"
8732 " final result.\n");
8733 return false;
8737 if (!vec_stmt_p)
8739 /* No transformation required. */
8740 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8742 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8743 OPTIMIZE_FOR_SPEED))
8745 if (dump_enabled_p ())
8746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8747 "can't operate on partial vectors "
8748 "because the target doesn't support extract "
8749 "last reduction.\n");
8750 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8752 else if (slp_node)
8754 if (dump_enabled_p ())
8755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8756 "can't operate on partial vectors "
8757 "because an SLP statement is live after "
8758 "the loop.\n");
8759 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8761 else if (ncopies > 1)
8763 if (dump_enabled_p ())
8764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8765 "can't operate on partial vectors "
8766 "because ncopies is greater than 1.\n");
8767 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8769 else
8771 gcc_assert (ncopies == 1 && !slp_node);
8772 vect_record_loop_mask (loop_vinfo,
8773 &LOOP_VINFO_MASKS (loop_vinfo),
8774 1, vectype, NULL);
8777 /* ??? Enable for loop costing as well. */
8778 if (!loop_vinfo)
8779 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8780 0, vect_epilogue);
8781 return true;
8784 /* Use the lhs of the original scalar statement. */
8785 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8786 if (dump_enabled_p ())
8787 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8788 "stmt %G", stmt);
8790 lhs = gimple_get_lhs (stmt);
8791 lhs_type = TREE_TYPE (lhs);
8793 bitsize = vector_element_bits_tree (vectype);
8795 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8796 tree vec_lhs, bitstart;
8797 gimple *vec_stmt;
8798 if (slp_node)
8800 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8802 /* Get the correct slp vectorized stmt. */
8803 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8804 vec_lhs = gimple_get_lhs (vec_stmt);
8806 /* Get entry to use. */
8807 bitstart = bitsize_int (vec_index);
8808 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8810 else
8812 /* For multiple copies, get the last copy. */
8813 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8814 vec_lhs = gimple_get_lhs (vec_stmt);
8816 /* Get the last lane in the vector. */
8817 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8820 if (loop_vinfo)
8822 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8823 requirement, insert one phi node for it. It looks like:
8824 loop;
8826 # lhs' = PHI <lhs>
8828 loop;
8830 # vec_lhs' = PHI <vec_lhs>
8831 new_tree = lane_extract <vec_lhs', ...>;
8832 lhs' = new_tree; */
8834 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8835 basic_block exit_bb = single_exit (loop)->dest;
8836 gcc_assert (single_pred_p (exit_bb));
8838 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8839 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8840 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8842 gimple_seq stmts = NULL;
8843 tree new_tree;
8844 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8846 /* Emit:
8848 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8850 where VEC_LHS is the vectorized live-out result and MASK is
8851 the loop mask for the final iteration. */
8852 gcc_assert (ncopies == 1 && !slp_node);
8853 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8854 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8855 1, vectype, 0);
8856 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8857 mask, vec_lhs_phi);
8859 /* Convert the extracted vector element to the scalar type. */
8860 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8862 else
8864 tree bftype = TREE_TYPE (vectype);
8865 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8866 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8867 new_tree = build3 (BIT_FIELD_REF, bftype,
8868 vec_lhs_phi, bitsize, bitstart);
8869 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8870 &stmts, true, NULL_TREE);
8873 if (stmts)
8875 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8876 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8878 /* Remove existing phi from lhs and create one copy from new_tree. */
8879 tree lhs_phi = NULL_TREE;
8880 gimple_stmt_iterator gsi;
8881 for (gsi = gsi_start_phis (exit_bb);
8882 !gsi_end_p (gsi); gsi_next (&gsi))
8884 gimple *phi = gsi_stmt (gsi);
8885 if ((gimple_phi_arg_def (phi, 0) == lhs))
8887 remove_phi_node (&gsi, false);
8888 lhs_phi = gimple_phi_result (phi);
8889 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8890 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8891 break;
8896 /* Replace use of lhs with newly computed result. If the use stmt is a
8897 single arg PHI, just replace all uses of PHI result. It's necessary
8898 because lcssa PHI defining lhs may be before newly inserted stmt. */
8899 use_operand_p use_p;
8900 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8901 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8902 && !is_gimple_debug (use_stmt))
8904 if (gimple_code (use_stmt) == GIMPLE_PHI
8905 && gimple_phi_num_args (use_stmt) == 1)
8907 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8909 else
8911 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8912 SET_USE (use_p, new_tree);
8914 update_stmt (use_stmt);
8917 else
8919 /* For basic-block vectorization simply insert the lane-extraction. */
8920 tree bftype = TREE_TYPE (vectype);
8921 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8922 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8923 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8924 vec_lhs, bitsize, bitstart);
8925 gimple_seq stmts = NULL;
8926 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8927 &stmts, true, NULL_TREE);
8928 if (TREE_CODE (new_tree) == SSA_NAME
8929 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8930 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8931 if (is_a <gphi *> (vec_stmt))
8933 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8934 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8936 else
8938 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8939 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8942 /* Replace use of lhs with newly computed result. If the use stmt is a
8943 single arg PHI, just replace all uses of PHI result. It's necessary
8944 because lcssa PHI defining lhs may be before newly inserted stmt. */
8945 use_operand_p use_p;
8946 stmt_vec_info use_stmt_info;
8947 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8948 if (!is_gimple_debug (use_stmt)
8949 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8950 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8952 /* ??? This can happen when the live lane ends up being
8953 used in a vector construction code-generated by an
8954 external SLP node (and code-generation for that already
8955 happened). See gcc.dg/vect/bb-slp-47.c.
8956 Doing this is what would happen if that vector CTOR
8957 were not code-generated yet so it is not too bad.
8958 ??? In fact we'd likely want to avoid this situation
8959 in the first place. */
8960 if (TREE_CODE (new_tree) == SSA_NAME
8961 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8962 && gimple_code (use_stmt) != GIMPLE_PHI
8963 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8964 use_stmt))
8966 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8967 gcc_assert (code == CONSTRUCTOR
8968 || code == VIEW_CONVERT_EXPR
8969 || CONVERT_EXPR_CODE_P (code));
8970 if (dump_enabled_p ())
8971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8972 "Using original scalar computation for "
8973 "live lane because use preceeds vector "
8974 "def\n");
8975 continue;
8977 /* ??? It can also happen that we end up pulling a def into
8978 a loop where replacing out-of-loop uses would require
8979 a new LC SSA PHI node. Retain the original scalar in
8980 those cases as well. PR98064. */
8981 if (TREE_CODE (new_tree) == SSA_NAME
8982 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8983 && (gimple_bb (use_stmt)->loop_father
8984 != gimple_bb (vec_stmt)->loop_father)
8985 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8986 gimple_bb (use_stmt)->loop_father))
8988 if (dump_enabled_p ())
8989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8990 "Using original scalar computation for "
8991 "live lane because there is an out-of-loop "
8992 "definition for it\n");
8993 continue;
8995 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8996 SET_USE (use_p, new_tree);
8997 update_stmt (use_stmt);
9001 return true;
9004 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
9006 static void
9007 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9009 ssa_op_iter op_iter;
9010 imm_use_iterator imm_iter;
9011 def_operand_p def_p;
9012 gimple *ustmt;
9014 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9016 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9018 basic_block bb;
9020 if (!is_gimple_debug (ustmt))
9021 continue;
9023 bb = gimple_bb (ustmt);
9025 if (!flow_bb_inside_loop_p (loop, bb))
9027 if (gimple_debug_bind_p (ustmt))
9029 if (dump_enabled_p ())
9030 dump_printf_loc (MSG_NOTE, vect_location,
9031 "killing debug use\n");
9033 gimple_debug_bind_reset_value (ustmt);
9034 update_stmt (ustmt);
9036 else
9037 gcc_unreachable ();
9043 /* Given loop represented by LOOP_VINFO, return true if computation of
9044 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9045 otherwise. */
9047 static bool
9048 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9050 /* Constant case. */
9051 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9053 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9054 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9056 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9057 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9058 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9059 return true;
9062 widest_int max;
9063 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9064 /* Check the upper bound of loop niters. */
9065 if (get_max_loop_iterations (loop, &max))
9067 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9068 signop sgn = TYPE_SIGN (type);
9069 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9070 if (max < type_max)
9071 return true;
9073 return false;
9076 /* Return a mask type with half the number of elements as OLD_TYPE,
9077 given that it should have mode NEW_MODE. */
9079 tree
9080 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9082 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9083 return build_truth_vector_type_for_mode (nunits, new_mode);
9086 /* Return a mask type with twice as many elements as OLD_TYPE,
9087 given that it should have mode NEW_MODE. */
9089 tree
9090 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9092 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9093 return build_truth_vector_type_for_mode (nunits, new_mode);
9096 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9097 contain a sequence of NVECTORS masks that each control a vector of type
9098 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9099 these vector masks with the vector version of SCALAR_MASK. */
9101 void
9102 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9103 unsigned int nvectors, tree vectype, tree scalar_mask)
9105 gcc_assert (nvectors != 0);
9106 if (masks->length () < nvectors)
9107 masks->safe_grow_cleared (nvectors, true);
9108 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9109 /* The number of scalars per iteration and the number of vectors are
9110 both compile-time constants. */
9111 unsigned int nscalars_per_iter
9112 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9113 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9115 if (scalar_mask)
9117 scalar_cond_masked_key cond (scalar_mask, nvectors);
9118 loop_vinfo->scalar_cond_masked_set.add (cond);
9121 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9123 rgm->max_nscalars_per_iter = nscalars_per_iter;
9124 rgm->type = truth_type_for (vectype);
9125 rgm->factor = 1;
9129 /* Given a complete set of masks MASKS, extract mask number INDEX
9130 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9131 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9133 See the comment above vec_loop_masks for more details about the mask
9134 arrangement. */
9136 tree
9137 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9138 unsigned int nvectors, tree vectype, unsigned int index)
9140 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9141 tree mask_type = rgm->type;
9143 /* Populate the rgroup's mask array, if this is the first time we've
9144 used it. */
9145 if (rgm->controls.is_empty ())
9147 rgm->controls.safe_grow_cleared (nvectors, true);
9148 for (unsigned int i = 0; i < nvectors; ++i)
9150 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9151 /* Provide a dummy definition until the real one is available. */
9152 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9153 rgm->controls[i] = mask;
9157 tree mask = rgm->controls[index];
9158 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9159 TYPE_VECTOR_SUBPARTS (vectype)))
9161 /* A loop mask for data type X can be reused for data type Y
9162 if X has N times more elements than Y and if Y's elements
9163 are N times bigger than X's. In this case each sequence
9164 of N elements in the loop mask will be all-zero or all-one.
9165 We can then view-convert the mask so that each sequence of
9166 N elements is replaced by a single element. */
9167 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9168 TYPE_VECTOR_SUBPARTS (vectype)));
9169 gimple_seq seq = NULL;
9170 mask_type = truth_type_for (vectype);
9171 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9172 if (seq)
9173 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9175 return mask;
9178 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9179 lengths for controlling an operation on VECTYPE. The operation splits
9180 each element of VECTYPE into FACTOR separate subelements, measuring the
9181 length as a number of these subelements. */
9183 void
9184 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9185 unsigned int nvectors, tree vectype, unsigned int factor)
9187 gcc_assert (nvectors != 0);
9188 if (lens->length () < nvectors)
9189 lens->safe_grow_cleared (nvectors, true);
9190 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9192 /* The number of scalars per iteration, scalar occupied bytes and
9193 the number of vectors are both compile-time constants. */
9194 unsigned int nscalars_per_iter
9195 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9196 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9198 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9200 /* For now, we only support cases in which all loads and stores fall back
9201 to VnQI or none do. */
9202 gcc_assert (!rgl->max_nscalars_per_iter
9203 || (rgl->factor == 1 && factor == 1)
9204 || (rgl->max_nscalars_per_iter * rgl->factor
9205 == nscalars_per_iter * factor));
9206 rgl->max_nscalars_per_iter = nscalars_per_iter;
9207 rgl->type = vectype;
9208 rgl->factor = factor;
9212 /* Given a complete set of length LENS, extract length number INDEX for an
9213 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9215 tree
9216 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9217 unsigned int nvectors, unsigned int index)
9219 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9221 /* Populate the rgroup's len array, if this is the first time we've
9222 used it. */
9223 if (rgl->controls.is_empty ())
9225 rgl->controls.safe_grow_cleared (nvectors, true);
9226 for (unsigned int i = 0; i < nvectors; ++i)
9228 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9229 gcc_assert (len_type != NULL_TREE);
9230 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9232 /* Provide a dummy definition until the real one is available. */
9233 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9234 rgl->controls[i] = len;
9238 return rgl->controls[index];
9241 /* Scale profiling counters by estimation for LOOP which is vectorized
9242 by factor VF. */
9244 static void
9245 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9247 edge preheader = loop_preheader_edge (loop);
9248 /* Reduce loop iterations by the vectorization factor. */
9249 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9250 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9252 if (freq_h.nonzero_p ())
9254 profile_probability p;
9256 /* Avoid dropping loop body profile counter to 0 because of zero count
9257 in loop's preheader. */
9258 if (!(freq_e == profile_count::zero ()))
9259 freq_e = freq_e.force_nonzero ();
9260 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9261 scale_loop_frequencies (loop, p);
9264 edge exit_e = single_exit (loop);
9265 exit_e->probability = profile_probability::always ()
9266 .apply_scale (1, new_est_niter + 1);
9268 edge exit_l = single_pred_edge (loop->latch);
9269 profile_probability prob = exit_l->probability;
9270 exit_l->probability = exit_e->probability.invert ();
9271 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9272 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9275 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9276 latch edge values originally defined by it. */
9278 static void
9279 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9280 stmt_vec_info def_stmt_info)
9282 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9283 if (!def || TREE_CODE (def) != SSA_NAME)
9284 return;
9285 stmt_vec_info phi_info;
9286 imm_use_iterator iter;
9287 use_operand_p use_p;
9288 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9289 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9290 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9291 && (phi_info = loop_vinfo->lookup_stmt (phi))
9292 && STMT_VINFO_RELEVANT_P (phi_info)
9293 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9294 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9295 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9297 loop_p loop = gimple_bb (phi)->loop_father;
9298 edge e = loop_latch_edge (loop);
9299 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9301 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9302 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9303 gcc_assert (phi_defs.length () == latch_defs.length ());
9304 for (unsigned i = 0; i < phi_defs.length (); ++i)
9305 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9306 gimple_get_lhs (latch_defs[i]), e,
9307 gimple_phi_arg_location (phi, e->dest_idx));
9312 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9313 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9314 stmt_vec_info. */
9316 static bool
9317 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9318 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9320 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9321 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9323 if (dump_enabled_p ())
9324 dump_printf_loc (MSG_NOTE, vect_location,
9325 "------>vectorizing statement: %G", stmt_info->stmt);
9327 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9328 vect_loop_kill_debug_uses (loop, stmt_info);
9330 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9331 && !STMT_VINFO_LIVE_P (stmt_info))
9332 return false;
9334 if (STMT_VINFO_VECTYPE (stmt_info))
9336 poly_uint64 nunits
9337 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9338 if (!STMT_SLP_TYPE (stmt_info)
9339 && maybe_ne (nunits, vf)
9340 && dump_enabled_p ())
9341 /* For SLP VF is set according to unrolling factor, and not
9342 to vector size, hence for SLP this print is not valid. */
9343 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9346 /* Pure SLP statements have already been vectorized. We still need
9347 to apply loop vectorization to hybrid SLP statements. */
9348 if (PURE_SLP_STMT (stmt_info))
9349 return false;
9351 if (dump_enabled_p ())
9352 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9354 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9355 *seen_store = stmt_info;
9357 return true;
9360 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9361 in the hash_map with its corresponding values. */
9363 static tree
9364 find_in_mapping (tree t, void *context)
9366 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9368 tree *value = mapping->get (t);
9369 return value ? *value : t;
9372 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9373 original loop that has now been vectorized.
9375 The inits of the data_references need to be advanced with the number of
9376 iterations of the main loop. This has been computed in vect_do_peeling and
9377 is stored in parameter ADVANCE. We first restore the data_references
9378 initial offset with the values recored in ORIG_DRS_INIT.
9380 Since the loop_vec_info of this EPILOGUE was constructed for the original
9381 loop, its stmt_vec_infos all point to the original statements. These need
9382 to be updated to point to their corresponding copies as well as the SSA_NAMES
9383 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9385 The data_reference's connections also need to be updated. Their
9386 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9387 stmt_vec_infos, their statements need to point to their corresponding copy,
9388 if they are gather loads or scatter stores then their reference needs to be
9389 updated to point to its corresponding copy and finally we set
9390 'base_misaligned' to false as we have already peeled for alignment in the
9391 prologue of the main loop. */
9393 static void
9394 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9396 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9397 auto_vec<gimple *> stmt_worklist;
9398 hash_map<tree,tree> mapping;
9399 gimple *orig_stmt, *new_stmt;
9400 gimple_stmt_iterator epilogue_gsi;
9401 gphi_iterator epilogue_phi_gsi;
9402 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9403 basic_block *epilogue_bbs = get_loop_body (epilogue);
9404 unsigned i;
9406 free (LOOP_VINFO_BBS (epilogue_vinfo));
9407 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9409 /* Advance data_reference's with the number of iterations of the previous
9410 loop and its prologue. */
9411 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9414 /* The EPILOGUE loop is a copy of the original loop so they share the same
9415 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9416 point to the copied statements. We also create a mapping of all LHS' in
9417 the original loop and all the LHS' in the EPILOGUE and create worklists to
9418 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9419 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9421 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9422 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9424 new_stmt = epilogue_phi_gsi.phi ();
9426 gcc_assert (gimple_uid (new_stmt) > 0);
9427 stmt_vinfo
9428 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9430 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9431 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9433 mapping.put (gimple_phi_result (orig_stmt),
9434 gimple_phi_result (new_stmt));
9435 /* PHI nodes can not have patterns or related statements. */
9436 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9437 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9440 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9441 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9443 new_stmt = gsi_stmt (epilogue_gsi);
9444 if (is_gimple_debug (new_stmt))
9445 continue;
9447 gcc_assert (gimple_uid (new_stmt) > 0);
9448 stmt_vinfo
9449 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9451 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9452 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9454 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9455 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9457 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9459 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9460 for (gimple_stmt_iterator gsi = gsi_start (seq);
9461 !gsi_end_p (gsi); gsi_next (&gsi))
9462 stmt_worklist.safe_push (gsi_stmt (gsi));
9465 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9466 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9468 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9469 stmt_worklist.safe_push (stmt);
9470 /* Set BB such that the assert in
9471 'get_initial_def_for_reduction' is able to determine that
9472 the BB of the related stmt is inside this loop. */
9473 gimple_set_bb (stmt,
9474 gimple_bb (new_stmt));
9475 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9476 gcc_assert (related_vinfo == NULL
9477 || related_vinfo == stmt_vinfo);
9482 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9483 using the original main loop and thus need to be updated to refer to the
9484 cloned variables used in the epilogue. */
9485 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9487 gimple *stmt = stmt_worklist[i];
9488 tree *new_op;
9490 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9492 tree op = gimple_op (stmt, j);
9493 if ((new_op = mapping.get(op)))
9494 gimple_set_op (stmt, j, *new_op);
9495 else
9497 /* PR92429: The last argument of simplify_replace_tree disables
9498 folding when replacing arguments. This is required as
9499 otherwise you might end up with different statements than the
9500 ones analyzed in vect_loop_analyze, leading to different
9501 vectorization. */
9502 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9503 &find_in_mapping, &mapping, false);
9504 gimple_set_op (stmt, j, op);
9509 struct data_reference *dr;
9510 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9511 FOR_EACH_VEC_ELT (datarefs, i, dr)
9513 orig_stmt = DR_STMT (dr);
9514 gcc_assert (gimple_uid (orig_stmt) > 0);
9515 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9516 /* Data references for gather loads and scatter stores do not use the
9517 updated offset we set using ADVANCE. Instead we have to make sure the
9518 reference in the data references point to the corresponding copy of
9519 the original in the epilogue. */
9520 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9521 == VMAT_GATHER_SCATTER)
9523 DR_REF (dr)
9524 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9525 &find_in_mapping, &mapping);
9526 DR_BASE_ADDRESS (dr)
9527 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9528 &find_in_mapping, &mapping);
9530 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9531 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9532 /* The vector size of the epilogue is smaller than that of the main loop
9533 so the alignment is either the same or lower. This means the dr will
9534 thus by definition be aligned. */
9535 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9538 epilogue_vinfo->shared->datarefs_copy.release ();
9539 epilogue_vinfo->shared->save_datarefs ();
9542 /* Function vect_transform_loop.
9544 The analysis phase has determined that the loop is vectorizable.
9545 Vectorize the loop - created vectorized stmts to replace the scalar
9546 stmts in the loop, and update the loop exit condition.
9547 Returns scalar epilogue loop if any. */
9549 class loop *
9550 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9552 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9553 class loop *epilogue = NULL;
9554 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9555 int nbbs = loop->num_nodes;
9556 int i;
9557 tree niters_vector = NULL_TREE;
9558 tree step_vector = NULL_TREE;
9559 tree niters_vector_mult_vf = NULL_TREE;
9560 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9561 unsigned int lowest_vf = constant_lower_bound (vf);
9562 gimple *stmt;
9563 bool check_profitability = false;
9564 unsigned int th;
9566 DUMP_VECT_SCOPE ("vec_transform_loop");
9568 loop_vinfo->shared->check_datarefs ();
9570 /* Use the more conservative vectorization threshold. If the number
9571 of iterations is constant assume the cost check has been performed
9572 by our caller. If the threshold makes all loops profitable that
9573 run at least the (estimated) vectorization factor number of times
9574 checking is pointless, too. */
9575 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9576 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9578 if (dump_enabled_p ())
9579 dump_printf_loc (MSG_NOTE, vect_location,
9580 "Profitability threshold is %d loop iterations.\n",
9581 th);
9582 check_profitability = true;
9585 /* Make sure there exists a single-predecessor exit bb. Do this before
9586 versioning. */
9587 edge e = single_exit (loop);
9588 if (! single_pred_p (e->dest))
9590 split_loop_exit_edge (e, true);
9591 if (dump_enabled_p ())
9592 dump_printf (MSG_NOTE, "split exit edge\n");
9595 /* Version the loop first, if required, so the profitability check
9596 comes first. */
9598 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9600 class loop *sloop
9601 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9602 sloop->force_vectorize = false;
9603 check_profitability = false;
9606 /* Make sure there exists a single-predecessor exit bb also on the
9607 scalar loop copy. Do this after versioning but before peeling
9608 so CFG structure is fine for both scalar and if-converted loop
9609 to make slpeel_duplicate_current_defs_from_edges face matched
9610 loop closed PHI nodes on the exit. */
9611 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9613 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9614 if (! single_pred_p (e->dest))
9616 split_loop_exit_edge (e, true);
9617 if (dump_enabled_p ())
9618 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9622 tree niters = vect_build_loop_niters (loop_vinfo);
9623 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9624 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9625 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9626 tree advance;
9627 drs_init_vec orig_drs_init;
9629 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9630 &step_vector, &niters_vector_mult_vf, th,
9631 check_profitability, niters_no_overflow,
9632 &advance);
9634 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9635 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9636 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9637 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9639 if (niters_vector == NULL_TREE)
9641 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9642 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9643 && known_eq (lowest_vf, vf))
9645 niters_vector
9646 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9647 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9648 step_vector = build_one_cst (TREE_TYPE (niters));
9650 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9651 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9652 &step_vector, niters_no_overflow);
9653 else
9654 /* vect_do_peeling subtracted the number of peeled prologue
9655 iterations from LOOP_VINFO_NITERS. */
9656 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9657 &niters_vector, &step_vector,
9658 niters_no_overflow);
9661 /* 1) Make sure the loop header has exactly two entries
9662 2) Make sure we have a preheader basic block. */
9664 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9666 split_edge (loop_preheader_edge (loop));
9668 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9669 /* This will deal with any possible peeling. */
9670 vect_prepare_for_masked_peels (loop_vinfo);
9672 /* Schedule the SLP instances first, then handle loop vectorization
9673 below. */
9674 if (!loop_vinfo->slp_instances.is_empty ())
9676 DUMP_VECT_SCOPE ("scheduling SLP instances");
9677 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9680 /* FORNOW: the vectorizer supports only loops which body consist
9681 of one basic block (header + empty latch). When the vectorizer will
9682 support more involved loop forms, the order by which the BBs are
9683 traversed need to be reconsidered. */
9685 for (i = 0; i < nbbs; i++)
9687 basic_block bb = bbs[i];
9688 stmt_vec_info stmt_info;
9690 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9691 gsi_next (&si))
9693 gphi *phi = si.phi ();
9694 if (dump_enabled_p ())
9695 dump_printf_loc (MSG_NOTE, vect_location,
9696 "------>vectorizing phi: %G", phi);
9697 stmt_info = loop_vinfo->lookup_stmt (phi);
9698 if (!stmt_info)
9699 continue;
9701 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9702 vect_loop_kill_debug_uses (loop, stmt_info);
9704 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9705 && !STMT_VINFO_LIVE_P (stmt_info))
9706 continue;
9708 if (STMT_VINFO_VECTYPE (stmt_info)
9709 && (maybe_ne
9710 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9711 && dump_enabled_p ())
9712 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9714 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9715 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9716 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9717 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9718 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9719 && ! PURE_SLP_STMT (stmt_info))
9721 if (dump_enabled_p ())
9722 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9723 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9727 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9728 gsi_next (&si))
9730 gphi *phi = si.phi ();
9731 stmt_info = loop_vinfo->lookup_stmt (phi);
9732 if (!stmt_info)
9733 continue;
9735 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9736 && !STMT_VINFO_LIVE_P (stmt_info))
9737 continue;
9739 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9740 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9741 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9742 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9743 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9744 && ! PURE_SLP_STMT (stmt_info))
9745 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9748 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9749 !gsi_end_p (si);)
9751 stmt = gsi_stmt (si);
9752 /* During vectorization remove existing clobber stmts. */
9753 if (gimple_clobber_p (stmt))
9755 unlink_stmt_vdef (stmt);
9756 gsi_remove (&si, true);
9757 release_defs (stmt);
9759 else
9761 /* Ignore vector stmts created in the outer loop. */
9762 stmt_info = loop_vinfo->lookup_stmt (stmt);
9764 /* vector stmts created in the outer-loop during vectorization of
9765 stmts in an inner-loop may not have a stmt_info, and do not
9766 need to be vectorized. */
9767 stmt_vec_info seen_store = NULL;
9768 if (stmt_info)
9770 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9772 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9773 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9774 !gsi_end_p (subsi); gsi_next (&subsi))
9776 stmt_vec_info pat_stmt_info
9777 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9778 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9779 &si, &seen_store);
9781 stmt_vec_info pat_stmt_info
9782 = STMT_VINFO_RELATED_STMT (stmt_info);
9783 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9784 &si, &seen_store))
9785 maybe_set_vectorized_backedge_value (loop_vinfo,
9786 pat_stmt_info);
9788 else
9790 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9791 &seen_store))
9792 maybe_set_vectorized_backedge_value (loop_vinfo,
9793 stmt_info);
9796 gsi_next (&si);
9797 if (seen_store)
9799 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9800 /* Interleaving. If IS_STORE is TRUE, the
9801 vectorization of the interleaving chain was
9802 completed - free all the stores in the chain. */
9803 vect_remove_stores (loop_vinfo,
9804 DR_GROUP_FIRST_ELEMENT (seen_store));
9805 else
9806 /* Free the attached stmt_vec_info and remove the stmt. */
9807 loop_vinfo->remove_stmt (stmt_info);
9812 /* Stub out scalar statements that must not survive vectorization.
9813 Doing this here helps with grouped statements, or statements that
9814 are involved in patterns. */
9815 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9816 !gsi_end_p (gsi); gsi_next (&gsi))
9818 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9819 if (!call || !gimple_call_internal_p (call))
9820 continue;
9821 internal_fn ifn = gimple_call_internal_fn (call);
9822 if (ifn == IFN_MASK_LOAD)
9824 tree lhs = gimple_get_lhs (call);
9825 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9827 tree zero = build_zero_cst (TREE_TYPE (lhs));
9828 gimple *new_stmt = gimple_build_assign (lhs, zero);
9829 gsi_replace (&gsi, new_stmt, true);
9832 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9834 tree lhs = gimple_get_lhs (call);
9835 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9837 tree else_arg
9838 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9839 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9840 gsi_replace (&gsi, new_stmt, true);
9844 } /* BBs in loop */
9846 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9847 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9848 if (integer_onep (step_vector))
9849 niters_no_overflow = true;
9850 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9851 niters_vector_mult_vf, !niters_no_overflow);
9853 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9854 scale_profile_for_vect_loop (loop, assumed_vf);
9856 /* True if the final iteration might not handle a full vector's
9857 worth of scalar iterations. */
9858 bool final_iter_may_be_partial
9859 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9860 /* The minimum number of iterations performed by the epilogue. This
9861 is 1 when peeling for gaps because we always need a final scalar
9862 iteration. */
9863 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9864 /* +1 to convert latch counts to loop iteration counts,
9865 -min_epilogue_iters to remove iterations that cannot be performed
9866 by the vector code. */
9867 int bias_for_lowest = 1 - min_epilogue_iters;
9868 int bias_for_assumed = bias_for_lowest;
9869 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9870 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9872 /* When the amount of peeling is known at compile time, the first
9873 iteration will have exactly alignment_npeels active elements.
9874 In the worst case it will have at least one. */
9875 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9876 bias_for_lowest += lowest_vf - min_first_active;
9877 bias_for_assumed += assumed_vf - min_first_active;
9879 /* In these calculations the "- 1" converts loop iteration counts
9880 back to latch counts. */
9881 if (loop->any_upper_bound)
9883 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9884 loop->nb_iterations_upper_bound
9885 = (final_iter_may_be_partial
9886 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9887 lowest_vf) - 1
9888 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9889 lowest_vf) - 1);
9890 if (main_vinfo)
9892 unsigned int bound;
9893 poly_uint64 main_iters
9894 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9895 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9896 main_iters
9897 = upper_bound (main_iters,
9898 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9899 if (can_div_away_from_zero_p (main_iters,
9900 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9901 &bound))
9902 loop->nb_iterations_upper_bound
9903 = wi::umin ((widest_int) (bound - 1),
9904 loop->nb_iterations_upper_bound);
9907 if (loop->any_likely_upper_bound)
9908 loop->nb_iterations_likely_upper_bound
9909 = (final_iter_may_be_partial
9910 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9911 + bias_for_lowest, lowest_vf) - 1
9912 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9913 + bias_for_lowest, lowest_vf) - 1);
9914 if (loop->any_estimate)
9915 loop->nb_iterations_estimate
9916 = (final_iter_may_be_partial
9917 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9918 assumed_vf) - 1
9919 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9920 assumed_vf) - 1);
9922 if (dump_enabled_p ())
9924 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9926 dump_printf_loc (MSG_NOTE, vect_location,
9927 "LOOP VECTORIZED\n");
9928 if (loop->inner)
9929 dump_printf_loc (MSG_NOTE, vect_location,
9930 "OUTER LOOP VECTORIZED\n");
9931 dump_printf (MSG_NOTE, "\n");
9933 else
9934 dump_printf_loc (MSG_NOTE, vect_location,
9935 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9936 GET_MODE_NAME (loop_vinfo->vector_mode));
9939 /* Loops vectorized with a variable factor won't benefit from
9940 unrolling/peeling. */
9941 if (!vf.is_constant ())
9943 loop->unroll = 1;
9944 if (dump_enabled_p ())
9945 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9946 " variable-length vectorization factor\n");
9948 /* Free SLP instances here because otherwise stmt reference counting
9949 won't work. */
9950 slp_instance instance;
9951 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9952 vect_free_slp_instance (instance);
9953 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9954 /* Clear-up safelen field since its value is invalid after vectorization
9955 since vectorized loop can have loop-carried dependencies. */
9956 loop->safelen = 0;
9958 if (epilogue)
9960 update_epilogue_loop_vinfo (epilogue, advance);
9962 epilogue->simduid = loop->simduid;
9963 epilogue->force_vectorize = loop->force_vectorize;
9964 epilogue->dont_vectorize = false;
9967 return epilogue;
9970 /* The code below is trying to perform simple optimization - revert
9971 if-conversion for masked stores, i.e. if the mask of a store is zero
9972 do not perform it and all stored value producers also if possible.
9973 For example,
9974 for (i=0; i<n; i++)
9975 if (c[i])
9977 p1[i] += 1;
9978 p2[i] = p3[i] +2;
9980 this transformation will produce the following semi-hammock:
9982 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9984 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9985 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9986 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9987 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9988 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9989 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9993 void
9994 optimize_mask_stores (class loop *loop)
9996 basic_block *bbs = get_loop_body (loop);
9997 unsigned nbbs = loop->num_nodes;
9998 unsigned i;
9999 basic_block bb;
10000 class loop *bb_loop;
10001 gimple_stmt_iterator gsi;
10002 gimple *stmt;
10003 auto_vec<gimple *> worklist;
10004 auto_purge_vect_location sentinel;
10006 vect_location = find_loop_location (loop);
10007 /* Pick up all masked stores in loop if any. */
10008 for (i = 0; i < nbbs; i++)
10010 bb = bbs[i];
10011 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10012 gsi_next (&gsi))
10014 stmt = gsi_stmt (gsi);
10015 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10016 worklist.safe_push (stmt);
10020 free (bbs);
10021 if (worklist.is_empty ())
10022 return;
10024 /* Loop has masked stores. */
10025 while (!worklist.is_empty ())
10027 gimple *last, *last_store;
10028 edge e, efalse;
10029 tree mask;
10030 basic_block store_bb, join_bb;
10031 gimple_stmt_iterator gsi_to;
10032 tree vdef, new_vdef;
10033 gphi *phi;
10034 tree vectype;
10035 tree zero;
10037 last = worklist.pop ();
10038 mask = gimple_call_arg (last, 2);
10039 bb = gimple_bb (last);
10040 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10041 the same loop as if_bb. It could be different to LOOP when two
10042 level loop-nest is vectorized and mask_store belongs to the inner
10043 one. */
10044 e = split_block (bb, last);
10045 bb_loop = bb->loop_father;
10046 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10047 join_bb = e->dest;
10048 store_bb = create_empty_bb (bb);
10049 add_bb_to_loop (store_bb, bb_loop);
10050 e->flags = EDGE_TRUE_VALUE;
10051 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10052 /* Put STORE_BB to likely part. */
10053 efalse->probability = profile_probability::unlikely ();
10054 store_bb->count = efalse->count ();
10055 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10056 if (dom_info_available_p (CDI_DOMINATORS))
10057 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10058 if (dump_enabled_p ())
10059 dump_printf_loc (MSG_NOTE, vect_location,
10060 "Create new block %d to sink mask stores.",
10061 store_bb->index);
10062 /* Create vector comparison with boolean result. */
10063 vectype = TREE_TYPE (mask);
10064 zero = build_zero_cst (vectype);
10065 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10066 gsi = gsi_last_bb (bb);
10067 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10068 /* Create new PHI node for vdef of the last masked store:
10069 .MEM_2 = VDEF <.MEM_1>
10070 will be converted to
10071 .MEM.3 = VDEF <.MEM_1>
10072 and new PHI node will be created in join bb
10073 .MEM_2 = PHI <.MEM_1, .MEM_3>
10075 vdef = gimple_vdef (last);
10076 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10077 gimple_set_vdef (last, new_vdef);
10078 phi = create_phi_node (vdef, join_bb);
10079 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10081 /* Put all masked stores with the same mask to STORE_BB if possible. */
10082 while (true)
10084 gimple_stmt_iterator gsi_from;
10085 gimple *stmt1 = NULL;
10087 /* Move masked store to STORE_BB. */
10088 last_store = last;
10089 gsi = gsi_for_stmt (last);
10090 gsi_from = gsi;
10091 /* Shift GSI to the previous stmt for further traversal. */
10092 gsi_prev (&gsi);
10093 gsi_to = gsi_start_bb (store_bb);
10094 gsi_move_before (&gsi_from, &gsi_to);
10095 /* Setup GSI_TO to the non-empty block start. */
10096 gsi_to = gsi_start_bb (store_bb);
10097 if (dump_enabled_p ())
10098 dump_printf_loc (MSG_NOTE, vect_location,
10099 "Move stmt to created bb\n%G", last);
10100 /* Move all stored value producers if possible. */
10101 while (!gsi_end_p (gsi))
10103 tree lhs;
10104 imm_use_iterator imm_iter;
10105 use_operand_p use_p;
10106 bool res;
10108 /* Skip debug statements. */
10109 if (is_gimple_debug (gsi_stmt (gsi)))
10111 gsi_prev (&gsi);
10112 continue;
10114 stmt1 = gsi_stmt (gsi);
10115 /* Do not consider statements writing to memory or having
10116 volatile operand. */
10117 if (gimple_vdef (stmt1)
10118 || gimple_has_volatile_ops (stmt1))
10119 break;
10120 gsi_from = gsi;
10121 gsi_prev (&gsi);
10122 lhs = gimple_get_lhs (stmt1);
10123 if (!lhs)
10124 break;
10126 /* LHS of vectorized stmt must be SSA_NAME. */
10127 if (TREE_CODE (lhs) != SSA_NAME)
10128 break;
10130 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10132 /* Remove dead scalar statement. */
10133 if (has_zero_uses (lhs))
10135 gsi_remove (&gsi_from, true);
10136 continue;
10140 /* Check that LHS does not have uses outside of STORE_BB. */
10141 res = true;
10142 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10144 gimple *use_stmt;
10145 use_stmt = USE_STMT (use_p);
10146 if (is_gimple_debug (use_stmt))
10147 continue;
10148 if (gimple_bb (use_stmt) != store_bb)
10150 res = false;
10151 break;
10154 if (!res)
10155 break;
10157 if (gimple_vuse (stmt1)
10158 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10159 break;
10161 /* Can move STMT1 to STORE_BB. */
10162 if (dump_enabled_p ())
10163 dump_printf_loc (MSG_NOTE, vect_location,
10164 "Move stmt to created bb\n%G", stmt1);
10165 gsi_move_before (&gsi_from, &gsi_to);
10166 /* Shift GSI_TO for further insertion. */
10167 gsi_prev (&gsi_to);
10169 /* Put other masked stores with the same mask to STORE_BB. */
10170 if (worklist.is_empty ()
10171 || gimple_call_arg (worklist.last (), 2) != mask
10172 || worklist.last () != stmt1)
10173 break;
10174 last = worklist.pop ();
10176 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10180 /* Decide whether it is possible to use a zero-based induction variable
10181 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10182 the value that the induction variable must be able to hold in order
10183 to ensure that the rgroups eventually have no active vector elements.
10184 Return -1 otherwise. */
10186 widest_int
10187 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10189 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10190 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10191 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10193 /* Calculate the value that the induction variable must be able
10194 to hit in order to ensure that we end the loop with an all-false mask.
10195 This involves adding the maximum number of inactive trailing scalar
10196 iterations. */
10197 widest_int iv_limit = -1;
10198 if (max_loop_iterations (loop, &iv_limit))
10200 if (niters_skip)
10202 /* Add the maximum number of skipped iterations to the
10203 maximum iteration count. */
10204 if (TREE_CODE (niters_skip) == INTEGER_CST)
10205 iv_limit += wi::to_widest (niters_skip);
10206 else
10207 iv_limit += max_vf - 1;
10209 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10210 /* Make a conservatively-correct assumption. */
10211 iv_limit += max_vf - 1;
10213 /* IV_LIMIT is the maximum number of latch iterations, which is also
10214 the maximum in-range IV value. Round this value down to the previous
10215 vector alignment boundary and then add an extra full iteration. */
10216 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10217 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10219 return iv_limit;
10222 /* For the given rgroup_controls RGC, check whether an induction variable
10223 would ever hit a value that produces a set of all-false masks or zero
10224 lengths before wrapping around. Return true if it's possible to wrap
10225 around before hitting the desirable value, otherwise return false. */
10227 bool
10228 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10230 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10232 if (iv_limit == -1)
10233 return true;
10235 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10236 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10237 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10239 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10240 return true;
10242 return false;