Fortran: Missing error with IMPLICIT none (external) [PR100972]
[official-gcc.git] / gcc / tree-vect-loop.c
blobb56b7a4a38640e477e13c8a1a80d5b3318f7da3a
1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
158 bool *, bool *);
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162 may already be set for general statements (not just data refs). */
164 static opt_result
165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
166 bool vectype_maybe_set_p,
167 poly_uint64 *vf)
169 gimple *stmt = stmt_info->stmt;
171 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
172 && !STMT_VINFO_LIVE_P (stmt_info))
173 || gimple_clobber_p (stmt))
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
177 return opt_result::success ();
180 tree stmt_vectype, nunits_vectype;
181 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
182 &stmt_vectype,
183 &nunits_vectype);
184 if (!res)
185 return res;
187 if (stmt_vectype)
189 if (STMT_VINFO_VECTYPE (stmt_info))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 || vectype_maybe_set_p)
195 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. Return true on success
209 or false if something prevented vectorization. */
211 static opt_result
212 vect_determine_vf_for_stmt (vec_info *vinfo,
213 stmt_vec_info stmt_info, poly_uint64 *vf)
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
217 stmt_info->stmt);
218 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
219 if (!res)
220 return res;
222 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
223 && STMT_VINFO_RELATED_STMT (stmt_info))
225 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
226 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
228 /* If a pattern statement has def stmts, analyze them too. */
229 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
230 !gsi_end_p (si); gsi_next (&si))
232 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
233 if (dump_enabled_p ())
234 dump_printf_loc (MSG_NOTE, vect_location,
235 "==> examining pattern def stmt: %G",
236 def_stmt_info->stmt);
237 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
238 if (!res)
239 return res;
242 if (dump_enabled_p ())
243 dump_printf_loc (MSG_NOTE, vect_location,
244 "==> examining pattern statement: %G",
245 stmt_info->stmt);
246 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
247 if (!res)
248 return res;
251 return opt_result::success ();
254 /* Function vect_determine_vectorization_factor
256 Determine the vectorization factor (VF). VF is the number of data elements
257 that are operated upon in parallel in a single iteration of the vectorized
258 loop. For example, when vectorizing a loop that operates on 4byte elements,
259 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260 elements can fit in a single vector register.
262 We currently support vectorization of loops in which all types operated upon
263 are of the same size. Therefore this function currently sets VF according to
264 the size of the types operated upon, and fails if there are multiple sizes
265 in the loop.
267 VF is also the factor by which the loop iterations are strip-mined, e.g.:
268 original loop:
269 for (i=0; i<N; i++){
270 a[i] = b[i] + c[i];
273 vectorized loop:
274 for (i=0; i<N; i+=VF){
275 a[i:VF] = b[i:VF] + c[i:VF];
279 static opt_result
280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
282 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284 unsigned nbbs = loop->num_nodes;
285 poly_uint64 vectorization_factor = 1;
286 tree scalar_type = NULL_TREE;
287 gphi *phi;
288 tree vectype;
289 stmt_vec_info stmt_info;
290 unsigned i;
292 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
294 for (i = 0; i < nbbs; i++)
296 basic_block bb = bbs[i];
298 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
299 gsi_next (&si))
301 phi = si.phi ();
302 stmt_info = loop_vinfo->lookup_stmt (phi);
303 if (dump_enabled_p ())
304 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
305 phi);
307 gcc_assert (stmt_info);
309 if (STMT_VINFO_RELEVANT_P (stmt_info)
310 || STMT_VINFO_LIVE_P (stmt_info))
312 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
313 scalar_type = TREE_TYPE (PHI_RESULT (phi));
315 if (dump_enabled_p ())
316 dump_printf_loc (MSG_NOTE, vect_location,
317 "get vectype for scalar type: %T\n",
318 scalar_type);
320 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
321 if (!vectype)
322 return opt_result::failure_at (phi,
323 "not vectorized: unsupported "
324 "data-type %T\n",
325 scalar_type);
326 STMT_VINFO_VECTYPE (stmt_info) = vectype;
328 if (dump_enabled_p ())
329 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
330 vectype);
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
335 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
336 dump_printf (MSG_NOTE, "\n");
339 vect_update_max_nunits (&vectorization_factor, vectype);
343 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
344 gsi_next (&si))
346 if (is_gimple_debug (gsi_stmt (si)))
347 continue;
348 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
349 opt_result res
350 = vect_determine_vf_for_stmt (loop_vinfo,
351 stmt_info, &vectorization_factor);
352 if (!res)
353 return res;
357 /* TODO: Analyze cost. Decide if worth while to vectorize. */
358 if (dump_enabled_p ())
360 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
361 dump_dec (MSG_NOTE, vectorization_factor);
362 dump_printf (MSG_NOTE, "\n");
365 if (known_le (vectorization_factor, 1U))
366 return opt_result::failure_at (vect_location,
367 "not vectorized: unsupported data-type\n");
368 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
369 return opt_result::success ();
373 /* Function vect_is_simple_iv_evolution.
375 FORNOW: A simple evolution of an induction variables in the loop is
376 considered a polynomial evolution. */
378 static bool
379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
380 tree * step)
382 tree init_expr;
383 tree step_expr;
384 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
385 basic_block bb;
387 /* When there is no evolution in this loop, the evolution function
388 is not "simple". */
389 if (evolution_part == NULL_TREE)
390 return false;
392 /* When the evolution is a polynomial of degree >= 2
393 the evolution function is not "simple". */
394 if (tree_is_chrec (evolution_part))
395 return false;
397 step_expr = evolution_part;
398 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
402 step_expr, init_expr);
404 *init = init_expr;
405 *step = step_expr;
407 if (TREE_CODE (step_expr) != INTEGER_CST
408 && (TREE_CODE (step_expr) != SSA_NAME
409 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
410 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
411 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
412 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
413 || !flag_associative_math)))
414 && (TREE_CODE (step_expr) != REAL_CST
415 || !flag_associative_math))
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "step unknown.\n");
420 return false;
423 return true;
426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
427 what we are assuming is a double reduction. For example, given
428 a structure like this:
430 outer1:
431 x_1 = PHI <x_4(outer2), ...>;
434 inner:
435 x_2 = PHI <x_1(outer1), ...>;
437 x_3 = ...;
440 outer2:
441 x_4 = PHI <x_3(inner)>;
444 outer loop analysis would treat x_1 as a double reduction phi and
445 this function would then return true for x_2. */
447 static bool
448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
450 use_operand_p use_p;
451 ssa_op_iter op_iter;
452 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
453 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
454 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
455 return true;
456 return false;
459 /* Function vect_analyze_scalar_cycles_1.
461 Examine the cross iteration def-use cycles of scalar variables
462 in LOOP. LOOP_VINFO represents the loop that is now being
463 considered for vectorization (can be LOOP, or an outer-loop
464 enclosing LOOP). */
466 static void
467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
469 basic_block bb = loop->header;
470 tree init, step;
471 auto_vec<stmt_vec_info, 64> worklist;
472 gphi_iterator gsi;
473 bool double_reduc, reduc_chain;
475 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
477 /* First - identify all inductions. Reduction detection assumes that all the
478 inductions have been identified, therefore, this order must not be
479 changed. */
480 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
482 gphi *phi = gsi.phi ();
483 tree access_fn = NULL;
484 tree def = PHI_RESULT (phi);
485 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
490 /* Skip virtual phi's. The data dependences that are associated with
491 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
492 if (virtual_operand_p (def))
493 continue;
495 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
497 /* Analyze the evolution function. */
498 access_fn = analyze_scalar_evolution (loop, def);
499 if (access_fn)
501 STRIP_NOPS (access_fn);
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "Access function of PHI: %T\n", access_fn);
505 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
506 = initial_condition_in_loop_num (access_fn, loop->num);
507 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
508 = evolution_part_in_loop_num (access_fn, loop->num);
511 if (!access_fn
512 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
513 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
514 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
515 && TREE_CODE (step) != INTEGER_CST))
517 worklist.safe_push (stmt_vinfo);
518 continue;
521 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 != NULL_TREE);
523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
525 if (dump_enabled_p ())
526 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
527 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
531 /* Second - identify all reductions and nested cycles. */
532 while (worklist.length () > 0)
534 stmt_vec_info stmt_vinfo = worklist.pop ();
535 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
536 tree def = PHI_RESULT (phi);
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
541 gcc_assert (!virtual_operand_p (def)
542 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
544 stmt_vec_info reduc_stmt_info
545 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
546 &reduc_chain);
547 if (reduc_stmt_info)
549 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
550 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
551 if (double_reduc)
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_NOTE, vect_location,
555 "Detected double reduction.\n");
557 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
558 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
560 else
562 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "Detected vectorizable nested cycle.\n");
568 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
570 else
572 if (dump_enabled_p ())
573 dump_printf_loc (MSG_NOTE, vect_location,
574 "Detected reduction.\n");
576 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
577 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
578 /* Store the reduction cycles for possible vectorization in
579 loop-aware SLP if it was not detected as reduction
580 chain. */
581 if (! reduc_chain)
582 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
583 (reduc_stmt_info);
587 else
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Unknown def-use cycle pattern.\n");
595 /* Function vect_analyze_scalar_cycles.
597 Examine the cross iteration def-use cycles of scalar variables, by
598 analyzing the loop-header PHIs of scalar variables. Classify each
599 cycle as one of the following: invariant, induction, reduction, unknown.
600 We do that for the loop represented by LOOP_VINFO, and also to its
601 inner-loop, if exists.
602 Examples for scalar cycles:
604 Example1: reduction:
606 loop1:
607 for (i=0; i<N; i++)
608 sum += a[i];
610 Example2: induction:
612 loop2:
613 for (i=0; i<N; i++)
614 a[i] = i; */
616 static void
617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
619 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
621 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
623 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
624 Reductions in such inner-loop therefore have different properties than
625 the reductions in the nest that gets vectorized:
626 1. When vectorized, they are executed in the same order as in the original
627 scalar loop, so we can't change the order of computation when
628 vectorizing them.
629 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
630 current checks are too strict. */
632 if (loop->inner)
633 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
636 /* Transfer group and reduction information from STMT_INFO to its
637 pattern stmt. */
639 static void
640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
642 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
643 stmt_vec_info stmtp;
644 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
645 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
646 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
649 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
650 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
651 == STMT_VINFO_DEF_TYPE (stmt_info));
652 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
653 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
654 if (stmt_info)
655 REDUC_GROUP_NEXT_ELEMENT (stmtp)
656 = STMT_VINFO_RELATED_STMT (stmt_info);
658 while (stmt_info);
661 /* Fixup scalar cycles that now have their stmts detected as patterns. */
663 static void
664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
666 stmt_vec_info first;
667 unsigned i;
669 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
674 if ((STMT_VINFO_IN_PATTERN_P (next)
675 != STMT_VINFO_IN_PATTERN_P (first))
676 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
677 break;
678 next = REDUC_GROUP_NEXT_ELEMENT (next);
680 /* If all reduction chain members are well-formed patterns adjust
681 the group to group the pattern stmts instead. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
685 if (STMT_VINFO_IN_PATTERN_P (first))
687 vect_fixup_reduc_chain (first);
688 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
689 = STMT_VINFO_RELATED_STMT (first);
692 /* If not all stmt in the chain are patterns or if we failed
693 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
694 it as regular reduction instead. */
695 else
697 stmt_vec_info vinfo = first;
698 stmt_vec_info last = NULL;
699 while (vinfo)
701 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
702 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
703 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
704 last = vinfo;
705 vinfo = next;
707 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
708 = vect_internal_def;
709 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
710 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
711 --i;
716 /* Function vect_get_loop_niters.
718 Determine how many iterations the loop is executed and place it
719 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
720 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
721 niter information holds in ASSUMPTIONS.
723 Return the loop exit condition. */
726 static gcond *
727 vect_get_loop_niters (class loop *loop, tree *assumptions,
728 tree *number_of_iterations, tree *number_of_iterationsm1)
730 edge exit = single_exit (loop);
731 class tree_niter_desc niter_desc;
732 tree niter_assumptions, niter, may_be_zero;
733 gcond *cond = get_loop_exit_condition (loop);
735 *assumptions = boolean_true_node;
736 *number_of_iterationsm1 = chrec_dont_know;
737 *number_of_iterations = chrec_dont_know;
738 DUMP_VECT_SCOPE ("get_loop_niters");
740 if (!exit)
741 return cond;
743 may_be_zero = NULL_TREE;
744 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
745 || chrec_contains_undetermined (niter_desc.niter))
746 return cond;
748 niter_assumptions = niter_desc.assumptions;
749 may_be_zero = niter_desc.may_be_zero;
750 niter = niter_desc.niter;
752 if (may_be_zero && integer_zerop (may_be_zero))
753 may_be_zero = NULL_TREE;
755 if (may_be_zero)
757 if (COMPARISON_CLASS_P (may_be_zero))
759 /* Try to combine may_be_zero with assumptions, this can simplify
760 computation of niter expression. */
761 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
762 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
763 niter_assumptions,
764 fold_build1 (TRUTH_NOT_EXPR,
765 boolean_type_node,
766 may_be_zero));
767 else
768 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
769 build_int_cst (TREE_TYPE (niter), 0),
770 rewrite_to_non_trapping_overflow (niter));
772 may_be_zero = NULL_TREE;
774 else if (integer_nonzerop (may_be_zero))
776 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
777 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
778 return cond;
780 else
781 return cond;
784 *assumptions = niter_assumptions;
785 *number_of_iterationsm1 = niter;
787 /* We want the number of loop header executions which is the number
788 of latch executions plus one.
789 ??? For UINT_MAX latch executions this number overflows to zero
790 for loops like do { n++; } while (n != 0); */
791 if (niter && !chrec_contains_undetermined (niter))
792 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
793 build_int_cst (TREE_TYPE (niter), 1));
794 *number_of_iterations = niter;
796 return cond;
799 /* Function bb_in_loop_p
801 Used as predicate for dfs order traversal of the loop bbs. */
803 static bool
804 bb_in_loop_p (const_basic_block bb, const void *data)
806 const class loop *const loop = (const class loop *)data;
807 if (flow_bb_inside_loop_p (loop, bb))
808 return true;
809 return false;
813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
814 stmt_vec_info structs for all the stmts in LOOP_IN. */
816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
817 : vec_info (vec_info::loop, shared),
818 loop (loop_in),
819 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
820 num_itersm1 (NULL_TREE),
821 num_iters (NULL_TREE),
822 num_iters_unchanged (NULL_TREE),
823 num_iters_assumptions (NULL_TREE),
824 th (0),
825 versioning_threshold (0),
826 vectorization_factor (0),
827 main_loop_edge (nullptr),
828 skip_main_loop_edge (nullptr),
829 skip_this_loop_edge (nullptr),
830 reusable_accumulators (),
831 max_vectorization_factor (0),
832 mask_skip_niters (NULL_TREE),
833 rgroup_compare_type (NULL_TREE),
834 simd_if_cond (NULL_TREE),
835 unaligned_dr (NULL),
836 peeling_for_alignment (0),
837 ptr_mask (0),
838 ivexpr_map (NULL),
839 scan_map (NULL),
840 slp_unrolling_factor (1),
841 single_scalar_iteration_cost (0),
842 vec_outside_cost (0),
843 vec_inside_cost (0),
844 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
845 vectorizable (false),
846 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
847 using_partial_vectors_p (false),
848 epil_using_partial_vectors_p (false),
849 peeling_for_gaps (false),
850 peeling_for_niter (false),
851 no_data_dependencies (false),
852 has_mask_store (false),
853 scalar_loop_scaling (profile_probability::uninitialized ()),
854 scalar_loop (NULL),
855 orig_loop_info (NULL)
857 /* CHECKME: We want to visit all BBs before their successors (except for
858 latch blocks, for which this assertion wouldn't hold). In the simple
859 case of the loop forms we allow, a dfs order of the BBs would the same
860 as reversed postorder traversal, so we are safe. */
862 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
863 bbs, loop->num_nodes, loop);
864 gcc_assert (nbbs == loop->num_nodes);
866 for (unsigned int i = 0; i < nbbs; i++)
868 basic_block bb = bbs[i];
869 gimple_stmt_iterator si;
871 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
873 gimple *phi = gsi_stmt (si);
874 gimple_set_uid (phi, 0);
875 add_stmt (phi);
878 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
880 gimple *stmt = gsi_stmt (si);
881 gimple_set_uid (stmt, 0);
882 if (is_gimple_debug (stmt))
883 continue;
884 add_stmt (stmt);
885 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
886 third argument is the #pragma omp simd if (x) condition, when 0,
887 loop shouldn't be vectorized, when non-zero constant, it should
888 be vectorized normally, otherwise versioned with vectorized loop
889 done if the condition is non-zero at runtime. */
890 if (loop_in->simduid
891 && is_gimple_call (stmt)
892 && gimple_call_internal_p (stmt)
893 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
894 && gimple_call_num_args (stmt) >= 3
895 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
896 && (loop_in->simduid
897 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
899 tree arg = gimple_call_arg (stmt, 2);
900 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
901 simd_if_cond = arg;
902 else
903 gcc_assert (integer_nonzerop (arg));
908 epilogue_vinfos.create (6);
911 /* Free all levels of rgroup CONTROLS. */
913 void
914 release_vec_loop_controls (vec<rgroup_controls> *controls)
916 rgroup_controls *rgc;
917 unsigned int i;
918 FOR_EACH_VEC_ELT (*controls, i, rgc)
919 rgc->controls.release ();
920 controls->release ();
923 /* Free all memory used by the _loop_vec_info, as well as all the
924 stmt_vec_info structs of all the stmts in the loop. */
926 _loop_vec_info::~_loop_vec_info ()
928 free (bbs);
930 release_vec_loop_controls (&masks);
931 release_vec_loop_controls (&lens);
932 delete ivexpr_map;
933 delete scan_map;
934 epilogue_vinfos.release ();
936 /* When we release an epiloge vinfo that we do not intend to use
937 avoid clearing AUX of the main loop which should continue to
938 point to the main loop vinfo since otherwise we'll leak that. */
939 if (loop->aux == this)
940 loop->aux = NULL;
943 /* Return an invariant or register for EXPR and emit necessary
944 computations in the LOOP_VINFO loop preheader. */
946 tree
947 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
949 if (is_gimple_reg (expr)
950 || is_gimple_min_invariant (expr))
951 return expr;
953 if (! loop_vinfo->ivexpr_map)
954 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
955 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
956 if (! cached)
958 gimple_seq stmts = NULL;
959 cached = force_gimple_operand (unshare_expr (expr),
960 &stmts, true, NULL_TREE);
961 if (stmts)
963 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
964 gsi_insert_seq_on_edge_immediate (e, stmts);
967 return cached;
970 /* Return true if we can use CMP_TYPE as the comparison type to produce
971 all masks required to mask LOOP_VINFO. */
973 static bool
974 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
976 rgroup_controls *rgm;
977 unsigned int i;
978 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
979 if (rgm->type != NULL_TREE
980 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
981 cmp_type, rgm->type,
982 OPTIMIZE_FOR_SPEED))
983 return false;
984 return true;
987 /* Calculate the maximum number of scalars per iteration for every
988 rgroup in LOOP_VINFO. */
990 static unsigned int
991 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
993 unsigned int res = 1;
994 unsigned int i;
995 rgroup_controls *rgm;
996 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
997 res = MAX (res, rgm->max_nscalars_per_iter);
998 return res;
1001 /* Calculate the minimum precision necessary to represent:
1003 MAX_NITERS * FACTOR
1005 as an unsigned integer, where MAX_NITERS is the maximum number of
1006 loop header iterations for the original scalar form of LOOP_VINFO. */
1008 static unsigned
1009 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1011 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1013 /* Get the maximum number of iterations that is representable
1014 in the counter type. */
1015 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1016 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1018 /* Get a more refined estimate for the number of iterations. */
1019 widest_int max_back_edges;
1020 if (max_loop_iterations (loop, &max_back_edges))
1021 max_ni = wi::smin (max_ni, max_back_edges + 1);
1023 /* Work out how many bits we need to represent the limit. */
1024 return wi::min_precision (max_ni * factor, UNSIGNED);
1027 /* True if the loop needs peeling or partial vectors when vectorized. */
1029 static bool
1030 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1032 unsigned HOST_WIDE_INT const_vf;
1033 HOST_WIDE_INT max_niter
1034 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1036 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1037 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1038 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1039 (loop_vinfo));
1041 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1042 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1044 /* Work out the (constant) number of iterations that need to be
1045 peeled for reasons other than niters. */
1046 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1047 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1048 peel_niter += 1;
1049 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1050 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1051 return true;
1053 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1054 /* ??? When peeling for gaps but not alignment, we could
1055 try to check whether the (variable) niters is known to be
1056 VF * N + 1. That's something of a niche case though. */
1057 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1058 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1059 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1060 < (unsigned) exact_log2 (const_vf))
1061 /* In case of versioning, check if the maximum number of
1062 iterations is greater than th. If they are identical,
1063 the epilogue is unnecessary. */
1064 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1065 || ((unsigned HOST_WIDE_INT) max_niter
1066 > (th / const_vf) * const_vf))))
1067 return true;
1069 return false;
1072 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1073 whether we can actually generate the masks required. Return true if so,
1074 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1076 static bool
1077 vect_verify_full_masking (loop_vec_info loop_vinfo)
1079 unsigned int min_ni_width;
1080 unsigned int max_nscalars_per_iter
1081 = vect_get_max_nscalars_per_iter (loop_vinfo);
1083 /* Use a normal loop if there are no statements that need masking.
1084 This only happens in rare degenerate cases: it means that the loop
1085 has no loads, no stores, and no live-out values. */
1086 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1087 return false;
1089 /* Work out how many bits we need to represent the limit. */
1090 min_ni_width
1091 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1093 /* Find a scalar mode for which WHILE_ULT is supported. */
1094 opt_scalar_int_mode cmp_mode_iter;
1095 tree cmp_type = NULL_TREE;
1096 tree iv_type = NULL_TREE;
1097 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1098 unsigned int iv_precision = UINT_MAX;
1100 if (iv_limit != -1)
1101 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1102 UNSIGNED);
1104 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1106 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1107 if (cmp_bits >= min_ni_width
1108 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1110 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1111 if (this_type
1112 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1114 /* Although we could stop as soon as we find a valid mode,
1115 there are at least two reasons why that's not always the
1116 best choice:
1118 - An IV that's Pmode or wider is more likely to be reusable
1119 in address calculations than an IV that's narrower than
1120 Pmode.
1122 - Doing the comparison in IV_PRECISION or wider allows
1123 a natural 0-based IV, whereas using a narrower comparison
1124 type requires mitigations against wrap-around.
1126 Conversely, if the IV limit is variable, doing the comparison
1127 in a wider type than the original type can introduce
1128 unnecessary extensions, so picking the widest valid mode
1129 is not always a good choice either.
1131 Here we prefer the first IV type that's Pmode or wider,
1132 and the first comparison type that's IV_PRECISION or wider.
1133 (The comparison type must be no wider than the IV type,
1134 to avoid extensions in the vector loop.)
1136 ??? We might want to try continuing beyond Pmode for ILP32
1137 targets if CMP_BITS < IV_PRECISION. */
1138 iv_type = this_type;
1139 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1140 cmp_type = this_type;
1141 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1142 break;
1147 if (!cmp_type)
1148 return false;
1150 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1151 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1152 return true;
1155 /* Check whether we can use vector access with length based on precison
1156 comparison. So far, to keep it simple, we only allow the case that the
1157 precision of the target supported length is larger than the precision
1158 required by loop niters. */
1160 static bool
1161 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1163 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1164 return false;
1166 unsigned int max_nitems_per_iter = 1;
1167 unsigned int i;
1168 rgroup_controls *rgl;
1169 /* Find the maximum number of items per iteration for every rgroup. */
1170 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1172 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1173 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1176 /* Work out how many bits we need to represent the length limit. */
1177 unsigned int min_ni_prec
1178 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1180 /* Now use the maximum of below precisions for one suitable IV type:
1181 - the IV's natural precision
1182 - the precision needed to hold: the maximum number of scalar
1183 iterations multiplied by the scale factor (min_ni_prec above)
1184 - the Pmode precision
1186 If min_ni_prec is less than the precision of the current niters,
1187 we perfer to still use the niters type. Prefer to use Pmode and
1188 wider IV to avoid narrow conversions. */
1190 unsigned int ni_prec
1191 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1192 min_ni_prec = MAX (min_ni_prec, ni_prec);
1193 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1195 tree iv_type = NULL_TREE;
1196 opt_scalar_int_mode tmode_iter;
1197 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1199 scalar_mode tmode = tmode_iter.require ();
1200 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1202 /* ??? Do we really want to construct one IV whose precision exceeds
1203 BITS_PER_WORD? */
1204 if (tbits > BITS_PER_WORD)
1205 break;
1207 /* Find the first available standard integral type. */
1208 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1210 iv_type = build_nonstandard_integer_type (tbits, true);
1211 break;
1215 if (!iv_type)
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "can't vectorize with length-based partial vectors"
1220 " because there is no suitable iv type.\n");
1221 return false;
1224 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1225 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1227 return true;
1230 /* Calculate the cost of one scalar iteration of the loop. */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1234 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236 int nbbs = loop->num_nodes, factor;
1237 int innerloop_iters, i;
1239 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1241 /* Gather costs for statements in the scalar loop. */
1243 /* FORNOW. */
1244 innerloop_iters = 1;
1245 if (loop->inner)
1246 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1248 for (i = 0; i < nbbs; i++)
1250 gimple_stmt_iterator si;
1251 basic_block bb = bbs[i];
1253 if (bb->loop_father == loop->inner)
1254 factor = innerloop_iters;
1255 else
1256 factor = 1;
1258 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1260 gimple *stmt = gsi_stmt (si);
1261 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1263 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1264 continue;
1266 /* Skip stmts that are not vectorized inside the loop. */
1267 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1268 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1269 && (!STMT_VINFO_LIVE_P (vstmt_info)
1270 || !VECTORIZABLE_CYCLE_DEF
1271 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1272 continue;
1274 vect_cost_for_stmt kind;
1275 if (STMT_VINFO_DATA_REF (stmt_info))
1277 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1278 kind = scalar_load;
1279 else
1280 kind = scalar_store;
1282 else if (vect_nop_conversion_p (stmt_info))
1283 continue;
1284 else
1285 kind = scalar_stmt;
1287 /* We are using vect_prologue here to avoid scaling twice
1288 by the inner loop factor. */
1289 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1290 factor, kind, stmt_info, 0, vect_prologue);
1294 /* Now accumulate cost. */
1295 vector_costs *target_cost_data = init_cost (loop_vinfo, true);
1296 stmt_info_for_cost *si;
1297 int j;
1298 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1299 j, si)
1300 (void) add_stmt_cost (target_cost_data, si->count,
1301 si->kind, si->stmt_info, si->vectype,
1302 si->misalign, si->where);
1303 unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0;
1304 finish_cost (target_cost_data, &prologue_cost, &body_cost,
1305 &epilogue_cost);
1306 delete target_cost_data;
1307 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1308 = prologue_cost + body_cost + epilogue_cost;
1312 /* Function vect_analyze_loop_form.
1314 Verify that certain CFG restrictions hold, including:
1315 - the loop has a pre-header
1316 - the loop has a single entry and exit
1317 - the loop exit condition is simple enough
1318 - the number of iterations can be analyzed, i.e, a countable loop. The
1319 niter could be analyzed under some assumptions. */
1321 opt_result
1322 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1324 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1326 /* Different restrictions apply when we are considering an inner-most loop,
1327 vs. an outer (nested) loop.
1328 (FORNOW. May want to relax some of these restrictions in the future). */
1330 info->inner_loop_cond = NULL;
1331 if (!loop->inner)
1333 /* Inner-most loop. We currently require that the number of BBs is
1334 exactly 2 (the header and latch). Vectorizable inner-most loops
1335 look like this:
1337 (pre-header)
1339 header <--------+
1340 | | |
1341 | +--> latch --+
1343 (exit-bb) */
1345 if (loop->num_nodes != 2)
1346 return opt_result::failure_at (vect_location,
1347 "not vectorized:"
1348 " control flow in loop.\n");
1350 if (empty_block_p (loop->header))
1351 return opt_result::failure_at (vect_location,
1352 "not vectorized: empty loop.\n");
1354 else
1356 class loop *innerloop = loop->inner;
1357 edge entryedge;
1359 /* Nested loop. We currently require that the loop is doubly-nested,
1360 contains a single inner loop, and the number of BBs is exactly 5.
1361 Vectorizable outer-loops look like this:
1363 (pre-header)
1365 header <---+
1367 inner-loop |
1369 tail ------+
1371 (exit-bb)
1373 The inner-loop has the properties expected of inner-most loops
1374 as described above. */
1376 if ((loop->inner)->inner || (loop->inner)->next)
1377 return opt_result::failure_at (vect_location,
1378 "not vectorized:"
1379 " multiple nested loops.\n");
1381 if (loop->num_nodes != 5)
1382 return opt_result::failure_at (vect_location,
1383 "not vectorized:"
1384 " control flow in loop.\n");
1386 entryedge = loop_preheader_edge (innerloop);
1387 if (entryedge->src != loop->header
1388 || !single_exit (innerloop)
1389 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1390 return opt_result::failure_at (vect_location,
1391 "not vectorized:"
1392 " unsupported outerloop form.\n");
1394 /* Analyze the inner-loop. */
1395 vect_loop_form_info inner;
1396 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1397 if (!res)
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401 "not vectorized: Bad inner loop.\n");
1402 return res;
1405 /* Don't support analyzing niter under assumptions for inner
1406 loop. */
1407 if (!integer_onep (inner.assumptions))
1408 return opt_result::failure_at (vect_location,
1409 "not vectorized: Bad inner loop.\n");
1411 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: inner-loop count not"
1414 " invariant.\n");
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE, vect_location,
1418 "Considering outer-loop vectorization.\n");
1419 info->inner_loop_cond = inner.loop_cond;
1422 if (!single_exit (loop))
1423 return opt_result::failure_at (vect_location,
1424 "not vectorized: multiple exits.\n");
1425 if (EDGE_COUNT (loop->header->preds) != 2)
1426 return opt_result::failure_at (vect_location,
1427 "not vectorized:"
1428 " too many incoming edges.\n");
1430 /* We assume that the loop exit condition is at the end of the loop. i.e,
1431 that the loop is represented as a do-while (with a proper if-guard
1432 before the loop if needed), where the loop header contains all the
1433 executable statements, and the latch is empty. */
1434 if (!empty_block_p (loop->latch)
1435 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1436 return opt_result::failure_at (vect_location,
1437 "not vectorized: latch block not empty.\n");
1439 /* Make sure the exit is not abnormal. */
1440 edge e = single_exit (loop);
1441 if (e->flags & EDGE_ABNORMAL)
1442 return opt_result::failure_at (vect_location,
1443 "not vectorized:"
1444 " abnormal loop exit edge.\n");
1446 info->loop_cond
1447 = vect_get_loop_niters (loop, &info->assumptions,
1448 &info->number_of_iterations,
1449 &info->number_of_iterationsm1);
1450 if (!info->loop_cond)
1451 return opt_result::failure_at
1452 (vect_location,
1453 "not vectorized: complicated exit condition.\n");
1455 if (integer_zerop (info->assumptions)
1456 || !info->number_of_iterations
1457 || chrec_contains_undetermined (info->number_of_iterations))
1458 return opt_result::failure_at
1459 (info->loop_cond,
1460 "not vectorized: number of iterations cannot be computed.\n");
1462 if (integer_zerop (info->number_of_iterations))
1463 return opt_result::failure_at
1464 (info->loop_cond,
1465 "not vectorized: number of iterations = 0.\n");
1467 return opt_result::success ();
1470 /* Create a loop_vec_info for LOOP with SHARED and the
1471 vect_analyze_loop_form result. */
1473 loop_vec_info
1474 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1475 const vect_loop_form_info *info)
1477 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1478 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1479 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1480 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1481 if (!integer_onep (info->assumptions))
1483 /* We consider to vectorize this loop by versioning it under
1484 some assumptions. In order to do this, we need to clear
1485 existing information computed by scev and niter analyzer. */
1486 scev_reset_htab ();
1487 free_numbers_of_iterations_estimates (loop);
1488 /* Also set flag for this loop so that following scev and niter
1489 analysis are done under the assumptions. */
1490 loop_constraint_set (loop, LOOP_C_FINITE);
1491 /* Also record the assumptions for versioning. */
1492 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1495 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1497 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE, vect_location,
1500 "Symbolic number of iterations is ");
1501 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1502 dump_printf (MSG_NOTE, "\n");
1506 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1507 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508 if (info->inner_loop_cond)
1510 stmt_vec_info inner_loop_cond_info
1511 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1512 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1513 /* If we have an estimate on the number of iterations of the inner
1514 loop use that to limit the scale for costing, otherwise use
1515 --param vect-inner-loop-cost-factor literally. */
1516 widest_int nit;
1517 if (estimated_stmt_executions (loop->inner, &nit))
1518 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1519 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1522 return loop_vinfo;
1527 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1528 statements update the vectorization factor. */
1530 static void
1531 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1533 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1534 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1535 int nbbs = loop->num_nodes;
1536 poly_uint64 vectorization_factor;
1537 int i;
1539 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1541 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1542 gcc_assert (known_ne (vectorization_factor, 0U));
1544 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1545 vectorization factor of the loop is the unrolling factor required by
1546 the SLP instances. If that unrolling factor is 1, we say, that we
1547 perform pure SLP on loop - cross iteration parallelism is not
1548 exploited. */
1549 bool only_slp_in_loop = true;
1550 for (i = 0; i < nbbs; i++)
1552 basic_block bb = bbs[i];
1553 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1554 gsi_next (&si))
1556 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1557 if (!stmt_info)
1558 continue;
1559 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1560 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1561 && !PURE_SLP_STMT (stmt_info))
1562 /* STMT needs both SLP and loop-based vectorization. */
1563 only_slp_in_loop = false;
1565 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1566 gsi_next (&si))
1568 if (is_gimple_debug (gsi_stmt (si)))
1569 continue;
1570 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1571 stmt_info = vect_stmt_to_vectorize (stmt_info);
1572 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574 && !PURE_SLP_STMT (stmt_info))
1575 /* STMT needs both SLP and loop-based vectorization. */
1576 only_slp_in_loop = false;
1580 if (only_slp_in_loop)
1582 if (dump_enabled_p ())
1583 dump_printf_loc (MSG_NOTE, vect_location,
1584 "Loop contains only SLP stmts\n");
1585 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1587 else
1589 if (dump_enabled_p ())
1590 dump_printf_loc (MSG_NOTE, vect_location,
1591 "Loop contains SLP and non-SLP stmts\n");
1592 /* Both the vectorization factor and unroll factor have the form
1593 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1594 so they must have a common multiple. */
1595 vectorization_factor
1596 = force_common_multiple (vectorization_factor,
1597 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1600 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1601 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "Updating vectorization factor to ");
1605 dump_dec (MSG_NOTE, vectorization_factor);
1606 dump_printf (MSG_NOTE, ".\n");
1610 /* Return true if STMT_INFO describes a double reduction phi and if
1611 the other phi in the reduction is also relevant for vectorization.
1612 This rejects cases such as:
1614 outer1:
1615 x_1 = PHI <x_3(outer2), ...>;
1618 inner:
1619 x_2 = ...;
1622 outer2:
1623 x_3 = PHI <x_2(inner)>;
1625 if nothing in x_2 or elsewhere makes x_1 relevant. */
1627 static bool
1628 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1630 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1631 return false;
1633 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1636 /* Function vect_analyze_loop_operations.
1638 Scan the loop stmts and make sure they are all vectorizable. */
1640 static opt_result
1641 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1643 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1644 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1645 int nbbs = loop->num_nodes;
1646 int i;
1647 stmt_vec_info stmt_info;
1648 bool need_to_vectorize = false;
1649 bool ok;
1651 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1653 auto_vec<stmt_info_for_cost> cost_vec;
1655 for (i = 0; i < nbbs; i++)
1657 basic_block bb = bbs[i];
1659 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1660 gsi_next (&si))
1662 gphi *phi = si.phi ();
1663 ok = true;
1665 stmt_info = loop_vinfo->lookup_stmt (phi);
1666 if (dump_enabled_p ())
1667 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1668 if (virtual_operand_p (gimple_phi_result (phi)))
1669 continue;
1671 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1672 (i.e., a phi in the tail of the outer-loop). */
1673 if (! is_loop_header_bb_p (bb))
1675 /* FORNOW: we currently don't support the case that these phis
1676 are not used in the outerloop (unless it is double reduction,
1677 i.e., this phi is vect_reduction_def), cause this case
1678 requires to actually do something here. */
1679 if (STMT_VINFO_LIVE_P (stmt_info)
1680 && !vect_active_double_reduction_p (stmt_info))
1681 return opt_result::failure_at (phi,
1682 "Unsupported loop-closed phi"
1683 " in outer-loop.\n");
1685 /* If PHI is used in the outer loop, we check that its operand
1686 is defined in the inner loop. */
1687 if (STMT_VINFO_RELEVANT_P (stmt_info))
1689 tree phi_op;
1691 if (gimple_phi_num_args (phi) != 1)
1692 return opt_result::failure_at (phi, "unsupported phi");
1694 phi_op = PHI_ARG_DEF (phi, 0);
1695 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1696 if (!op_def_info)
1697 return opt_result::failure_at (phi, "unsupported phi\n");
1699 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1700 && (STMT_VINFO_RELEVANT (op_def_info)
1701 != vect_used_in_outer_by_reduction))
1702 return opt_result::failure_at (phi, "unsupported phi\n");
1704 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1705 || (STMT_VINFO_DEF_TYPE (stmt_info)
1706 == vect_double_reduction_def))
1707 && !vectorizable_lc_phi (loop_vinfo,
1708 stmt_info, NULL, NULL))
1709 return opt_result::failure_at (phi, "unsupported phi\n");
1712 continue;
1715 gcc_assert (stmt_info);
1717 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1718 || STMT_VINFO_LIVE_P (stmt_info))
1719 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1720 /* A scalar-dependence cycle that we don't support. */
1721 return opt_result::failure_at (phi,
1722 "not vectorized:"
1723 " scalar dependence cycle.\n");
1725 if (STMT_VINFO_RELEVANT_P (stmt_info))
1727 need_to_vectorize = true;
1728 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1729 && ! PURE_SLP_STMT (stmt_info))
1730 ok = vectorizable_induction (loop_vinfo,
1731 stmt_info, NULL, NULL,
1732 &cost_vec);
1733 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1734 || (STMT_VINFO_DEF_TYPE (stmt_info)
1735 == vect_double_reduction_def)
1736 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1737 && ! PURE_SLP_STMT (stmt_info))
1738 ok = vectorizable_reduction (loop_vinfo,
1739 stmt_info, NULL, NULL, &cost_vec);
1742 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1743 if (ok
1744 && STMT_VINFO_LIVE_P (stmt_info)
1745 && !PURE_SLP_STMT (stmt_info))
1746 ok = vectorizable_live_operation (loop_vinfo,
1747 stmt_info, NULL, NULL, NULL,
1748 -1, false, &cost_vec);
1750 if (!ok)
1751 return opt_result::failure_at (phi,
1752 "not vectorized: relevant phi not "
1753 "supported: %G",
1754 static_cast <gimple *> (phi));
1757 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1758 gsi_next (&si))
1760 gimple *stmt = gsi_stmt (si);
1761 if (!gimple_clobber_p (stmt)
1762 && !is_gimple_debug (stmt))
1764 opt_result res
1765 = vect_analyze_stmt (loop_vinfo,
1766 loop_vinfo->lookup_stmt (stmt),
1767 &need_to_vectorize,
1768 NULL, NULL, &cost_vec);
1769 if (!res)
1770 return res;
1773 } /* bbs */
1775 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1777 /* All operations in the loop are either irrelevant (deal with loop
1778 control, or dead), or only used outside the loop and can be moved
1779 out of the loop (e.g. invariants, inductions). The loop can be
1780 optimized away by scalar optimizations. We're better off not
1781 touching this loop. */
1782 if (!need_to_vectorize)
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_NOTE, vect_location,
1786 "All the computation can be taken out of the loop.\n");
1787 return opt_result::failure_at
1788 (vect_location,
1789 "not vectorized: redundant loop. no profit to vectorize.\n");
1792 return opt_result::success ();
1795 /* Return true if we know that the iteration count is smaller than the
1796 vectorization factor. Return false if it isn't, or if we can't be sure
1797 either way. */
1799 static bool
1800 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1802 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1804 HOST_WIDE_INT max_niter;
1805 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1806 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1807 else
1808 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1810 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1811 return true;
1813 return false;
1816 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1817 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1818 definitely no, or -1 if it's worth retrying. */
1820 static int
1821 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1823 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1824 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1826 /* Only loops that can handle partially-populated vectors can have iteration
1827 counts less than the vectorization factor. */
1828 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1830 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1832 if (dump_enabled_p ())
1833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1834 "not vectorized: iteration count smaller than "
1835 "vectorization factor.\n");
1836 return 0;
1840 /* If using the "very cheap" model. reject cases in which we'd keep
1841 a copy of the scalar code (even if we might be able to vectorize it). */
1842 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1843 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1844 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1845 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1847 if (dump_enabled_p ())
1848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1849 "some scalar iterations would need to be peeled\n");
1850 return 0;
1853 int min_profitable_iters, min_profitable_estimate;
1854 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1855 &min_profitable_estimate);
1857 if (min_profitable_iters < 0)
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "not vectorized: vectorization not profitable.\n");
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864 "not vectorized: vector version will never be "
1865 "profitable.\n");
1866 return -1;
1869 int min_scalar_loop_bound = (param_min_vect_loop_bound
1870 * assumed_vf);
1872 /* Use the cost model only if it is more conservative than user specified
1873 threshold. */
1874 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1875 min_profitable_iters);
1877 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1879 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1880 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1882 if (dump_enabled_p ())
1883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884 "not vectorized: vectorization not profitable.\n");
1885 if (dump_enabled_p ())
1886 dump_printf_loc (MSG_NOTE, vect_location,
1887 "not vectorized: iteration count smaller than user "
1888 "specified loop bound parameter or minimum profitable "
1889 "iterations (whichever is more conservative).\n");
1890 return 0;
1893 /* The static profitablity threshold min_profitable_estimate includes
1894 the cost of having to check at runtime whether the scalar loop
1895 should be used instead. If it turns out that we don't need or want
1896 such a check, the threshold we should use for the static estimate
1897 is simply the point at which the vector loop becomes more profitable
1898 than the scalar loop. */
1899 if (min_profitable_estimate > min_profitable_iters
1900 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1901 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1902 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1903 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1907 " choice between the scalar and vector loops\n");
1908 min_profitable_estimate = min_profitable_iters;
1911 /* If the vector loop needs multiple iterations to be beneficial then
1912 things are probably too close to call, and the conservative thing
1913 would be to stick with the scalar code. */
1914 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1915 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1919 "one iteration of the vector loop would be"
1920 " more expensive than the equivalent number of"
1921 " iterations of the scalar loop\n");
1922 return 0;
1925 HOST_WIDE_INT estimated_niter;
1927 /* If we are vectorizing an epilogue then we know the maximum number of
1928 scalar iterations it will cover is at least one lower than the
1929 vectorization factor of the main loop. */
1930 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1931 estimated_niter
1932 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1933 else
1935 estimated_niter = estimated_stmt_executions_int (loop);
1936 if (estimated_niter == -1)
1937 estimated_niter = likely_max_stmt_executions_int (loop);
1939 if (estimated_niter != -1
1940 && ((unsigned HOST_WIDE_INT) estimated_niter
1941 < MAX (th, (unsigned) min_profitable_estimate)))
1943 if (dump_enabled_p ())
1944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1945 "not vectorized: estimated iteration count too "
1946 "small.\n");
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_NOTE, vect_location,
1949 "not vectorized: estimated iteration count smaller "
1950 "than specified loop bound parameter or minimum "
1951 "profitable iterations (whichever is more "
1952 "conservative).\n");
1953 return -1;
1956 return 1;
1959 static opt_result
1960 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1961 vec<data_reference_p> *datarefs,
1962 unsigned int *n_stmts)
1964 *n_stmts = 0;
1965 for (unsigned i = 0; i < loop->num_nodes; i++)
1966 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1967 !gsi_end_p (gsi); gsi_next (&gsi))
1969 gimple *stmt = gsi_stmt (gsi);
1970 if (is_gimple_debug (stmt))
1971 continue;
1972 ++(*n_stmts);
1973 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1974 NULL, 0);
1975 if (!res)
1977 if (is_gimple_call (stmt) && loop->safelen)
1979 tree fndecl = gimple_call_fndecl (stmt), op;
1980 if (fndecl != NULL_TREE)
1982 cgraph_node *node = cgraph_node::get (fndecl);
1983 if (node != NULL && node->simd_clones != NULL)
1985 unsigned int j, n = gimple_call_num_args (stmt);
1986 for (j = 0; j < n; j++)
1988 op = gimple_call_arg (stmt, j);
1989 if (DECL_P (op)
1990 || (REFERENCE_CLASS_P (op)
1991 && get_base_address (op)))
1992 break;
1994 op = gimple_call_lhs (stmt);
1995 /* Ignore #pragma omp declare simd functions
1996 if they don't have data references in the
1997 call stmt itself. */
1998 if (j == n
1999 && !(op
2000 && (DECL_P (op)
2001 || (REFERENCE_CLASS_P (op)
2002 && get_base_address (op)))))
2003 continue;
2007 return res;
2009 /* If dependence analysis will give up due to the limit on the
2010 number of datarefs stop here and fail fatally. */
2011 if (datarefs->length ()
2012 > (unsigned)param_loop_max_datarefs_for_datadeps)
2013 return opt_result::failure_at (stmt, "exceeded param "
2014 "loop-max-datarefs-for-datadeps\n");
2016 return opt_result::success ();
2019 /* Look for SLP-only access groups and turn each individual access into its own
2020 group. */
2021 static void
2022 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2024 unsigned int i;
2025 struct data_reference *dr;
2027 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2029 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2030 FOR_EACH_VEC_ELT (datarefs, i, dr)
2032 gcc_assert (DR_REF (dr));
2033 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2035 /* Check if the load is a part of an interleaving chain. */
2036 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2038 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2039 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2040 unsigned int group_size = DR_GROUP_SIZE (first_element);
2042 /* Check if SLP-only groups. */
2043 if (!STMT_SLP_TYPE (stmt_info)
2044 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2046 /* Dissolve the group. */
2047 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2049 stmt_vec_info vinfo = first_element;
2050 while (vinfo)
2052 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2053 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2054 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2055 DR_GROUP_SIZE (vinfo) = 1;
2056 if (STMT_VINFO_STRIDED_P (first_element))
2057 DR_GROUP_GAP (vinfo) = 0;
2058 else
2059 DR_GROUP_GAP (vinfo) = group_size - 1;
2060 /* Duplicate and adjust alignment info, it needs to
2061 be present on each group leader, see dr_misalignment. */
2062 if (vinfo != first_element)
2064 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2065 dr_info2->target_alignment = dr_info->target_alignment;
2066 int misalignment = dr_info->misalignment;
2067 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2069 HOST_WIDE_INT diff
2070 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2071 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2072 unsigned HOST_WIDE_INT align_c
2073 = dr_info->target_alignment.to_constant ();
2074 misalignment = (misalignment + diff) % align_c;
2076 dr_info2->misalignment = misalignment;
2078 vinfo = next;
2085 /* Determine if operating on full vectors for LOOP_VINFO might leave
2086 some scalar iterations still to do. If so, decide how we should
2087 handle those scalar iterations. The possibilities are:
2089 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2090 In this case:
2092 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2093 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2094 LOOP_VINFO_PEELING_FOR_NITER == false
2096 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2097 to handle the remaining scalar iterations. In this case:
2099 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2100 LOOP_VINFO_PEELING_FOR_NITER == true
2102 There are two choices:
2104 (2a) Consider vectorizing the epilogue loop at the same VF as the
2105 main loop, but using partial vectors instead of full vectors.
2106 In this case:
2108 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2110 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2111 In this case:
2113 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2115 When FOR_EPILOGUE_P is true, make this determination based on the
2116 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2117 based on the assumption that LOOP_VINFO is the main loop. The caller
2118 has made sure that the number of iterations is set appropriately for
2119 this value of FOR_EPILOGUE_P. */
2121 opt_result
2122 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2123 bool for_epilogue_p)
2125 /* Determine whether there would be any scalar iterations left over. */
2126 bool need_peeling_or_partial_vectors_p
2127 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2129 /* Decide whether to vectorize the loop with partial vectors. */
2130 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2131 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2132 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2133 && need_peeling_or_partial_vectors_p)
2135 /* For partial-vector-usage=1, try to push the handling of partial
2136 vectors to the epilogue, with the main loop continuing to operate
2137 on full vectors.
2139 ??? We could then end up failing to use partial vectors if we
2140 decide to peel iterations into a prologue, and if the main loop
2141 then ends up processing fewer than VF iterations. */
2142 if (param_vect_partial_vector_usage == 1
2143 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2144 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2145 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2146 else
2147 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2150 if (dump_enabled_p ())
2152 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2153 dump_printf_loc (MSG_NOTE, vect_location,
2154 "operating on partial vectors%s.\n",
2155 for_epilogue_p ? " for epilogue loop" : "");
2156 else
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "operating only on full vectors%s.\n",
2159 for_epilogue_p ? " for epilogue loop" : "");
2162 if (for_epilogue_p)
2164 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165 gcc_assert (orig_loop_vinfo);
2166 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2167 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2168 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2171 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2172 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174 /* Check that the loop processes at least one full vector. */
2175 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2176 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2177 if (known_lt (wi::to_widest (scalar_niters), vf))
2178 return opt_result::failure_at (vect_location,
2179 "loop does not have enough iterations"
2180 " to support vectorization.\n");
2182 /* If we need to peel an extra epilogue iteration to handle data
2183 accesses with gaps, check that there are enough scalar iterations
2184 available.
2186 The check above is redundant with this one when peeling for gaps,
2187 but the distinction is useful for diagnostics. */
2188 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2189 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2190 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2191 return opt_result::failure_at (vect_location,
2192 "loop does not have enough iterations"
2193 " to support peeling for gaps.\n");
2196 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2197 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2198 && need_peeling_or_partial_vectors_p);
2200 return opt_result::success ();
2203 /* Function vect_analyze_loop_2.
2205 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2206 for it. The different analyses will record information in the
2207 loop_vec_info struct. */
2208 static opt_result
2209 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2211 opt_result ok = opt_result::success ();
2212 int res;
2213 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2214 poly_uint64 min_vf = 2;
2215 loop_vec_info orig_loop_vinfo = NULL;
2217 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2218 loop_vec_info of the first vectorized loop. */
2219 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2220 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2221 else
2222 orig_loop_vinfo = loop_vinfo;
2223 gcc_assert (orig_loop_vinfo);
2225 /* The first group of checks is independent of the vector size. */
2226 fatal = true;
2228 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2229 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2230 return opt_result::failure_at (vect_location,
2231 "not vectorized: simd if(0)\n");
2233 /* Find all data references in the loop (which correspond to vdefs/vuses)
2234 and analyze their evolution in the loop. */
2236 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2238 /* Gather the data references and count stmts in the loop. */
2239 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2241 opt_result res
2242 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2243 &LOOP_VINFO_DATAREFS (loop_vinfo),
2244 &LOOP_VINFO_N_STMTS (loop_vinfo));
2245 if (!res)
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "not vectorized: loop contains function "
2250 "calls or data references that cannot "
2251 "be analyzed\n");
2252 return res;
2254 loop_vinfo->shared->save_datarefs ();
2256 else
2257 loop_vinfo->shared->check_datarefs ();
2259 /* Analyze the data references and also adjust the minimal
2260 vectorization factor according to the loads and stores. */
2262 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2263 if (!ok)
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2267 "bad data references.\n");
2268 return ok;
2271 /* Classify all cross-iteration scalar data-flow cycles.
2272 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2273 vect_analyze_scalar_cycles (loop_vinfo);
2275 vect_pattern_recog (loop_vinfo);
2277 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2279 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2280 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2282 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2283 if (!ok)
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "bad data access.\n");
2288 return ok;
2291 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2293 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2294 if (!ok)
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "unexpected pattern.\n");
2299 return ok;
2302 /* While the rest of the analysis below depends on it in some way. */
2303 fatal = false;
2305 /* Analyze data dependences between the data-refs in the loop
2306 and adjust the maximum vectorization factor according to
2307 the dependences.
2308 FORNOW: fail at the first data dependence that we encounter. */
2310 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2311 if (!ok)
2313 if (dump_enabled_p ())
2314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2315 "bad data dependence.\n");
2316 return ok;
2318 if (max_vf != MAX_VECTORIZATION_FACTOR
2319 && maybe_lt (max_vf, min_vf))
2320 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2321 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2323 ok = vect_determine_vectorization_factor (loop_vinfo);
2324 if (!ok)
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328 "can't determine vectorization factor.\n");
2329 return ok;
2331 if (max_vf != MAX_VECTORIZATION_FACTOR
2332 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2333 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2335 /* Compute the scalar iteration cost. */
2336 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2338 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2340 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2341 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2342 if (!ok)
2343 return ok;
2345 /* If there are any SLP instances mark them as pure_slp. */
2346 bool slp = vect_make_slp_decision (loop_vinfo);
2347 if (slp)
2349 /* Find stmts that need to be both vectorized and SLPed. */
2350 vect_detect_hybrid_slp (loop_vinfo);
2352 /* Update the vectorization factor based on the SLP decision. */
2353 vect_update_vf_for_slp (loop_vinfo);
2355 /* Optimize the SLP graph with the vectorization factor fixed. */
2356 vect_optimize_slp (loop_vinfo);
2358 /* Gather the loads reachable from the SLP graph entries. */
2359 vect_gather_slp_loads (loop_vinfo);
2362 bool saved_can_use_partial_vectors_p
2363 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2365 /* We don't expect to have to roll back to anything other than an empty
2366 set of rgroups. */
2367 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2369 /* This is the point where we can re-start analysis with SLP forced off. */
2370 start_over:
2372 /* Now the vectorization factor is final. */
2373 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2374 gcc_assert (known_ne (vectorization_factor, 0U));
2376 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2378 dump_printf_loc (MSG_NOTE, vect_location,
2379 "vectorization_factor = ");
2380 dump_dec (MSG_NOTE, vectorization_factor);
2381 dump_printf (MSG_NOTE, ", niters = %wd\n",
2382 LOOP_VINFO_INT_NITERS (loop_vinfo));
2385 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = init_cost (loop_vinfo, false);
2387 /* Analyze the alignment of the data-refs in the loop.
2388 Fail if a data reference is found that cannot be vectorized. */
2390 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2391 if (!ok)
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395 "bad data alignment.\n");
2396 return ok;
2399 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2400 It is important to call pruning after vect_analyze_data_ref_accesses,
2401 since we use grouping information gathered by interleaving analysis. */
2402 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2403 if (!ok)
2404 return ok;
2406 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2407 vectorization, since we do not want to add extra peeling or
2408 add versioning for alignment. */
2409 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2410 /* This pass will decide on using loop versioning and/or loop peeling in
2411 order to enhance the alignment of data references in the loop. */
2412 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2413 if (!ok)
2414 return ok;
2416 if (slp)
2418 /* Analyze operations in the SLP instances. Note this may
2419 remove unsupported SLP instances which makes the above
2420 SLP kind detection invalid. */
2421 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2422 vect_slp_analyze_operations (loop_vinfo);
2423 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2425 ok = opt_result::failure_at (vect_location,
2426 "unsupported SLP instances\n");
2427 goto again;
2430 /* Check whether any load in ALL SLP instances is possibly permuted. */
2431 slp_tree load_node, slp_root;
2432 unsigned i, x;
2433 slp_instance instance;
2434 bool can_use_lanes = true;
2435 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2437 slp_root = SLP_INSTANCE_TREE (instance);
2438 int group_size = SLP_TREE_LANES (slp_root);
2439 tree vectype = SLP_TREE_VECTYPE (slp_root);
2440 bool loads_permuted = false;
2441 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2443 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2444 continue;
2445 unsigned j;
2446 stmt_vec_info load_info;
2447 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2448 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2450 loads_permuted = true;
2451 break;
2455 /* If the loads and stores can be handled with load/store-lane
2456 instructions record it and move on to the next instance. */
2457 if (loads_permuted
2458 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2459 && vect_store_lanes_supported (vectype, group_size, false))
2461 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2463 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2464 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2465 /* Use SLP for strided accesses (or if we can't
2466 load-lanes). */
2467 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2468 || ! vect_load_lanes_supported
2469 (STMT_VINFO_VECTYPE (stmt_vinfo),
2470 DR_GROUP_SIZE (stmt_vinfo), false))
2471 break;
2474 can_use_lanes
2475 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2477 if (can_use_lanes && dump_enabled_p ())
2478 dump_printf_loc (MSG_NOTE, vect_location,
2479 "SLP instance %p can use load/store-lanes\n",
2480 instance);
2482 else
2484 can_use_lanes = false;
2485 break;
2489 /* If all SLP instances can use load/store-lanes abort SLP and try again
2490 with SLP disabled. */
2491 if (can_use_lanes)
2493 ok = opt_result::failure_at (vect_location,
2494 "Built SLP cancelled: can use "
2495 "load/store-lanes\n");
2496 if (dump_enabled_p ())
2497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2498 "Built SLP cancelled: all SLP instances support "
2499 "load/store-lanes\n");
2500 goto again;
2504 /* Dissolve SLP-only groups. */
2505 vect_dissolve_slp_only_groups (loop_vinfo);
2507 /* Scan all the remaining operations in the loop that are not subject
2508 to SLP and make sure they are vectorizable. */
2509 ok = vect_analyze_loop_operations (loop_vinfo);
2510 if (!ok)
2512 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514 "bad operation or unsupported loop bound.\n");
2515 return ok;
2518 /* For now, we don't expect to mix both masking and length approaches for one
2519 loop, disable it if both are recorded. */
2520 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2521 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2522 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526 "can't vectorize a loop with partial vectors"
2527 " because we don't expect to mix different"
2528 " approaches with partial vectors for the"
2529 " same loop.\n");
2530 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2533 /* If we still have the option of using partial vectors,
2534 check whether we can generate the necessary loop controls. */
2535 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2536 && !vect_verify_full_masking (loop_vinfo)
2537 && !vect_verify_loop_lens (loop_vinfo))
2538 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2540 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2541 to be able to handle fewer than VF scalars, or needs to have a lower VF
2542 than the main loop. */
2543 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2544 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2545 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2546 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2547 return opt_result::failure_at (vect_location,
2548 "Vectorization factor too high for"
2549 " epilogue loop.\n");
2551 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2552 assuming that the loop will be used as a main loop. We will redo
2553 this analysis later if we instead decide to use the loop as an
2554 epilogue loop. */
2555 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2556 if (!ok)
2557 return ok;
2559 /* Check the costings of the loop make vectorizing worthwhile. */
2560 res = vect_analyze_loop_costing (loop_vinfo);
2561 if (res < 0)
2563 ok = opt_result::failure_at (vect_location,
2564 "Loop costings may not be worthwhile.\n");
2565 goto again;
2567 if (!res)
2568 return opt_result::failure_at (vect_location,
2569 "Loop costings not worthwhile.\n");
2571 /* If an epilogue loop is required make sure we can create one. */
2572 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2573 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2575 if (dump_enabled_p ())
2576 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2577 if (!vect_can_advance_ivs_p (loop_vinfo)
2578 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2579 single_exit (LOOP_VINFO_LOOP
2580 (loop_vinfo))))
2582 ok = opt_result::failure_at (vect_location,
2583 "not vectorized: can't create required "
2584 "epilog loop\n");
2585 goto again;
2589 /* During peeling, we need to check if number of loop iterations is
2590 enough for both peeled prolog loop and vector loop. This check
2591 can be merged along with threshold check of loop versioning, so
2592 increase threshold for this case if necessary.
2594 If we are analyzing an epilogue we still want to check what its
2595 versioning threshold would be. If we decide to vectorize the epilogues we
2596 will want to use the lowest versioning threshold of all epilogues and main
2597 loop. This will enable us to enter a vectorized epilogue even when
2598 versioning the loop. We can't simply check whether the epilogue requires
2599 versioning though since we may have skipped some versioning checks when
2600 analyzing the epilogue. For instance, checks for alias versioning will be
2601 skipped when dealing with epilogues as we assume we already checked them
2602 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2603 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2605 poly_uint64 niters_th = 0;
2606 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2608 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2610 /* Niters for peeled prolog loop. */
2611 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2613 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2614 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2615 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2617 else
2618 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2621 /* Niters for at least one iteration of vectorized loop. */
2622 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2623 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2624 /* One additional iteration because of peeling for gap. */
2625 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2626 niters_th += 1;
2628 /* Use the same condition as vect_transform_loop to decide when to use
2629 the cost to determine a versioning threshold. */
2630 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2631 && ordered_p (th, niters_th))
2632 niters_th = ordered_max (poly_uint64 (th), niters_th);
2634 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2637 gcc_assert (known_eq (vectorization_factor,
2638 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2640 /* Ok to vectorize! */
2641 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2642 return opt_result::success ();
2644 again:
2645 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2646 gcc_assert (!ok);
2648 /* Try again with SLP forced off but if we didn't do any SLP there is
2649 no point in re-trying. */
2650 if (!slp)
2651 return ok;
2653 /* If there are reduction chains re-trying will fail anyway. */
2654 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2655 return ok;
2657 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2658 via interleaving or lane instructions. */
2659 slp_instance instance;
2660 slp_tree node;
2661 unsigned i, j;
2662 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2664 stmt_vec_info vinfo;
2665 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2666 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2667 continue;
2668 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2669 unsigned int size = DR_GROUP_SIZE (vinfo);
2670 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2671 if (! vect_store_lanes_supported (vectype, size, false)
2672 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2673 && ! vect_grouped_store_supported (vectype, size))
2674 return opt_result::failure_at (vinfo->stmt,
2675 "unsupported grouped store\n");
2676 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2678 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2679 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2680 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2681 size = DR_GROUP_SIZE (vinfo);
2682 vectype = STMT_VINFO_VECTYPE (vinfo);
2683 if (! vect_load_lanes_supported (vectype, size, false)
2684 && ! vect_grouped_load_supported (vectype, single_element_p,
2685 size))
2686 return opt_result::failure_at (vinfo->stmt,
2687 "unsupported grouped load\n");
2691 if (dump_enabled_p ())
2692 dump_printf_loc (MSG_NOTE, vect_location,
2693 "re-trying with SLP disabled\n");
2695 /* Roll back state appropriately. No SLP this time. */
2696 slp = false;
2697 /* Restore vectorization factor as it were without SLP. */
2698 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2699 /* Free the SLP instances. */
2700 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2701 vect_free_slp_instance (instance);
2702 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2703 /* Reset SLP type to loop_vect on all stmts. */
2704 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2706 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2707 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2708 !gsi_end_p (si); gsi_next (&si))
2710 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2711 STMT_SLP_TYPE (stmt_info) = loop_vect;
2712 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2713 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2715 /* vectorizable_reduction adjusts reduction stmt def-types,
2716 restore them to that of the PHI. */
2717 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2718 = STMT_VINFO_DEF_TYPE (stmt_info);
2719 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2720 (STMT_VINFO_REDUC_DEF (stmt_info)))
2721 = STMT_VINFO_DEF_TYPE (stmt_info);
2724 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2725 !gsi_end_p (si); gsi_next (&si))
2727 if (is_gimple_debug (gsi_stmt (si)))
2728 continue;
2729 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2730 STMT_SLP_TYPE (stmt_info) = loop_vect;
2731 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2733 stmt_vec_info pattern_stmt_info
2734 = STMT_VINFO_RELATED_STMT (stmt_info);
2735 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2736 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2738 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2739 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2740 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2741 !gsi_end_p (pi); gsi_next (&pi))
2742 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2743 = loop_vect;
2747 /* Free optimized alias test DDRS. */
2748 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2749 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2750 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2751 /* Reset target cost data. */
2752 delete LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2753 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) = nullptr;
2754 /* Reset accumulated rgroup information. */
2755 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2756 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2757 /* Reset assorted flags. */
2758 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2759 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2760 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2761 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2762 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2763 = saved_can_use_partial_vectors_p;
2765 goto start_over;
2768 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2769 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2770 OLD_LOOP_VINFO is better unless something specifically indicates
2771 otherwise.
2773 Note that this deliberately isn't a partial order. */
2775 static bool
2776 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2777 loop_vec_info old_loop_vinfo)
2779 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2780 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2782 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2783 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2785 /* Always prefer a VF of loop->simdlen over any other VF. */
2786 if (loop->simdlen)
2788 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2789 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2790 if (new_simdlen_p != old_simdlen_p)
2791 return new_simdlen_p;
2794 /* Limit the VFs to what is likely to be the maximum number of iterations,
2795 to handle cases in which at least one loop_vinfo is fully-masked. */
2796 HOST_WIDE_INT estimated_max_niter;
2797 loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
2798 unsigned HOST_WIDE_INT main_vf;
2799 if (main_loop
2800 && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
2801 && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
2802 estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
2803 else
2804 estimated_max_niter = likely_max_stmt_executions_int (loop);
2805 if (estimated_max_niter != -1)
2807 if (known_le (estimated_max_niter, new_vf))
2808 new_vf = estimated_max_niter;
2809 if (known_le (estimated_max_niter, old_vf))
2810 old_vf = estimated_max_niter;
2813 /* Check whether the (fractional) cost per scalar iteration is lower
2814 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2815 poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2816 poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2818 HOST_WIDE_INT est_rel_new_min
2819 = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2820 HOST_WIDE_INT est_rel_new_max
2821 = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2823 HOST_WIDE_INT est_rel_old_min
2824 = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2825 HOST_WIDE_INT est_rel_old_max
2826 = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2828 /* Check first if we can make out an unambigous total order from the minimum
2829 and maximum estimates. */
2830 if (est_rel_new_min < est_rel_old_min
2831 && est_rel_new_max < est_rel_old_max)
2832 return true;
2833 else if (est_rel_old_min < est_rel_new_min
2834 && est_rel_old_max < est_rel_new_max)
2835 return false;
2836 /* When old_loop_vinfo uses a variable vectorization factor,
2837 we know that it has a lower cost for at least one runtime VF.
2838 However, we don't know how likely that VF is.
2840 One option would be to compare the costs for the estimated VFs.
2841 The problem is that that can put too much pressure on the cost
2842 model. E.g. if the estimated VF is also the lowest possible VF,
2843 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2844 for the estimated VF, we'd then choose new_loop_vinfo even
2845 though (a) new_loop_vinfo might not actually be better than
2846 old_loop_vinfo for that VF and (b) it would be significantly
2847 worse at larger VFs.
2849 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2850 no more expensive than old_loop_vinfo even after doubling the
2851 estimated old_loop_vinfo VF. For all but trivial loops, this
2852 ensures that we only pick new_loop_vinfo if it is significantly
2853 better than old_loop_vinfo at the estimated VF. */
2855 if (est_rel_old_min != est_rel_new_min
2856 || est_rel_old_max != est_rel_new_max)
2858 HOST_WIDE_INT est_rel_new_likely
2859 = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2860 HOST_WIDE_INT est_rel_old_likely
2861 = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2863 return est_rel_new_likely * 2 <= est_rel_old_likely;
2866 /* If there's nothing to choose between the loop bodies, see whether
2867 there's a difference in the prologue and epilogue costs. */
2868 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2869 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2871 return false;
2874 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2875 true if we should. */
2877 static bool
2878 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2879 loop_vec_info old_loop_vinfo)
2881 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2882 return false;
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_NOTE, vect_location,
2886 "***** Preferring vector mode %s to vector mode %s\n",
2887 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2888 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2889 return true;
2892 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2893 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2894 MODE_I to the next mode useful to analyze.
2895 Return the loop_vinfo on success and wrapped null on failure. */
2897 static opt_loop_vec_info
2898 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2899 const vect_loop_form_info *loop_form_info,
2900 loop_vec_info main_loop_vinfo,
2901 const vector_modes &vector_modes, unsigned &mode_i,
2902 machine_mode &autodetected_vector_mode,
2903 bool &fatal)
2905 loop_vec_info loop_vinfo
2906 = vect_create_loop_vinfo (loop, shared, loop_form_info);
2907 if (main_loop_vinfo)
2908 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_vinfo;
2910 machine_mode vector_mode = vector_modes[mode_i];
2911 loop_vinfo->vector_mode = vector_mode;
2913 /* Run the main analysis. */
2914 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
2915 if (dump_enabled_p ())
2916 dump_printf_loc (MSG_NOTE, vect_location,
2917 "***** Analysis %s with vector mode %s\n",
2918 res ? "succeeded" : " failed",
2919 GET_MODE_NAME (loop_vinfo->vector_mode));
2921 /* Remember the autodetected vector mode. */
2922 if (vector_mode == VOIDmode)
2923 autodetected_vector_mode = loop_vinfo->vector_mode;
2925 /* Advance mode_i, first skipping modes that would result in the
2926 same analysis result. */
2927 while (mode_i + 1 < vector_modes.length ()
2928 && vect_chooses_same_modes_p (loop_vinfo,
2929 vector_modes[mode_i + 1]))
2931 if (dump_enabled_p ())
2932 dump_printf_loc (MSG_NOTE, vect_location,
2933 "***** The result for vector mode %s would"
2934 " be the same\n",
2935 GET_MODE_NAME (vector_modes[mode_i + 1]));
2936 mode_i += 1;
2938 if (mode_i + 1 < vector_modes.length ()
2939 && VECTOR_MODE_P (autodetected_vector_mode)
2940 && (related_vector_mode (vector_modes[mode_i + 1],
2941 GET_MODE_INNER (autodetected_vector_mode))
2942 == autodetected_vector_mode)
2943 && (related_vector_mode (autodetected_vector_mode,
2944 GET_MODE_INNER (vector_modes[mode_i + 1]))
2945 == vector_modes[mode_i + 1]))
2947 if (dump_enabled_p ())
2948 dump_printf_loc (MSG_NOTE, vect_location,
2949 "***** Skipping vector mode %s, which would"
2950 " repeat the analysis for %s\n",
2951 GET_MODE_NAME (vector_modes[mode_i + 1]),
2952 GET_MODE_NAME (autodetected_vector_mode));
2953 mode_i += 1;
2955 mode_i++;
2957 if (!res)
2959 delete loop_vinfo;
2960 if (fatal)
2961 gcc_checking_assert (main_loop_vinfo == NULL);
2962 return opt_loop_vec_info::propagate_failure (res);
2965 return opt_loop_vec_info::success (loop_vinfo);
2968 /* Function vect_analyze_loop.
2970 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2971 for it. The different analyses will record information in the
2972 loop_vec_info struct. */
2973 opt_loop_vec_info
2974 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2976 DUMP_VECT_SCOPE ("analyze_loop_nest");
2978 if (loop_outer (loop)
2979 && loop_vec_info_for_loop (loop_outer (loop))
2980 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2981 return opt_loop_vec_info::failure_at (vect_location,
2982 "outer-loop already vectorized.\n");
2984 if (!find_loop_nest (loop, &shared->loop_nest))
2985 return opt_loop_vec_info::failure_at
2986 (vect_location,
2987 "not vectorized: loop nest containing two or more consecutive inner"
2988 " loops cannot be vectorized\n");
2990 /* Analyze the loop form. */
2991 vect_loop_form_info loop_form_info;
2992 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2993 if (!res)
2995 if (dump_enabled_p ())
2996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2997 "bad loop form.\n");
2998 return opt_loop_vec_info::propagate_failure (res);
3001 auto_vector_modes vector_modes;
3002 /* Autodetect first vector size we try. */
3003 vector_modes.safe_push (VOIDmode);
3004 unsigned int autovec_flags
3005 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3006 loop->simdlen != 0);
3007 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3008 && !unlimited_cost_model (loop));
3009 machine_mode autodetected_vector_mode = VOIDmode;
3010 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3011 unsigned int mode_i = 0;
3012 unsigned int first_loop_i = 0;
3013 unsigned int first_loop_next_i = 0;
3014 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3016 /* First determine the main loop vectorization mode, either the first
3017 one that works, starting with auto-detecting the vector mode and then
3018 following the targets order of preference, or the one with the
3019 lowest cost if pick_lowest_cost_p. */
3020 while (1)
3022 unsigned int loop_vinfo_i = mode_i;
3023 bool fatal;
3024 opt_loop_vec_info loop_vinfo
3025 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3026 NULL, vector_modes, mode_i,
3027 autodetected_vector_mode, fatal);
3028 if (fatal)
3029 break;
3031 if (loop_vinfo)
3033 /* Once we hit the desired simdlen for the first time,
3034 discard any previous attempts. */
3035 if (simdlen
3036 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3038 delete first_loop_vinfo;
3039 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3040 simdlen = 0;
3042 else if (pick_lowest_cost_p
3043 && first_loop_vinfo
3044 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3046 /* Pick loop_vinfo over first_loop_vinfo. */
3047 delete first_loop_vinfo;
3048 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3050 if (first_loop_vinfo == NULL)
3052 first_loop_vinfo = loop_vinfo;
3053 first_loop_i = loop_vinfo_i;
3054 first_loop_next_i = mode_i;
3056 else
3058 delete loop_vinfo;
3059 loop_vinfo = opt_loop_vec_info::success (NULL);
3062 /* Commit to first_loop_vinfo if we have no reason to try
3063 alternatives. */
3064 if (!simdlen && !pick_lowest_cost_p)
3065 break;
3067 if (mode_i == vector_modes.length ()
3068 || autodetected_vector_mode == VOIDmode)
3069 break;
3071 /* Try the next biggest vector size. */
3072 if (dump_enabled_p ())
3073 dump_printf_loc (MSG_NOTE, vect_location,
3074 "***** Re-trying analysis with vector mode %s\n",
3075 GET_MODE_NAME (vector_modes[mode_i]));
3077 if (!first_loop_vinfo)
3078 return opt_loop_vec_info::propagate_failure (res);
3080 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_NOTE, vect_location,
3082 "***** Choosing vector mode %s\n",
3083 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3085 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3086 enabled, SIMDUID is not set, it is the innermost loop and we have
3087 either already found the loop's SIMDLEN or there was no SIMDLEN to
3088 begin with.
3089 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3090 bool vect_epilogues = (!simdlen
3091 && loop->inner == NULL
3092 && param_vect_epilogues_nomask
3093 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3094 && !loop->simduid);
3095 if (!vect_epilogues)
3096 return first_loop_vinfo;
3098 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3099 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3101 /* Handle the case that the original loop can use partial
3102 vectorization, but want to only adopt it for the epilogue.
3103 The retry should be in the same mode as original. */
3104 if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
3106 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
3107 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
3108 if (dump_enabled_p ())
3109 dump_printf_loc (MSG_NOTE, vect_location,
3110 "***** Re-trying analysis with same vector mode"
3111 " %s for epilogue with partial vectors.\n",
3112 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3113 mode_i = first_loop_i;
3115 else
3117 mode_i = first_loop_next_i;
3118 if (mode_i == vector_modes.length ())
3119 return first_loop_vinfo;
3122 /* ??? If first_loop_vinfo was using VOIDmode then we probably
3123 want to instead search for the corresponding mode in vector_modes[]. */
3125 while (1)
3127 bool fatal;
3128 opt_loop_vec_info loop_vinfo
3129 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3130 first_loop_vinfo,
3131 vector_modes, mode_i,
3132 autodetected_vector_mode, fatal);
3133 if (fatal)
3134 break;
3136 if (loop_vinfo)
3138 if (pick_lowest_cost_p)
3140 /* Keep trying to roll back vectorization attempts while the
3141 loop_vec_infos they produced were worse than this one. */
3142 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3143 while (!vinfos.is_empty ()
3144 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3146 gcc_assert (vect_epilogues);
3147 delete vinfos.pop ();
3150 /* For now only allow one epilogue loop. */
3151 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3153 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3154 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3155 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3156 || maybe_ne (lowest_th, 0U));
3157 /* Keep track of the known smallest versioning
3158 threshold. */
3159 if (ordered_p (lowest_th, th))
3160 lowest_th = ordered_min (lowest_th, th);
3162 else
3164 delete loop_vinfo;
3165 loop_vinfo = opt_loop_vec_info::success (NULL);
3168 /* For now only allow one epilogue loop, but allow
3169 pick_lowest_cost_p to replace it, so commit to the
3170 first epilogue if we have no reason to try alternatives. */
3171 if (!pick_lowest_cost_p)
3172 break;
3175 if (mode_i == vector_modes.length ())
3176 break;
3178 /* Try the next biggest vector size. */
3179 if (dump_enabled_p ())
3180 dump_printf_loc (MSG_NOTE, vect_location,
3181 "***** Re-trying epilogue analysis with vector "
3182 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3185 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3187 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3188 if (dump_enabled_p ())
3189 dump_printf_loc (MSG_NOTE, vect_location,
3190 "***** Choosing epilogue vector mode %s\n",
3191 GET_MODE_NAME
3192 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3195 return first_loop_vinfo;
3198 /* Return true if there is an in-order reduction function for CODE, storing
3199 it in *REDUC_FN if so. */
3201 static bool
3202 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3204 switch (code)
3206 case PLUS_EXPR:
3207 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3208 return true;
3210 default:
3211 return false;
3215 /* Function reduction_fn_for_scalar_code
3217 Input:
3218 CODE - tree_code of a reduction operations.
3220 Output:
3221 REDUC_FN - the corresponding internal function to be used to reduce the
3222 vector of partial results into a single scalar result, or IFN_LAST
3223 if the operation is a supported reduction operation, but does not have
3224 such an internal function.
3226 Return FALSE if CODE currently cannot be vectorized as reduction. */
3228 bool
3229 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3231 switch (code)
3233 case MAX_EXPR:
3234 *reduc_fn = IFN_REDUC_MAX;
3235 return true;
3237 case MIN_EXPR:
3238 *reduc_fn = IFN_REDUC_MIN;
3239 return true;
3241 case PLUS_EXPR:
3242 *reduc_fn = IFN_REDUC_PLUS;
3243 return true;
3245 case BIT_AND_EXPR:
3246 *reduc_fn = IFN_REDUC_AND;
3247 return true;
3249 case BIT_IOR_EXPR:
3250 *reduc_fn = IFN_REDUC_IOR;
3251 return true;
3253 case BIT_XOR_EXPR:
3254 *reduc_fn = IFN_REDUC_XOR;
3255 return true;
3257 case MULT_EXPR:
3258 case MINUS_EXPR:
3259 *reduc_fn = IFN_LAST;
3260 return true;
3262 default:
3263 return false;
3267 /* If there is a neutral value X such that a reduction would not be affected
3268 by the introduction of additional X elements, return that X, otherwise
3269 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3270 of the scalar elements. If the reduction has just a single initial value
3271 then INITIAL_VALUE is that value, otherwise it is null. */
3273 static tree
3274 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3276 switch (code)
3278 case WIDEN_SUM_EXPR:
3279 case DOT_PROD_EXPR:
3280 case SAD_EXPR:
3281 case PLUS_EXPR:
3282 case MINUS_EXPR:
3283 case BIT_IOR_EXPR:
3284 case BIT_XOR_EXPR:
3285 return build_zero_cst (scalar_type);
3287 case MULT_EXPR:
3288 return build_one_cst (scalar_type);
3290 case BIT_AND_EXPR:
3291 return build_all_ones_cst (scalar_type);
3293 case MAX_EXPR:
3294 case MIN_EXPR:
3295 return initial_value;
3297 default:
3298 return NULL_TREE;
3302 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3303 STMT is printed with a message MSG. */
3305 static void
3306 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3308 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3311 /* Return true if we need an in-order reduction for operation CODE
3312 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3313 overflow must wrap. */
3315 bool
3316 needs_fold_left_reduction_p (tree type, tree_code code)
3318 /* CHECKME: check for !flag_finite_math_only too? */
3319 if (SCALAR_FLOAT_TYPE_P (type))
3320 switch (code)
3322 case MIN_EXPR:
3323 case MAX_EXPR:
3324 return false;
3326 default:
3327 return !flag_associative_math;
3330 if (INTEGRAL_TYPE_P (type))
3332 if (!operation_no_trapping_overflow (type, code))
3333 return true;
3334 return false;
3337 if (SAT_FIXED_POINT_TYPE_P (type))
3338 return true;
3340 return false;
3343 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3344 has a handled computation expression. Store the main reduction
3345 operation in *CODE. */
3347 static bool
3348 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3349 tree loop_arg, enum tree_code *code,
3350 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3352 auto_bitmap visited;
3353 tree lookfor = PHI_RESULT (phi);
3354 ssa_op_iter curri;
3355 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3356 while (USE_FROM_PTR (curr) != loop_arg)
3357 curr = op_iter_next_use (&curri);
3358 curri.i = curri.numops;
3361 path.safe_push (std::make_pair (curri, curr));
3362 tree use = USE_FROM_PTR (curr);
3363 if (use == lookfor)
3364 break;
3365 gimple *def = SSA_NAME_DEF_STMT (use);
3366 if (gimple_nop_p (def)
3367 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3369 pop:
3372 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3373 curri = x.first;
3374 curr = x.second;
3376 curr = op_iter_next_use (&curri);
3377 /* Skip already visited or non-SSA operands (from iterating
3378 over PHI args). */
3379 while (curr != NULL_USE_OPERAND_P
3380 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3381 || ! bitmap_set_bit (visited,
3382 SSA_NAME_VERSION
3383 (USE_FROM_PTR (curr)))));
3385 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3386 if (curr == NULL_USE_OPERAND_P)
3387 break;
3389 else
3391 if (gimple_code (def) == GIMPLE_PHI)
3392 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3393 else
3394 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3395 while (curr != NULL_USE_OPERAND_P
3396 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3397 || ! bitmap_set_bit (visited,
3398 SSA_NAME_VERSION
3399 (USE_FROM_PTR (curr)))))
3400 curr = op_iter_next_use (&curri);
3401 if (curr == NULL_USE_OPERAND_P)
3402 goto pop;
3405 while (1);
3406 if (dump_file && (dump_flags & TDF_DETAILS))
3408 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3409 unsigned i;
3410 std::pair<ssa_op_iter, use_operand_p> *x;
3411 FOR_EACH_VEC_ELT (path, i, x)
3412 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3413 dump_printf (MSG_NOTE, "\n");
3416 /* Check whether the reduction path detected is valid. */
3417 bool fail = path.length () == 0;
3418 bool neg = false;
3419 int sign = -1;
3420 *code = ERROR_MARK;
3421 for (unsigned i = 1; i < path.length (); ++i)
3423 gimple *use_stmt = USE_STMT (path[i].second);
3424 tree op = USE_FROM_PTR (path[i].second);
3425 if (! is_gimple_assign (use_stmt)
3426 /* The following make sure we can compute the operand index
3427 easily plus it mostly disallows chaining via COND_EXPR condition
3428 operands. */
3429 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3430 && (gimple_num_ops (use_stmt) <= 2
3431 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3432 && (gimple_num_ops (use_stmt) <= 3
3433 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3435 fail = true;
3436 break;
3438 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3439 if (use_code == MINUS_EXPR)
3441 use_code = PLUS_EXPR;
3442 /* Track whether we negate the reduction value each iteration. */
3443 if (gimple_assign_rhs2 (use_stmt) == op)
3444 neg = ! neg;
3446 if (CONVERT_EXPR_CODE_P (use_code)
3447 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3448 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3450 else if (*code == ERROR_MARK)
3452 *code = use_code;
3453 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3455 else if (use_code != *code)
3457 fail = true;
3458 break;
3460 else if ((use_code == MIN_EXPR
3461 || use_code == MAX_EXPR)
3462 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3464 fail = true;
3465 break;
3467 /* Check there's only a single stmt the op is used on. For the
3468 not value-changing tail and the last stmt allow out-of-loop uses.
3469 ??? We could relax this and handle arbitrary live stmts by
3470 forcing a scalar epilogue for example. */
3471 imm_use_iterator imm_iter;
3472 gimple *op_use_stmt;
3473 unsigned cnt = 0;
3474 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3475 if (!is_gimple_debug (op_use_stmt)
3476 && (*code != ERROR_MARK
3477 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3479 /* We want to allow x + x but not x < 1 ? x : 2. */
3480 if (is_gimple_assign (op_use_stmt)
3481 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3483 use_operand_p use_p;
3484 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3485 cnt++;
3487 else
3488 cnt++;
3490 if (cnt != 1)
3492 fail = true;
3493 break;
3496 return ! fail && ! neg && *code != ERROR_MARK;
3499 bool
3500 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3501 tree loop_arg, enum tree_code code)
3503 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3504 enum tree_code code_;
3505 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3506 && code_ == code);
3511 /* Function vect_is_simple_reduction
3513 (1) Detect a cross-iteration def-use cycle that represents a simple
3514 reduction computation. We look for the following pattern:
3516 loop_header:
3517 a1 = phi < a0, a2 >
3518 a3 = ...
3519 a2 = operation (a3, a1)
3523 a3 = ...
3524 loop_header:
3525 a1 = phi < a0, a2 >
3526 a2 = operation (a3, a1)
3528 such that:
3529 1. operation is commutative and associative and it is safe to
3530 change the order of the computation
3531 2. no uses for a2 in the loop (a2 is used out of the loop)
3532 3. no uses of a1 in the loop besides the reduction operation
3533 4. no uses of a1 outside the loop.
3535 Conditions 1,4 are tested here.
3536 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3538 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3539 nested cycles.
3541 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3542 reductions:
3544 a1 = phi < a0, a2 >
3545 inner loop (def of a3)
3546 a2 = phi < a3 >
3548 (4) Detect condition expressions, ie:
3549 for (int i = 0; i < N; i++)
3550 if (a[i] < val)
3551 ret_val = a[i];
3555 static stmt_vec_info
3556 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3557 bool *double_reduc, bool *reduc_chain_p)
3559 gphi *phi = as_a <gphi *> (phi_info->stmt);
3560 gimple *phi_use_stmt = NULL;
3561 imm_use_iterator imm_iter;
3562 use_operand_p use_p;
3564 *double_reduc = false;
3565 *reduc_chain_p = false;
3566 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3568 tree phi_name = PHI_RESULT (phi);
3569 /* ??? If there are no uses of the PHI result the inner loop reduction
3570 won't be detected as possibly double-reduction by vectorizable_reduction
3571 because that tries to walk the PHI arg from the preheader edge which
3572 can be constant. See PR60382. */
3573 if (has_zero_uses (phi_name))
3574 return NULL;
3575 class loop *loop = (gimple_bb (phi))->loop_father;
3576 unsigned nphi_def_loop_uses = 0;
3577 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3579 gimple *use_stmt = USE_STMT (use_p);
3580 if (is_gimple_debug (use_stmt))
3581 continue;
3583 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3585 if (dump_enabled_p ())
3586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3587 "intermediate value used outside loop.\n");
3589 return NULL;
3592 nphi_def_loop_uses++;
3593 phi_use_stmt = use_stmt;
3596 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3597 if (TREE_CODE (latch_def) != SSA_NAME)
3599 if (dump_enabled_p ())
3600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3601 "reduction: not ssa_name: %T\n", latch_def);
3602 return NULL;
3605 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3606 if (!def_stmt_info
3607 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3608 return NULL;
3610 bool nested_in_vect_loop
3611 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3612 unsigned nlatch_def_loop_uses = 0;
3613 auto_vec<gphi *, 3> lcphis;
3614 bool inner_loop_of_double_reduc = false;
3615 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3617 gimple *use_stmt = USE_STMT (use_p);
3618 if (is_gimple_debug (use_stmt))
3619 continue;
3620 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3621 nlatch_def_loop_uses++;
3622 else
3624 /* We can have more than one loop-closed PHI. */
3625 lcphis.safe_push (as_a <gphi *> (use_stmt));
3626 if (nested_in_vect_loop
3627 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3628 == vect_double_reduction_def))
3629 inner_loop_of_double_reduc = true;
3633 /* If we are vectorizing an inner reduction we are executing that
3634 in the original order only in case we are not dealing with a
3635 double reduction. */
3636 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3638 if (dump_enabled_p ())
3639 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3640 "detected nested cycle: ");
3641 return def_stmt_info;
3644 /* If this isn't a nested cycle or if the nested cycle reduction value
3645 is used ouside of the inner loop we cannot handle uses of the reduction
3646 value. */
3647 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3649 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3651 "reduction used in loop.\n");
3652 return NULL;
3655 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3656 defined in the inner loop. */
3657 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3659 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3660 if (gimple_phi_num_args (def_stmt) != 1
3661 || TREE_CODE (op1) != SSA_NAME)
3663 if (dump_enabled_p ())
3664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3665 "unsupported phi node definition.\n");
3667 return NULL;
3670 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3671 if (gimple_bb (def1)
3672 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3673 && loop->inner
3674 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3675 && is_gimple_assign (def1)
3676 && is_a <gphi *> (phi_use_stmt)
3677 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3679 if (dump_enabled_p ())
3680 report_vect_op (MSG_NOTE, def_stmt,
3681 "detected double reduction: ");
3683 *double_reduc = true;
3684 return def_stmt_info;
3687 return NULL;
3690 /* Look for the expression computing latch_def from then loop PHI result. */
3691 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3692 enum tree_code code;
3693 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3694 path))
3696 STMT_VINFO_REDUC_CODE (phi_info) = code;
3697 if (code == COND_EXPR && !nested_in_vect_loop)
3698 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3700 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3701 reduction chain for which the additional restriction is that
3702 all operations in the chain are the same. */
3703 auto_vec<stmt_vec_info, 8> reduc_chain;
3704 unsigned i;
3705 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3706 for (i = path.length () - 1; i >= 1; --i)
3708 gimple *stmt = USE_STMT (path[i].second);
3709 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3710 STMT_VINFO_REDUC_IDX (stmt_info)
3711 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3712 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3713 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3714 && (i == 1 || i == path.length () - 1));
3715 if ((stmt_code != code && !leading_conversion)
3716 /* We can only handle the final value in epilogue
3717 generation for reduction chains. */
3718 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3719 is_slp_reduc = false;
3720 /* For reduction chains we support a trailing/leading
3721 conversions. We do not store those in the actual chain. */
3722 if (leading_conversion)
3723 continue;
3724 reduc_chain.safe_push (stmt_info);
3726 if (is_slp_reduc && reduc_chain.length () > 1)
3728 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3730 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3731 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3733 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3734 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3736 /* Save the chain for further analysis in SLP detection. */
3737 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3738 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3740 *reduc_chain_p = true;
3741 if (dump_enabled_p ())
3742 dump_printf_loc (MSG_NOTE, vect_location,
3743 "reduction: detected reduction chain\n");
3745 else if (dump_enabled_p ())
3746 dump_printf_loc (MSG_NOTE, vect_location,
3747 "reduction: detected reduction\n");
3749 return def_stmt_info;
3752 if (dump_enabled_p ())
3753 dump_printf_loc (MSG_NOTE, vect_location,
3754 "reduction: unknown pattern\n");
3756 return NULL;
3759 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3760 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3761 or -1 if not known. */
3763 static int
3764 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3766 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3767 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3769 if (dump_enabled_p ())
3770 dump_printf_loc (MSG_NOTE, vect_location,
3771 "cost model: epilogue peel iters set to vf/2 "
3772 "because loop iterations are unknown .\n");
3773 return assumed_vf / 2;
3775 else
3777 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3778 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3779 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3780 /* If we need to peel for gaps, but no peeling is required, we have to
3781 peel VF iterations. */
3782 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3783 peel_iters_epilogue = assumed_vf;
3784 return peel_iters_epilogue;
3788 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3790 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3791 int *peel_iters_epilogue,
3792 stmt_vector_for_cost *scalar_cost_vec,
3793 stmt_vector_for_cost *prologue_cost_vec,
3794 stmt_vector_for_cost *epilogue_cost_vec)
3796 int retval = 0;
3798 *peel_iters_epilogue
3799 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3801 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3803 /* If peeled iterations are known but number of scalar loop
3804 iterations are unknown, count a taken branch per peeled loop. */
3805 if (peel_iters_prologue > 0)
3806 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3807 NULL, NULL_TREE, 0, vect_prologue);
3808 if (*peel_iters_epilogue > 0)
3809 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3810 NULL, NULL_TREE, 0, vect_epilogue);
3813 stmt_info_for_cost *si;
3814 int j;
3815 if (peel_iters_prologue)
3816 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3817 retval += record_stmt_cost (prologue_cost_vec,
3818 si->count * peel_iters_prologue,
3819 si->kind, si->stmt_info, si->misalign,
3820 vect_prologue);
3821 if (*peel_iters_epilogue)
3822 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3823 retval += record_stmt_cost (epilogue_cost_vec,
3824 si->count * *peel_iters_epilogue,
3825 si->kind, si->stmt_info, si->misalign,
3826 vect_epilogue);
3828 return retval;
3831 /* Function vect_estimate_min_profitable_iters
3833 Return the number of iterations required for the vector version of the
3834 loop to be profitable relative to the cost of the scalar version of the
3835 loop.
3837 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3838 of iterations for vectorization. -1 value means loop vectorization
3839 is not profitable. This returned value may be used for dynamic
3840 profitability check.
3842 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3843 for static check against estimated number of iterations. */
3845 static void
3846 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3847 int *ret_min_profitable_niters,
3848 int *ret_min_profitable_estimate)
3850 int min_profitable_iters;
3851 int min_profitable_estimate;
3852 int peel_iters_prologue;
3853 int peel_iters_epilogue;
3854 unsigned vec_inside_cost = 0;
3855 int vec_outside_cost = 0;
3856 unsigned vec_prologue_cost = 0;
3857 unsigned vec_epilogue_cost = 0;
3858 int scalar_single_iter_cost = 0;
3859 int scalar_outside_cost = 0;
3860 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3861 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3862 vector_costs *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3864 /* Cost model disabled. */
3865 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3867 if (dump_enabled_p ())
3868 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3869 *ret_min_profitable_niters = 0;
3870 *ret_min_profitable_estimate = 0;
3871 return;
3874 /* Requires loop versioning tests to handle misalignment. */
3875 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3877 /* FIXME: Make cost depend on complexity of individual check. */
3878 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3879 (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3880 NULL, NULL_TREE, 0, vect_prologue);
3881 if (dump_enabled_p ())
3882 dump_printf (MSG_NOTE,
3883 "cost model: Adding cost of checks for loop "
3884 "versioning to treat misalignment.\n");
3887 /* Requires loop versioning with alias checks. */
3888 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3890 /* FIXME: Make cost depend on complexity of individual check. */
3891 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3892 (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3893 NULL, NULL_TREE, 0, vect_prologue);
3894 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3895 if (len)
3896 /* Count LEN - 1 ANDs and LEN comparisons. */
3897 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3898 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3899 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3900 if (len)
3902 /* Count LEN - 1 ANDs and LEN comparisons. */
3903 unsigned int nstmts = len * 2 - 1;
3904 /* +1 for each bias that needs adding. */
3905 for (unsigned int i = 0; i < len; ++i)
3906 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3907 nstmts += 1;
3908 (void) add_stmt_cost (target_cost_data, nstmts,
3909 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3911 if (dump_enabled_p ())
3912 dump_printf (MSG_NOTE,
3913 "cost model: Adding cost of checks for loop "
3914 "versioning aliasing.\n");
3917 /* Requires loop versioning with niter checks. */
3918 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3920 /* FIXME: Make cost depend on complexity of individual check. */
3921 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3922 NULL, NULL_TREE, 0, vect_prologue);
3923 if (dump_enabled_p ())
3924 dump_printf (MSG_NOTE,
3925 "cost model: Adding cost of checks for loop "
3926 "versioning niters.\n");
3929 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3930 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3931 NULL, NULL_TREE, 0, vect_prologue);
3933 /* Count statements in scalar loop. Using this as scalar cost for a single
3934 iteration for now.
3936 TODO: Add outer loop support.
3938 TODO: Consider assigning different costs to different scalar
3939 statements. */
3941 scalar_single_iter_cost
3942 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3944 /* Add additional cost for the peeled instructions in prologue and epilogue
3945 loop. (For fully-masked loops there will be no peeling.)
3947 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3948 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3950 TODO: Build an expression that represents peel_iters for prologue and
3951 epilogue to be used in a run-time test. */
3953 bool prologue_need_br_taken_cost = false;
3954 bool prologue_need_br_not_taken_cost = false;
3956 /* Calculate peel_iters_prologue. */
3957 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3958 peel_iters_prologue = 0;
3959 else if (npeel < 0)
3961 peel_iters_prologue = assumed_vf / 2;
3962 if (dump_enabled_p ())
3963 dump_printf (MSG_NOTE, "cost model: "
3964 "prologue peel iters set to vf/2.\n");
3966 /* If peeled iterations are unknown, count a taken branch and a not taken
3967 branch per peeled loop. Even if scalar loop iterations are known,
3968 vector iterations are not known since peeled prologue iterations are
3969 not known. Hence guards remain the same. */
3970 prologue_need_br_taken_cost = true;
3971 prologue_need_br_not_taken_cost = true;
3973 else
3975 peel_iters_prologue = npeel;
3976 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3977 /* If peeled iterations are known but number of scalar loop
3978 iterations are unknown, count a taken branch per peeled loop. */
3979 prologue_need_br_taken_cost = true;
3982 bool epilogue_need_br_taken_cost = false;
3983 bool epilogue_need_br_not_taken_cost = false;
3985 /* Calculate peel_iters_epilogue. */
3986 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3987 /* We need to peel exactly one iteration for gaps. */
3988 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3989 else if (npeel < 0)
3991 /* If peeling for alignment is unknown, loop bound of main loop
3992 becomes unknown. */
3993 peel_iters_epilogue = assumed_vf / 2;
3994 if (dump_enabled_p ())
3995 dump_printf (MSG_NOTE, "cost model: "
3996 "epilogue peel iters set to vf/2 because "
3997 "peeling for alignment is unknown.\n");
3999 /* See the same reason above in peel_iters_prologue calculation. */
4000 epilogue_need_br_taken_cost = true;
4001 epilogue_need_br_not_taken_cost = true;
4003 else
4005 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4006 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4007 /* If peeled iterations are known but number of scalar loop
4008 iterations are unknown, count a taken branch per peeled loop. */
4009 epilogue_need_br_taken_cost = true;
4012 stmt_info_for_cost *si;
4013 int j;
4014 /* Add costs associated with peel_iters_prologue. */
4015 if (peel_iters_prologue)
4016 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4018 (void) add_stmt_cost (target_cost_data,
4019 si->count * peel_iters_prologue, si->kind,
4020 si->stmt_info, si->vectype, si->misalign,
4021 vect_prologue);
4024 /* Add costs associated with peel_iters_epilogue. */
4025 if (peel_iters_epilogue)
4026 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4028 (void) add_stmt_cost (target_cost_data,
4029 si->count * peel_iters_epilogue, si->kind,
4030 si->stmt_info, si->vectype, si->misalign,
4031 vect_epilogue);
4034 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4036 if (prologue_need_br_taken_cost)
4037 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4038 NULL, NULL_TREE, 0, vect_prologue);
4040 if (prologue_need_br_not_taken_cost)
4041 (void) add_stmt_cost (target_cost_data, 1,
4042 cond_branch_not_taken, NULL, NULL_TREE, 0,
4043 vect_prologue);
4045 if (epilogue_need_br_taken_cost)
4046 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4047 NULL, NULL_TREE, 0, vect_epilogue);
4049 if (epilogue_need_br_not_taken_cost)
4050 (void) add_stmt_cost (target_cost_data, 1,
4051 cond_branch_not_taken, NULL, NULL_TREE, 0,
4052 vect_epilogue);
4054 /* Take care of special costs for rgroup controls of partial vectors. */
4055 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4057 /* Calculate how many masks we need to generate. */
4058 unsigned int num_masks = 0;
4059 rgroup_controls *rgm;
4060 unsigned int num_vectors_m1;
4061 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4062 if (rgm->type)
4063 num_masks += num_vectors_m1 + 1;
4064 gcc_assert (num_masks > 0);
4066 /* In the worst case, we need to generate each mask in the prologue
4067 and in the loop body. One of the loop body mask instructions
4068 replaces the comparison in the scalar loop, and since we don't
4069 count the scalar comparison against the scalar body, we shouldn't
4070 count that vector instruction against the vector body either.
4072 Sometimes we can use unpacks instead of generating prologue
4073 masks and sometimes the prologue mask will fold to a constant,
4074 so the actual prologue cost might be smaller. However, it's
4075 simpler and safer to use the worst-case cost; if this ends up
4076 being the tie-breaker between vectorizing or not, then it's
4077 probably better not to vectorize. */
4078 (void) add_stmt_cost (target_cost_data, num_masks,
4079 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4080 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4081 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4083 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4085 /* Referring to the functions vect_set_loop_condition_partial_vectors
4086 and vect_set_loop_controls_directly, we need to generate each
4087 length in the prologue and in the loop body if required. Although
4088 there are some possible optimizations, we consider the worst case
4089 here. */
4091 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4092 bool need_iterate_p
4093 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4094 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4096 /* Calculate how many statements to be added. */
4097 unsigned int prologue_stmts = 0;
4098 unsigned int body_stmts = 0;
4100 rgroup_controls *rgc;
4101 unsigned int num_vectors_m1;
4102 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4103 if (rgc->type)
4105 /* May need one SHIFT for nitems_total computation. */
4106 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4107 if (nitems != 1 && !niters_known_p)
4108 prologue_stmts += 1;
4110 /* May need one MAX and one MINUS for wrap around. */
4111 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4112 prologue_stmts += 2;
4114 /* Need one MAX and one MINUS for each batch limit excepting for
4115 the 1st one. */
4116 prologue_stmts += num_vectors_m1 * 2;
4118 unsigned int num_vectors = num_vectors_m1 + 1;
4120 /* Need to set up lengths in prologue, only one MIN required
4121 for each since start index is zero. */
4122 prologue_stmts += num_vectors;
4124 /* Each may need two MINs and one MINUS to update lengths in body
4125 for next iteration. */
4126 if (need_iterate_p)
4127 body_stmts += 3 * num_vectors;
4130 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4131 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4132 (void) add_stmt_cost (target_cost_data, body_stmts,
4133 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4136 /* FORNOW: The scalar outside cost is incremented in one of the
4137 following ways:
4139 1. The vectorizer checks for alignment and aliasing and generates
4140 a condition that allows dynamic vectorization. A cost model
4141 check is ANDED with the versioning condition. Hence scalar code
4142 path now has the added cost of the versioning check.
4144 if (cost > th & versioning_check)
4145 jmp to vector code
4147 Hence run-time scalar is incremented by not-taken branch cost.
4149 2. The vectorizer then checks if a prologue is required. If the
4150 cost model check was not done before during versioning, it has to
4151 be done before the prologue check.
4153 if (cost <= th)
4154 prologue = scalar_iters
4155 if (prologue == 0)
4156 jmp to vector code
4157 else
4158 execute prologue
4159 if (prologue == num_iters)
4160 go to exit
4162 Hence the run-time scalar cost is incremented by a taken branch,
4163 plus a not-taken branch, plus a taken branch cost.
4165 3. The vectorizer then checks if an epilogue is required. If the
4166 cost model check was not done before during prologue check, it
4167 has to be done with the epilogue check.
4169 if (prologue == 0)
4170 jmp to vector code
4171 else
4172 execute prologue
4173 if (prologue == num_iters)
4174 go to exit
4175 vector code:
4176 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4177 jmp to epilogue
4179 Hence the run-time scalar cost should be incremented by 2 taken
4180 branches.
4182 TODO: The back end may reorder the BBS's differently and reverse
4183 conditions/branch directions. Change the estimates below to
4184 something more reasonable. */
4186 /* If the number of iterations is known and we do not do versioning, we can
4187 decide whether to vectorize at compile time. Hence the scalar version
4188 do not carry cost model guard costs. */
4189 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4190 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4192 /* Cost model check occurs at versioning. */
4193 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4194 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4195 else
4197 /* Cost model check occurs at prologue generation. */
4198 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4199 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4200 + vect_get_stmt_cost (cond_branch_not_taken);
4201 /* Cost model check occurs at epilogue generation. */
4202 else
4203 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4207 /* Complete the target-specific cost calculations. */
4208 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4209 &vec_inside_cost, &vec_epilogue_cost);
4211 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4213 /* Stash the costs so that we can compare two loop_vec_infos. */
4214 loop_vinfo->vec_inside_cost = vec_inside_cost;
4215 loop_vinfo->vec_outside_cost = vec_outside_cost;
4217 if (dump_enabled_p ())
4219 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4220 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4221 vec_inside_cost);
4222 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4223 vec_prologue_cost);
4224 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4225 vec_epilogue_cost);
4226 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4227 scalar_single_iter_cost);
4228 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4229 scalar_outside_cost);
4230 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4231 vec_outside_cost);
4232 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4233 peel_iters_prologue);
4234 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4235 peel_iters_epilogue);
4238 /* Calculate number of iterations required to make the vector version
4239 profitable, relative to the loop bodies only. The following condition
4240 must hold true:
4241 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4242 where
4243 SIC = scalar iteration cost, VIC = vector iteration cost,
4244 VOC = vector outside cost, VF = vectorization factor,
4245 NPEEL = prologue iterations + epilogue iterations,
4246 SOC = scalar outside cost for run time cost model check. */
4248 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4249 - vec_inside_cost);
4250 if (saving_per_viter <= 0)
4252 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4253 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4254 "vectorization did not happen for a simd loop");
4256 if (dump_enabled_p ())
4257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4258 "cost model: the vector iteration cost = %d "
4259 "divided by the scalar iteration cost = %d "
4260 "is greater or equal to the vectorization factor = %d"
4261 ".\n",
4262 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4263 *ret_min_profitable_niters = -1;
4264 *ret_min_profitable_estimate = -1;
4265 return;
4268 /* ??? The "if" arm is written to handle all cases; see below for what
4269 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4270 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4272 /* Rewriting the condition above in terms of the number of
4273 vector iterations (vniters) rather than the number of
4274 scalar iterations (niters) gives:
4276 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4278 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4280 For integer N, X and Y when X > 0:
4282 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4283 int outside_overhead = (vec_outside_cost
4284 - scalar_single_iter_cost * peel_iters_prologue
4285 - scalar_single_iter_cost * peel_iters_epilogue
4286 - scalar_outside_cost);
4287 /* We're only interested in cases that require at least one
4288 vector iteration. */
4289 int min_vec_niters = 1;
4290 if (outside_overhead > 0)
4291 min_vec_niters = outside_overhead / saving_per_viter + 1;
4293 if (dump_enabled_p ())
4294 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4295 min_vec_niters);
4297 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4299 /* Now that we know the minimum number of vector iterations,
4300 find the minimum niters for which the scalar cost is larger:
4302 SIC * niters > VIC * vniters + VOC - SOC
4304 We know that the minimum niters is no more than
4305 vniters * VF + NPEEL, but it might be (and often is) less
4306 than that if a partial vector iteration is cheaper than the
4307 equivalent scalar code. */
4308 int threshold = (vec_inside_cost * min_vec_niters
4309 + vec_outside_cost
4310 - scalar_outside_cost);
4311 if (threshold <= 0)
4312 min_profitable_iters = 1;
4313 else
4314 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4316 else
4317 /* Convert the number of vector iterations into a number of
4318 scalar iterations. */
4319 min_profitable_iters = (min_vec_niters * assumed_vf
4320 + peel_iters_prologue
4321 + peel_iters_epilogue);
4323 else
4325 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4326 * assumed_vf
4327 - vec_inside_cost * peel_iters_prologue
4328 - vec_inside_cost * peel_iters_epilogue);
4329 if (min_profitable_iters <= 0)
4330 min_profitable_iters = 0;
4331 else
4333 min_profitable_iters /= saving_per_viter;
4335 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4336 <= (((int) vec_inside_cost * min_profitable_iters)
4337 + (((int) vec_outside_cost - scalar_outside_cost)
4338 * assumed_vf)))
4339 min_profitable_iters++;
4343 if (dump_enabled_p ())
4344 dump_printf (MSG_NOTE,
4345 " Calculated minimum iters for profitability: %d\n",
4346 min_profitable_iters);
4348 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4349 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4350 /* We want the vectorized loop to execute at least once. */
4351 min_profitable_iters = assumed_vf + peel_iters_prologue;
4352 else if (min_profitable_iters < peel_iters_prologue)
4353 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4354 vectorized loop executes at least once. */
4355 min_profitable_iters = peel_iters_prologue;
4357 if (dump_enabled_p ())
4358 dump_printf_loc (MSG_NOTE, vect_location,
4359 " Runtime profitability threshold = %d\n",
4360 min_profitable_iters);
4362 *ret_min_profitable_niters = min_profitable_iters;
4364 /* Calculate number of iterations required to make the vector version
4365 profitable, relative to the loop bodies only.
4367 Non-vectorized variant is SIC * niters and it must win over vector
4368 variant on the expected loop trip count. The following condition must hold true:
4369 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4371 if (vec_outside_cost <= 0)
4372 min_profitable_estimate = 0;
4373 /* ??? This "else if" arm is written to handle all cases; see below for
4374 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4375 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4377 /* This is a repeat of the code above, but with + SOC rather
4378 than - SOC. */
4379 int outside_overhead = (vec_outside_cost
4380 - scalar_single_iter_cost * peel_iters_prologue
4381 - scalar_single_iter_cost * peel_iters_epilogue
4382 + scalar_outside_cost);
4383 int min_vec_niters = 1;
4384 if (outside_overhead > 0)
4385 min_vec_niters = outside_overhead / saving_per_viter + 1;
4387 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4389 int threshold = (vec_inside_cost * min_vec_niters
4390 + vec_outside_cost
4391 + scalar_outside_cost);
4392 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4394 else
4395 min_profitable_estimate = (min_vec_niters * assumed_vf
4396 + peel_iters_prologue
4397 + peel_iters_epilogue);
4399 else
4401 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4402 * assumed_vf
4403 - vec_inside_cost * peel_iters_prologue
4404 - vec_inside_cost * peel_iters_epilogue)
4405 / ((scalar_single_iter_cost * assumed_vf)
4406 - vec_inside_cost);
4408 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4409 if (dump_enabled_p ())
4410 dump_printf_loc (MSG_NOTE, vect_location,
4411 " Static estimate profitability threshold = %d\n",
4412 min_profitable_estimate);
4414 *ret_min_profitable_estimate = min_profitable_estimate;
4417 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4418 vector elements (not bits) for a vector with NELT elements. */
4419 static void
4420 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4421 vec_perm_builder *sel)
4423 /* The encoding is a single stepped pattern. Any wrap-around is handled
4424 by vec_perm_indices. */
4425 sel->new_vector (nelt, 1, 3);
4426 for (unsigned int i = 0; i < 3; i++)
4427 sel->quick_push (i + offset);
4430 /* Checks whether the target supports whole-vector shifts for vectors of mode
4431 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4432 it supports vec_perm_const with masks for all necessary shift amounts. */
4433 static bool
4434 have_whole_vector_shift (machine_mode mode)
4436 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4437 return true;
4439 /* Variable-length vectors should be handled via the optab. */
4440 unsigned int nelt;
4441 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4442 return false;
4444 vec_perm_builder sel;
4445 vec_perm_indices indices;
4446 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4448 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4449 indices.new_vector (sel, 2, nelt);
4450 if (!can_vec_perm_const_p (mode, indices, false))
4451 return false;
4453 return true;
4456 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4457 functions. Design better to avoid maintenance issues. */
4459 /* Function vect_model_reduction_cost.
4461 Models cost for a reduction operation, including the vector ops
4462 generated within the strip-mine loop in some cases, the initial
4463 definition before the loop, and the epilogue code that must be generated. */
4465 static void
4466 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4467 stmt_vec_info stmt_info, internal_fn reduc_fn,
4468 vect_reduction_type reduction_type,
4469 int ncopies, stmt_vector_for_cost *cost_vec)
4471 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4472 enum tree_code code;
4473 optab optab;
4474 tree vectype;
4475 machine_mode mode;
4476 class loop *loop = NULL;
4478 if (loop_vinfo)
4479 loop = LOOP_VINFO_LOOP (loop_vinfo);
4481 /* Condition reductions generate two reductions in the loop. */
4482 if (reduction_type == COND_REDUCTION)
4483 ncopies *= 2;
4485 vectype = STMT_VINFO_VECTYPE (stmt_info);
4486 mode = TYPE_MODE (vectype);
4487 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4489 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4491 if (reduction_type == EXTRACT_LAST_REDUCTION)
4492 /* No extra instructions are needed in the prologue. The loop body
4493 operations are costed in vectorizable_condition. */
4494 inside_cost = 0;
4495 else if (reduction_type == FOLD_LEFT_REDUCTION)
4497 /* No extra instructions needed in the prologue. */
4498 prologue_cost = 0;
4500 if (reduc_fn != IFN_LAST)
4501 /* Count one reduction-like operation per vector. */
4502 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4503 stmt_info, 0, vect_body);
4504 else
4506 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4507 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4508 inside_cost = record_stmt_cost (cost_vec, nelements,
4509 vec_to_scalar, stmt_info, 0,
4510 vect_body);
4511 inside_cost += record_stmt_cost (cost_vec, nelements,
4512 scalar_stmt, stmt_info, 0,
4513 vect_body);
4516 else
4518 /* Add in cost for initial definition.
4519 For cond reduction we have four vectors: initial index, step,
4520 initial result of the data reduction, initial value of the index
4521 reduction. */
4522 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4523 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4524 scalar_to_vec, stmt_info, 0,
4525 vect_prologue);
4528 /* Determine cost of epilogue code.
4530 We have a reduction operator that will reduce the vector in one statement.
4531 Also requires scalar extract. */
4533 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4535 if (reduc_fn != IFN_LAST)
4537 if (reduction_type == COND_REDUCTION)
4539 /* An EQ stmt and an COND_EXPR stmt. */
4540 epilogue_cost += record_stmt_cost (cost_vec, 2,
4541 vector_stmt, stmt_info, 0,
4542 vect_epilogue);
4543 /* Reduction of the max index and a reduction of the found
4544 values. */
4545 epilogue_cost += record_stmt_cost (cost_vec, 2,
4546 vec_to_scalar, stmt_info, 0,
4547 vect_epilogue);
4548 /* A broadcast of the max value. */
4549 epilogue_cost += record_stmt_cost (cost_vec, 1,
4550 scalar_to_vec, stmt_info, 0,
4551 vect_epilogue);
4553 else
4555 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4556 stmt_info, 0, vect_epilogue);
4557 epilogue_cost += record_stmt_cost (cost_vec, 1,
4558 vec_to_scalar, stmt_info, 0,
4559 vect_epilogue);
4562 else if (reduction_type == COND_REDUCTION)
4564 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4565 /* Extraction of scalar elements. */
4566 epilogue_cost += record_stmt_cost (cost_vec,
4567 2 * estimated_nunits,
4568 vec_to_scalar, stmt_info, 0,
4569 vect_epilogue);
4570 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4571 epilogue_cost += record_stmt_cost (cost_vec,
4572 2 * estimated_nunits - 3,
4573 scalar_stmt, stmt_info, 0,
4574 vect_epilogue);
4576 else if (reduction_type == EXTRACT_LAST_REDUCTION
4577 || reduction_type == FOLD_LEFT_REDUCTION)
4578 /* No extra instructions need in the epilogue. */
4580 else
4582 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4583 tree bitsize =
4584 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4585 int element_bitsize = tree_to_uhwi (bitsize);
4586 int nelements = vec_size_in_bits / element_bitsize;
4588 if (code == COND_EXPR)
4589 code = MAX_EXPR;
4591 optab = optab_for_tree_code (code, vectype, optab_default);
4593 /* We have a whole vector shift available. */
4594 if (optab != unknown_optab
4595 && VECTOR_MODE_P (mode)
4596 && optab_handler (optab, mode) != CODE_FOR_nothing
4597 && have_whole_vector_shift (mode))
4599 /* Final reduction via vector shifts and the reduction operator.
4600 Also requires scalar extract. */
4601 epilogue_cost += record_stmt_cost (cost_vec,
4602 exact_log2 (nelements) * 2,
4603 vector_stmt, stmt_info, 0,
4604 vect_epilogue);
4605 epilogue_cost += record_stmt_cost (cost_vec, 1,
4606 vec_to_scalar, stmt_info, 0,
4607 vect_epilogue);
4609 else
4610 /* Use extracts and reduction op for final reduction. For N
4611 elements, we have N extracts and N-1 reduction ops. */
4612 epilogue_cost += record_stmt_cost (cost_vec,
4613 nelements + nelements - 1,
4614 vector_stmt, stmt_info, 0,
4615 vect_epilogue);
4619 if (dump_enabled_p ())
4620 dump_printf (MSG_NOTE,
4621 "vect_model_reduction_cost: inside_cost = %d, "
4622 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4623 prologue_cost, epilogue_cost);
4626 /* SEQ is a sequence of instructions that initialize the reduction
4627 described by REDUC_INFO. Emit them in the appropriate place. */
4629 static void
4630 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4631 stmt_vec_info reduc_info, gimple *seq)
4633 if (reduc_info->reused_accumulator)
4635 /* When reusing an accumulator from the main loop, we only need
4636 initialization instructions if the main loop can be skipped.
4637 In that case, emit the initialization instructions at the end
4638 of the guard block that does the skip. */
4639 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4640 gcc_assert (skip_edge);
4641 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4642 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4644 else
4646 /* The normal case: emit the initialization instructions on the
4647 preheader edge. */
4648 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4649 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4653 /* Function get_initial_def_for_reduction
4655 Input:
4656 REDUC_INFO - the info_for_reduction
4657 INIT_VAL - the initial value of the reduction variable
4658 NEUTRAL_OP - a value that has no effect on the reduction, as per
4659 neutral_op_for_reduction
4661 Output:
4662 Return a vector variable, initialized according to the operation that
4663 STMT_VINFO performs. This vector will be used as the initial value
4664 of the vector of partial results.
4666 The value we need is a vector in which element 0 has value INIT_VAL
4667 and every other element has value NEUTRAL_OP. */
4669 static tree
4670 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4671 stmt_vec_info reduc_info,
4672 tree init_val, tree neutral_op)
4674 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4675 tree scalar_type = TREE_TYPE (init_val);
4676 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4677 tree init_def;
4678 gimple_seq stmts = NULL;
4680 gcc_assert (vectype);
4682 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4683 || SCALAR_FLOAT_TYPE_P (scalar_type));
4685 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4686 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4688 if (operand_equal_p (init_val, neutral_op))
4690 /* If both elements are equal then the vector described above is
4691 just a splat. */
4692 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4693 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4695 else
4697 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4698 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4699 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4701 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4702 element 0. */
4703 init_def = gimple_build_vector_from_val (&stmts, vectype,
4704 neutral_op);
4705 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4706 vectype, init_def, init_val);
4708 else
4710 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4711 tree_vector_builder elts (vectype, 1, 2);
4712 elts.quick_push (init_val);
4713 elts.quick_push (neutral_op);
4714 init_def = gimple_build_vector (&stmts, &elts);
4718 if (stmts)
4719 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4720 return init_def;
4723 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4724 which performs a reduction involving GROUP_SIZE scalar statements.
4725 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4726 is nonnull, introducing extra elements of that value will not change the
4727 result. */
4729 static void
4730 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4731 stmt_vec_info reduc_info,
4732 vec<tree> *vec_oprnds,
4733 unsigned int number_of_vectors,
4734 unsigned int group_size, tree neutral_op)
4736 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4737 unsigned HOST_WIDE_INT nunits;
4738 unsigned j, number_of_places_left_in_vector;
4739 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4740 unsigned int i;
4742 gcc_assert (group_size == initial_values.length () || neutral_op);
4744 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4745 created vectors. It is greater than 1 if unrolling is performed.
4747 For example, we have two scalar operands, s1 and s2 (e.g., group of
4748 strided accesses of size two), while NUNITS is four (i.e., four scalars
4749 of this type can be packed in a vector). The output vector will contain
4750 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4751 will be 2).
4753 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4754 vectors containing the operands.
4756 For example, NUNITS is four as before, and the group size is 8
4757 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4758 {s5, s6, s7, s8}. */
4760 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4761 nunits = group_size;
4763 number_of_places_left_in_vector = nunits;
4764 bool constant_p = true;
4765 tree_vector_builder elts (vector_type, nunits, 1);
4766 elts.quick_grow (nunits);
4767 gimple_seq ctor_seq = NULL;
4768 for (j = 0; j < nunits * number_of_vectors; ++j)
4770 tree op;
4771 i = j % group_size;
4773 /* Get the def before the loop. In reduction chain we have only
4774 one initial value. Else we have as many as PHIs in the group. */
4775 if (i >= initial_values.length () || (j > i && neutral_op))
4776 op = neutral_op;
4777 else
4778 op = initial_values[i];
4780 /* Create 'vect_ = {op0,op1,...,opn}'. */
4781 number_of_places_left_in_vector--;
4782 elts[nunits - number_of_places_left_in_vector - 1] = op;
4783 if (!CONSTANT_CLASS_P (op))
4784 constant_p = false;
4786 if (number_of_places_left_in_vector == 0)
4788 tree init;
4789 if (constant_p && !neutral_op
4790 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4791 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4792 /* Build the vector directly from ELTS. */
4793 init = gimple_build_vector (&ctor_seq, &elts);
4794 else if (neutral_op)
4796 /* Build a vector of the neutral value and shift the
4797 other elements into place. */
4798 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4799 neutral_op);
4800 int k = nunits;
4801 while (k > 0 && elts[k - 1] == neutral_op)
4802 k -= 1;
4803 while (k > 0)
4805 k -= 1;
4806 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4807 vector_type, init, elts[k]);
4810 else
4812 /* First time round, duplicate ELTS to fill the
4813 required number of vectors. */
4814 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4815 elts, number_of_vectors, *vec_oprnds);
4816 break;
4818 vec_oprnds->quick_push (init);
4820 number_of_places_left_in_vector = nunits;
4821 elts.new_vector (vector_type, nunits, 1);
4822 elts.quick_grow (nunits);
4823 constant_p = true;
4826 if (ctor_seq != NULL)
4827 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4830 /* For a statement STMT_INFO taking part in a reduction operation return
4831 the stmt_vec_info the meta information is stored on. */
4833 stmt_vec_info
4834 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4836 stmt_info = vect_orig_stmt (stmt_info);
4837 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4838 if (!is_a <gphi *> (stmt_info->stmt)
4839 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4840 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4841 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4842 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4844 if (gimple_phi_num_args (phi) == 1)
4845 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4847 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4849 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4850 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4851 stmt_info = info;
4853 return stmt_info;
4856 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4857 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4858 return false. */
4860 static bool
4861 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4862 stmt_vec_info reduc_info)
4864 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4865 if (!main_loop_vinfo)
4866 return false;
4868 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4869 return false;
4871 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4872 auto_vec<tree, 16> main_loop_results (num_phis);
4873 auto_vec<tree, 16> initial_values (num_phis);
4874 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4876 /* The epilogue loop can be entered either from the main loop or
4877 from an earlier guard block. */
4878 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4879 for (tree incoming_value : reduc_info->reduc_initial_values)
4881 /* Look for:
4883 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4884 INITIAL_VALUE(guard block)>. */
4885 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4887 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4888 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4890 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4891 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4893 main_loop_results.quick_push (from_main_loop);
4894 initial_values.quick_push (from_skip);
4897 else
4898 /* The main loop dominates the epilogue loop. */
4899 main_loop_results.splice (reduc_info->reduc_initial_values);
4901 /* See if the main loop has the kind of accumulator we need. */
4902 vect_reusable_accumulator *accumulator
4903 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4904 if (!accumulator
4905 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4906 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4907 accumulator->reduc_info->reduc_scalar_results.begin ()))
4908 return false;
4910 /* Handle the case where we can reduce wider vectors to narrower ones. */
4911 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4912 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4913 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4914 TYPE_VECTOR_SUBPARTS (vectype)))
4915 return false;
4917 /* Non-SLP reductions might apply an adjustment after the reduction
4918 operation, in order to simplify the initialization of the accumulator.
4919 If the epilogue loop carries on from where the main loop left off,
4920 it should apply the same adjustment to the final reduction result.
4922 If the epilogue loop can also be entered directly (rather than via
4923 the main loop), we need to be able to handle that case in the same way,
4924 with the same adjustment. (In principle we could add a PHI node
4925 to select the correct adjustment, but in practice that shouldn't be
4926 necessary.) */
4927 tree main_adjustment
4928 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4929 if (loop_vinfo->main_loop_edge && main_adjustment)
4931 gcc_assert (num_phis == 1);
4932 tree initial_value = initial_values[0];
4933 /* Check that we can use INITIAL_VALUE as the adjustment and
4934 initialize the accumulator with a neutral value instead. */
4935 if (!operand_equal_p (initial_value, main_adjustment))
4936 return false;
4937 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4938 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4939 code, initial_value);
4941 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4942 reduc_info->reduc_initial_values.truncate (0);
4943 reduc_info->reduc_initial_values.splice (initial_values);
4944 reduc_info->reused_accumulator = accumulator;
4945 return true;
4948 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4949 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
4951 static tree
4952 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4953 gimple_seq *seq)
4955 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4956 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4957 tree stype = TREE_TYPE (vectype);
4958 tree new_temp = vec_def;
4959 while (nunits > nunits1)
4961 nunits /= 2;
4962 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4963 stype, nunits);
4964 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4966 /* The target has to make sure we support lowpart/highpart
4967 extraction, either via direct vector extract or through
4968 an integer mode punning. */
4969 tree dst1, dst2;
4970 gimple *epilog_stmt;
4971 if (convert_optab_handler (vec_extract_optab,
4972 TYPE_MODE (TREE_TYPE (new_temp)),
4973 TYPE_MODE (vectype1))
4974 != CODE_FOR_nothing)
4976 /* Extract sub-vectors directly once vec_extract becomes
4977 a conversion optab. */
4978 dst1 = make_ssa_name (vectype1);
4979 epilog_stmt
4980 = gimple_build_assign (dst1, BIT_FIELD_REF,
4981 build3 (BIT_FIELD_REF, vectype1,
4982 new_temp, TYPE_SIZE (vectype1),
4983 bitsize_int (0)));
4984 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4985 dst2 = make_ssa_name (vectype1);
4986 epilog_stmt
4987 = gimple_build_assign (dst2, BIT_FIELD_REF,
4988 build3 (BIT_FIELD_REF, vectype1,
4989 new_temp, TYPE_SIZE (vectype1),
4990 bitsize_int (bitsize)));
4991 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4993 else
4995 /* Extract via punning to appropriately sized integer mode
4996 vector. */
4997 tree eltype = build_nonstandard_integer_type (bitsize, 1);
4998 tree etype = build_vector_type (eltype, 2);
4999 gcc_assert (convert_optab_handler (vec_extract_optab,
5000 TYPE_MODE (etype),
5001 TYPE_MODE (eltype))
5002 != CODE_FOR_nothing);
5003 tree tem = make_ssa_name (etype);
5004 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5005 build1 (VIEW_CONVERT_EXPR,
5006 etype, new_temp));
5007 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5008 new_temp = tem;
5009 tem = make_ssa_name (eltype);
5010 epilog_stmt
5011 = gimple_build_assign (tem, BIT_FIELD_REF,
5012 build3 (BIT_FIELD_REF, eltype,
5013 new_temp, TYPE_SIZE (eltype),
5014 bitsize_int (0)));
5015 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5016 dst1 = make_ssa_name (vectype1);
5017 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5018 build1 (VIEW_CONVERT_EXPR,
5019 vectype1, tem));
5020 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5021 tem = make_ssa_name (eltype);
5022 epilog_stmt
5023 = gimple_build_assign (tem, BIT_FIELD_REF,
5024 build3 (BIT_FIELD_REF, eltype,
5025 new_temp, TYPE_SIZE (eltype),
5026 bitsize_int (bitsize)));
5027 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5028 dst2 = make_ssa_name (vectype1);
5029 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5030 build1 (VIEW_CONVERT_EXPR,
5031 vectype1, tem));
5032 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5035 new_temp = make_ssa_name (vectype1);
5036 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5037 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5040 return new_temp;
5043 /* Function vect_create_epilog_for_reduction
5045 Create code at the loop-epilog to finalize the result of a reduction
5046 computation.
5048 STMT_INFO is the scalar reduction stmt that is being vectorized.
5049 SLP_NODE is an SLP node containing a group of reduction statements. The
5050 first one in this group is STMT_INFO.
5051 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5052 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5053 (counting from 0)
5055 This function:
5056 1. Completes the reduction def-use cycles.
5057 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5058 by calling the function specified by REDUC_FN if available, or by
5059 other means (whole-vector shifts or a scalar loop).
5060 The function also creates a new phi node at the loop exit to preserve
5061 loop-closed form, as illustrated below.
5063 The flow at the entry to this function:
5065 loop:
5066 vec_def = phi <vec_init, null> # REDUCTION_PHI
5067 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5068 s_loop = scalar_stmt # (scalar) STMT_INFO
5069 loop_exit:
5070 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5071 use <s_out0>
5072 use <s_out0>
5074 The above is transformed by this function into:
5076 loop:
5077 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5078 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5079 s_loop = scalar_stmt # (scalar) STMT_INFO
5080 loop_exit:
5081 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5082 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5083 v_out2 = reduce <v_out1>
5084 s_out3 = extract_field <v_out2, 0>
5085 s_out4 = adjust_result <s_out3>
5086 use <s_out4>
5087 use <s_out4>
5090 static void
5091 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5092 stmt_vec_info stmt_info,
5093 slp_tree slp_node,
5094 slp_instance slp_node_instance)
5096 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5097 gcc_assert (reduc_info->is_reduc_info);
5098 /* For double reductions we need to get at the inner loop reduction
5099 stmt which has the meta info attached. Our stmt_info is that of the
5100 loop-closed PHI of the inner loop which we remember as
5101 def for the reduction PHI generation. */
5102 bool double_reduc = false;
5103 stmt_vec_info rdef_info = stmt_info;
5104 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5106 gcc_assert (!slp_node);
5107 double_reduc = true;
5108 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5109 (stmt_info->stmt, 0));
5110 stmt_info = vect_stmt_to_vectorize (stmt_info);
5112 gphi *reduc_def_stmt
5113 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5114 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5115 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5116 tree vectype;
5117 machine_mode mode;
5118 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5119 basic_block exit_bb;
5120 tree scalar_dest;
5121 tree scalar_type;
5122 gimple *new_phi = NULL, *phi;
5123 gimple_stmt_iterator exit_gsi;
5124 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5125 gimple *epilog_stmt = NULL;
5126 gimple *exit_phi;
5127 tree bitsize;
5128 tree def;
5129 tree orig_name, scalar_result;
5130 imm_use_iterator imm_iter, phi_imm_iter;
5131 use_operand_p use_p, phi_use_p;
5132 gimple *use_stmt;
5133 auto_vec<tree> reduc_inputs;
5134 int j, i;
5135 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5136 unsigned int group_size = 1, k;
5137 auto_vec<gimple *> phis;
5138 /* SLP reduction without reduction chain, e.g.,
5139 # a1 = phi <a2, a0>
5140 # b1 = phi <b2, b0>
5141 a2 = operation (a1)
5142 b2 = operation (b1) */
5143 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5144 bool direct_slp_reduc;
5145 tree induction_index = NULL_TREE;
5147 if (slp_node)
5148 group_size = SLP_TREE_LANES (slp_node);
5150 if (nested_in_vect_loop_p (loop, stmt_info))
5152 outer_loop = loop;
5153 loop = loop->inner;
5154 gcc_assert (!slp_node && double_reduc);
5157 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5158 gcc_assert (vectype);
5159 mode = TYPE_MODE (vectype);
5161 tree induc_val = NULL_TREE;
5162 tree adjustment_def = NULL;
5163 if (slp_node)
5165 else
5167 /* Optimize: for induction condition reduction, if we can't use zero
5168 for induc_val, use initial_def. */
5169 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5170 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5171 else if (double_reduc)
5173 else
5174 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5177 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5178 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5179 if (slp_reduc)
5180 /* All statements produce live-out values. */
5181 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5182 else if (slp_node)
5183 /* The last statement in the reduction chain produces the live-out
5184 value. */
5185 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5187 unsigned vec_num;
5188 int ncopies;
5189 if (slp_node)
5191 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5192 ncopies = 1;
5194 else
5196 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5197 vec_num = 1;
5198 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5201 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5202 which is updated with the current index of the loop for every match of
5203 the original loop's cond_expr (VEC_STMT). This results in a vector
5204 containing the last time the condition passed for that vector lane.
5205 The first match will be a 1 to allow 0 to be used for non-matching
5206 indexes. If there are no matches at all then the vector will be all
5207 zeroes.
5209 PR92772: This algorithm is broken for architectures that support
5210 masked vectors, but do not provide fold_extract_last. */
5211 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5213 auto_vec<std::pair<tree, bool>, 2> ccompares;
5214 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5215 cond_info = vect_stmt_to_vectorize (cond_info);
5216 while (cond_info != reduc_info)
5218 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5220 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5221 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5222 ccompares.safe_push
5223 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5224 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5226 cond_info
5227 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5228 1 + STMT_VINFO_REDUC_IDX
5229 (cond_info)));
5230 cond_info = vect_stmt_to_vectorize (cond_info);
5232 gcc_assert (ccompares.length () != 0);
5234 tree indx_before_incr, indx_after_incr;
5235 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5236 int scalar_precision
5237 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5238 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5239 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5240 (TYPE_MODE (vectype), cr_index_scalar_type,
5241 TYPE_VECTOR_SUBPARTS (vectype));
5243 /* First we create a simple vector induction variable which starts
5244 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5245 vector size (STEP). */
5247 /* Create a {1,2,3,...} vector. */
5248 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5250 /* Create a vector of the step value. */
5251 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5252 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5254 /* Create an induction variable. */
5255 gimple_stmt_iterator incr_gsi;
5256 bool insert_after;
5257 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5258 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5259 insert_after, &indx_before_incr, &indx_after_incr);
5261 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5262 filled with zeros (VEC_ZERO). */
5264 /* Create a vector of 0s. */
5265 tree zero = build_zero_cst (cr_index_scalar_type);
5266 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5268 /* Create a vector phi node. */
5269 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5270 new_phi = create_phi_node (new_phi_tree, loop->header);
5271 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5272 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5274 /* Now take the condition from the loops original cond_exprs
5275 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5276 every match uses values from the induction variable
5277 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5278 (NEW_PHI_TREE).
5279 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5280 the new cond_expr (INDEX_COND_EXPR). */
5281 gimple_seq stmts = NULL;
5282 for (int i = ccompares.length () - 1; i != -1; --i)
5284 tree ccompare = ccompares[i].first;
5285 if (ccompares[i].second)
5286 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5287 cr_index_vector_type,
5288 ccompare,
5289 indx_before_incr, new_phi_tree);
5290 else
5291 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5292 cr_index_vector_type,
5293 ccompare,
5294 new_phi_tree, indx_before_incr);
5296 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5298 /* Update the phi with the vec cond. */
5299 induction_index = new_phi_tree;
5300 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5301 loop_latch_edge (loop), UNKNOWN_LOCATION);
5304 /* 2. Create epilog code.
5305 The reduction epilog code operates across the elements of the vector
5306 of partial results computed by the vectorized loop.
5307 The reduction epilog code consists of:
5309 step 1: compute the scalar result in a vector (v_out2)
5310 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5311 step 3: adjust the scalar result (s_out3) if needed.
5313 Step 1 can be accomplished using one the following three schemes:
5314 (scheme 1) using reduc_fn, if available.
5315 (scheme 2) using whole-vector shifts, if available.
5316 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5317 combined.
5319 The overall epilog code looks like this:
5321 s_out0 = phi <s_loop> # original EXIT_PHI
5322 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5323 v_out2 = reduce <v_out1> # step 1
5324 s_out3 = extract_field <v_out2, 0> # step 2
5325 s_out4 = adjust_result <s_out3> # step 3
5327 (step 3 is optional, and steps 1 and 2 may be combined).
5328 Lastly, the uses of s_out0 are replaced by s_out4. */
5331 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5332 v_out1 = phi <VECT_DEF>
5333 Store them in NEW_PHIS. */
5334 if (double_reduc)
5335 loop = outer_loop;
5336 exit_bb = single_exit (loop)->dest;
5337 exit_gsi = gsi_after_labels (exit_bb);
5338 reduc_inputs.create (slp_node ? vec_num : ncopies);
5339 for (unsigned i = 0; i < vec_num; i++)
5341 gimple_seq stmts = NULL;
5342 if (slp_node)
5343 def = vect_get_slp_vect_def (slp_node, i);
5344 else
5345 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5346 for (j = 0; j < ncopies; j++)
5348 tree new_def = copy_ssa_name (def);
5349 phi = create_phi_node (new_def, exit_bb);
5350 if (j)
5351 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5352 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5353 new_def = gimple_convert (&stmts, vectype, new_def);
5354 reduc_inputs.quick_push (new_def);
5356 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5359 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5360 (i.e. when reduc_fn is not available) and in the final adjustment
5361 code (if needed). Also get the original scalar reduction variable as
5362 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5363 represents a reduction pattern), the tree-code and scalar-def are
5364 taken from the original stmt that the pattern-stmt (STMT) replaces.
5365 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5366 are taken from STMT. */
5368 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5369 if (orig_stmt_info != stmt_info)
5371 /* Reduction pattern */
5372 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5373 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5376 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5377 scalar_type = TREE_TYPE (scalar_dest);
5378 scalar_results.create (group_size);
5379 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5380 bitsize = TYPE_SIZE (scalar_type);
5382 /* True if we should implement SLP_REDUC using native reduction operations
5383 instead of scalar operations. */
5384 direct_slp_reduc = (reduc_fn != IFN_LAST
5385 && slp_reduc
5386 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5388 /* In case of reduction chain, e.g.,
5389 # a1 = phi <a3, a0>
5390 a2 = operation (a1)
5391 a3 = operation (a2),
5393 we may end up with more than one vector result. Here we reduce them
5394 to one vector.
5396 The same is true if we couldn't use a single defuse cycle. */
5397 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5398 || direct_slp_reduc
5399 || ncopies > 1)
5401 gimple_seq stmts = NULL;
5402 tree single_input = reduc_inputs[0];
5403 for (k = 1; k < reduc_inputs.length (); k++)
5404 single_input = gimple_build (&stmts, code, vectype,
5405 single_input, reduc_inputs[k]);
5406 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5408 reduc_inputs.truncate (0);
5409 reduc_inputs.safe_push (single_input);
5412 tree orig_reduc_input = reduc_inputs[0];
5414 /* If this loop is an epilogue loop that can be skipped after the
5415 main loop, we can only share a reduction operation between the
5416 main loop and the epilogue if we put it at the target of the
5417 skip edge.
5419 We can still reuse accumulators if this check fails. Doing so has
5420 the minor(?) benefit of making the epilogue loop's scalar result
5421 independent of the main loop's scalar result. */
5422 bool unify_with_main_loop_p = false;
5423 if (reduc_info->reused_accumulator
5424 && loop_vinfo->skip_this_loop_edge
5425 && single_succ_p (exit_bb)
5426 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5428 unify_with_main_loop_p = true;
5430 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5431 reduc_inputs[0] = make_ssa_name (vectype);
5432 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5433 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5434 UNKNOWN_LOCATION);
5435 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5436 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5437 exit_gsi = gsi_after_labels (reduc_block);
5440 /* Shouldn't be used beyond this point. */
5441 exit_bb = nullptr;
5443 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5444 && reduc_fn != IFN_LAST)
5446 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5447 various data values where the condition matched and another vector
5448 (INDUCTION_INDEX) containing all the indexes of those matches. We
5449 need to extract the last matching index (which will be the index with
5450 highest value) and use this to index into the data vector.
5451 For the case where there were no matches, the data vector will contain
5452 all default values and the index vector will be all zeros. */
5454 /* Get various versions of the type of the vector of indexes. */
5455 tree index_vec_type = TREE_TYPE (induction_index);
5456 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5457 tree index_scalar_type = TREE_TYPE (index_vec_type);
5458 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5460 /* Get an unsigned integer version of the type of the data vector. */
5461 int scalar_precision
5462 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5463 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5464 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5465 vectype);
5467 /* First we need to create a vector (ZERO_VEC) of zeros and another
5468 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5469 can create using a MAX reduction and then expanding.
5470 In the case where the loop never made any matches, the max index will
5471 be zero. */
5473 /* Vector of {0, 0, 0,...}. */
5474 tree zero_vec = build_zero_cst (vectype);
5476 /* Find maximum value from the vector of found indexes. */
5477 tree max_index = make_ssa_name (index_scalar_type);
5478 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5479 1, induction_index);
5480 gimple_call_set_lhs (max_index_stmt, max_index);
5481 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5483 /* Vector of {max_index, max_index, max_index,...}. */
5484 tree max_index_vec = make_ssa_name (index_vec_type);
5485 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5486 max_index);
5487 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5488 max_index_vec_rhs);
5489 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5491 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5492 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5493 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5494 otherwise. Only one value should match, resulting in a vector
5495 (VEC_COND) with one data value and the rest zeros.
5496 In the case where the loop never made any matches, every index will
5497 match, resulting in a vector with all data values (which will all be
5498 the default value). */
5500 /* Compare the max index vector to the vector of found indexes to find
5501 the position of the max value. */
5502 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5503 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5504 induction_index,
5505 max_index_vec);
5506 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5508 /* Use the compare to choose either values from the data vector or
5509 zero. */
5510 tree vec_cond = make_ssa_name (vectype);
5511 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5512 vec_compare,
5513 reduc_inputs[0],
5514 zero_vec);
5515 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5517 /* Finally we need to extract the data value from the vector (VEC_COND)
5518 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5519 reduction, but because this doesn't exist, we can use a MAX reduction
5520 instead. The data value might be signed or a float so we need to cast
5521 it first.
5522 In the case where the loop never made any matches, the data values are
5523 all identical, and so will reduce down correctly. */
5525 /* Make the matched data values unsigned. */
5526 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5527 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5528 vec_cond);
5529 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5530 VIEW_CONVERT_EXPR,
5531 vec_cond_cast_rhs);
5532 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5534 /* Reduce down to a scalar value. */
5535 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5536 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5537 1, vec_cond_cast);
5538 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5539 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5541 /* Convert the reduced value back to the result type and set as the
5542 result. */
5543 gimple_seq stmts = NULL;
5544 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5545 data_reduc);
5546 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5547 scalar_results.safe_push (new_temp);
5549 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5550 && reduc_fn == IFN_LAST)
5552 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5553 idx = 0;
5554 idx_val = induction_index[0];
5555 val = data_reduc[0];
5556 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5557 if (induction_index[i] > idx_val)
5558 val = data_reduc[i], idx_val = induction_index[i];
5559 return val; */
5561 tree data_eltype = TREE_TYPE (vectype);
5562 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5563 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5564 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5565 /* Enforced by vectorizable_reduction, which ensures we have target
5566 support before allowing a conditional reduction on variable-length
5567 vectors. */
5568 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5569 tree idx_val = NULL_TREE, val = NULL_TREE;
5570 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5572 tree old_idx_val = idx_val;
5573 tree old_val = val;
5574 idx_val = make_ssa_name (idx_eltype);
5575 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5576 build3 (BIT_FIELD_REF, idx_eltype,
5577 induction_index,
5578 bitsize_int (el_size),
5579 bitsize_int (off)));
5580 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5581 val = make_ssa_name (data_eltype);
5582 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5583 build3 (BIT_FIELD_REF,
5584 data_eltype,
5585 reduc_inputs[0],
5586 bitsize_int (el_size),
5587 bitsize_int (off)));
5588 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5589 if (off != 0)
5591 tree new_idx_val = idx_val;
5592 if (off != v_size - el_size)
5594 new_idx_val = make_ssa_name (idx_eltype);
5595 epilog_stmt = gimple_build_assign (new_idx_val,
5596 MAX_EXPR, idx_val,
5597 old_idx_val);
5598 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5600 tree new_val = make_ssa_name (data_eltype);
5601 epilog_stmt = gimple_build_assign (new_val,
5602 COND_EXPR,
5603 build2 (GT_EXPR,
5604 boolean_type_node,
5605 idx_val,
5606 old_idx_val),
5607 val, old_val);
5608 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5609 idx_val = new_idx_val;
5610 val = new_val;
5613 /* Convert the reduced value back to the result type and set as the
5614 result. */
5615 gimple_seq stmts = NULL;
5616 val = gimple_convert (&stmts, scalar_type, val);
5617 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5618 scalar_results.safe_push (val);
5621 /* 2.3 Create the reduction code, using one of the three schemes described
5622 above. In SLP we simply need to extract all the elements from the
5623 vector (without reducing them), so we use scalar shifts. */
5624 else if (reduc_fn != IFN_LAST && !slp_reduc)
5626 tree tmp;
5627 tree vec_elem_type;
5629 /* Case 1: Create:
5630 v_out2 = reduc_expr <v_out1> */
5632 if (dump_enabled_p ())
5633 dump_printf_loc (MSG_NOTE, vect_location,
5634 "Reduce using direct vector reduction.\n");
5636 gimple_seq stmts = NULL;
5637 vec_elem_type = TREE_TYPE (vectype);
5638 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5639 vec_elem_type, reduc_inputs[0]);
5640 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5641 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5643 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5644 && induc_val)
5646 /* Earlier we set the initial value to be a vector if induc_val
5647 values. Check the result and if it is induc_val then replace
5648 with the original initial value, unless induc_val is
5649 the same as initial_def already. */
5650 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5651 induc_val);
5652 tree initial_def = reduc_info->reduc_initial_values[0];
5654 tmp = make_ssa_name (new_scalar_dest);
5655 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5656 initial_def, new_temp);
5657 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5658 new_temp = tmp;
5661 scalar_results.safe_push (new_temp);
5663 else if (direct_slp_reduc)
5665 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5666 with the elements for other SLP statements replaced with the
5667 neutral value. We can then do a normal reduction on each vector. */
5669 /* Enforced by vectorizable_reduction. */
5670 gcc_assert (reduc_inputs.length () == 1);
5671 gcc_assert (pow2p_hwi (group_size));
5673 gimple_seq seq = NULL;
5675 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5676 and the same element size as VECTYPE. */
5677 tree index = build_index_vector (vectype, 0, 1);
5678 tree index_type = TREE_TYPE (index);
5679 tree index_elt_type = TREE_TYPE (index_type);
5680 tree mask_type = truth_type_for (index_type);
5682 /* Create a vector that, for each element, identifies which of
5683 the REDUC_GROUP_SIZE results should use it. */
5684 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5685 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5686 build_vector_from_val (index_type, index_mask));
5688 /* Get a neutral vector value. This is simply a splat of the neutral
5689 scalar value if we have one, otherwise the initial scalar value
5690 is itself a neutral value. */
5691 tree vector_identity = NULL_TREE;
5692 tree neutral_op = NULL_TREE;
5693 if (slp_node)
5695 tree initial_value = NULL_TREE;
5696 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5697 initial_value = reduc_info->reduc_initial_values[0];
5698 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5699 initial_value);
5701 if (neutral_op)
5702 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5703 neutral_op);
5704 for (unsigned int i = 0; i < group_size; ++i)
5706 /* If there's no univeral neutral value, we can use the
5707 initial scalar value from the original PHI. This is used
5708 for MIN and MAX reduction, for example. */
5709 if (!neutral_op)
5711 tree scalar_value = reduc_info->reduc_initial_values[i];
5712 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5713 scalar_value);
5714 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5715 scalar_value);
5718 /* Calculate the equivalent of:
5720 sel[j] = (index[j] == i);
5722 which selects the elements of REDUC_INPUTS[0] that should
5723 be included in the result. */
5724 tree compare_val = build_int_cst (index_elt_type, i);
5725 compare_val = build_vector_from_val (index_type, compare_val);
5726 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5727 index, compare_val);
5729 /* Calculate the equivalent of:
5731 vec = seq ? reduc_inputs[0] : vector_identity;
5733 VEC is now suitable for a full vector reduction. */
5734 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5735 sel, reduc_inputs[0], vector_identity);
5737 /* Do the reduction and convert it to the appropriate type. */
5738 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5739 TREE_TYPE (vectype), vec);
5740 scalar = gimple_convert (&seq, scalar_type, scalar);
5741 scalar_results.safe_push (scalar);
5743 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5745 else
5747 bool reduce_with_shift;
5748 tree vec_temp;
5750 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5752 /* See if the target wants to do the final (shift) reduction
5753 in a vector mode of smaller size and first reduce upper/lower
5754 halves against each other. */
5755 enum machine_mode mode1 = mode;
5756 tree stype = TREE_TYPE (vectype);
5757 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5758 unsigned nunits1 = nunits;
5759 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5760 && reduc_inputs.length () == 1)
5762 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5763 /* For SLP reductions we have to make sure lanes match up, but
5764 since we're doing individual element final reduction reducing
5765 vector width here is even more important.
5766 ??? We can also separate lanes with permutes, for the common
5767 case of power-of-two group-size odd/even extracts would work. */
5768 if (slp_reduc && nunits != nunits1)
5770 nunits1 = least_common_multiple (nunits1, group_size);
5771 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5774 if (!slp_reduc
5775 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5776 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5778 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5779 stype, nunits1);
5780 reduce_with_shift = have_whole_vector_shift (mode1);
5781 if (!VECTOR_MODE_P (mode1))
5782 reduce_with_shift = false;
5783 else
5785 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5786 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5787 reduce_with_shift = false;
5790 /* First reduce the vector to the desired vector size we should
5791 do shift reduction on by combining upper and lower halves. */
5792 gimple_seq stmts = NULL;
5793 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5794 code, &stmts);
5795 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5796 reduc_inputs[0] = new_temp;
5798 if (reduce_with_shift && !slp_reduc)
5800 int element_bitsize = tree_to_uhwi (bitsize);
5801 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5802 for variable-length vectors and also requires direct target support
5803 for loop reductions. */
5804 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5805 int nelements = vec_size_in_bits / element_bitsize;
5806 vec_perm_builder sel;
5807 vec_perm_indices indices;
5809 int elt_offset;
5811 tree zero_vec = build_zero_cst (vectype1);
5812 /* Case 2: Create:
5813 for (offset = nelements/2; offset >= 1; offset/=2)
5815 Create: va' = vec_shift <va, offset>
5816 Create: va = vop <va, va'>
5817 } */
5819 tree rhs;
5821 if (dump_enabled_p ())
5822 dump_printf_loc (MSG_NOTE, vect_location,
5823 "Reduce using vector shifts\n");
5825 gimple_seq stmts = NULL;
5826 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5827 for (elt_offset = nelements / 2;
5828 elt_offset >= 1;
5829 elt_offset /= 2)
5831 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5832 indices.new_vector (sel, 2, nelements);
5833 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5834 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5835 new_temp, zero_vec, mask);
5836 new_temp = gimple_build (&stmts, code,
5837 vectype1, new_name, new_temp);
5839 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5841 /* 2.4 Extract the final scalar result. Create:
5842 s_out3 = extract_field <v_out2, bitpos> */
5844 if (dump_enabled_p ())
5845 dump_printf_loc (MSG_NOTE, vect_location,
5846 "extract scalar result\n");
5848 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5849 bitsize, bitsize_zero_node);
5850 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5851 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5852 gimple_assign_set_lhs (epilog_stmt, new_temp);
5853 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5854 scalar_results.safe_push (new_temp);
5856 else
5858 /* Case 3: Create:
5859 s = extract_field <v_out2, 0>
5860 for (offset = element_size;
5861 offset < vector_size;
5862 offset += element_size;)
5864 Create: s' = extract_field <v_out2, offset>
5865 Create: s = op <s, s'> // For non SLP cases
5866 } */
5868 if (dump_enabled_p ())
5869 dump_printf_loc (MSG_NOTE, vect_location,
5870 "Reduce using scalar code.\n");
5872 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5873 int element_bitsize = tree_to_uhwi (bitsize);
5874 tree compute_type = TREE_TYPE (vectype);
5875 gimple_seq stmts = NULL;
5876 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5878 int bit_offset;
5879 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5880 vec_temp, bitsize, bitsize_zero_node);
5882 /* In SLP we don't need to apply reduction operation, so we just
5883 collect s' values in SCALAR_RESULTS. */
5884 if (slp_reduc)
5885 scalar_results.safe_push (new_temp);
5887 for (bit_offset = element_bitsize;
5888 bit_offset < vec_size_in_bits;
5889 bit_offset += element_bitsize)
5891 tree bitpos = bitsize_int (bit_offset);
5892 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5893 compute_type, vec_temp,
5894 bitsize, bitpos);
5895 if (slp_reduc)
5897 /* In SLP we don't need to apply reduction operation, so
5898 we just collect s' values in SCALAR_RESULTS. */
5899 new_temp = new_name;
5900 scalar_results.safe_push (new_name);
5902 else
5903 new_temp = gimple_build (&stmts, code, compute_type,
5904 new_name, new_temp);
5908 /* The only case where we need to reduce scalar results in SLP, is
5909 unrolling. If the size of SCALAR_RESULTS is greater than
5910 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5911 REDUC_GROUP_SIZE. */
5912 if (slp_reduc)
5914 tree res, first_res, new_res;
5916 /* Reduce multiple scalar results in case of SLP unrolling. */
5917 for (j = group_size; scalar_results.iterate (j, &res);
5918 j++)
5920 first_res = scalar_results[j % group_size];
5921 new_res = gimple_build (&stmts, code, compute_type,
5922 first_res, res);
5923 scalar_results[j % group_size] = new_res;
5925 scalar_results.truncate (group_size);
5926 for (k = 0; k < group_size; k++)
5927 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5928 scalar_results[k]);
5930 else
5932 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5933 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5934 scalar_results.safe_push (new_temp);
5937 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5940 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5941 && induc_val)
5943 /* Earlier we set the initial value to be a vector if induc_val
5944 values. Check the result and if it is induc_val then replace
5945 with the original initial value, unless induc_val is
5946 the same as initial_def already. */
5947 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5948 induc_val);
5949 tree initial_def = reduc_info->reduc_initial_values[0];
5951 tree tmp = make_ssa_name (new_scalar_dest);
5952 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5953 initial_def, new_temp);
5954 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5955 scalar_results[0] = tmp;
5959 /* 2.5 Adjust the final result by the initial value of the reduction
5960 variable. (When such adjustment is not needed, then
5961 'adjustment_def' is zero). For example, if code is PLUS we create:
5962 new_temp = loop_exit_def + adjustment_def */
5964 if (adjustment_def)
5966 gcc_assert (!slp_reduc);
5967 gimple_seq stmts = NULL;
5968 if (double_reduc)
5970 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5971 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5972 new_temp = gimple_build (&stmts, code, vectype,
5973 reduc_inputs[0], adjustment_def);
5975 else
5977 new_temp = scalar_results[0];
5978 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5979 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5980 new_temp = gimple_build (&stmts, code, scalar_type,
5981 new_temp, adjustment_def);
5984 epilog_stmt = gimple_seq_last_stmt (stmts);
5985 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5986 scalar_results[0] = new_temp;
5989 /* Record this operation if it could be reused by the epilogue loop. */
5990 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
5991 loop_vinfo->reusable_accumulators.put (scalar_results[0],
5992 { orig_reduc_input, reduc_info });
5994 if (double_reduc)
5995 loop = outer_loop;
5997 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5998 phis with new adjusted scalar results, i.e., replace use <s_out0>
5999 with use <s_out4>.
6001 Transform:
6002 loop_exit:
6003 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6004 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6005 v_out2 = reduce <v_out1>
6006 s_out3 = extract_field <v_out2, 0>
6007 s_out4 = adjust_result <s_out3>
6008 use <s_out0>
6009 use <s_out0>
6011 into:
6013 loop_exit:
6014 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6015 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6016 v_out2 = reduce <v_out1>
6017 s_out3 = extract_field <v_out2, 0>
6018 s_out4 = adjust_result <s_out3>
6019 use <s_out4>
6020 use <s_out4> */
6022 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6023 for (k = 0; k < live_out_stmts.size (); k++)
6025 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6026 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6028 phis.create (3);
6029 /* Find the loop-closed-use at the loop exit of the original scalar
6030 result. (The reduction result is expected to have two immediate uses,
6031 one at the latch block, and one at the loop exit). For double
6032 reductions we are looking for exit phis of the outer loop. */
6033 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6035 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6037 if (!is_gimple_debug (USE_STMT (use_p)))
6038 phis.safe_push (USE_STMT (use_p));
6040 else
6042 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6044 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6046 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6048 if (!flow_bb_inside_loop_p (loop,
6049 gimple_bb (USE_STMT (phi_use_p)))
6050 && !is_gimple_debug (USE_STMT (phi_use_p)))
6051 phis.safe_push (USE_STMT (phi_use_p));
6057 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6059 /* Replace the uses: */
6060 orig_name = PHI_RESULT (exit_phi);
6062 /* Look for a single use at the target of the skip edge. */
6063 if (unify_with_main_loop_p)
6065 use_operand_p use_p;
6066 gimple *user;
6067 if (!single_imm_use (orig_name, &use_p, &user))
6068 gcc_unreachable ();
6069 orig_name = gimple_get_lhs (user);
6072 scalar_result = scalar_results[k];
6073 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6075 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6076 SET_USE (use_p, scalar_result);
6077 update_stmt (use_stmt);
6081 phis.release ();
6085 /* Return a vector of type VECTYPE that is equal to the vector select
6086 operation "MASK ? VEC : IDENTITY". Insert the select statements
6087 before GSI. */
6089 static tree
6090 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6091 tree vec, tree identity)
6093 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6094 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6095 mask, vec, identity);
6096 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6097 return cond;
6100 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6101 order, starting with LHS. Insert the extraction statements before GSI and
6102 associate the new scalar SSA names with variable SCALAR_DEST.
6103 Return the SSA name for the result. */
6105 static tree
6106 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6107 tree_code code, tree lhs, tree vector_rhs)
6109 tree vectype = TREE_TYPE (vector_rhs);
6110 tree scalar_type = TREE_TYPE (vectype);
6111 tree bitsize = TYPE_SIZE (scalar_type);
6112 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6113 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6115 for (unsigned HOST_WIDE_INT bit_offset = 0;
6116 bit_offset < vec_size_in_bits;
6117 bit_offset += element_bitsize)
6119 tree bitpos = bitsize_int (bit_offset);
6120 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6121 bitsize, bitpos);
6123 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6124 rhs = make_ssa_name (scalar_dest, stmt);
6125 gimple_assign_set_lhs (stmt, rhs);
6126 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6128 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6129 tree new_name = make_ssa_name (scalar_dest, stmt);
6130 gimple_assign_set_lhs (stmt, new_name);
6131 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6132 lhs = new_name;
6134 return lhs;
6137 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6138 type of the vector input. */
6140 static internal_fn
6141 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6143 internal_fn mask_reduc_fn;
6145 switch (reduc_fn)
6147 case IFN_FOLD_LEFT_PLUS:
6148 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6149 break;
6151 default:
6152 return IFN_LAST;
6155 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6156 OPTIMIZE_FOR_SPEED))
6157 return mask_reduc_fn;
6158 return IFN_LAST;
6161 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6162 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6163 statement. CODE is the operation performed by STMT_INFO and OPS are
6164 its scalar operands. REDUC_INDEX is the index of the operand in
6165 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6166 implements in-order reduction, or IFN_LAST if we should open-code it.
6167 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6168 that should be used to control the operation in a fully-masked loop. */
6170 static bool
6171 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6172 stmt_vec_info stmt_info,
6173 gimple_stmt_iterator *gsi,
6174 gimple **vec_stmt, slp_tree slp_node,
6175 gimple *reduc_def_stmt,
6176 tree_code code, internal_fn reduc_fn,
6177 tree ops[3], tree vectype_in,
6178 int reduc_index, vec_loop_masks *masks)
6180 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6181 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6182 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6184 int ncopies;
6185 if (slp_node)
6186 ncopies = 1;
6187 else
6188 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6190 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6191 gcc_assert (ncopies == 1);
6192 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6194 if (slp_node)
6195 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6196 TYPE_VECTOR_SUBPARTS (vectype_in)));
6198 tree op0 = ops[1 - reduc_index];
6200 int group_size = 1;
6201 stmt_vec_info scalar_dest_def_info;
6202 auto_vec<tree> vec_oprnds0;
6203 if (slp_node)
6205 auto_vec<vec<tree> > vec_defs (2);
6206 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6207 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6208 vec_defs[0].release ();
6209 vec_defs[1].release ();
6210 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6211 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6213 else
6215 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6216 op0, &vec_oprnds0);
6217 scalar_dest_def_info = stmt_info;
6220 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6221 tree scalar_type = TREE_TYPE (scalar_dest);
6222 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6224 int vec_num = vec_oprnds0.length ();
6225 gcc_assert (vec_num == 1 || slp_node);
6226 tree vec_elem_type = TREE_TYPE (vectype_out);
6227 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6229 tree vector_identity = NULL_TREE;
6230 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6231 vector_identity = build_zero_cst (vectype_out);
6233 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6234 int i;
6235 tree def0;
6236 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6238 gimple *new_stmt;
6239 tree mask = NULL_TREE;
6240 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6241 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6243 /* Handle MINUS by adding the negative. */
6244 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6246 tree negated = make_ssa_name (vectype_out);
6247 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6248 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6249 def0 = negated;
6252 if (mask && mask_reduc_fn == IFN_LAST)
6253 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6254 vector_identity);
6256 /* On the first iteration the input is simply the scalar phi
6257 result, and for subsequent iterations it is the output of
6258 the preceding operation. */
6259 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6261 if (mask && mask_reduc_fn != IFN_LAST)
6262 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6263 def0, mask);
6264 else
6265 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6266 def0);
6267 /* For chained SLP reductions the output of the previous reduction
6268 operation serves as the input of the next. For the final statement
6269 the output cannot be a temporary - we reuse the original
6270 scalar destination of the last statement. */
6271 if (i != vec_num - 1)
6273 gimple_set_lhs (new_stmt, scalar_dest_var);
6274 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6275 gimple_set_lhs (new_stmt, reduc_var);
6278 else
6280 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6281 reduc_var, def0);
6282 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6283 /* Remove the statement, so that we can use the same code paths
6284 as for statements that we've just created. */
6285 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6286 gsi_remove (&tmp_gsi, true);
6289 if (i == vec_num - 1)
6291 gimple_set_lhs (new_stmt, scalar_dest);
6292 vect_finish_replace_stmt (loop_vinfo,
6293 scalar_dest_def_info,
6294 new_stmt);
6296 else
6297 vect_finish_stmt_generation (loop_vinfo,
6298 scalar_dest_def_info,
6299 new_stmt, gsi);
6301 if (slp_node)
6302 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6303 else
6305 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6306 *vec_stmt = new_stmt;
6310 return true;
6313 /* Function is_nonwrapping_integer_induction.
6315 Check if STMT_VINO (which is part of loop LOOP) both increments and
6316 does not cause overflow. */
6318 static bool
6319 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6321 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6322 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6323 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6324 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6325 widest_int ni, max_loop_value, lhs_max;
6326 wi::overflow_type overflow = wi::OVF_NONE;
6328 /* Make sure the loop is integer based. */
6329 if (TREE_CODE (base) != INTEGER_CST
6330 || TREE_CODE (step) != INTEGER_CST)
6331 return false;
6333 /* Check that the max size of the loop will not wrap. */
6335 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6336 return true;
6338 if (! max_stmt_executions (loop, &ni))
6339 return false;
6341 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6342 &overflow);
6343 if (overflow)
6344 return false;
6346 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6347 TYPE_SIGN (lhs_type), &overflow);
6348 if (overflow)
6349 return false;
6351 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6352 <= TYPE_PRECISION (lhs_type));
6355 /* Check if masking can be supported by inserting a conditional expression.
6356 CODE is the code for the operation. COND_FN is the conditional internal
6357 function, if it exists. VECTYPE_IN is the type of the vector input. */
6358 static bool
6359 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6360 tree vectype_in)
6362 if (cond_fn != IFN_LAST
6363 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6364 OPTIMIZE_FOR_SPEED))
6365 return false;
6367 switch (code)
6369 case DOT_PROD_EXPR:
6370 case SAD_EXPR:
6371 return true;
6373 default:
6374 return false;
6378 /* Insert a conditional expression to enable masked vectorization. CODE is the
6379 code for the operation. VOP is the array of operands. MASK is the loop
6380 mask. GSI is a statement iterator used to place the new conditional
6381 expression. */
6382 static void
6383 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6384 gimple_stmt_iterator *gsi)
6386 switch (code)
6388 case DOT_PROD_EXPR:
6390 tree vectype = TREE_TYPE (vop[1]);
6391 tree zero = build_zero_cst (vectype);
6392 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6393 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6394 mask, vop[1], zero);
6395 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6396 vop[1] = masked_op1;
6397 break;
6400 case SAD_EXPR:
6402 tree vectype = TREE_TYPE (vop[1]);
6403 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6404 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6405 mask, vop[1], vop[0]);
6406 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6407 vop[1] = masked_op1;
6408 break;
6411 default:
6412 gcc_unreachable ();
6416 /* Function vectorizable_reduction.
6418 Check if STMT_INFO performs a reduction operation that can be vectorized.
6419 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6420 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6421 Return true if STMT_INFO is vectorizable in this way.
6423 This function also handles reduction idioms (patterns) that have been
6424 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6425 may be of this form:
6426 X = pattern_expr (arg0, arg1, ..., X)
6427 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6428 sequence that had been detected and replaced by the pattern-stmt
6429 (STMT_INFO).
6431 This function also handles reduction of condition expressions, for example:
6432 for (int i = 0; i < N; i++)
6433 if (a[i] < value)
6434 last = a[i];
6435 This is handled by vectorising the loop and creating an additional vector
6436 containing the loop indexes for which "a[i] < value" was true. In the
6437 function epilogue this is reduced to a single max value and then used to
6438 index into the vector of results.
6440 In some cases of reduction patterns, the type of the reduction variable X is
6441 different than the type of the other arguments of STMT_INFO.
6442 In such cases, the vectype that is used when transforming STMT_INFO into
6443 a vector stmt is different than the vectype that is used to determine the
6444 vectorization factor, because it consists of a different number of elements
6445 than the actual number of elements that are being operated upon in parallel.
6447 For example, consider an accumulation of shorts into an int accumulator.
6448 On some targets it's possible to vectorize this pattern operating on 8
6449 shorts at a time (hence, the vectype for purposes of determining the
6450 vectorization factor should be V8HI); on the other hand, the vectype that
6451 is used to create the vector form is actually V4SI (the type of the result).
6453 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6454 indicates what is the actual level of parallelism (V8HI in the example), so
6455 that the right vectorization factor would be derived. This vectype
6456 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6457 be used to create the vectorized stmt. The right vectype for the vectorized
6458 stmt is obtained from the type of the result X:
6459 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6461 This means that, contrary to "regular" reductions (or "regular" stmts in
6462 general), the following equation:
6463 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6464 does *NOT* necessarily hold for reduction patterns. */
6466 bool
6467 vectorizable_reduction (loop_vec_info loop_vinfo,
6468 stmt_vec_info stmt_info, slp_tree slp_node,
6469 slp_instance slp_node_instance,
6470 stmt_vector_for_cost *cost_vec)
6472 tree scalar_dest;
6473 tree vectype_in = NULL_TREE;
6474 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6475 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6476 stmt_vec_info cond_stmt_vinfo = NULL;
6477 tree scalar_type;
6478 int i;
6479 int ncopies;
6480 bool single_defuse_cycle = false;
6481 bool nested_cycle = false;
6482 bool double_reduc = false;
6483 int vec_num;
6484 tree tem;
6485 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6486 tree cond_reduc_val = NULL_TREE;
6488 /* Make sure it was already recognized as a reduction computation. */
6489 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6490 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6491 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6492 return false;
6494 /* The stmt we store reduction analysis meta on. */
6495 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6496 reduc_info->is_reduc_info = true;
6498 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6500 if (is_a <gphi *> (stmt_info->stmt))
6502 if (slp_node)
6504 /* We eventually need to set a vector type on invariant
6505 arguments. */
6506 unsigned j;
6507 slp_tree child;
6508 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6509 if (!vect_maybe_update_slp_op_vectype
6510 (child, SLP_TREE_VECTYPE (slp_node)))
6512 if (dump_enabled_p ())
6513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6514 "incompatible vector types for "
6515 "invariants\n");
6516 return false;
6519 /* Analysis for double-reduction is done on the outer
6520 loop PHI, nested cycles have no further restrictions. */
6521 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6523 else
6524 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6525 return true;
6528 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6529 stmt_vec_info phi_info = stmt_info;
6530 if (!is_a <gphi *> (stmt_info->stmt))
6532 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6533 return true;
6535 if (slp_node)
6537 slp_node_instance->reduc_phis = slp_node;
6538 /* ??? We're leaving slp_node to point to the PHIs, we only
6539 need it to get at the number of vector stmts which wasn't
6540 yet initialized for the instance root. */
6542 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6543 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6544 else
6546 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6547 == vect_double_reduction_def);
6548 use_operand_p use_p;
6549 gimple *use_stmt;
6550 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6551 &use_p, &use_stmt);
6552 gcc_assert (res);
6553 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6554 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6557 /* PHIs should not participate in patterns. */
6558 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6559 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6561 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6562 and compute the reduction chain length. Discover the real
6563 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6564 tree reduc_def
6565 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6566 loop_latch_edge
6567 (gimple_bb (reduc_def_phi)->loop_father));
6568 unsigned reduc_chain_length = 0;
6569 bool only_slp_reduc_chain = true;
6570 stmt_info = NULL;
6571 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6572 while (reduc_def != PHI_RESULT (reduc_def_phi))
6574 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6575 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6576 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6578 if (dump_enabled_p ())
6579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6580 "reduction chain broken by patterns.\n");
6581 return false;
6583 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6584 only_slp_reduc_chain = false;
6585 /* ??? For epilogue generation live members of the chain need
6586 to point back to the PHI via their original stmt for
6587 info_for_reduction to work. */
6588 if (STMT_VINFO_LIVE_P (vdef))
6589 STMT_VINFO_REDUC_DEF (def) = phi_info;
6590 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6591 if (!assign)
6593 if (dump_enabled_p ())
6594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6595 "reduction chain includes calls.\n");
6596 return false;
6598 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6600 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6601 TREE_TYPE (gimple_assign_rhs1 (assign))))
6603 if (dump_enabled_p ())
6604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6605 "conversion in the reduction chain.\n");
6606 return false;
6609 else if (!stmt_info)
6610 /* First non-conversion stmt. */
6611 stmt_info = vdef;
6612 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6613 reduc_chain_length++;
6614 if (!stmt_info && slp_node)
6615 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6617 /* PHIs should not participate in patterns. */
6618 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6620 if (nested_in_vect_loop_p (loop, stmt_info))
6622 loop = loop->inner;
6623 nested_cycle = true;
6626 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6627 element. */
6628 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6630 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6631 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6633 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6634 gcc_assert (slp_node
6635 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6637 /* 1. Is vectorizable reduction? */
6638 /* Not supportable if the reduction variable is used in the loop, unless
6639 it's a reduction chain. */
6640 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6641 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6642 return false;
6644 /* Reductions that are not used even in an enclosing outer-loop,
6645 are expected to be "live" (used out of the loop). */
6646 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6647 && !STMT_VINFO_LIVE_P (stmt_info))
6648 return false;
6650 /* 2. Has this been recognized as a reduction pattern?
6652 Check if STMT represents a pattern that has been recognized
6653 in earlier analysis stages. For stmts that represent a pattern,
6654 the STMT_VINFO_RELATED_STMT field records the last stmt in
6655 the original sequence that constitutes the pattern. */
6657 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6658 if (orig_stmt_info)
6660 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6661 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6664 /* 3. Check the operands of the operation. The first operands are defined
6665 inside the loop body. The last operand is the reduction variable,
6666 which is defined by the loop-header-phi. */
6668 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6669 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6670 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6671 enum tree_code code = gimple_assign_rhs_code (stmt);
6672 bool lane_reduc_code_p
6673 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6674 int op_type = TREE_CODE_LENGTH (code);
6675 enum optab_subtype optab_query_kind = optab_vector;
6676 if (code == DOT_PROD_EXPR
6677 && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6678 != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6679 optab_query_kind = optab_vector_mixed_sign;
6682 scalar_dest = gimple_assign_lhs (stmt);
6683 scalar_type = TREE_TYPE (scalar_dest);
6684 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6685 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6686 return false;
6688 /* Do not try to vectorize bit-precision reductions. */
6689 if (!type_has_mode_precision_p (scalar_type))
6690 return false;
6692 /* For lane-reducing ops we're reducing the number of reduction PHIs
6693 which means the only use of that may be in the lane-reducing operation. */
6694 if (lane_reduc_code_p
6695 && reduc_chain_length != 1
6696 && !only_slp_reduc_chain)
6698 if (dump_enabled_p ())
6699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700 "lane-reducing reduction with extra stmts.\n");
6701 return false;
6704 /* All uses but the last are expected to be defined in the loop.
6705 The last use is the reduction variable. In case of nested cycle this
6706 assumption is not true: we use reduc_index to record the index of the
6707 reduction variable. */
6708 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6709 /* We need to skip an extra operand for COND_EXPRs with embedded
6710 comparison. */
6711 unsigned opno_adjust = 0;
6712 if (code == COND_EXPR
6713 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6714 opno_adjust = 1;
6715 for (i = 0; i < op_type; i++)
6717 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6718 if (i == 0 && code == COND_EXPR)
6719 continue;
6721 stmt_vec_info def_stmt_info;
6722 enum vect_def_type dt;
6723 tree op;
6724 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6725 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6726 &def_stmt_info))
6728 if (dump_enabled_p ())
6729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6730 "use not simple.\n");
6731 return false;
6733 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6734 continue;
6736 /* There should be only one cycle def in the stmt, the one
6737 leading to reduc_def. */
6738 if (VECTORIZABLE_CYCLE_DEF (dt))
6739 return false;
6741 /* To properly compute ncopies we are interested in the widest
6742 non-reduction input type in case we're looking at a widening
6743 accumulation that we later handle in vect_transform_reduction. */
6744 if (lane_reduc_code_p
6745 && tem
6746 && (!vectype_in
6747 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6748 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6749 vectype_in = tem;
6751 if (code == COND_EXPR)
6753 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6754 if (dt == vect_constant_def)
6756 cond_reduc_dt = dt;
6757 cond_reduc_val = op;
6759 if (dt == vect_induction_def
6760 && def_stmt_info
6761 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6763 cond_reduc_dt = dt;
6764 cond_stmt_vinfo = def_stmt_info;
6768 if (!vectype_in)
6769 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6770 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6772 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6773 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6774 /* If we have a condition reduction, see if we can simplify it further. */
6775 if (v_reduc_type == COND_REDUCTION)
6777 if (slp_node)
6778 return false;
6780 /* When the condition uses the reduction value in the condition, fail. */
6781 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6783 if (dump_enabled_p ())
6784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785 "condition depends on previous iteration\n");
6786 return false;
6789 if (reduc_chain_length == 1
6790 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6791 vectype_in, OPTIMIZE_FOR_SPEED))
6793 if (dump_enabled_p ())
6794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6795 "optimizing condition reduction with"
6796 " FOLD_EXTRACT_LAST.\n");
6797 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6799 else if (cond_reduc_dt == vect_induction_def)
6801 tree base
6802 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6803 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6805 gcc_assert (TREE_CODE (base) == INTEGER_CST
6806 && TREE_CODE (step) == INTEGER_CST);
6807 cond_reduc_val = NULL_TREE;
6808 enum tree_code cond_reduc_op_code = ERROR_MARK;
6809 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6810 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6812 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6813 above base; punt if base is the minimum value of the type for
6814 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6815 else if (tree_int_cst_sgn (step) == -1)
6817 cond_reduc_op_code = MIN_EXPR;
6818 if (tree_int_cst_sgn (base) == -1)
6819 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6820 else if (tree_int_cst_lt (base,
6821 TYPE_MAX_VALUE (TREE_TYPE (base))))
6822 cond_reduc_val
6823 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6825 else
6827 cond_reduc_op_code = MAX_EXPR;
6828 if (tree_int_cst_sgn (base) == 1)
6829 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6830 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6831 base))
6832 cond_reduc_val
6833 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6835 if (cond_reduc_val)
6837 if (dump_enabled_p ())
6838 dump_printf_loc (MSG_NOTE, vect_location,
6839 "condition expression based on "
6840 "integer induction.\n");
6841 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6842 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6843 = cond_reduc_val;
6844 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6847 else if (cond_reduc_dt == vect_constant_def)
6849 enum vect_def_type cond_initial_dt;
6850 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6851 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6852 if (cond_initial_dt == vect_constant_def
6853 && types_compatible_p (TREE_TYPE (cond_initial_val),
6854 TREE_TYPE (cond_reduc_val)))
6856 tree e = fold_binary (LE_EXPR, boolean_type_node,
6857 cond_initial_val, cond_reduc_val);
6858 if (e && (integer_onep (e) || integer_zerop (e)))
6860 if (dump_enabled_p ())
6861 dump_printf_loc (MSG_NOTE, vect_location,
6862 "condition expression based on "
6863 "compile time constant.\n");
6864 /* Record reduction code at analysis stage. */
6865 STMT_VINFO_REDUC_CODE (reduc_info)
6866 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6867 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6873 if (STMT_VINFO_LIVE_P (phi_info))
6874 return false;
6876 if (slp_node)
6877 ncopies = 1;
6878 else
6879 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6881 gcc_assert (ncopies >= 1);
6883 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6885 if (nested_cycle)
6887 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6888 == vect_double_reduction_def);
6889 double_reduc = true;
6892 /* 4.2. Check support for the epilog operation.
6894 If STMT represents a reduction pattern, then the type of the
6895 reduction variable may be different than the type of the rest
6896 of the arguments. For example, consider the case of accumulation
6897 of shorts into an int accumulator; The original code:
6898 S1: int_a = (int) short_a;
6899 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6901 was replaced with:
6902 STMT: int_acc = widen_sum <short_a, int_acc>
6904 This means that:
6905 1. The tree-code that is used to create the vector operation in the
6906 epilog code (that reduces the partial results) is not the
6907 tree-code of STMT, but is rather the tree-code of the original
6908 stmt from the pattern that STMT is replacing. I.e, in the example
6909 above we want to use 'widen_sum' in the loop, but 'plus' in the
6910 epilog.
6911 2. The type (mode) we use to check available target support
6912 for the vector operation to be created in the *epilog*, is
6913 determined by the type of the reduction variable (in the example
6914 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6915 However the type (mode) we use to check available target support
6916 for the vector operation to be created *inside the loop*, is
6917 determined by the type of the other arguments to STMT (in the
6918 example we'd check this: optab_handler (widen_sum_optab,
6919 vect_short_mode)).
6921 This is contrary to "regular" reductions, in which the types of all
6922 the arguments are the same as the type of the reduction variable.
6923 For "regular" reductions we can therefore use the same vector type
6924 (and also the same tree-code) when generating the epilog code and
6925 when generating the code inside the loop. */
6927 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6928 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6930 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6931 if (reduction_type == TREE_CODE_REDUCTION)
6933 /* Check whether it's ok to change the order of the computation.
6934 Generally, when vectorizing a reduction we change the order of the
6935 computation. This may change the behavior of the program in some
6936 cases, so we need to check that this is ok. One exception is when
6937 vectorizing an outer-loop: the inner-loop is executed sequentially,
6938 and therefore vectorizing reductions in the inner-loop during
6939 outer-loop vectorization is safe. Likewise when we are vectorizing
6940 a series of reductions using SLP and the VF is one the reductions
6941 are performed in scalar order. */
6942 if (slp_node
6943 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6944 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6946 else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6948 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6949 is not directy used in stmt. */
6950 if (!only_slp_reduc_chain
6951 && reduc_chain_length != 1)
6953 if (dump_enabled_p ())
6954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6955 "in-order reduction chain without SLP.\n");
6956 return false;
6958 STMT_VINFO_REDUC_TYPE (reduc_info)
6959 = reduction_type = FOLD_LEFT_REDUCTION;
6961 else if (!commutative_tree_code (orig_code)
6962 || !associative_tree_code (orig_code))
6964 if (dump_enabled_p ())
6965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6966 "reduction: not commutative/associative");
6967 return false;
6971 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6972 && ncopies > 1)
6974 if (dump_enabled_p ())
6975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6976 "multiple types in double reduction or condition "
6977 "reduction or fold-left reduction.\n");
6978 return false;
6981 internal_fn reduc_fn = IFN_LAST;
6982 if (reduction_type == TREE_CODE_REDUCTION
6983 || reduction_type == FOLD_LEFT_REDUCTION
6984 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6985 || reduction_type == CONST_COND_REDUCTION)
6987 if (reduction_type == FOLD_LEFT_REDUCTION
6988 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6989 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6991 if (reduc_fn != IFN_LAST
6992 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6993 OPTIMIZE_FOR_SPEED))
6995 if (dump_enabled_p ())
6996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6997 "reduc op not supported by target.\n");
6999 reduc_fn = IFN_LAST;
7002 else
7004 if (!nested_cycle || double_reduc)
7006 if (dump_enabled_p ())
7007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7008 "no reduc code for scalar code.\n");
7010 return false;
7014 else if (reduction_type == COND_REDUCTION)
7016 int scalar_precision
7017 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7018 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7019 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7020 vectype_out);
7022 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7023 OPTIMIZE_FOR_SPEED))
7024 reduc_fn = IFN_REDUC_MAX;
7026 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7028 if (reduction_type != EXTRACT_LAST_REDUCTION
7029 && (!nested_cycle || double_reduc)
7030 && reduc_fn == IFN_LAST
7031 && !nunits_out.is_constant ())
7033 if (dump_enabled_p ())
7034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7035 "missing target support for reduction on"
7036 " variable-length vectors.\n");
7037 return false;
7040 /* For SLP reductions, see if there is a neutral value we can use. */
7041 tree neutral_op = NULL_TREE;
7042 if (slp_node)
7044 tree initial_value = NULL_TREE;
7045 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7046 initial_value = vect_phi_initial_value (reduc_def_phi);
7047 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7048 orig_code, initial_value);
7051 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7053 /* We can't support in-order reductions of code such as this:
7055 for (int i = 0; i < n1; ++i)
7056 for (int j = 0; j < n2; ++j)
7057 l += a[j];
7059 since GCC effectively transforms the loop when vectorizing:
7061 for (int i = 0; i < n1 / VF; ++i)
7062 for (int j = 0; j < n2; ++j)
7063 for (int k = 0; k < VF; ++k)
7064 l += a[j];
7066 which is a reassociation of the original operation. */
7067 if (dump_enabled_p ())
7068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7069 "in-order double reduction not supported.\n");
7071 return false;
7074 if (reduction_type == FOLD_LEFT_REDUCTION
7075 && slp_node
7076 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7078 /* We cannot use in-order reductions in this case because there is
7079 an implicit reassociation of the operations involved. */
7080 if (dump_enabled_p ())
7081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7082 "in-order unchained SLP reductions not supported.\n");
7083 return false;
7086 /* For double reductions, and for SLP reductions with a neutral value,
7087 we construct a variable-length initial vector by loading a vector
7088 full of the neutral value and then shift-and-inserting the start
7089 values into the low-numbered elements. */
7090 if ((double_reduc || neutral_op)
7091 && !nunits_out.is_constant ()
7092 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7093 vectype_out, OPTIMIZE_FOR_SPEED))
7095 if (dump_enabled_p ())
7096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097 "reduction on variable-length vectors requires"
7098 " target support for a vector-shift-and-insert"
7099 " operation.\n");
7100 return false;
7103 /* Check extra constraints for variable-length unchained SLP reductions. */
7104 if (STMT_SLP_TYPE (stmt_info)
7105 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7106 && !nunits_out.is_constant ())
7108 /* We checked above that we could build the initial vector when
7109 there's a neutral element value. Check here for the case in
7110 which each SLP statement has its own initial value and in which
7111 that value needs to be repeated for every instance of the
7112 statement within the initial vector. */
7113 unsigned int group_size = SLP_TREE_LANES (slp_node);
7114 if (!neutral_op
7115 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7116 TREE_TYPE (vectype_out)))
7118 if (dump_enabled_p ())
7119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7120 "unsupported form of SLP reduction for"
7121 " variable-length vectors: cannot build"
7122 " initial vector.\n");
7123 return false;
7125 /* The epilogue code relies on the number of elements being a multiple
7126 of the group size. The duplicate-and-interleave approach to setting
7127 up the initial vector does too. */
7128 if (!multiple_p (nunits_out, group_size))
7130 if (dump_enabled_p ())
7131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7132 "unsupported form of SLP reduction for"
7133 " variable-length vectors: the vector size"
7134 " is not a multiple of the number of results.\n");
7135 return false;
7139 if (reduction_type == COND_REDUCTION)
7141 widest_int ni;
7143 if (! max_loop_iterations (loop, &ni))
7145 if (dump_enabled_p ())
7146 dump_printf_loc (MSG_NOTE, vect_location,
7147 "loop count not known, cannot create cond "
7148 "reduction.\n");
7149 return false;
7151 /* Convert backedges to iterations. */
7152 ni += 1;
7154 /* The additional index will be the same type as the condition. Check
7155 that the loop can fit into this less one (because we'll use up the
7156 zero slot for when there are no matches). */
7157 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7158 if (wi::geu_p (ni, wi::to_widest (max_index)))
7160 if (dump_enabled_p ())
7161 dump_printf_loc (MSG_NOTE, vect_location,
7162 "loop size is greater than data size.\n");
7163 return false;
7167 /* In case the vectorization factor (VF) is bigger than the number
7168 of elements that we can fit in a vectype (nunits), we have to generate
7169 more than one vector stmt - i.e - we need to "unroll" the
7170 vector stmt by a factor VF/nunits. For more details see documentation
7171 in vectorizable_operation. */
7173 /* If the reduction is used in an outer loop we need to generate
7174 VF intermediate results, like so (e.g. for ncopies=2):
7175 r0 = phi (init, r0)
7176 r1 = phi (init, r1)
7177 r0 = x0 + r0;
7178 r1 = x1 + r1;
7179 (i.e. we generate VF results in 2 registers).
7180 In this case we have a separate def-use cycle for each copy, and therefore
7181 for each copy we get the vector def for the reduction variable from the
7182 respective phi node created for this copy.
7184 Otherwise (the reduction is unused in the loop nest), we can combine
7185 together intermediate results, like so (e.g. for ncopies=2):
7186 r = phi (init, r)
7187 r = x0 + r;
7188 r = x1 + r;
7189 (i.e. we generate VF/2 results in a single register).
7190 In this case for each copy we get the vector def for the reduction variable
7191 from the vectorized reduction operation generated in the previous iteration.
7193 This only works when we see both the reduction PHI and its only consumer
7194 in vectorizable_reduction and there are no intermediate stmts
7195 participating. */
7196 if (ncopies > 1
7197 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7198 && reduc_chain_length == 1)
7199 single_defuse_cycle = true;
7201 if (single_defuse_cycle || lane_reduc_code_p)
7203 gcc_assert (code != COND_EXPR);
7205 /* 4. Supportable by target? */
7206 bool ok = true;
7208 /* 4.1. check support for the operation in the loop */
7209 optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7210 if (!optab)
7212 if (dump_enabled_p ())
7213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7214 "no optab.\n");
7215 ok = false;
7218 machine_mode vec_mode = TYPE_MODE (vectype_in);
7219 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7221 if (dump_enabled_p ())
7222 dump_printf (MSG_NOTE, "op not supported by target.\n");
7223 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7224 || !vect_can_vectorize_without_simd_p (code))
7225 ok = false;
7226 else
7227 if (dump_enabled_p ())
7228 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7231 if (vect_emulated_vector_p (vectype_in)
7232 && !vect_can_vectorize_without_simd_p (code))
7234 if (dump_enabled_p ())
7235 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7236 return false;
7239 /* lane-reducing operations have to go through vect_transform_reduction.
7240 For the other cases try without the single cycle optimization. */
7241 if (!ok)
7243 if (lane_reduc_code_p)
7244 return false;
7245 else
7246 single_defuse_cycle = false;
7249 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7251 /* If the reduction stmt is one of the patterns that have lane
7252 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7253 if ((ncopies > 1 && ! single_defuse_cycle)
7254 && lane_reduc_code_p)
7256 if (dump_enabled_p ())
7257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7258 "multi def-use cycle not possible for lane-reducing "
7259 "reduction operation\n");
7260 return false;
7263 if (slp_node
7264 && !(!single_defuse_cycle
7265 && code != DOT_PROD_EXPR
7266 && code != WIDEN_SUM_EXPR
7267 && code != SAD_EXPR
7268 && reduction_type != FOLD_LEFT_REDUCTION))
7269 for (i = 0; i < op_type; i++)
7270 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7272 if (dump_enabled_p ())
7273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7274 "incompatible vector types for invariants\n");
7275 return false;
7278 if (slp_node)
7279 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7280 else
7281 vec_num = 1;
7283 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7284 reduction_type, ncopies, cost_vec);
7285 /* Cost the reduction op inside the loop if transformed via
7286 vect_transform_reduction. Otherwise this is costed by the
7287 separate vectorizable_* routines. */
7288 if (single_defuse_cycle
7289 || code == DOT_PROD_EXPR
7290 || code == WIDEN_SUM_EXPR
7291 || code == SAD_EXPR)
7292 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7294 if (dump_enabled_p ()
7295 && reduction_type == FOLD_LEFT_REDUCTION)
7296 dump_printf_loc (MSG_NOTE, vect_location,
7297 "using an in-order (fold-left) reduction.\n");
7298 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7299 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7300 reductions go through their own vectorizable_* routines. */
7301 if (!single_defuse_cycle
7302 && code != DOT_PROD_EXPR
7303 && code != WIDEN_SUM_EXPR
7304 && code != SAD_EXPR
7305 && reduction_type != FOLD_LEFT_REDUCTION)
7307 stmt_vec_info tem
7308 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7309 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7311 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7312 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7314 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7315 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7317 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7319 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7320 internal_fn cond_fn = get_conditional_internal_fn (code);
7322 if (reduction_type != FOLD_LEFT_REDUCTION
7323 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7324 && (cond_fn == IFN_LAST
7325 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7326 OPTIMIZE_FOR_SPEED)))
7328 if (dump_enabled_p ())
7329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7330 "can't operate on partial vectors because"
7331 " no conditional operation is available.\n");
7332 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7334 else if (reduction_type == FOLD_LEFT_REDUCTION
7335 && reduc_fn == IFN_LAST
7336 && !expand_vec_cond_expr_p (vectype_in,
7337 truth_type_for (vectype_in),
7338 SSA_NAME))
7340 if (dump_enabled_p ())
7341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7342 "can't operate on partial vectors because"
7343 " no conditional operation is available.\n");
7344 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7346 else
7347 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7348 vectype_in, NULL);
7350 return true;
7353 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7354 value. */
7356 bool
7357 vect_transform_reduction (loop_vec_info loop_vinfo,
7358 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7359 gimple **vec_stmt, slp_tree slp_node)
7361 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7362 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7363 int i;
7364 int ncopies;
7365 int vec_num;
7367 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7368 gcc_assert (reduc_info->is_reduc_info);
7370 if (nested_in_vect_loop_p (loop, stmt_info))
7372 loop = loop->inner;
7373 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7376 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7377 enum tree_code code = gimple_assign_rhs_code (stmt);
7378 int op_type = TREE_CODE_LENGTH (code);
7380 /* Flatten RHS. */
7381 tree ops[3];
7382 switch (get_gimple_rhs_class (code))
7384 case GIMPLE_TERNARY_RHS:
7385 ops[2] = gimple_assign_rhs3 (stmt);
7386 /* Fall thru. */
7387 case GIMPLE_BINARY_RHS:
7388 ops[0] = gimple_assign_rhs1 (stmt);
7389 ops[1] = gimple_assign_rhs2 (stmt);
7390 break;
7391 default:
7392 gcc_unreachable ();
7395 /* All uses but the last are expected to be defined in the loop.
7396 The last use is the reduction variable. In case of nested cycle this
7397 assumption is not true: we use reduc_index to record the index of the
7398 reduction variable. */
7399 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7400 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7401 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7402 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7404 if (slp_node)
7406 ncopies = 1;
7407 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7409 else
7411 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7412 vec_num = 1;
7415 internal_fn cond_fn = get_conditional_internal_fn (code);
7416 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7417 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7419 /* Transform. */
7420 tree new_temp = NULL_TREE;
7421 auto_vec<tree> vec_oprnds0;
7422 auto_vec<tree> vec_oprnds1;
7423 auto_vec<tree> vec_oprnds2;
7424 tree def0;
7426 if (dump_enabled_p ())
7427 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7429 /* FORNOW: Multiple types are not supported for condition. */
7430 if (code == COND_EXPR)
7431 gcc_assert (ncopies == 1);
7433 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7435 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7436 if (reduction_type == FOLD_LEFT_REDUCTION)
7438 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7439 return vectorize_fold_left_reduction
7440 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7441 reduc_fn, ops, vectype_in, reduc_index, masks);
7444 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7445 gcc_assert (single_defuse_cycle
7446 || code == DOT_PROD_EXPR
7447 || code == WIDEN_SUM_EXPR
7448 || code == SAD_EXPR);
7450 /* Create the destination vector */
7451 tree scalar_dest = gimple_assign_lhs (stmt);
7452 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7454 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7455 single_defuse_cycle && reduc_index == 0
7456 ? NULL_TREE : ops[0], &vec_oprnds0,
7457 single_defuse_cycle && reduc_index == 1
7458 ? NULL_TREE : ops[1], &vec_oprnds1,
7459 op_type == ternary_op
7460 && !(single_defuse_cycle && reduc_index == 2)
7461 ? ops[2] : NULL_TREE, &vec_oprnds2);
7462 if (single_defuse_cycle)
7464 gcc_assert (!slp_node);
7465 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7466 ops[reduc_index],
7467 reduc_index == 0 ? &vec_oprnds0
7468 : (reduc_index == 1 ? &vec_oprnds1
7469 : &vec_oprnds2));
7472 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7474 gimple *new_stmt;
7475 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7476 if (masked_loop_p && !mask_by_cond_expr)
7478 /* Make sure that the reduction accumulator is vop[0]. */
7479 if (reduc_index == 1)
7481 gcc_assert (commutative_tree_code (code));
7482 std::swap (vop[0], vop[1]);
7484 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7485 vectype_in, i);
7486 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7487 vop[0], vop[1], vop[0]);
7488 new_temp = make_ssa_name (vec_dest, call);
7489 gimple_call_set_lhs (call, new_temp);
7490 gimple_call_set_nothrow (call, true);
7491 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7492 new_stmt = call;
7494 else
7496 if (op_type == ternary_op)
7497 vop[2] = vec_oprnds2[i];
7499 if (masked_loop_p && mask_by_cond_expr)
7501 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7502 vectype_in, i);
7503 build_vect_cond_expr (code, vop, mask, gsi);
7506 new_stmt = gimple_build_assign (vec_dest, code,
7507 vop[0], vop[1], vop[2]);
7508 new_temp = make_ssa_name (vec_dest, new_stmt);
7509 gimple_assign_set_lhs (new_stmt, new_temp);
7510 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7513 if (slp_node)
7514 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7515 else if (single_defuse_cycle
7516 && i < ncopies - 1)
7518 if (reduc_index == 0)
7519 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7520 else if (reduc_index == 1)
7521 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7522 else if (reduc_index == 2)
7523 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7525 else
7526 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7529 if (!slp_node)
7530 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7532 return true;
7535 /* Transform phase of a cycle PHI. */
7537 bool
7538 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7539 stmt_vec_info stmt_info, gimple **vec_stmt,
7540 slp_tree slp_node, slp_instance slp_node_instance)
7542 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7543 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7544 int i;
7545 int ncopies;
7546 int j;
7547 bool nested_cycle = false;
7548 int vec_num;
7550 if (nested_in_vect_loop_p (loop, stmt_info))
7552 loop = loop->inner;
7553 nested_cycle = true;
7556 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7557 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7558 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7559 gcc_assert (reduc_info->is_reduc_info);
7561 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7562 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7563 /* Leave the scalar phi in place. */
7564 return true;
7566 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7567 /* For a nested cycle we do not fill the above. */
7568 if (!vectype_in)
7569 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7570 gcc_assert (vectype_in);
7572 if (slp_node)
7574 /* The size vect_schedule_slp_instance computes is off for us. */
7575 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7576 * SLP_TREE_LANES (slp_node), vectype_in);
7577 ncopies = 1;
7579 else
7581 vec_num = 1;
7582 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7585 /* Check whether we should use a single PHI node and accumulate
7586 vectors to one before the backedge. */
7587 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7588 ncopies = 1;
7590 /* Create the destination vector */
7591 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7592 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7593 vectype_out);
7595 /* Get the loop-entry arguments. */
7596 tree vec_initial_def = NULL_TREE;
7597 auto_vec<tree> vec_initial_defs;
7598 if (slp_node)
7600 vec_initial_defs.reserve (vec_num);
7601 if (nested_cycle)
7603 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7604 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7605 &vec_initial_defs);
7607 else
7609 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7610 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7611 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7613 unsigned int num_phis = stmts.length ();
7614 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7615 num_phis = 1;
7616 initial_values.reserve (num_phis);
7617 for (unsigned int i = 0; i < num_phis; ++i)
7619 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7620 initial_values.quick_push (vect_phi_initial_value (this_phi));
7622 if (vec_num == 1)
7623 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7624 if (!initial_values.is_empty ())
7626 tree initial_value
7627 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7628 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7629 tree neutral_op
7630 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7631 code, initial_value);
7632 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7633 &vec_initial_defs, vec_num,
7634 stmts.length (), neutral_op);
7638 else
7640 /* Get at the scalar def before the loop, that defines the initial
7641 value of the reduction variable. */
7642 tree initial_def = vect_phi_initial_value (phi);
7643 reduc_info->reduc_initial_values.safe_push (initial_def);
7644 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7645 and we can't use zero for induc_val, use initial_def. Similarly
7646 for REDUC_MIN and initial_def larger than the base. */
7647 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7649 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7650 if (TREE_CODE (initial_def) == INTEGER_CST
7651 && !integer_zerop (induc_val)
7652 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7653 && tree_int_cst_lt (initial_def, induc_val))
7654 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7655 && tree_int_cst_lt (induc_val, initial_def))))
7657 induc_val = initial_def;
7658 /* Communicate we used the initial_def to epilouge
7659 generation. */
7660 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7662 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7664 else if (nested_cycle)
7666 /* Do not use an adjustment def as that case is not supported
7667 correctly if ncopies is not one. */
7668 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7669 ncopies, initial_def,
7670 &vec_initial_defs);
7672 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7673 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7674 /* Fill the initial vector with the initial scalar value. */
7675 vec_initial_def
7676 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7677 initial_def, initial_def);
7678 else
7680 if (ncopies == 1)
7681 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7682 if (!reduc_info->reduc_initial_values.is_empty ())
7684 initial_def = reduc_info->reduc_initial_values[0];
7685 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7686 tree neutral_op
7687 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7688 code, initial_def);
7689 gcc_assert (neutral_op);
7690 /* Try to simplify the vector initialization by applying an
7691 adjustment after the reduction has been performed. */
7692 if (!reduc_info->reused_accumulator
7693 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7694 && !operand_equal_p (neutral_op, initial_def))
7696 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7697 = initial_def;
7698 initial_def = neutral_op;
7700 vec_initial_def
7701 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7702 initial_def, neutral_op);
7707 if (vec_initial_def)
7709 vec_initial_defs.create (ncopies);
7710 for (i = 0; i < ncopies; ++i)
7711 vec_initial_defs.quick_push (vec_initial_def);
7714 if (auto *accumulator = reduc_info->reused_accumulator)
7716 tree def = accumulator->reduc_input;
7717 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7719 unsigned int nreduc;
7720 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7721 (TREE_TYPE (def)),
7722 TYPE_VECTOR_SUBPARTS (vectype_out),
7723 &nreduc);
7724 gcc_assert (res);
7725 gimple_seq stmts = NULL;
7726 /* Reduce the single vector to a smaller one. */
7727 if (nreduc != 1)
7729 /* Perform the reduction in the appropriate type. */
7730 tree rvectype = vectype_out;
7731 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7732 TREE_TYPE (TREE_TYPE (def))))
7733 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7734 TYPE_VECTOR_SUBPARTS
7735 (vectype_out));
7736 def = vect_create_partial_epilog (def, rvectype,
7737 STMT_VINFO_REDUC_CODE
7738 (reduc_info),
7739 &stmts);
7741 /* The epilogue loop might use a different vector mode, like
7742 VNx2DI vs. V2DI. */
7743 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7745 tree reduc_type = build_vector_type_for_mode
7746 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7747 def = gimple_convert (&stmts, reduc_type, def);
7749 /* Adjust the input so we pick up the partially reduced value
7750 for the skip edge in vect_create_epilog_for_reduction. */
7751 accumulator->reduc_input = def;
7752 /* And the reduction could be carried out using a different sign. */
7753 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7754 def = gimple_convert (&stmts, vectype_out, def);
7755 if (loop_vinfo->main_loop_edge)
7757 /* While we'd like to insert on the edge this will split
7758 blocks and disturb bookkeeping, we also will eventually
7759 need this on the skip edge. Rely on sinking to
7760 fixup optimal placement and insert in the pred. */
7761 gimple_stmt_iterator gsi
7762 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7763 /* Insert before a cond that eventually skips the
7764 epilogue. */
7765 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7766 gsi_prev (&gsi);
7767 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7769 else
7770 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7771 stmts);
7773 if (loop_vinfo->main_loop_edge)
7774 vec_initial_defs[0]
7775 = vect_get_main_loop_result (loop_vinfo, def,
7776 vec_initial_defs[0]);
7777 else
7778 vec_initial_defs.safe_push (def);
7781 /* Generate the reduction PHIs upfront. */
7782 for (i = 0; i < vec_num; i++)
7784 tree vec_init_def = vec_initial_defs[i];
7785 for (j = 0; j < ncopies; j++)
7787 /* Create the reduction-phi that defines the reduction
7788 operand. */
7789 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7791 /* Set the loop-entry arg of the reduction-phi. */
7792 if (j != 0 && nested_cycle)
7793 vec_init_def = vec_initial_defs[j];
7794 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7795 UNKNOWN_LOCATION);
7797 /* The loop-latch arg is set in epilogue processing. */
7799 if (slp_node)
7800 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7801 else
7803 if (j == 0)
7804 *vec_stmt = new_phi;
7805 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7810 return true;
7813 /* Vectorizes LC PHIs. */
7815 bool
7816 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7817 stmt_vec_info stmt_info, gimple **vec_stmt,
7818 slp_tree slp_node)
7820 if (!loop_vinfo
7821 || !is_a <gphi *> (stmt_info->stmt)
7822 || gimple_phi_num_args (stmt_info->stmt) != 1)
7823 return false;
7825 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7826 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7827 return false;
7829 if (!vec_stmt) /* transformation not required. */
7831 /* Deal with copies from externs or constants that disguise as
7832 loop-closed PHI nodes (PR97886). */
7833 if (slp_node
7834 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7835 SLP_TREE_VECTYPE (slp_node)))
7837 if (dump_enabled_p ())
7838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7839 "incompatible vector types for invariants\n");
7840 return false;
7842 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7843 return true;
7846 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7847 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7848 basic_block bb = gimple_bb (stmt_info->stmt);
7849 edge e = single_pred_edge (bb);
7850 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7851 auto_vec<tree> vec_oprnds;
7852 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7853 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7854 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7855 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7857 /* Create the vectorized LC PHI node. */
7858 gphi *new_phi = create_phi_node (vec_dest, bb);
7859 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7860 if (slp_node)
7861 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7862 else
7863 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7865 if (!slp_node)
7866 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7868 return true;
7871 /* Vectorizes PHIs. */
7873 bool
7874 vectorizable_phi (vec_info *,
7875 stmt_vec_info stmt_info, gimple **vec_stmt,
7876 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7878 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7879 return false;
7881 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7882 return false;
7884 tree vectype = SLP_TREE_VECTYPE (slp_node);
7886 if (!vec_stmt) /* transformation not required. */
7888 slp_tree child;
7889 unsigned i;
7890 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7891 if (!child)
7893 if (dump_enabled_p ())
7894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7895 "PHI node with unvectorized backedge def\n");
7896 return false;
7898 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7900 if (dump_enabled_p ())
7901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7902 "incompatible vector types for invariants\n");
7903 return false;
7905 /* For single-argument PHIs assume coalescing which means zero cost
7906 for the scalar and the vector PHIs. This avoids artificially
7907 favoring the vector path (but may pessimize it in some cases). */
7908 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7909 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7910 vector_stmt, stmt_info, vectype, 0, vect_body);
7911 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7912 return true;
7915 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7916 basic_block bb = gimple_bb (stmt_info->stmt);
7917 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7918 auto_vec<gphi *> new_phis;
7919 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7921 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7923 /* Skip not yet vectorized defs. */
7924 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7925 && SLP_TREE_VEC_STMTS (child).is_empty ())
7926 continue;
7928 auto_vec<tree> vec_oprnds;
7929 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7930 if (!new_phis.exists ())
7932 new_phis.create (vec_oprnds.length ());
7933 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7935 /* Create the vectorized LC PHI node. */
7936 new_phis.quick_push (create_phi_node (vec_dest, bb));
7937 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7940 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7941 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7942 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7944 /* We should have at least one already vectorized child. */
7945 gcc_assert (new_phis.exists ());
7947 return true;
7950 /* Return true if VECTYPE represents a vector that requires lowering
7951 by the vector lowering pass. */
7953 bool
7954 vect_emulated_vector_p (tree vectype)
7956 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7957 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7958 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7961 /* Return true if we can emulate CODE on an integer mode representation
7962 of a vector. */
7964 bool
7965 vect_can_vectorize_without_simd_p (tree_code code)
7967 switch (code)
7969 case PLUS_EXPR:
7970 case MINUS_EXPR:
7971 case NEGATE_EXPR:
7972 case BIT_AND_EXPR:
7973 case BIT_IOR_EXPR:
7974 case BIT_XOR_EXPR:
7975 case BIT_NOT_EXPR:
7976 return true;
7978 default:
7979 return false;
7983 /* Function vectorizable_induction
7985 Check if STMT_INFO performs an induction computation that can be vectorized.
7986 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7987 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7988 Return true if STMT_INFO is vectorizable in this way. */
7990 bool
7991 vectorizable_induction (loop_vec_info loop_vinfo,
7992 stmt_vec_info stmt_info,
7993 gimple **vec_stmt, slp_tree slp_node,
7994 stmt_vector_for_cost *cost_vec)
7996 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7997 unsigned ncopies;
7998 bool nested_in_vect_loop = false;
7999 class loop *iv_loop;
8000 tree vec_def;
8001 edge pe = loop_preheader_edge (loop);
8002 basic_block new_bb;
8003 tree new_vec, vec_init, vec_step, t;
8004 tree new_name;
8005 gimple *new_stmt;
8006 gphi *induction_phi;
8007 tree induc_def, vec_dest;
8008 tree init_expr, step_expr;
8009 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8010 unsigned i;
8011 tree expr;
8012 gimple_stmt_iterator si;
8014 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8015 if (!phi)
8016 return false;
8018 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8019 return false;
8021 /* Make sure it was recognized as induction computation. */
8022 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8023 return false;
8025 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8026 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8028 if (slp_node)
8029 ncopies = 1;
8030 else
8031 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8032 gcc_assert (ncopies >= 1);
8034 /* FORNOW. These restrictions should be relaxed. */
8035 if (nested_in_vect_loop_p (loop, stmt_info))
8037 imm_use_iterator imm_iter;
8038 use_operand_p use_p;
8039 gimple *exit_phi;
8040 edge latch_e;
8041 tree loop_arg;
8043 if (ncopies > 1)
8045 if (dump_enabled_p ())
8046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8047 "multiple types in nested loop.\n");
8048 return false;
8051 exit_phi = NULL;
8052 latch_e = loop_latch_edge (loop->inner);
8053 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8054 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8056 gimple *use_stmt = USE_STMT (use_p);
8057 if (is_gimple_debug (use_stmt))
8058 continue;
8060 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8062 exit_phi = use_stmt;
8063 break;
8066 if (exit_phi)
8068 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8069 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8070 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8072 if (dump_enabled_p ())
8073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8074 "inner-loop induction only used outside "
8075 "of the outer vectorized loop.\n");
8076 return false;
8080 nested_in_vect_loop = true;
8081 iv_loop = loop->inner;
8083 else
8084 iv_loop = loop;
8085 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8087 if (slp_node && !nunits.is_constant ())
8089 /* The current SLP code creates the step value element-by-element. */
8090 if (dump_enabled_p ())
8091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8092 "SLP induction not supported for variable-length"
8093 " vectors.\n");
8094 return false;
8097 if (!vec_stmt) /* transformation not required. */
8099 unsigned inside_cost = 0, prologue_cost = 0;
8100 if (slp_node)
8102 /* We eventually need to set a vector type on invariant
8103 arguments. */
8104 unsigned j;
8105 slp_tree child;
8106 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8107 if (!vect_maybe_update_slp_op_vectype
8108 (child, SLP_TREE_VECTYPE (slp_node)))
8110 if (dump_enabled_p ())
8111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8112 "incompatible vector types for "
8113 "invariants\n");
8114 return false;
8116 /* loop cost for vec_loop. */
8117 inside_cost
8118 = record_stmt_cost (cost_vec,
8119 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8120 vector_stmt, stmt_info, 0, vect_body);
8121 /* prologue cost for vec_init (if not nested) and step. */
8122 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8123 scalar_to_vec,
8124 stmt_info, 0, vect_prologue);
8126 else /* if (!slp_node) */
8128 /* loop cost for vec_loop. */
8129 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8130 stmt_info, 0, vect_body);
8131 /* prologue cost for vec_init and vec_step. */
8132 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8133 stmt_info, 0, vect_prologue);
8135 if (dump_enabled_p ())
8136 dump_printf_loc (MSG_NOTE, vect_location,
8137 "vect_model_induction_cost: inside_cost = %d, "
8138 "prologue_cost = %d .\n", inside_cost,
8139 prologue_cost);
8141 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8142 DUMP_VECT_SCOPE ("vectorizable_induction");
8143 return true;
8146 /* Transform. */
8148 /* Compute a vector variable, initialized with the first VF values of
8149 the induction variable. E.g., for an iv with IV_PHI='X' and
8150 evolution S, for a vector of 4 units, we want to compute:
8151 [X, X + S, X + 2*S, X + 3*S]. */
8153 if (dump_enabled_p ())
8154 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8156 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8157 gcc_assert (step_expr != NULL_TREE);
8158 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8160 pe = loop_preheader_edge (iv_loop);
8161 /* Find the first insertion point in the BB. */
8162 basic_block bb = gimple_bb (phi);
8163 si = gsi_after_labels (bb);
8165 /* For SLP induction we have to generate several IVs as for example
8166 with group size 3 we need
8167 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8168 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8169 if (slp_node)
8171 /* Enforced above. */
8172 unsigned int const_nunits = nunits.to_constant ();
8174 /* The initial values are vectorized, but any lanes > group_size
8175 need adjustment. */
8176 slp_tree init_node
8177 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8179 /* Gather steps. Since we do not vectorize inductions as
8180 cycles we have to reconstruct the step from SCEV data. */
8181 unsigned group_size = SLP_TREE_LANES (slp_node);
8182 tree *steps = XALLOCAVEC (tree, group_size);
8183 tree *inits = XALLOCAVEC (tree, group_size);
8184 stmt_vec_info phi_info;
8185 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8187 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8188 if (!init_node)
8189 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8190 pe->dest_idx);
8193 /* Now generate the IVs. */
8194 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8195 gcc_assert ((const_nunits * nvects) % group_size == 0);
8196 unsigned nivs;
8197 if (nested_in_vect_loop)
8198 nivs = nvects;
8199 else
8201 /* Compute the number of distinct IVs we need. First reduce
8202 group_size if it is a multiple of const_nunits so we get
8203 one IV for a group_size of 4 but const_nunits 2. */
8204 unsigned group_sizep = group_size;
8205 if (group_sizep % const_nunits == 0)
8206 group_sizep = group_sizep / const_nunits;
8207 nivs = least_common_multiple (group_sizep,
8208 const_nunits) / const_nunits;
8210 tree stept = TREE_TYPE (step_vectype);
8211 tree lupdate_mul = NULL_TREE;
8212 if (!nested_in_vect_loop)
8214 /* The number of iterations covered in one vector iteration. */
8215 unsigned lup_mul = (nvects * const_nunits) / group_size;
8216 lupdate_mul
8217 = build_vector_from_val (step_vectype,
8218 SCALAR_FLOAT_TYPE_P (stept)
8219 ? build_real_from_wide (stept, lup_mul,
8220 UNSIGNED)
8221 : build_int_cstu (stept, lup_mul));
8223 tree peel_mul = NULL_TREE;
8224 gimple_seq init_stmts = NULL;
8225 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8227 if (SCALAR_FLOAT_TYPE_P (stept))
8228 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8229 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8230 else
8231 peel_mul = gimple_convert (&init_stmts, stept,
8232 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8233 peel_mul = gimple_build_vector_from_val (&init_stmts,
8234 step_vectype, peel_mul);
8236 unsigned ivn;
8237 auto_vec<tree> vec_steps;
8238 for (ivn = 0; ivn < nivs; ++ivn)
8240 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8241 tree_vector_builder init_elts (vectype, const_nunits, 1);
8242 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8243 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8245 /* The scalar steps of the IVs. */
8246 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8247 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8248 step_elts.quick_push (elt);
8249 if (!init_node)
8251 /* The scalar inits of the IVs if not vectorized. */
8252 elt = inits[(ivn*const_nunits + eltn) % group_size];
8253 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8254 TREE_TYPE (elt)))
8255 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8256 TREE_TYPE (vectype), elt);
8257 init_elts.quick_push (elt);
8259 /* The number of steps to add to the initial values. */
8260 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8261 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8262 ? build_real_from_wide (stept,
8263 mul_elt, UNSIGNED)
8264 : build_int_cstu (stept, mul_elt));
8266 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8267 vec_steps.safe_push (vec_step);
8268 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8269 if (peel_mul)
8270 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8271 step_mul, peel_mul);
8272 if (!init_node)
8273 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8275 /* Create the induction-phi that defines the induction-operand. */
8276 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8277 "vec_iv_");
8278 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8279 induc_def = PHI_RESULT (induction_phi);
8281 /* Create the iv update inside the loop */
8282 tree up = vec_step;
8283 if (lupdate_mul)
8284 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8285 vec_step, lupdate_mul);
8286 gimple_seq stmts = NULL;
8287 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8288 vec_def = gimple_build (&stmts,
8289 PLUS_EXPR, step_vectype, vec_def, up);
8290 vec_def = gimple_convert (&stmts, vectype, vec_def);
8291 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8292 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8293 UNKNOWN_LOCATION);
8295 if (init_node)
8296 vec_init = vect_get_slp_vect_def (init_node, ivn);
8297 if (!nested_in_vect_loop
8298 && !integer_zerop (step_mul))
8300 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8301 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8302 vec_step, step_mul);
8303 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8304 vec_def, up);
8305 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8308 /* Set the arguments of the phi node: */
8309 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8311 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8313 if (!nested_in_vect_loop)
8315 /* Fill up to the number of vectors we need for the whole group. */
8316 nivs = least_common_multiple (group_size,
8317 const_nunits) / const_nunits;
8318 vec_steps.reserve (nivs-ivn);
8319 for (; ivn < nivs; ++ivn)
8321 SLP_TREE_VEC_STMTS (slp_node)
8322 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8323 vec_steps.quick_push (vec_steps[0]);
8327 /* Re-use IVs when we can. We are generating further vector
8328 stmts by adding VF' * stride to the IVs generated above. */
8329 if (ivn < nvects)
8331 unsigned vfp
8332 = least_common_multiple (group_size, const_nunits) / group_size;
8333 tree lupdate_mul
8334 = build_vector_from_val (step_vectype,
8335 SCALAR_FLOAT_TYPE_P (stept)
8336 ? build_real_from_wide (stept,
8337 vfp, UNSIGNED)
8338 : build_int_cstu (stept, vfp));
8339 for (; ivn < nvects; ++ivn)
8341 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8342 tree def = gimple_get_lhs (iv);
8343 if (ivn < 2*nivs)
8344 vec_steps[ivn - nivs]
8345 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8346 vec_steps[ivn - nivs], lupdate_mul);
8347 gimple_seq stmts = NULL;
8348 def = gimple_convert (&stmts, step_vectype, def);
8349 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8350 def, vec_steps[ivn % nivs]);
8351 def = gimple_convert (&stmts, vectype, def);
8352 if (gimple_code (iv) == GIMPLE_PHI)
8353 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8354 else
8356 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8357 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8359 SLP_TREE_VEC_STMTS (slp_node)
8360 .quick_push (SSA_NAME_DEF_STMT (def));
8364 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8365 gcc_assert (!new_bb);
8367 return true;
8370 init_expr = vect_phi_initial_value (phi);
8372 gimple_seq stmts = NULL;
8373 if (!nested_in_vect_loop)
8375 /* Convert the initial value to the IV update type. */
8376 tree new_type = TREE_TYPE (step_expr);
8377 init_expr = gimple_convert (&stmts, new_type, init_expr);
8379 /* If we are using the loop mask to "peel" for alignment then we need
8380 to adjust the start value here. */
8381 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8382 if (skip_niters != NULL_TREE)
8384 if (FLOAT_TYPE_P (vectype))
8385 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8386 skip_niters);
8387 else
8388 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8389 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8390 skip_niters, step_expr);
8391 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8392 init_expr, skip_step);
8396 if (stmts)
8398 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8399 gcc_assert (!new_bb);
8402 /* Create the vector that holds the initial_value of the induction. */
8403 if (nested_in_vect_loop)
8405 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8406 been created during vectorization of previous stmts. We obtain it
8407 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8408 auto_vec<tree> vec_inits;
8409 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8410 init_expr, &vec_inits);
8411 vec_init = vec_inits[0];
8412 /* If the initial value is not of proper type, convert it. */
8413 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8415 new_stmt
8416 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8417 vect_simple_var,
8418 "vec_iv_"),
8419 VIEW_CONVERT_EXPR,
8420 build1 (VIEW_CONVERT_EXPR, vectype,
8421 vec_init));
8422 vec_init = gimple_assign_lhs (new_stmt);
8423 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8424 new_stmt);
8425 gcc_assert (!new_bb);
8428 else
8430 /* iv_loop is the loop to be vectorized. Create:
8431 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8432 stmts = NULL;
8433 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8435 unsigned HOST_WIDE_INT const_nunits;
8436 if (nunits.is_constant (&const_nunits))
8438 tree_vector_builder elts (step_vectype, const_nunits, 1);
8439 elts.quick_push (new_name);
8440 for (i = 1; i < const_nunits; i++)
8442 /* Create: new_name_i = new_name + step_expr */
8443 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8444 new_name, step_expr);
8445 elts.quick_push (new_name);
8447 /* Create a vector from [new_name_0, new_name_1, ...,
8448 new_name_nunits-1] */
8449 vec_init = gimple_build_vector (&stmts, &elts);
8451 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8452 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8453 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8454 new_name, step_expr);
8455 else
8457 /* Build:
8458 [base, base, base, ...]
8459 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8460 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8461 gcc_assert (flag_associative_math);
8462 tree index = build_index_vector (step_vectype, 0, 1);
8463 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8464 new_name);
8465 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8466 step_expr);
8467 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8468 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8469 vec_init, step_vec);
8470 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8471 vec_init, base_vec);
8473 vec_init = gimple_convert (&stmts, vectype, vec_init);
8475 if (stmts)
8477 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8478 gcc_assert (!new_bb);
8483 /* Create the vector that holds the step of the induction. */
8484 if (nested_in_vect_loop)
8485 /* iv_loop is nested in the loop to be vectorized. Generate:
8486 vec_step = [S, S, S, S] */
8487 new_name = step_expr;
8488 else
8490 /* iv_loop is the loop to be vectorized. Generate:
8491 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8492 gimple_seq seq = NULL;
8493 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8495 expr = build_int_cst (integer_type_node, vf);
8496 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8498 else
8499 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8500 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8501 expr, step_expr);
8502 if (seq)
8504 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8505 gcc_assert (!new_bb);
8509 t = unshare_expr (new_name);
8510 gcc_assert (CONSTANT_CLASS_P (new_name)
8511 || TREE_CODE (new_name) == SSA_NAME);
8512 new_vec = build_vector_from_val (step_vectype, t);
8513 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8514 new_vec, step_vectype, NULL);
8517 /* Create the following def-use cycle:
8518 loop prolog:
8519 vec_init = ...
8520 vec_step = ...
8521 loop:
8522 vec_iv = PHI <vec_init, vec_loop>
8524 STMT
8526 vec_loop = vec_iv + vec_step; */
8528 /* Create the induction-phi that defines the induction-operand. */
8529 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8530 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8531 induc_def = PHI_RESULT (induction_phi);
8533 /* Create the iv update inside the loop */
8534 stmts = NULL;
8535 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8536 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8537 vec_def = gimple_convert (&stmts, vectype, vec_def);
8538 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8539 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8541 /* Set the arguments of the phi node: */
8542 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8543 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8544 UNKNOWN_LOCATION);
8546 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8547 *vec_stmt = induction_phi;
8549 /* In case that vectorization factor (VF) is bigger than the number
8550 of elements that we can fit in a vectype (nunits), we have to generate
8551 more than one vector stmt - i.e - we need to "unroll" the
8552 vector stmt by a factor VF/nunits. For more details see documentation
8553 in vectorizable_operation. */
8555 if (ncopies > 1)
8557 gimple_seq seq = NULL;
8558 /* FORNOW. This restriction should be relaxed. */
8559 gcc_assert (!nested_in_vect_loop);
8561 /* Create the vector that holds the step of the induction. */
8562 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8564 expr = build_int_cst (integer_type_node, nunits);
8565 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8567 else
8568 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8569 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8570 expr, step_expr);
8571 if (seq)
8573 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8574 gcc_assert (!new_bb);
8577 t = unshare_expr (new_name);
8578 gcc_assert (CONSTANT_CLASS_P (new_name)
8579 || TREE_CODE (new_name) == SSA_NAME);
8580 new_vec = build_vector_from_val (step_vectype, t);
8581 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8582 new_vec, step_vectype, NULL);
8584 vec_def = induc_def;
8585 for (i = 1; i < ncopies; i++)
8587 /* vec_i = vec_prev + vec_step */
8588 gimple_seq stmts = NULL;
8589 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8590 vec_def = gimple_build (&stmts,
8591 PLUS_EXPR, step_vectype, vec_def, vec_step);
8592 vec_def = gimple_convert (&stmts, vectype, vec_def);
8594 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8595 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8596 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8600 if (dump_enabled_p ())
8601 dump_printf_loc (MSG_NOTE, vect_location,
8602 "transform induction: created def-use cycle: %G%G",
8603 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8605 return true;
8608 /* Function vectorizable_live_operation.
8610 STMT_INFO computes a value that is used outside the loop. Check if
8611 it can be supported. */
8613 bool
8614 vectorizable_live_operation (vec_info *vinfo,
8615 stmt_vec_info stmt_info,
8616 gimple_stmt_iterator *gsi,
8617 slp_tree slp_node, slp_instance slp_node_instance,
8618 int slp_index, bool vec_stmt_p,
8619 stmt_vector_for_cost *cost_vec)
8621 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8622 imm_use_iterator imm_iter;
8623 tree lhs, lhs_type, bitsize;
8624 tree vectype = (slp_node
8625 ? SLP_TREE_VECTYPE (slp_node)
8626 : STMT_VINFO_VECTYPE (stmt_info));
8627 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8628 int ncopies;
8629 gimple *use_stmt;
8630 auto_vec<tree> vec_oprnds;
8631 int vec_entry = 0;
8632 poly_uint64 vec_index = 0;
8634 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8636 /* If a stmt of a reduction is live, vectorize it via
8637 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8638 validity so just trigger the transform here. */
8639 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8641 if (!vec_stmt_p)
8642 return true;
8643 if (slp_node)
8645 /* For reduction chains the meta-info is attached to
8646 the group leader. */
8647 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8648 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8649 /* For SLP reductions we vectorize the epilogue for
8650 all involved stmts together. */
8651 else if (slp_index != 0)
8652 return true;
8653 else
8654 /* For SLP reductions the meta-info is attached to
8655 the representative. */
8656 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8658 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8659 gcc_assert (reduc_info->is_reduc_info);
8660 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8661 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8662 return true;
8663 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8664 slp_node_instance);
8665 return true;
8668 /* If STMT is not relevant and it is a simple assignment and its inputs are
8669 invariant then it can remain in place, unvectorized. The original last
8670 scalar value that it computes will be used. */
8671 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8673 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8674 if (dump_enabled_p ())
8675 dump_printf_loc (MSG_NOTE, vect_location,
8676 "statement is simple and uses invariant. Leaving in "
8677 "place.\n");
8678 return true;
8681 if (slp_node)
8682 ncopies = 1;
8683 else
8684 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8686 if (slp_node)
8688 gcc_assert (slp_index >= 0);
8690 /* Get the last occurrence of the scalar index from the concatenation of
8691 all the slp vectors. Calculate which slp vector it is and the index
8692 within. */
8693 int num_scalar = SLP_TREE_LANES (slp_node);
8694 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8695 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8697 /* Calculate which vector contains the result, and which lane of
8698 that vector we need. */
8699 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8701 if (dump_enabled_p ())
8702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8703 "Cannot determine which vector holds the"
8704 " final result.\n");
8705 return false;
8709 if (!vec_stmt_p)
8711 /* No transformation required. */
8712 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8714 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8715 OPTIMIZE_FOR_SPEED))
8717 if (dump_enabled_p ())
8718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8719 "can't operate on partial vectors "
8720 "because the target doesn't support extract "
8721 "last reduction.\n");
8722 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8724 else if (slp_node)
8726 if (dump_enabled_p ())
8727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8728 "can't operate on partial vectors "
8729 "because an SLP statement is live after "
8730 "the loop.\n");
8731 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8733 else if (ncopies > 1)
8735 if (dump_enabled_p ())
8736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8737 "can't operate on partial vectors "
8738 "because ncopies is greater than 1.\n");
8739 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8741 else
8743 gcc_assert (ncopies == 1 && !slp_node);
8744 vect_record_loop_mask (loop_vinfo,
8745 &LOOP_VINFO_MASKS (loop_vinfo),
8746 1, vectype, NULL);
8749 /* ??? Enable for loop costing as well. */
8750 if (!loop_vinfo)
8751 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8752 0, vect_epilogue);
8753 return true;
8756 /* Use the lhs of the original scalar statement. */
8757 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8758 if (dump_enabled_p ())
8759 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8760 "stmt %G", stmt);
8762 lhs = gimple_get_lhs (stmt);
8763 lhs_type = TREE_TYPE (lhs);
8765 bitsize = vector_element_bits_tree (vectype);
8767 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8768 tree vec_lhs, bitstart;
8769 gimple *vec_stmt;
8770 if (slp_node)
8772 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8774 /* Get the correct slp vectorized stmt. */
8775 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8776 vec_lhs = gimple_get_lhs (vec_stmt);
8778 /* Get entry to use. */
8779 bitstart = bitsize_int (vec_index);
8780 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8782 else
8784 /* For multiple copies, get the last copy. */
8785 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8786 vec_lhs = gimple_get_lhs (vec_stmt);
8788 /* Get the last lane in the vector. */
8789 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8792 if (loop_vinfo)
8794 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8795 requirement, insert one phi node for it. It looks like:
8796 loop;
8798 # lhs' = PHI <lhs>
8800 loop;
8802 # vec_lhs' = PHI <vec_lhs>
8803 new_tree = lane_extract <vec_lhs', ...>;
8804 lhs' = new_tree; */
8806 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8807 basic_block exit_bb = single_exit (loop)->dest;
8808 gcc_assert (single_pred_p (exit_bb));
8810 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8811 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8812 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8814 gimple_seq stmts = NULL;
8815 tree new_tree;
8816 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8818 /* Emit:
8820 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8822 where VEC_LHS is the vectorized live-out result and MASK is
8823 the loop mask for the final iteration. */
8824 gcc_assert (ncopies == 1 && !slp_node);
8825 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8826 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8827 1, vectype, 0);
8828 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8829 mask, vec_lhs_phi);
8831 /* Convert the extracted vector element to the scalar type. */
8832 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8834 else
8836 tree bftype = TREE_TYPE (vectype);
8837 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8838 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8839 new_tree = build3 (BIT_FIELD_REF, bftype,
8840 vec_lhs_phi, bitsize, bitstart);
8841 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8842 &stmts, true, NULL_TREE);
8845 if (stmts)
8847 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8848 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8850 /* Remove existing phi from lhs and create one copy from new_tree. */
8851 tree lhs_phi = NULL_TREE;
8852 gimple_stmt_iterator gsi;
8853 for (gsi = gsi_start_phis (exit_bb);
8854 !gsi_end_p (gsi); gsi_next (&gsi))
8856 gimple *phi = gsi_stmt (gsi);
8857 if ((gimple_phi_arg_def (phi, 0) == lhs))
8859 remove_phi_node (&gsi, false);
8860 lhs_phi = gimple_phi_result (phi);
8861 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8862 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8863 break;
8868 /* Replace use of lhs with newly computed result. If the use stmt is a
8869 single arg PHI, just replace all uses of PHI result. It's necessary
8870 because lcssa PHI defining lhs may be before newly inserted stmt. */
8871 use_operand_p use_p;
8872 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8873 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8874 && !is_gimple_debug (use_stmt))
8876 if (gimple_code (use_stmt) == GIMPLE_PHI
8877 && gimple_phi_num_args (use_stmt) == 1)
8879 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8881 else
8883 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8884 SET_USE (use_p, new_tree);
8886 update_stmt (use_stmt);
8889 else
8891 /* For basic-block vectorization simply insert the lane-extraction. */
8892 tree bftype = TREE_TYPE (vectype);
8893 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8894 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8895 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8896 vec_lhs, bitsize, bitstart);
8897 gimple_seq stmts = NULL;
8898 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8899 &stmts, true, NULL_TREE);
8900 if (TREE_CODE (new_tree) == SSA_NAME
8901 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8902 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8903 if (is_a <gphi *> (vec_stmt))
8905 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8906 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8908 else
8910 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8911 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8914 /* Replace use of lhs with newly computed result. If the use stmt is a
8915 single arg PHI, just replace all uses of PHI result. It's necessary
8916 because lcssa PHI defining lhs may be before newly inserted stmt. */
8917 use_operand_p use_p;
8918 stmt_vec_info use_stmt_info;
8919 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8920 if (!is_gimple_debug (use_stmt)
8921 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8922 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8924 /* ??? This can happen when the live lane ends up being
8925 used in a vector construction code-generated by an
8926 external SLP node (and code-generation for that already
8927 happened). See gcc.dg/vect/bb-slp-47.c.
8928 Doing this is what would happen if that vector CTOR
8929 were not code-generated yet so it is not too bad.
8930 ??? In fact we'd likely want to avoid this situation
8931 in the first place. */
8932 if (TREE_CODE (new_tree) == SSA_NAME
8933 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8934 && gimple_code (use_stmt) != GIMPLE_PHI
8935 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8936 use_stmt))
8938 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8939 gcc_assert (code == CONSTRUCTOR
8940 || code == VIEW_CONVERT_EXPR
8941 || CONVERT_EXPR_CODE_P (code));
8942 if (dump_enabled_p ())
8943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8944 "Using original scalar computation for "
8945 "live lane because use preceeds vector "
8946 "def\n");
8947 continue;
8949 /* ??? It can also happen that we end up pulling a def into
8950 a loop where replacing out-of-loop uses would require
8951 a new LC SSA PHI node. Retain the original scalar in
8952 those cases as well. PR98064. */
8953 if (TREE_CODE (new_tree) == SSA_NAME
8954 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8955 && (gimple_bb (use_stmt)->loop_father
8956 != gimple_bb (vec_stmt)->loop_father)
8957 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8958 gimple_bb (use_stmt)->loop_father))
8960 if (dump_enabled_p ())
8961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8962 "Using original scalar computation for "
8963 "live lane because there is an out-of-loop "
8964 "definition for it\n");
8965 continue;
8967 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8968 SET_USE (use_p, new_tree);
8969 update_stmt (use_stmt);
8973 return true;
8976 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8978 static void
8979 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8981 ssa_op_iter op_iter;
8982 imm_use_iterator imm_iter;
8983 def_operand_p def_p;
8984 gimple *ustmt;
8986 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8988 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8990 basic_block bb;
8992 if (!is_gimple_debug (ustmt))
8993 continue;
8995 bb = gimple_bb (ustmt);
8997 if (!flow_bb_inside_loop_p (loop, bb))
8999 if (gimple_debug_bind_p (ustmt))
9001 if (dump_enabled_p ())
9002 dump_printf_loc (MSG_NOTE, vect_location,
9003 "killing debug use\n");
9005 gimple_debug_bind_reset_value (ustmt);
9006 update_stmt (ustmt);
9008 else
9009 gcc_unreachable ();
9015 /* Given loop represented by LOOP_VINFO, return true if computation of
9016 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9017 otherwise. */
9019 static bool
9020 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9022 /* Constant case. */
9023 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9025 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9026 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9028 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9029 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9030 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9031 return true;
9034 widest_int max;
9035 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9036 /* Check the upper bound of loop niters. */
9037 if (get_max_loop_iterations (loop, &max))
9039 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9040 signop sgn = TYPE_SIGN (type);
9041 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9042 if (max < type_max)
9043 return true;
9045 return false;
9048 /* Return a mask type with half the number of elements as OLD_TYPE,
9049 given that it should have mode NEW_MODE. */
9051 tree
9052 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9054 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9055 return build_truth_vector_type_for_mode (nunits, new_mode);
9058 /* Return a mask type with twice as many elements as OLD_TYPE,
9059 given that it should have mode NEW_MODE. */
9061 tree
9062 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9064 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9065 return build_truth_vector_type_for_mode (nunits, new_mode);
9068 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9069 contain a sequence of NVECTORS masks that each control a vector of type
9070 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9071 these vector masks with the vector version of SCALAR_MASK. */
9073 void
9074 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9075 unsigned int nvectors, tree vectype, tree scalar_mask)
9077 gcc_assert (nvectors != 0);
9078 if (masks->length () < nvectors)
9079 masks->safe_grow_cleared (nvectors, true);
9080 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9081 /* The number of scalars per iteration and the number of vectors are
9082 both compile-time constants. */
9083 unsigned int nscalars_per_iter
9084 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9085 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9087 if (scalar_mask)
9089 scalar_cond_masked_key cond (scalar_mask, nvectors);
9090 loop_vinfo->scalar_cond_masked_set.add (cond);
9093 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9095 rgm->max_nscalars_per_iter = nscalars_per_iter;
9096 rgm->type = truth_type_for (vectype);
9097 rgm->factor = 1;
9101 /* Given a complete set of masks MASKS, extract mask number INDEX
9102 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9103 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9105 See the comment above vec_loop_masks for more details about the mask
9106 arrangement. */
9108 tree
9109 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9110 unsigned int nvectors, tree vectype, unsigned int index)
9112 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9113 tree mask_type = rgm->type;
9115 /* Populate the rgroup's mask array, if this is the first time we've
9116 used it. */
9117 if (rgm->controls.is_empty ())
9119 rgm->controls.safe_grow_cleared (nvectors, true);
9120 for (unsigned int i = 0; i < nvectors; ++i)
9122 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9123 /* Provide a dummy definition until the real one is available. */
9124 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9125 rgm->controls[i] = mask;
9129 tree mask = rgm->controls[index];
9130 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9131 TYPE_VECTOR_SUBPARTS (vectype)))
9133 /* A loop mask for data type X can be reused for data type Y
9134 if X has N times more elements than Y and if Y's elements
9135 are N times bigger than X's. In this case each sequence
9136 of N elements in the loop mask will be all-zero or all-one.
9137 We can then view-convert the mask so that each sequence of
9138 N elements is replaced by a single element. */
9139 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9140 TYPE_VECTOR_SUBPARTS (vectype)));
9141 gimple_seq seq = NULL;
9142 mask_type = truth_type_for (vectype);
9143 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9144 if (seq)
9145 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9147 return mask;
9150 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9151 lengths for controlling an operation on VECTYPE. The operation splits
9152 each element of VECTYPE into FACTOR separate subelements, measuring the
9153 length as a number of these subelements. */
9155 void
9156 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9157 unsigned int nvectors, tree vectype, unsigned int factor)
9159 gcc_assert (nvectors != 0);
9160 if (lens->length () < nvectors)
9161 lens->safe_grow_cleared (nvectors, true);
9162 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9164 /* The number of scalars per iteration, scalar occupied bytes and
9165 the number of vectors are both compile-time constants. */
9166 unsigned int nscalars_per_iter
9167 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9168 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9170 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9172 /* For now, we only support cases in which all loads and stores fall back
9173 to VnQI or none do. */
9174 gcc_assert (!rgl->max_nscalars_per_iter
9175 || (rgl->factor == 1 && factor == 1)
9176 || (rgl->max_nscalars_per_iter * rgl->factor
9177 == nscalars_per_iter * factor));
9178 rgl->max_nscalars_per_iter = nscalars_per_iter;
9179 rgl->type = vectype;
9180 rgl->factor = factor;
9184 /* Given a complete set of length LENS, extract length number INDEX for an
9185 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9187 tree
9188 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9189 unsigned int nvectors, unsigned int index)
9191 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9193 /* Populate the rgroup's len array, if this is the first time we've
9194 used it. */
9195 if (rgl->controls.is_empty ())
9197 rgl->controls.safe_grow_cleared (nvectors, true);
9198 for (unsigned int i = 0; i < nvectors; ++i)
9200 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9201 gcc_assert (len_type != NULL_TREE);
9202 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9204 /* Provide a dummy definition until the real one is available. */
9205 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9206 rgl->controls[i] = len;
9210 return rgl->controls[index];
9213 /* Scale profiling counters by estimation for LOOP which is vectorized
9214 by factor VF. */
9216 static void
9217 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9219 edge preheader = loop_preheader_edge (loop);
9220 /* Reduce loop iterations by the vectorization factor. */
9221 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9222 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9224 if (freq_h.nonzero_p ())
9226 profile_probability p;
9228 /* Avoid dropping loop body profile counter to 0 because of zero count
9229 in loop's preheader. */
9230 if (!(freq_e == profile_count::zero ()))
9231 freq_e = freq_e.force_nonzero ();
9232 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9233 scale_loop_frequencies (loop, p);
9236 edge exit_e = single_exit (loop);
9237 exit_e->probability = profile_probability::always ()
9238 .apply_scale (1, new_est_niter + 1);
9240 edge exit_l = single_pred_edge (loop->latch);
9241 profile_probability prob = exit_l->probability;
9242 exit_l->probability = exit_e->probability.invert ();
9243 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9244 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9247 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9248 latch edge values originally defined by it. */
9250 static void
9251 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9252 stmt_vec_info def_stmt_info)
9254 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9255 if (!def || TREE_CODE (def) != SSA_NAME)
9256 return;
9257 stmt_vec_info phi_info;
9258 imm_use_iterator iter;
9259 use_operand_p use_p;
9260 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9261 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9262 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9263 && (phi_info = loop_vinfo->lookup_stmt (phi))
9264 && STMT_VINFO_RELEVANT_P (phi_info)
9265 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9266 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9267 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9269 loop_p loop = gimple_bb (phi)->loop_father;
9270 edge e = loop_latch_edge (loop);
9271 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9273 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9274 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9275 gcc_assert (phi_defs.length () == latch_defs.length ());
9276 for (unsigned i = 0; i < phi_defs.length (); ++i)
9277 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9278 gimple_get_lhs (latch_defs[i]), e,
9279 gimple_phi_arg_location (phi, e->dest_idx));
9284 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9285 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9286 stmt_vec_info. */
9288 static bool
9289 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9290 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9292 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9293 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9295 if (dump_enabled_p ())
9296 dump_printf_loc (MSG_NOTE, vect_location,
9297 "------>vectorizing statement: %G", stmt_info->stmt);
9299 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9300 vect_loop_kill_debug_uses (loop, stmt_info);
9302 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9303 && !STMT_VINFO_LIVE_P (stmt_info))
9304 return false;
9306 if (STMT_VINFO_VECTYPE (stmt_info))
9308 poly_uint64 nunits
9309 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9310 if (!STMT_SLP_TYPE (stmt_info)
9311 && maybe_ne (nunits, vf)
9312 && dump_enabled_p ())
9313 /* For SLP VF is set according to unrolling factor, and not
9314 to vector size, hence for SLP this print is not valid. */
9315 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9318 /* Pure SLP statements have already been vectorized. We still need
9319 to apply loop vectorization to hybrid SLP statements. */
9320 if (PURE_SLP_STMT (stmt_info))
9321 return false;
9323 if (dump_enabled_p ())
9324 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9326 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9327 *seen_store = stmt_info;
9329 return true;
9332 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9333 in the hash_map with its corresponding values. */
9335 static tree
9336 find_in_mapping (tree t, void *context)
9338 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9340 tree *value = mapping->get (t);
9341 return value ? *value : t;
9344 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9345 original loop that has now been vectorized.
9347 The inits of the data_references need to be advanced with the number of
9348 iterations of the main loop. This has been computed in vect_do_peeling and
9349 is stored in parameter ADVANCE. We first restore the data_references
9350 initial offset with the values recored in ORIG_DRS_INIT.
9352 Since the loop_vec_info of this EPILOGUE was constructed for the original
9353 loop, its stmt_vec_infos all point to the original statements. These need
9354 to be updated to point to their corresponding copies as well as the SSA_NAMES
9355 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9357 The data_reference's connections also need to be updated. Their
9358 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9359 stmt_vec_infos, their statements need to point to their corresponding copy,
9360 if they are gather loads or scatter stores then their reference needs to be
9361 updated to point to its corresponding copy and finally we set
9362 'base_misaligned' to false as we have already peeled for alignment in the
9363 prologue of the main loop. */
9365 static void
9366 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9368 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9369 auto_vec<gimple *> stmt_worklist;
9370 hash_map<tree,tree> mapping;
9371 gimple *orig_stmt, *new_stmt;
9372 gimple_stmt_iterator epilogue_gsi;
9373 gphi_iterator epilogue_phi_gsi;
9374 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9375 basic_block *epilogue_bbs = get_loop_body (epilogue);
9376 unsigned i;
9378 free (LOOP_VINFO_BBS (epilogue_vinfo));
9379 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9381 /* Advance data_reference's with the number of iterations of the previous
9382 loop and its prologue. */
9383 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9386 /* The EPILOGUE loop is a copy of the original loop so they share the same
9387 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9388 point to the copied statements. We also create a mapping of all LHS' in
9389 the original loop and all the LHS' in the EPILOGUE and create worklists to
9390 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9391 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9393 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9394 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9396 new_stmt = epilogue_phi_gsi.phi ();
9398 gcc_assert (gimple_uid (new_stmt) > 0);
9399 stmt_vinfo
9400 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9402 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9403 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9405 mapping.put (gimple_phi_result (orig_stmt),
9406 gimple_phi_result (new_stmt));
9407 /* PHI nodes can not have patterns or related statements. */
9408 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9409 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9412 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9413 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9415 new_stmt = gsi_stmt (epilogue_gsi);
9416 if (is_gimple_debug (new_stmt))
9417 continue;
9419 gcc_assert (gimple_uid (new_stmt) > 0);
9420 stmt_vinfo
9421 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9423 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9424 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9426 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9427 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9429 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9431 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9432 for (gimple_stmt_iterator gsi = gsi_start (seq);
9433 !gsi_end_p (gsi); gsi_next (&gsi))
9434 stmt_worklist.safe_push (gsi_stmt (gsi));
9437 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9438 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9440 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9441 stmt_worklist.safe_push (stmt);
9442 /* Set BB such that the assert in
9443 'get_initial_def_for_reduction' is able to determine that
9444 the BB of the related stmt is inside this loop. */
9445 gimple_set_bb (stmt,
9446 gimple_bb (new_stmt));
9447 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9448 gcc_assert (related_vinfo == NULL
9449 || related_vinfo == stmt_vinfo);
9454 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9455 using the original main loop and thus need to be updated to refer to the
9456 cloned variables used in the epilogue. */
9457 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9459 gimple *stmt = stmt_worklist[i];
9460 tree *new_op;
9462 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9464 tree op = gimple_op (stmt, j);
9465 if ((new_op = mapping.get(op)))
9466 gimple_set_op (stmt, j, *new_op);
9467 else
9469 /* PR92429: The last argument of simplify_replace_tree disables
9470 folding when replacing arguments. This is required as
9471 otherwise you might end up with different statements than the
9472 ones analyzed in vect_loop_analyze, leading to different
9473 vectorization. */
9474 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9475 &find_in_mapping, &mapping, false);
9476 gimple_set_op (stmt, j, op);
9481 struct data_reference *dr;
9482 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9483 FOR_EACH_VEC_ELT (datarefs, i, dr)
9485 orig_stmt = DR_STMT (dr);
9486 gcc_assert (gimple_uid (orig_stmt) > 0);
9487 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9488 /* Data references for gather loads and scatter stores do not use the
9489 updated offset we set using ADVANCE. Instead we have to make sure the
9490 reference in the data references point to the corresponding copy of
9491 the original in the epilogue. */
9492 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9493 == VMAT_GATHER_SCATTER)
9495 DR_REF (dr)
9496 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9497 &find_in_mapping, &mapping);
9498 DR_BASE_ADDRESS (dr)
9499 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9500 &find_in_mapping, &mapping);
9502 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9503 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9504 /* The vector size of the epilogue is smaller than that of the main loop
9505 so the alignment is either the same or lower. This means the dr will
9506 thus by definition be aligned. */
9507 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9510 epilogue_vinfo->shared->datarefs_copy.release ();
9511 epilogue_vinfo->shared->save_datarefs ();
9514 /* Function vect_transform_loop.
9516 The analysis phase has determined that the loop is vectorizable.
9517 Vectorize the loop - created vectorized stmts to replace the scalar
9518 stmts in the loop, and update the loop exit condition.
9519 Returns scalar epilogue loop if any. */
9521 class loop *
9522 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9524 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9525 class loop *epilogue = NULL;
9526 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9527 int nbbs = loop->num_nodes;
9528 int i;
9529 tree niters_vector = NULL_TREE;
9530 tree step_vector = NULL_TREE;
9531 tree niters_vector_mult_vf = NULL_TREE;
9532 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9533 unsigned int lowest_vf = constant_lower_bound (vf);
9534 gimple *stmt;
9535 bool check_profitability = false;
9536 unsigned int th;
9538 DUMP_VECT_SCOPE ("vec_transform_loop");
9540 loop_vinfo->shared->check_datarefs ();
9542 /* Use the more conservative vectorization threshold. If the number
9543 of iterations is constant assume the cost check has been performed
9544 by our caller. If the threshold makes all loops profitable that
9545 run at least the (estimated) vectorization factor number of times
9546 checking is pointless, too. */
9547 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9548 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9550 if (dump_enabled_p ())
9551 dump_printf_loc (MSG_NOTE, vect_location,
9552 "Profitability threshold is %d loop iterations.\n",
9553 th);
9554 check_profitability = true;
9557 /* Make sure there exists a single-predecessor exit bb. Do this before
9558 versioning. */
9559 edge e = single_exit (loop);
9560 if (! single_pred_p (e->dest))
9562 split_loop_exit_edge (e, true);
9563 if (dump_enabled_p ())
9564 dump_printf (MSG_NOTE, "split exit edge\n");
9567 /* Version the loop first, if required, so the profitability check
9568 comes first. */
9570 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9572 class loop *sloop
9573 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9574 sloop->force_vectorize = false;
9575 check_profitability = false;
9578 /* Make sure there exists a single-predecessor exit bb also on the
9579 scalar loop copy. Do this after versioning but before peeling
9580 so CFG structure is fine for both scalar and if-converted loop
9581 to make slpeel_duplicate_current_defs_from_edges face matched
9582 loop closed PHI nodes on the exit. */
9583 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9585 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9586 if (! single_pred_p (e->dest))
9588 split_loop_exit_edge (e, true);
9589 if (dump_enabled_p ())
9590 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9594 tree niters = vect_build_loop_niters (loop_vinfo);
9595 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9596 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9597 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9598 tree advance;
9599 drs_init_vec orig_drs_init;
9601 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9602 &step_vector, &niters_vector_mult_vf, th,
9603 check_profitability, niters_no_overflow,
9604 &advance);
9606 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9607 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9608 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9609 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9611 if (niters_vector == NULL_TREE)
9613 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9614 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9615 && known_eq (lowest_vf, vf))
9617 niters_vector
9618 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9619 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9620 step_vector = build_one_cst (TREE_TYPE (niters));
9622 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9623 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9624 &step_vector, niters_no_overflow);
9625 else
9626 /* vect_do_peeling subtracted the number of peeled prologue
9627 iterations from LOOP_VINFO_NITERS. */
9628 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9629 &niters_vector, &step_vector,
9630 niters_no_overflow);
9633 /* 1) Make sure the loop header has exactly two entries
9634 2) Make sure we have a preheader basic block. */
9636 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9638 split_edge (loop_preheader_edge (loop));
9640 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9641 /* This will deal with any possible peeling. */
9642 vect_prepare_for_masked_peels (loop_vinfo);
9644 /* Schedule the SLP instances first, then handle loop vectorization
9645 below. */
9646 if (!loop_vinfo->slp_instances.is_empty ())
9648 DUMP_VECT_SCOPE ("scheduling SLP instances");
9649 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9652 /* FORNOW: the vectorizer supports only loops which body consist
9653 of one basic block (header + empty latch). When the vectorizer will
9654 support more involved loop forms, the order by which the BBs are
9655 traversed need to be reconsidered. */
9657 for (i = 0; i < nbbs; i++)
9659 basic_block bb = bbs[i];
9660 stmt_vec_info stmt_info;
9662 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9663 gsi_next (&si))
9665 gphi *phi = si.phi ();
9666 if (dump_enabled_p ())
9667 dump_printf_loc (MSG_NOTE, vect_location,
9668 "------>vectorizing phi: %G", phi);
9669 stmt_info = loop_vinfo->lookup_stmt (phi);
9670 if (!stmt_info)
9671 continue;
9673 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9674 vect_loop_kill_debug_uses (loop, stmt_info);
9676 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9677 && !STMT_VINFO_LIVE_P (stmt_info))
9678 continue;
9680 if (STMT_VINFO_VECTYPE (stmt_info)
9681 && (maybe_ne
9682 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9683 && dump_enabled_p ())
9684 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9686 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9687 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9688 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9689 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9690 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9691 && ! PURE_SLP_STMT (stmt_info))
9693 if (dump_enabled_p ())
9694 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9695 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9699 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9700 gsi_next (&si))
9702 gphi *phi = si.phi ();
9703 stmt_info = loop_vinfo->lookup_stmt (phi);
9704 if (!stmt_info)
9705 continue;
9707 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9708 && !STMT_VINFO_LIVE_P (stmt_info))
9709 continue;
9711 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9712 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9713 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9714 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9715 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9716 && ! PURE_SLP_STMT (stmt_info))
9717 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9720 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9721 !gsi_end_p (si);)
9723 stmt = gsi_stmt (si);
9724 /* During vectorization remove existing clobber stmts. */
9725 if (gimple_clobber_p (stmt))
9727 unlink_stmt_vdef (stmt);
9728 gsi_remove (&si, true);
9729 release_defs (stmt);
9731 else
9733 /* Ignore vector stmts created in the outer loop. */
9734 stmt_info = loop_vinfo->lookup_stmt (stmt);
9736 /* vector stmts created in the outer-loop during vectorization of
9737 stmts in an inner-loop may not have a stmt_info, and do not
9738 need to be vectorized. */
9739 stmt_vec_info seen_store = NULL;
9740 if (stmt_info)
9742 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9744 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9745 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9746 !gsi_end_p (subsi); gsi_next (&subsi))
9748 stmt_vec_info pat_stmt_info
9749 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9750 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9751 &si, &seen_store);
9753 stmt_vec_info pat_stmt_info
9754 = STMT_VINFO_RELATED_STMT (stmt_info);
9755 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9756 &si, &seen_store))
9757 maybe_set_vectorized_backedge_value (loop_vinfo,
9758 pat_stmt_info);
9760 else
9762 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9763 &seen_store))
9764 maybe_set_vectorized_backedge_value (loop_vinfo,
9765 stmt_info);
9768 gsi_next (&si);
9769 if (seen_store)
9771 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9772 /* Interleaving. If IS_STORE is TRUE, the
9773 vectorization of the interleaving chain was
9774 completed - free all the stores in the chain. */
9775 vect_remove_stores (loop_vinfo,
9776 DR_GROUP_FIRST_ELEMENT (seen_store));
9777 else
9778 /* Free the attached stmt_vec_info and remove the stmt. */
9779 loop_vinfo->remove_stmt (stmt_info);
9784 /* Stub out scalar statements that must not survive vectorization.
9785 Doing this here helps with grouped statements, or statements that
9786 are involved in patterns. */
9787 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9788 !gsi_end_p (gsi); gsi_next (&gsi))
9790 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9791 if (!call || !gimple_call_internal_p (call))
9792 continue;
9793 internal_fn ifn = gimple_call_internal_fn (call);
9794 if (ifn == IFN_MASK_LOAD)
9796 tree lhs = gimple_get_lhs (call);
9797 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9799 tree zero = build_zero_cst (TREE_TYPE (lhs));
9800 gimple *new_stmt = gimple_build_assign (lhs, zero);
9801 gsi_replace (&gsi, new_stmt, true);
9804 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9806 tree lhs = gimple_get_lhs (call);
9807 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9809 tree else_arg
9810 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9811 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9812 gsi_replace (&gsi, new_stmt, true);
9816 } /* BBs in loop */
9818 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9819 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9820 if (integer_onep (step_vector))
9821 niters_no_overflow = true;
9822 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9823 niters_vector_mult_vf, !niters_no_overflow);
9825 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9826 scale_profile_for_vect_loop (loop, assumed_vf);
9828 /* True if the final iteration might not handle a full vector's
9829 worth of scalar iterations. */
9830 bool final_iter_may_be_partial
9831 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9832 /* The minimum number of iterations performed by the epilogue. This
9833 is 1 when peeling for gaps because we always need a final scalar
9834 iteration. */
9835 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9836 /* +1 to convert latch counts to loop iteration counts,
9837 -min_epilogue_iters to remove iterations that cannot be performed
9838 by the vector code. */
9839 int bias_for_lowest = 1 - min_epilogue_iters;
9840 int bias_for_assumed = bias_for_lowest;
9841 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9842 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9844 /* When the amount of peeling is known at compile time, the first
9845 iteration will have exactly alignment_npeels active elements.
9846 In the worst case it will have at least one. */
9847 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9848 bias_for_lowest += lowest_vf - min_first_active;
9849 bias_for_assumed += assumed_vf - min_first_active;
9851 /* In these calculations the "- 1" converts loop iteration counts
9852 back to latch counts. */
9853 if (loop->any_upper_bound)
9855 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9856 loop->nb_iterations_upper_bound
9857 = (final_iter_may_be_partial
9858 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9859 lowest_vf) - 1
9860 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9861 lowest_vf) - 1);
9862 if (main_vinfo)
9864 unsigned int bound;
9865 poly_uint64 main_iters
9866 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9867 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9868 main_iters
9869 = upper_bound (main_iters,
9870 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9871 if (can_div_away_from_zero_p (main_iters,
9872 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9873 &bound))
9874 loop->nb_iterations_upper_bound
9875 = wi::umin ((widest_int) (bound - 1),
9876 loop->nb_iterations_upper_bound);
9879 if (loop->any_likely_upper_bound)
9880 loop->nb_iterations_likely_upper_bound
9881 = (final_iter_may_be_partial
9882 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9883 + bias_for_lowest, lowest_vf) - 1
9884 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9885 + bias_for_lowest, lowest_vf) - 1);
9886 if (loop->any_estimate)
9887 loop->nb_iterations_estimate
9888 = (final_iter_may_be_partial
9889 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9890 assumed_vf) - 1
9891 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9892 assumed_vf) - 1);
9894 if (dump_enabled_p ())
9896 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9898 dump_printf_loc (MSG_NOTE, vect_location,
9899 "LOOP VECTORIZED\n");
9900 if (loop->inner)
9901 dump_printf_loc (MSG_NOTE, vect_location,
9902 "OUTER LOOP VECTORIZED\n");
9903 dump_printf (MSG_NOTE, "\n");
9905 else
9906 dump_printf_loc (MSG_NOTE, vect_location,
9907 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9908 GET_MODE_NAME (loop_vinfo->vector_mode));
9911 /* Loops vectorized with a variable factor won't benefit from
9912 unrolling/peeling. */
9913 if (!vf.is_constant ())
9915 loop->unroll = 1;
9916 if (dump_enabled_p ())
9917 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9918 " variable-length vectorization factor\n");
9920 /* Free SLP instances here because otherwise stmt reference counting
9921 won't work. */
9922 slp_instance instance;
9923 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9924 vect_free_slp_instance (instance);
9925 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9926 /* Clear-up safelen field since its value is invalid after vectorization
9927 since vectorized loop can have loop-carried dependencies. */
9928 loop->safelen = 0;
9930 if (epilogue)
9932 update_epilogue_loop_vinfo (epilogue, advance);
9934 epilogue->simduid = loop->simduid;
9935 epilogue->force_vectorize = loop->force_vectorize;
9936 epilogue->dont_vectorize = false;
9939 return epilogue;
9942 /* The code below is trying to perform simple optimization - revert
9943 if-conversion for masked stores, i.e. if the mask of a store is zero
9944 do not perform it and all stored value producers also if possible.
9945 For example,
9946 for (i=0; i<n; i++)
9947 if (c[i])
9949 p1[i] += 1;
9950 p2[i] = p3[i] +2;
9952 this transformation will produce the following semi-hammock:
9954 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9956 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9957 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9958 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9959 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9960 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9961 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9965 void
9966 optimize_mask_stores (class loop *loop)
9968 basic_block *bbs = get_loop_body (loop);
9969 unsigned nbbs = loop->num_nodes;
9970 unsigned i;
9971 basic_block bb;
9972 class loop *bb_loop;
9973 gimple_stmt_iterator gsi;
9974 gimple *stmt;
9975 auto_vec<gimple *> worklist;
9976 auto_purge_vect_location sentinel;
9978 vect_location = find_loop_location (loop);
9979 /* Pick up all masked stores in loop if any. */
9980 for (i = 0; i < nbbs; i++)
9982 bb = bbs[i];
9983 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9984 gsi_next (&gsi))
9986 stmt = gsi_stmt (gsi);
9987 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9988 worklist.safe_push (stmt);
9992 free (bbs);
9993 if (worklist.is_empty ())
9994 return;
9996 /* Loop has masked stores. */
9997 while (!worklist.is_empty ())
9999 gimple *last, *last_store;
10000 edge e, efalse;
10001 tree mask;
10002 basic_block store_bb, join_bb;
10003 gimple_stmt_iterator gsi_to;
10004 tree vdef, new_vdef;
10005 gphi *phi;
10006 tree vectype;
10007 tree zero;
10009 last = worklist.pop ();
10010 mask = gimple_call_arg (last, 2);
10011 bb = gimple_bb (last);
10012 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10013 the same loop as if_bb. It could be different to LOOP when two
10014 level loop-nest is vectorized and mask_store belongs to the inner
10015 one. */
10016 e = split_block (bb, last);
10017 bb_loop = bb->loop_father;
10018 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10019 join_bb = e->dest;
10020 store_bb = create_empty_bb (bb);
10021 add_bb_to_loop (store_bb, bb_loop);
10022 e->flags = EDGE_TRUE_VALUE;
10023 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10024 /* Put STORE_BB to likely part. */
10025 efalse->probability = profile_probability::unlikely ();
10026 store_bb->count = efalse->count ();
10027 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10028 if (dom_info_available_p (CDI_DOMINATORS))
10029 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10030 if (dump_enabled_p ())
10031 dump_printf_loc (MSG_NOTE, vect_location,
10032 "Create new block %d to sink mask stores.",
10033 store_bb->index);
10034 /* Create vector comparison with boolean result. */
10035 vectype = TREE_TYPE (mask);
10036 zero = build_zero_cst (vectype);
10037 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10038 gsi = gsi_last_bb (bb);
10039 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10040 /* Create new PHI node for vdef of the last masked store:
10041 .MEM_2 = VDEF <.MEM_1>
10042 will be converted to
10043 .MEM.3 = VDEF <.MEM_1>
10044 and new PHI node will be created in join bb
10045 .MEM_2 = PHI <.MEM_1, .MEM_3>
10047 vdef = gimple_vdef (last);
10048 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10049 gimple_set_vdef (last, new_vdef);
10050 phi = create_phi_node (vdef, join_bb);
10051 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10053 /* Put all masked stores with the same mask to STORE_BB if possible. */
10054 while (true)
10056 gimple_stmt_iterator gsi_from;
10057 gimple *stmt1 = NULL;
10059 /* Move masked store to STORE_BB. */
10060 last_store = last;
10061 gsi = gsi_for_stmt (last);
10062 gsi_from = gsi;
10063 /* Shift GSI to the previous stmt for further traversal. */
10064 gsi_prev (&gsi);
10065 gsi_to = gsi_start_bb (store_bb);
10066 gsi_move_before (&gsi_from, &gsi_to);
10067 /* Setup GSI_TO to the non-empty block start. */
10068 gsi_to = gsi_start_bb (store_bb);
10069 if (dump_enabled_p ())
10070 dump_printf_loc (MSG_NOTE, vect_location,
10071 "Move stmt to created bb\n%G", last);
10072 /* Move all stored value producers if possible. */
10073 while (!gsi_end_p (gsi))
10075 tree lhs;
10076 imm_use_iterator imm_iter;
10077 use_operand_p use_p;
10078 bool res;
10080 /* Skip debug statements. */
10081 if (is_gimple_debug (gsi_stmt (gsi)))
10083 gsi_prev (&gsi);
10084 continue;
10086 stmt1 = gsi_stmt (gsi);
10087 /* Do not consider statements writing to memory or having
10088 volatile operand. */
10089 if (gimple_vdef (stmt1)
10090 || gimple_has_volatile_ops (stmt1))
10091 break;
10092 gsi_from = gsi;
10093 gsi_prev (&gsi);
10094 lhs = gimple_get_lhs (stmt1);
10095 if (!lhs)
10096 break;
10098 /* LHS of vectorized stmt must be SSA_NAME. */
10099 if (TREE_CODE (lhs) != SSA_NAME)
10100 break;
10102 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10104 /* Remove dead scalar statement. */
10105 if (has_zero_uses (lhs))
10107 gsi_remove (&gsi_from, true);
10108 continue;
10112 /* Check that LHS does not have uses outside of STORE_BB. */
10113 res = true;
10114 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10116 gimple *use_stmt;
10117 use_stmt = USE_STMT (use_p);
10118 if (is_gimple_debug (use_stmt))
10119 continue;
10120 if (gimple_bb (use_stmt) != store_bb)
10122 res = false;
10123 break;
10126 if (!res)
10127 break;
10129 if (gimple_vuse (stmt1)
10130 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10131 break;
10133 /* Can move STMT1 to STORE_BB. */
10134 if (dump_enabled_p ())
10135 dump_printf_loc (MSG_NOTE, vect_location,
10136 "Move stmt to created bb\n%G", stmt1);
10137 gsi_move_before (&gsi_from, &gsi_to);
10138 /* Shift GSI_TO for further insertion. */
10139 gsi_prev (&gsi_to);
10141 /* Put other masked stores with the same mask to STORE_BB. */
10142 if (worklist.is_empty ()
10143 || gimple_call_arg (worklist.last (), 2) != mask
10144 || worklist.last () != stmt1)
10145 break;
10146 last = worklist.pop ();
10148 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10152 /* Decide whether it is possible to use a zero-based induction variable
10153 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10154 the value that the induction variable must be able to hold in order
10155 to ensure that the rgroups eventually have no active vector elements.
10156 Return -1 otherwise. */
10158 widest_int
10159 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10161 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10162 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10163 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10165 /* Calculate the value that the induction variable must be able
10166 to hit in order to ensure that we end the loop with an all-false mask.
10167 This involves adding the maximum number of inactive trailing scalar
10168 iterations. */
10169 widest_int iv_limit = -1;
10170 if (max_loop_iterations (loop, &iv_limit))
10172 if (niters_skip)
10174 /* Add the maximum number of skipped iterations to the
10175 maximum iteration count. */
10176 if (TREE_CODE (niters_skip) == INTEGER_CST)
10177 iv_limit += wi::to_widest (niters_skip);
10178 else
10179 iv_limit += max_vf - 1;
10181 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10182 /* Make a conservatively-correct assumption. */
10183 iv_limit += max_vf - 1;
10185 /* IV_LIMIT is the maximum number of latch iterations, which is also
10186 the maximum in-range IV value. Round this value down to the previous
10187 vector alignment boundary and then add an extra full iteration. */
10188 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10189 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10191 return iv_limit;
10194 /* For the given rgroup_controls RGC, check whether an induction variable
10195 would ever hit a value that produces a set of all-false masks or zero
10196 lengths before wrapping around. Return true if it's possible to wrap
10197 around before hitting the desirable value, otherwise return false. */
10199 bool
10200 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10202 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10204 if (iv_limit == -1)
10205 return true;
10207 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10208 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10209 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10211 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10212 return true;
10214 return false;