c++: wrong error with MVP and pushdecl [PR64679]
[official-gcc.git] / gcc / tree-vect-loop.cc
blobab7dade1c743180b077dcfe76e96ba7dc86ec502
1 /* Loop Vectorization
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158 unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160 bool *, bool *);
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
166 static opt_result
167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168 bool vectype_maybe_set_p,
169 poly_uint64 *vf)
171 gimple *stmt = stmt_info->stmt;
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174 && !STMT_VINFO_LIVE_P (stmt_info))
175 || gimple_clobber_p (stmt))
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179 return opt_result::success ();
182 tree stmt_vectype, nunits_vectype;
183 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184 &stmt_vectype,
185 &nunits_vectype);
186 if (!res)
187 return res;
189 if (stmt_vectype)
191 if (STMT_VINFO_VECTYPE (stmt_info))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196 || vectype_maybe_set_p)
197 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198 else
199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
202 if (nunits_vectype)
203 vect_update_max_nunits (vf, nunits_vectype);
205 return opt_result::success ();
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (vec_info *vinfo,
215 stmt_vec_info stmt_info, poly_uint64 *vf)
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 stmt_info->stmt);
220 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221 if (!res)
222 return res;
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
227 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
234 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE, vect_location,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info->stmt);
239 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240 if (!res)
241 return res;
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location,
246 "==> examining pattern statement: %G",
247 stmt_info->stmt);
248 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249 if (!res)
250 return res;
253 return opt_result::success ();
256 /* Function vect_determine_vectorization_factor
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
267 in the loop.
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
270 original loop:
271 for (i=0; i<N; i++){
272 a[i] = b[i] + c[i];
275 vectorized loop:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
281 static opt_result
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286 unsigned nbbs = loop->num_nodes;
287 poly_uint64 vectorization_factor = 1;
288 tree scalar_type = NULL_TREE;
289 gphi *phi;
290 tree vectype;
291 stmt_vec_info stmt_info;
292 unsigned i;
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296 for (i = 0; i < nbbs; i++)
298 basic_block bb = bbs[i];
300 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301 gsi_next (&si))
303 phi = si.phi ();
304 stmt_info = loop_vinfo->lookup_stmt (phi);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307 phi);
309 gcc_assert (stmt_info);
311 if (STMT_VINFO_RELEVANT_P (stmt_info)
312 || STMT_VINFO_LIVE_P (stmt_info))
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315 scalar_type = TREE_TYPE (PHI_RESULT (phi));
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE, vect_location,
319 "get vectype for scalar type: %T\n",
320 scalar_type);
322 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323 if (!vectype)
324 return opt_result::failure_at (phi,
325 "not vectorized: unsupported "
326 "data-type %T\n",
327 scalar_type);
328 STMT_VINFO_VECTYPE (stmt_info) = vectype;
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332 vectype);
334 if (dump_enabled_p ())
336 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338 dump_printf (MSG_NOTE, "\n");
341 vect_update_max_nunits (&vectorization_factor, vectype);
345 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346 gsi_next (&si))
348 if (is_gimple_debug (gsi_stmt (si)))
349 continue;
350 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351 opt_result res
352 = vect_determine_vf_for_stmt (loop_vinfo,
353 stmt_info, &vectorization_factor);
354 if (!res)
355 return res;
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363 dump_dec (MSG_NOTE, vectorization_factor);
364 dump_printf (MSG_NOTE, "\n");
367 if (known_le (vectorization_factor, 1U))
368 return opt_result::failure_at (vect_location,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371 return opt_result::success ();
375 /* Function vect_is_simple_iv_evolution.
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
380 static bool
381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382 tree * step)
384 tree init_expr;
385 tree step_expr;
386 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387 basic_block bb;
389 /* When there is no evolution in this loop, the evolution function
390 is not "simple". */
391 if (evolution_part == NULL_TREE)
392 return false;
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part))
397 return false;
399 step_expr = evolution_part;
400 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
404 step_expr, init_expr);
406 *init = init_expr;
407 *step = step_expr;
409 if (TREE_CODE (step_expr) != INTEGER_CST
410 && (TREE_CODE (step_expr) != SSA_NAME
411 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415 || !flag_associative_math)))
416 && (TREE_CODE (step_expr) != REAL_CST
417 || !flag_associative_math))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "step unknown.\n");
422 return false;
425 return true;
428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
429 what we are assuming is a double reduction. For example, given
430 a structure like this:
432 outer1:
433 x_1 = PHI <x_4(outer2), ...>;
436 inner:
437 x_2 = PHI <x_1(outer1), ...>;
439 x_3 = ...;
442 outer2:
443 x_4 = PHI <x_3(inner)>;
446 outer loop analysis would treat x_1 as a double reduction phi and
447 this function would then return true for x_2. */
449 static bool
450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
452 use_operand_p use_p;
453 ssa_op_iter op_iter;
454 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
455 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
456 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
457 return true;
458 return false;
461 /* Function vect_analyze_scalar_cycles_1.
463 Examine the cross iteration def-use cycles of scalar variables
464 in LOOP. LOOP_VINFO represents the loop that is now being
465 considered for vectorization (can be LOOP, or an outer-loop
466 enclosing LOOP). */
468 static void
469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
471 basic_block bb = loop->header;
472 tree init, step;
473 auto_vec<stmt_vec_info, 64> worklist;
474 gphi_iterator gsi;
475 bool double_reduc, reduc_chain;
477 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
479 /* First - identify all inductions. Reduction detection assumes that all the
480 inductions have been identified, therefore, this order must not be
481 changed. */
482 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
484 gphi *phi = gsi.phi ();
485 tree access_fn = NULL;
486 tree def = PHI_RESULT (phi);
487 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
489 if (dump_enabled_p ())
490 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
492 /* Skip virtual phi's. The data dependences that are associated with
493 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
494 if (virtual_operand_p (def))
495 continue;
497 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
499 /* Analyze the evolution function. */
500 access_fn = analyze_scalar_evolution (loop, def);
501 if (access_fn)
503 STRIP_NOPS (access_fn);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location,
506 "Access function of PHI: %T\n", access_fn);
507 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
508 = initial_condition_in_loop_num (access_fn, loop->num);
509 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
510 = evolution_part_in_loop_num (access_fn, loop->num);
513 if (!access_fn
514 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
515 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
516 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
517 && TREE_CODE (step) != INTEGER_CST))
519 worklist.safe_push (stmt_vinfo);
520 continue;
523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
524 != NULL_TREE);
525 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
527 if (dump_enabled_p ())
528 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
529 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
533 /* Second - identify all reductions and nested cycles. */
534 while (worklist.length () > 0)
536 stmt_vec_info stmt_vinfo = worklist.pop ();
537 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
538 tree def = PHI_RESULT (phi);
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
543 gcc_assert (!virtual_operand_p (def)
544 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
546 stmt_vec_info reduc_stmt_info
547 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
548 &reduc_chain);
549 if (reduc_stmt_info)
551 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
552 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
553 if (double_reduc)
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location,
557 "Detected double reduction.\n");
559 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
560 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
562 else
564 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "Detected vectorizable nested cycle.\n");
570 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
572 else
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_NOTE, vect_location,
576 "Detected reduction.\n");
578 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
579 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
580 /* Store the reduction cycles for possible vectorization in
581 loop-aware SLP if it was not detected as reduction
582 chain. */
583 if (! reduc_chain)
584 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
585 (reduc_stmt_info);
589 else
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
592 "Unknown def-use cycle pattern.\n");
597 /* Function vect_analyze_scalar_cycles.
599 Examine the cross iteration def-use cycles of scalar variables, by
600 analyzing the loop-header PHIs of scalar variables. Classify each
601 cycle as one of the following: invariant, induction, reduction, unknown.
602 We do that for the loop represented by LOOP_VINFO, and also to its
603 inner-loop, if exists.
604 Examples for scalar cycles:
606 Example1: reduction:
608 loop1:
609 for (i=0; i<N; i++)
610 sum += a[i];
612 Example2: induction:
614 loop2:
615 for (i=0; i<N; i++)
616 a[i] = i; */
618 static void
619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
621 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
623 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
625 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
626 Reductions in such inner-loop therefore have different properties than
627 the reductions in the nest that gets vectorized:
628 1. When vectorized, they are executed in the same order as in the original
629 scalar loop, so we can't change the order of computation when
630 vectorizing them.
631 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
632 current checks are too strict. */
634 if (loop->inner)
635 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
638 /* Transfer group and reduction information from STMT_INFO to its
639 pattern stmt. */
641 static void
642 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
644 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
645 stmt_vec_info stmtp;
646 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
647 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
648 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
651 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
652 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
653 == STMT_VINFO_DEF_TYPE (stmt_info));
654 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
655 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
656 if (stmt_info)
657 REDUC_GROUP_NEXT_ELEMENT (stmtp)
658 = STMT_VINFO_RELATED_STMT (stmt_info);
660 while (stmt_info);
663 /* Fixup scalar cycles that now have their stmts detected as patterns. */
665 static void
666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
668 stmt_vec_info first;
669 unsigned i;
671 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
673 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
674 while (next)
676 if ((STMT_VINFO_IN_PATTERN_P (next)
677 != STMT_VINFO_IN_PATTERN_P (first))
678 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
679 break;
680 next = REDUC_GROUP_NEXT_ELEMENT (next);
682 /* If all reduction chain members are well-formed patterns adjust
683 the group to group the pattern stmts instead. */
684 if (! next
685 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
687 if (STMT_VINFO_IN_PATTERN_P (first))
689 vect_fixup_reduc_chain (first);
690 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
691 = STMT_VINFO_RELATED_STMT (first);
694 /* If not all stmt in the chain are patterns or if we failed
695 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
696 it as regular reduction instead. */
697 else
699 stmt_vec_info vinfo = first;
700 stmt_vec_info last = NULL;
701 while (vinfo)
703 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
704 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
705 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
706 last = vinfo;
707 vinfo = next;
709 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
710 = vect_internal_def;
711 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
712 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
713 --i;
718 /* Function vect_get_loop_niters.
720 Determine how many iterations the loop is executed and place it
721 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
722 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
723 niter information holds in ASSUMPTIONS.
725 Return the loop exit condition. */
728 static gcond *
729 vect_get_loop_niters (class loop *loop, tree *assumptions,
730 tree *number_of_iterations, tree *number_of_iterationsm1)
732 edge exit = single_exit (loop);
733 class tree_niter_desc niter_desc;
734 tree niter_assumptions, niter, may_be_zero;
735 gcond *cond = get_loop_exit_condition (loop);
737 *assumptions = boolean_true_node;
738 *number_of_iterationsm1 = chrec_dont_know;
739 *number_of_iterations = chrec_dont_know;
740 DUMP_VECT_SCOPE ("get_loop_niters");
742 if (!exit)
743 return cond;
745 may_be_zero = NULL_TREE;
746 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
747 || chrec_contains_undetermined (niter_desc.niter))
748 return cond;
750 niter_assumptions = niter_desc.assumptions;
751 may_be_zero = niter_desc.may_be_zero;
752 niter = niter_desc.niter;
754 if (may_be_zero && integer_zerop (may_be_zero))
755 may_be_zero = NULL_TREE;
757 if (may_be_zero)
759 if (COMPARISON_CLASS_P (may_be_zero))
761 /* Try to combine may_be_zero with assumptions, this can simplify
762 computation of niter expression. */
763 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
764 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
765 niter_assumptions,
766 fold_build1 (TRUTH_NOT_EXPR,
767 boolean_type_node,
768 may_be_zero));
769 else
770 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
771 build_int_cst (TREE_TYPE (niter), 0),
772 rewrite_to_non_trapping_overflow (niter));
774 may_be_zero = NULL_TREE;
776 else if (integer_nonzerop (may_be_zero))
778 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
779 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
780 return cond;
782 else
783 return cond;
786 *assumptions = niter_assumptions;
787 *number_of_iterationsm1 = niter;
789 /* We want the number of loop header executions which is the number
790 of latch executions plus one.
791 ??? For UINT_MAX latch executions this number overflows to zero
792 for loops like do { n++; } while (n != 0); */
793 if (niter && !chrec_contains_undetermined (niter))
794 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
795 build_int_cst (TREE_TYPE (niter), 1));
796 *number_of_iterations = niter;
798 return cond;
801 /* Function bb_in_loop_p
803 Used as predicate for dfs order traversal of the loop bbs. */
805 static bool
806 bb_in_loop_p (const_basic_block bb, const void *data)
808 const class loop *const loop = (const class loop *)data;
809 if (flow_bb_inside_loop_p (loop, bb))
810 return true;
811 return false;
815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
816 stmt_vec_info structs for all the stmts in LOOP_IN. */
818 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
819 : vec_info (vec_info::loop, shared),
820 loop (loop_in),
821 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
822 num_itersm1 (NULL_TREE),
823 num_iters (NULL_TREE),
824 num_iters_unchanged (NULL_TREE),
825 num_iters_assumptions (NULL_TREE),
826 vector_costs (nullptr),
827 scalar_costs (nullptr),
828 th (0),
829 versioning_threshold (0),
830 vectorization_factor (0),
831 main_loop_edge (nullptr),
832 skip_main_loop_edge (nullptr),
833 skip_this_loop_edge (nullptr),
834 reusable_accumulators (),
835 suggested_unroll_factor (1),
836 max_vectorization_factor (0),
837 mask_skip_niters (NULL_TREE),
838 rgroup_compare_type (NULL_TREE),
839 simd_if_cond (NULL_TREE),
840 unaligned_dr (NULL),
841 peeling_for_alignment (0),
842 ptr_mask (0),
843 ivexpr_map (NULL),
844 scan_map (NULL),
845 slp_unrolling_factor (1),
846 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
847 vectorizable (false),
848 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
849 using_partial_vectors_p (false),
850 epil_using_partial_vectors_p (false),
851 partial_load_store_bias (0),
852 peeling_for_gaps (false),
853 peeling_for_niter (false),
854 no_data_dependencies (false),
855 has_mask_store (false),
856 scalar_loop_scaling (profile_probability::uninitialized ()),
857 scalar_loop (NULL),
858 orig_loop_info (NULL)
860 /* CHECKME: We want to visit all BBs before their successors (except for
861 latch blocks, for which this assertion wouldn't hold). In the simple
862 case of the loop forms we allow, a dfs order of the BBs would the same
863 as reversed postorder traversal, so we are safe. */
865 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 bbs, loop->num_nodes, loop);
867 gcc_assert (nbbs == loop->num_nodes);
869 for (unsigned int i = 0; i < nbbs; i++)
871 basic_block bb = bbs[i];
872 gimple_stmt_iterator si;
874 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
876 gimple *phi = gsi_stmt (si);
877 gimple_set_uid (phi, 0);
878 add_stmt (phi);
881 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
883 gimple *stmt = gsi_stmt (si);
884 gimple_set_uid (stmt, 0);
885 if (is_gimple_debug (stmt))
886 continue;
887 add_stmt (stmt);
888 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
889 third argument is the #pragma omp simd if (x) condition, when 0,
890 loop shouldn't be vectorized, when non-zero constant, it should
891 be vectorized normally, otherwise versioned with vectorized loop
892 done if the condition is non-zero at runtime. */
893 if (loop_in->simduid
894 && is_gimple_call (stmt)
895 && gimple_call_internal_p (stmt)
896 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
897 && gimple_call_num_args (stmt) >= 3
898 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
899 && (loop_in->simduid
900 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
902 tree arg = gimple_call_arg (stmt, 2);
903 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
904 simd_if_cond = arg;
905 else
906 gcc_assert (integer_nonzerop (arg));
911 epilogue_vinfos.create (6);
914 /* Free all levels of rgroup CONTROLS. */
916 void
917 release_vec_loop_controls (vec<rgroup_controls> *controls)
919 rgroup_controls *rgc;
920 unsigned int i;
921 FOR_EACH_VEC_ELT (*controls, i, rgc)
922 rgc->controls.release ();
923 controls->release ();
926 /* Free all memory used by the _loop_vec_info, as well as all the
927 stmt_vec_info structs of all the stmts in the loop. */
929 _loop_vec_info::~_loop_vec_info ()
931 free (bbs);
933 release_vec_loop_controls (&masks);
934 release_vec_loop_controls (&lens);
935 delete ivexpr_map;
936 delete scan_map;
937 epilogue_vinfos.release ();
938 delete scalar_costs;
939 delete vector_costs;
941 /* When we release an epiloge vinfo that we do not intend to use
942 avoid clearing AUX of the main loop which should continue to
943 point to the main loop vinfo since otherwise we'll leak that. */
944 if (loop->aux == this)
945 loop->aux = NULL;
948 /* Return an invariant or register for EXPR and emit necessary
949 computations in the LOOP_VINFO loop preheader. */
951 tree
952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
954 if (is_gimple_reg (expr)
955 || is_gimple_min_invariant (expr))
956 return expr;
958 if (! loop_vinfo->ivexpr_map)
959 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
960 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
961 if (! cached)
963 gimple_seq stmts = NULL;
964 cached = force_gimple_operand (unshare_expr (expr),
965 &stmts, true, NULL_TREE);
966 if (stmts)
968 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
969 gsi_insert_seq_on_edge_immediate (e, stmts);
972 return cached;
975 /* Return true if we can use CMP_TYPE as the comparison type to produce
976 all masks required to mask LOOP_VINFO. */
978 static bool
979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
981 rgroup_controls *rgm;
982 unsigned int i;
983 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
984 if (rgm->type != NULL_TREE
985 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
986 cmp_type, rgm->type,
987 OPTIMIZE_FOR_SPEED))
988 return false;
989 return true;
992 /* Calculate the maximum number of scalars per iteration for every
993 rgroup in LOOP_VINFO. */
995 static unsigned int
996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
998 unsigned int res = 1;
999 unsigned int i;
1000 rgroup_controls *rgm;
1001 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002 res = MAX (res, rgm->max_nscalars_per_iter);
1003 return res;
1006 /* Calculate the minimum precision necessary to represent:
1008 MAX_NITERS * FACTOR
1010 as an unsigned integer, where MAX_NITERS is the maximum number of
1011 loop header iterations for the original scalar form of LOOP_VINFO. */
1013 static unsigned
1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1016 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1018 /* Get the maximum number of iterations that is representable
1019 in the counter type. */
1020 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1023 /* Get a more refined estimate for the number of iterations. */
1024 widest_int max_back_edges;
1025 if (max_loop_iterations (loop, &max_back_edges))
1026 max_ni = wi::smin (max_ni, max_back_edges + 1);
1028 /* Work out how many bits we need to represent the limit. */
1029 return wi::min_precision (max_ni * factor, UNSIGNED);
1032 /* True if the loop needs peeling or partial vectors when vectorized. */
1034 static bool
1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1037 unsigned HOST_WIDE_INT const_vf;
1038 HOST_WIDE_INT max_niter
1039 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1041 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1042 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1043 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1044 (loop_vinfo));
1046 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1047 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1049 /* Work out the (constant) number of iterations that need to be
1050 peeled for reasons other than niters. */
1051 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1052 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1053 peel_niter += 1;
1054 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1055 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1056 return true;
1058 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1059 /* ??? When peeling for gaps but not alignment, we could
1060 try to check whether the (variable) niters is known to be
1061 VF * N + 1. That's something of a niche case though. */
1062 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1063 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1064 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1065 < (unsigned) exact_log2 (const_vf))
1066 /* In case of versioning, check if the maximum number of
1067 iterations is greater than th. If they are identical,
1068 the epilogue is unnecessary. */
1069 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1070 || ((unsigned HOST_WIDE_INT) max_niter
1071 > (th / const_vf) * const_vf))))
1072 return true;
1074 return false;
1077 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1078 whether we can actually generate the masks required. Return true if so,
1079 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1081 static bool
1082 vect_verify_full_masking (loop_vec_info loop_vinfo)
1084 unsigned int min_ni_width;
1085 unsigned int max_nscalars_per_iter
1086 = vect_get_max_nscalars_per_iter (loop_vinfo);
1088 /* Use a normal loop if there are no statements that need masking.
1089 This only happens in rare degenerate cases: it means that the loop
1090 has no loads, no stores, and no live-out values. */
1091 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1092 return false;
1094 /* Work out how many bits we need to represent the limit. */
1095 min_ni_width
1096 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1098 /* Find a scalar mode for which WHILE_ULT is supported. */
1099 opt_scalar_int_mode cmp_mode_iter;
1100 tree cmp_type = NULL_TREE;
1101 tree iv_type = NULL_TREE;
1102 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1103 unsigned int iv_precision = UINT_MAX;
1105 if (iv_limit != -1)
1106 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1107 UNSIGNED);
1109 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1111 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1112 if (cmp_bits >= min_ni_width
1113 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1115 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1116 if (this_type
1117 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1119 /* Although we could stop as soon as we find a valid mode,
1120 there are at least two reasons why that's not always the
1121 best choice:
1123 - An IV that's Pmode or wider is more likely to be reusable
1124 in address calculations than an IV that's narrower than
1125 Pmode.
1127 - Doing the comparison in IV_PRECISION or wider allows
1128 a natural 0-based IV, whereas using a narrower comparison
1129 type requires mitigations against wrap-around.
1131 Conversely, if the IV limit is variable, doing the comparison
1132 in a wider type than the original type can introduce
1133 unnecessary extensions, so picking the widest valid mode
1134 is not always a good choice either.
1136 Here we prefer the first IV type that's Pmode or wider,
1137 and the first comparison type that's IV_PRECISION or wider.
1138 (The comparison type must be no wider than the IV type,
1139 to avoid extensions in the vector loop.)
1141 ??? We might want to try continuing beyond Pmode for ILP32
1142 targets if CMP_BITS < IV_PRECISION. */
1143 iv_type = this_type;
1144 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1145 cmp_type = this_type;
1146 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1147 break;
1152 if (!cmp_type)
1153 return false;
1155 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1156 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1157 return true;
1160 /* Check whether we can use vector access with length based on precison
1161 comparison. So far, to keep it simple, we only allow the case that the
1162 precision of the target supported length is larger than the precision
1163 required by loop niters. */
1165 static bool
1166 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1168 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1169 return false;
1171 machine_mode len_load_mode = get_len_load_store_mode
1172 (loop_vinfo->vector_mode, true).require ();
1173 machine_mode len_store_mode = get_len_load_store_mode
1174 (loop_vinfo->vector_mode, false).require ();
1176 signed char partial_load_bias = internal_len_load_store_bias
1177 (IFN_LEN_LOAD, len_load_mode);
1179 signed char partial_store_bias = internal_len_load_store_bias
1180 (IFN_LEN_STORE, len_store_mode);
1182 gcc_assert (partial_load_bias == partial_store_bias);
1184 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1185 return false;
1187 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1188 len_loads with a length of zero. In order to avoid that we prohibit
1189 more than one loop length here. */
1190 if (partial_load_bias == -1
1191 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1192 return false;
1194 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1196 unsigned int max_nitems_per_iter = 1;
1197 unsigned int i;
1198 rgroup_controls *rgl;
1199 /* Find the maximum number of items per iteration for every rgroup. */
1200 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1202 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1203 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1206 /* Work out how many bits we need to represent the length limit. */
1207 unsigned int min_ni_prec
1208 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1210 /* Now use the maximum of below precisions for one suitable IV type:
1211 - the IV's natural precision
1212 - the precision needed to hold: the maximum number of scalar
1213 iterations multiplied by the scale factor (min_ni_prec above)
1214 - the Pmode precision
1216 If min_ni_prec is less than the precision of the current niters,
1217 we perfer to still use the niters type. Prefer to use Pmode and
1218 wider IV to avoid narrow conversions. */
1220 unsigned int ni_prec
1221 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1222 min_ni_prec = MAX (min_ni_prec, ni_prec);
1223 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1225 tree iv_type = NULL_TREE;
1226 opt_scalar_int_mode tmode_iter;
1227 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1229 scalar_mode tmode = tmode_iter.require ();
1230 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1232 /* ??? Do we really want to construct one IV whose precision exceeds
1233 BITS_PER_WORD? */
1234 if (tbits > BITS_PER_WORD)
1235 break;
1237 /* Find the first available standard integral type. */
1238 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1240 iv_type = build_nonstandard_integer_type (tbits, true);
1241 break;
1245 if (!iv_type)
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "can't vectorize with length-based partial vectors"
1250 " because there is no suitable iv type.\n");
1251 return false;
1254 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1255 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1257 return true;
1260 /* Calculate the cost of one scalar iteration of the loop. */
1261 static void
1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1264 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1265 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1266 int nbbs = loop->num_nodes, factor;
1267 int innerloop_iters, i;
1269 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1271 /* Gather costs for statements in the scalar loop. */
1273 /* FORNOW. */
1274 innerloop_iters = 1;
1275 if (loop->inner)
1276 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1278 for (i = 0; i < nbbs; i++)
1280 gimple_stmt_iterator si;
1281 basic_block bb = bbs[i];
1283 if (bb->loop_father == loop->inner)
1284 factor = innerloop_iters;
1285 else
1286 factor = 1;
1288 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1290 gimple *stmt = gsi_stmt (si);
1291 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1293 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1294 continue;
1296 /* Skip stmts that are not vectorized inside the loop. */
1297 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1298 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1299 && (!STMT_VINFO_LIVE_P (vstmt_info)
1300 || !VECTORIZABLE_CYCLE_DEF
1301 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1302 continue;
1304 vect_cost_for_stmt kind;
1305 if (STMT_VINFO_DATA_REF (stmt_info))
1307 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1308 kind = scalar_load;
1309 else
1310 kind = scalar_store;
1312 else if (vect_nop_conversion_p (stmt_info))
1313 continue;
1314 else
1315 kind = scalar_stmt;
1317 /* We are using vect_prologue here to avoid scaling twice
1318 by the inner loop factor. */
1319 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1320 factor, kind, stmt_info, 0, vect_prologue);
1324 /* Now accumulate cost. */
1325 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1326 add_stmt_costs (loop_vinfo->scalar_costs,
1327 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1328 loop_vinfo->scalar_costs->finish_cost (nullptr);
1332 /* Function vect_analyze_loop_form.
1334 Verify that certain CFG restrictions hold, including:
1335 - the loop has a pre-header
1336 - the loop has a single entry and exit
1337 - the loop exit condition is simple enough
1338 - the number of iterations can be analyzed, i.e, a countable loop. The
1339 niter could be analyzed under some assumptions. */
1341 opt_result
1342 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1344 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1346 /* Different restrictions apply when we are considering an inner-most loop,
1347 vs. an outer (nested) loop.
1348 (FORNOW. May want to relax some of these restrictions in the future). */
1350 info->inner_loop_cond = NULL;
1351 if (!loop->inner)
1353 /* Inner-most loop. We currently require that the number of BBs is
1354 exactly 2 (the header and latch). Vectorizable inner-most loops
1355 look like this:
1357 (pre-header)
1359 header <--------+
1360 | | |
1361 | +--> latch --+
1363 (exit-bb) */
1365 if (loop->num_nodes != 2)
1366 return opt_result::failure_at (vect_location,
1367 "not vectorized:"
1368 " control flow in loop.\n");
1370 if (empty_block_p (loop->header))
1371 return opt_result::failure_at (vect_location,
1372 "not vectorized: empty loop.\n");
1374 else
1376 class loop *innerloop = loop->inner;
1377 edge entryedge;
1379 /* Nested loop. We currently require that the loop is doubly-nested,
1380 contains a single inner loop, and the number of BBs is exactly 5.
1381 Vectorizable outer-loops look like this:
1383 (pre-header)
1385 header <---+
1387 inner-loop |
1389 tail ------+
1391 (exit-bb)
1393 The inner-loop has the properties expected of inner-most loops
1394 as described above. */
1396 if ((loop->inner)->inner || (loop->inner)->next)
1397 return opt_result::failure_at (vect_location,
1398 "not vectorized:"
1399 " multiple nested loops.\n");
1401 if (loop->num_nodes != 5)
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized:"
1404 " control flow in loop.\n");
1406 entryedge = loop_preheader_edge (innerloop);
1407 if (entryedge->src != loop->header
1408 || !single_exit (innerloop)
1409 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1410 return opt_result::failure_at (vect_location,
1411 "not vectorized:"
1412 " unsupported outerloop form.\n");
1414 /* Analyze the inner-loop. */
1415 vect_loop_form_info inner;
1416 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1417 if (!res)
1419 if (dump_enabled_p ())
1420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421 "not vectorized: Bad inner loop.\n");
1422 return res;
1425 /* Don't support analyzing niter under assumptions for inner
1426 loop. */
1427 if (!integer_onep (inner.assumptions))
1428 return opt_result::failure_at (vect_location,
1429 "not vectorized: Bad inner loop.\n");
1431 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1432 return opt_result::failure_at (vect_location,
1433 "not vectorized: inner-loop count not"
1434 " invariant.\n");
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE, vect_location,
1438 "Considering outer-loop vectorization.\n");
1439 info->inner_loop_cond = inner.loop_cond;
1442 if (!single_exit (loop))
1443 return opt_result::failure_at (vect_location,
1444 "not vectorized: multiple exits.\n");
1445 if (EDGE_COUNT (loop->header->preds) != 2)
1446 return opt_result::failure_at (vect_location,
1447 "not vectorized:"
1448 " too many incoming edges.\n");
1450 /* We assume that the loop exit condition is at the end of the loop. i.e,
1451 that the loop is represented as a do-while (with a proper if-guard
1452 before the loop if needed), where the loop header contains all the
1453 executable statements, and the latch is empty. */
1454 if (!empty_block_p (loop->latch)
1455 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456 return opt_result::failure_at (vect_location,
1457 "not vectorized: latch block not empty.\n");
1459 /* Make sure the exit is not abnormal. */
1460 edge e = single_exit (loop);
1461 if (e->flags & EDGE_ABNORMAL)
1462 return opt_result::failure_at (vect_location,
1463 "not vectorized:"
1464 " abnormal loop exit edge.\n");
1466 info->loop_cond
1467 = vect_get_loop_niters (loop, &info->assumptions,
1468 &info->number_of_iterations,
1469 &info->number_of_iterationsm1);
1470 if (!info->loop_cond)
1471 return opt_result::failure_at
1472 (vect_location,
1473 "not vectorized: complicated exit condition.\n");
1475 if (integer_zerop (info->assumptions)
1476 || !info->number_of_iterations
1477 || chrec_contains_undetermined (info->number_of_iterations))
1478 return opt_result::failure_at
1479 (info->loop_cond,
1480 "not vectorized: number of iterations cannot be computed.\n");
1482 if (integer_zerop (info->number_of_iterations))
1483 return opt_result::failure_at
1484 (info->loop_cond,
1485 "not vectorized: number of iterations = 0.\n");
1487 if (!(tree_fits_shwi_p (info->number_of_iterations)
1488 && tree_to_shwi (info->number_of_iterations) > 0))
1490 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_NOTE, vect_location,
1493 "Symbolic number of iterations is ");
1494 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1495 dump_printf (MSG_NOTE, "\n");
1499 return opt_result::success ();
1502 /* Create a loop_vec_info for LOOP with SHARED and the
1503 vect_analyze_loop_form result. */
1505 loop_vec_info
1506 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1507 const vect_loop_form_info *info,
1508 loop_vec_info main_loop_info)
1510 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1511 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1512 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1513 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1514 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1515 /* Also record the assumptions for versioning. */
1516 if (!integer_onep (info->assumptions) && !main_loop_info)
1517 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1519 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1520 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1521 if (info->inner_loop_cond)
1523 stmt_vec_info inner_loop_cond_info
1524 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1525 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1526 /* If we have an estimate on the number of iterations of the inner
1527 loop use that to limit the scale for costing, otherwise use
1528 --param vect-inner-loop-cost-factor literally. */
1529 widest_int nit;
1530 if (estimated_stmt_executions (loop->inner, &nit))
1531 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1532 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1535 return loop_vinfo;
1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1541 statements update the vectorization factor. */
1543 static void
1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1546 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1547 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1548 int nbbs = loop->num_nodes;
1549 poly_uint64 vectorization_factor;
1550 int i;
1552 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1554 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1555 gcc_assert (known_ne (vectorization_factor, 0U));
1557 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1558 vectorization factor of the loop is the unrolling factor required by
1559 the SLP instances. If that unrolling factor is 1, we say, that we
1560 perform pure SLP on loop - cross iteration parallelism is not
1561 exploited. */
1562 bool only_slp_in_loop = true;
1563 for (i = 0; i < nbbs; i++)
1565 basic_block bb = bbs[i];
1566 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1567 gsi_next (&si))
1569 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1570 if (!stmt_info)
1571 continue;
1572 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574 && !PURE_SLP_STMT (stmt_info))
1575 /* STMT needs both SLP and loop-based vectorization. */
1576 only_slp_in_loop = false;
1578 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579 gsi_next (&si))
1581 if (is_gimple_debug (gsi_stmt (si)))
1582 continue;
1583 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1584 stmt_info = vect_stmt_to_vectorize (stmt_info);
1585 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1586 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1587 && !PURE_SLP_STMT (stmt_info))
1588 /* STMT needs both SLP and loop-based vectorization. */
1589 only_slp_in_loop = false;
1593 if (only_slp_in_loop)
1595 if (dump_enabled_p ())
1596 dump_printf_loc (MSG_NOTE, vect_location,
1597 "Loop contains only SLP stmts\n");
1598 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1600 else
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "Loop contains SLP and non-SLP stmts\n");
1605 /* Both the vectorization factor and unroll factor have the form
1606 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1607 so they must have a common multiple. */
1608 vectorization_factor
1609 = force_common_multiple (vectorization_factor,
1610 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1613 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1614 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_NOTE, vect_location,
1617 "Updating vectorization factor to ");
1618 dump_dec (MSG_NOTE, vectorization_factor);
1619 dump_printf (MSG_NOTE, ".\n");
1623 /* Return true if STMT_INFO describes a double reduction phi and if
1624 the other phi in the reduction is also relevant for vectorization.
1625 This rejects cases such as:
1627 outer1:
1628 x_1 = PHI <x_3(outer2), ...>;
1631 inner:
1632 x_2 = ...;
1635 outer2:
1636 x_3 = PHI <x_2(inner)>;
1638 if nothing in x_2 or elsewhere makes x_1 relevant. */
1640 static bool
1641 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1643 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1644 return false;
1646 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1649 /* Function vect_analyze_loop_operations.
1651 Scan the loop stmts and make sure they are all vectorizable. */
1653 static opt_result
1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1656 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1657 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1658 int nbbs = loop->num_nodes;
1659 int i;
1660 stmt_vec_info stmt_info;
1661 bool need_to_vectorize = false;
1662 bool ok;
1664 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1666 auto_vec<stmt_info_for_cost> cost_vec;
1668 for (i = 0; i < nbbs; i++)
1670 basic_block bb = bbs[i];
1672 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1673 gsi_next (&si))
1675 gphi *phi = si.phi ();
1676 ok = true;
1678 stmt_info = loop_vinfo->lookup_stmt (phi);
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1681 if (virtual_operand_p (gimple_phi_result (phi)))
1682 continue;
1684 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685 (i.e., a phi in the tail of the outer-loop). */
1686 if (! is_loop_header_bb_p (bb))
1688 /* FORNOW: we currently don't support the case that these phis
1689 are not used in the outerloop (unless it is double reduction,
1690 i.e., this phi is vect_reduction_def), cause this case
1691 requires to actually do something here. */
1692 if (STMT_VINFO_LIVE_P (stmt_info)
1693 && !vect_active_double_reduction_p (stmt_info))
1694 return opt_result::failure_at (phi,
1695 "Unsupported loop-closed phi"
1696 " in outer-loop.\n");
1698 /* If PHI is used in the outer loop, we check that its operand
1699 is defined in the inner loop. */
1700 if (STMT_VINFO_RELEVANT_P (stmt_info))
1702 tree phi_op;
1704 if (gimple_phi_num_args (phi) != 1)
1705 return opt_result::failure_at (phi, "unsupported phi");
1707 phi_op = PHI_ARG_DEF (phi, 0);
1708 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1709 if (!op_def_info)
1710 return opt_result::failure_at (phi, "unsupported phi\n");
1712 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1713 && (STMT_VINFO_RELEVANT (op_def_info)
1714 != vect_used_in_outer_by_reduction))
1715 return opt_result::failure_at (phi, "unsupported phi\n");
1717 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1718 || (STMT_VINFO_DEF_TYPE (stmt_info)
1719 == vect_double_reduction_def))
1720 && !vectorizable_lc_phi (loop_vinfo,
1721 stmt_info, NULL, NULL))
1722 return opt_result::failure_at (phi, "unsupported phi\n");
1725 continue;
1728 gcc_assert (stmt_info);
1730 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1731 || STMT_VINFO_LIVE_P (stmt_info))
1732 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1733 /* A scalar-dependence cycle that we don't support. */
1734 return opt_result::failure_at (phi,
1735 "not vectorized:"
1736 " scalar dependence cycle.\n");
1738 if (STMT_VINFO_RELEVANT_P (stmt_info))
1740 need_to_vectorize = true;
1741 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1742 && ! PURE_SLP_STMT (stmt_info))
1743 ok = vectorizable_induction (loop_vinfo,
1744 stmt_info, NULL, NULL,
1745 &cost_vec);
1746 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1747 || (STMT_VINFO_DEF_TYPE (stmt_info)
1748 == vect_double_reduction_def)
1749 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1750 && ! PURE_SLP_STMT (stmt_info))
1751 ok = vectorizable_reduction (loop_vinfo,
1752 stmt_info, NULL, NULL, &cost_vec);
1755 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1756 if (ok
1757 && STMT_VINFO_LIVE_P (stmt_info)
1758 && !PURE_SLP_STMT (stmt_info))
1759 ok = vectorizable_live_operation (loop_vinfo,
1760 stmt_info, NULL, NULL, NULL,
1761 -1, false, &cost_vec);
1763 if (!ok)
1764 return opt_result::failure_at (phi,
1765 "not vectorized: relevant phi not "
1766 "supported: %G",
1767 static_cast <gimple *> (phi));
1770 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771 gsi_next (&si))
1773 gimple *stmt = gsi_stmt (si);
1774 if (!gimple_clobber_p (stmt)
1775 && !is_gimple_debug (stmt))
1777 opt_result res
1778 = vect_analyze_stmt (loop_vinfo,
1779 loop_vinfo->lookup_stmt (stmt),
1780 &need_to_vectorize,
1781 NULL, NULL, &cost_vec);
1782 if (!res)
1783 return res;
1786 } /* bbs */
1788 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1790 /* All operations in the loop are either irrelevant (deal with loop
1791 control, or dead), or only used outside the loop and can be moved
1792 out of the loop (e.g. invariants, inductions). The loop can be
1793 optimized away by scalar optimizations. We're better off not
1794 touching this loop. */
1795 if (!need_to_vectorize)
1797 if (dump_enabled_p ())
1798 dump_printf_loc (MSG_NOTE, vect_location,
1799 "All the computation can be taken out of the loop.\n");
1800 return opt_result::failure_at
1801 (vect_location,
1802 "not vectorized: redundant loop. no profit to vectorize.\n");
1805 return opt_result::success ();
1808 /* Return true if we know that the iteration count is smaller than the
1809 vectorization factor. Return false if it isn't, or if we can't be sure
1810 either way. */
1812 static bool
1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1815 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1817 HOST_WIDE_INT max_niter;
1818 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1819 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1820 else
1821 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1823 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1824 return true;
1826 return false;
1829 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1830 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1831 definitely no, or -1 if it's worth retrying. */
1833 static int
1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1835 unsigned *suggested_unroll_factor)
1837 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1840 /* Only loops that can handle partially-populated vectors can have iteration
1841 counts less than the vectorization factor. */
1842 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1844 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 "not vectorized: iteration count smaller than "
1849 "vectorization factor.\n");
1850 return 0;
1854 /* If using the "very cheap" model. reject cases in which we'd keep
1855 a copy of the scalar code (even if we might be able to vectorize it). */
1856 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1857 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1859 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 "some scalar iterations would need to be peeled\n");
1864 return 0;
1867 int min_profitable_iters, min_profitable_estimate;
1868 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1869 &min_profitable_estimate,
1870 suggested_unroll_factor);
1872 if (min_profitable_iters < 0)
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876 "not vectorized: vectorization not profitable.\n");
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1879 "not vectorized: vector version will never be "
1880 "profitable.\n");
1881 return -1;
1884 int min_scalar_loop_bound = (param_min_vect_loop_bound
1885 * assumed_vf);
1887 /* Use the cost model only if it is more conservative than user specified
1888 threshold. */
1889 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1890 min_profitable_iters);
1892 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1894 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1895 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 "not vectorized: vectorization not profitable.\n");
1900 if (dump_enabled_p ())
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "not vectorized: iteration count smaller than user "
1903 "specified loop bound parameter or minimum profitable "
1904 "iterations (whichever is more conservative).\n");
1905 return 0;
1908 /* The static profitablity threshold min_profitable_estimate includes
1909 the cost of having to check at runtime whether the scalar loop
1910 should be used instead. If it turns out that we don't need or want
1911 such a check, the threshold we should use for the static estimate
1912 is simply the point at which the vector loop becomes more profitable
1913 than the scalar loop. */
1914 if (min_profitable_estimate > min_profitable_iters
1915 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1916 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1917 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1918 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1922 " choice between the scalar and vector loops\n");
1923 min_profitable_estimate = min_profitable_iters;
1926 /* If the vector loop needs multiple iterations to be beneficial then
1927 things are probably too close to call, and the conservative thing
1928 would be to stick with the scalar code. */
1929 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1930 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1932 if (dump_enabled_p ())
1933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934 "one iteration of the vector loop would be"
1935 " more expensive than the equivalent number of"
1936 " iterations of the scalar loop\n");
1937 return 0;
1940 HOST_WIDE_INT estimated_niter;
1942 /* If we are vectorizing an epilogue then we know the maximum number of
1943 scalar iterations it will cover is at least one lower than the
1944 vectorization factor of the main loop. */
1945 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946 estimated_niter
1947 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1948 else
1950 estimated_niter = estimated_stmt_executions_int (loop);
1951 if (estimated_niter == -1)
1952 estimated_niter = likely_max_stmt_executions_int (loop);
1954 if (estimated_niter != -1
1955 && ((unsigned HOST_WIDE_INT) estimated_niter
1956 < MAX (th, (unsigned) min_profitable_estimate)))
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960 "not vectorized: estimated iteration count too "
1961 "small.\n");
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "not vectorized: estimated iteration count smaller "
1965 "than specified loop bound parameter or minimum "
1966 "profitable iterations (whichever is more "
1967 "conservative).\n");
1968 return -1;
1971 return 1;
1974 static opt_result
1975 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1976 vec<data_reference_p> *datarefs,
1977 unsigned int *n_stmts)
1979 *n_stmts = 0;
1980 for (unsigned i = 0; i < loop->num_nodes; i++)
1981 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1982 !gsi_end_p (gsi); gsi_next (&gsi))
1984 gimple *stmt = gsi_stmt (gsi);
1985 if (is_gimple_debug (stmt))
1986 continue;
1987 ++(*n_stmts);
1988 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1989 NULL, 0);
1990 if (!res)
1992 if (is_gimple_call (stmt) && loop->safelen)
1994 tree fndecl = gimple_call_fndecl (stmt), op;
1995 if (fndecl != NULL_TREE)
1997 cgraph_node *node = cgraph_node::get (fndecl);
1998 if (node != NULL && node->simd_clones != NULL)
2000 unsigned int j, n = gimple_call_num_args (stmt);
2001 for (j = 0; j < n; j++)
2003 op = gimple_call_arg (stmt, j);
2004 if (DECL_P (op)
2005 || (REFERENCE_CLASS_P (op)
2006 && get_base_address (op)))
2007 break;
2009 op = gimple_call_lhs (stmt);
2010 /* Ignore #pragma omp declare simd functions
2011 if they don't have data references in the
2012 call stmt itself. */
2013 if (j == n
2014 && !(op
2015 && (DECL_P (op)
2016 || (REFERENCE_CLASS_P (op)
2017 && get_base_address (op)))))
2018 continue;
2022 return res;
2024 /* If dependence analysis will give up due to the limit on the
2025 number of datarefs stop here and fail fatally. */
2026 if (datarefs->length ()
2027 > (unsigned)param_loop_max_datarefs_for_datadeps)
2028 return opt_result::failure_at (stmt, "exceeded param "
2029 "loop-max-datarefs-for-datadeps\n");
2031 return opt_result::success ();
2034 /* Look for SLP-only access groups and turn each individual access into its own
2035 group. */
2036 static void
2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2039 unsigned int i;
2040 struct data_reference *dr;
2042 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2044 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2045 FOR_EACH_VEC_ELT (datarefs, i, dr)
2047 gcc_assert (DR_REF (dr));
2048 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2050 /* Check if the load is a part of an interleaving chain. */
2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2053 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2054 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2055 unsigned int group_size = DR_GROUP_SIZE (first_element);
2057 /* Check if SLP-only groups. */
2058 if (!STMT_SLP_TYPE (stmt_info)
2059 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2061 /* Dissolve the group. */
2062 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2064 stmt_vec_info vinfo = first_element;
2065 while (vinfo)
2067 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2068 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2069 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2070 DR_GROUP_SIZE (vinfo) = 1;
2071 if (STMT_VINFO_STRIDED_P (first_element))
2072 DR_GROUP_GAP (vinfo) = 0;
2073 else
2074 DR_GROUP_GAP (vinfo) = group_size - 1;
2075 /* Duplicate and adjust alignment info, it needs to
2076 be present on each group leader, see dr_misalignment. */
2077 if (vinfo != first_element)
2079 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2080 dr_info2->target_alignment = dr_info->target_alignment;
2081 int misalignment = dr_info->misalignment;
2082 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2084 HOST_WIDE_INT diff
2085 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2086 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2087 unsigned HOST_WIDE_INT align_c
2088 = dr_info->target_alignment.to_constant ();
2089 misalignment = (misalignment + diff) % align_c;
2091 dr_info2->misalignment = misalignment;
2093 vinfo = next;
2100 /* Determine if operating on full vectors for LOOP_VINFO might leave
2101 some scalar iterations still to do. If so, decide how we should
2102 handle those scalar iterations. The possibilities are:
2104 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2105 In this case:
2107 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2108 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2109 LOOP_VINFO_PEELING_FOR_NITER == false
2111 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2112 to handle the remaining scalar iterations. In this case:
2114 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2115 LOOP_VINFO_PEELING_FOR_NITER == true
2117 There are two choices:
2119 (2a) Consider vectorizing the epilogue loop at the same VF as the
2120 main loop, but using partial vectors instead of full vectors.
2121 In this case:
2123 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2125 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2126 In this case:
2128 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2130 When FOR_EPILOGUE_P is true, make this determination based on the
2131 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2132 based on the assumption that LOOP_VINFO is the main loop. The caller
2133 has made sure that the number of iterations is set appropriately for
2134 this value of FOR_EPILOGUE_P. */
2136 opt_result
2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2138 bool for_epilogue_p)
2140 /* Determine whether there would be any scalar iterations left over. */
2141 bool need_peeling_or_partial_vectors_p
2142 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2144 /* Decide whether to vectorize the loop with partial vectors. */
2145 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2146 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2147 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2148 && need_peeling_or_partial_vectors_p)
2150 /* For partial-vector-usage=1, try to push the handling of partial
2151 vectors to the epilogue, with the main loop continuing to operate
2152 on full vectors.
2154 If we are unrolling we also do not want to use partial vectors. This
2155 is to avoid the overhead of generating multiple masks and also to
2156 avoid having to execute entire iterations of FALSE masked instructions
2157 when dealing with one or less full iterations.
2159 ??? We could then end up failing to use partial vectors if we
2160 decide to peel iterations into a prologue, and if the main loop
2161 then ends up processing fewer than VF iterations. */
2162 if ((param_vect_partial_vector_usage == 1
2163 || loop_vinfo->suggested_unroll_factor > 1)
2164 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2165 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2166 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2167 else
2168 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2171 if (dump_enabled_p ())
2173 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174 dump_printf_loc (MSG_NOTE, vect_location,
2175 "operating on partial vectors%s.\n",
2176 for_epilogue_p ? " for epilogue loop" : "");
2177 else
2178 dump_printf_loc (MSG_NOTE, vect_location,
2179 "operating only on full vectors%s.\n",
2180 for_epilogue_p ? " for epilogue loop" : "");
2183 if (for_epilogue_p)
2185 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2186 gcc_assert (orig_loop_vinfo);
2187 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2188 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2189 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2192 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2193 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2195 /* Check that the loop processes at least one full vector. */
2196 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2198 if (known_lt (wi::to_widest (scalar_niters), vf))
2199 return opt_result::failure_at (vect_location,
2200 "loop does not have enough iterations"
2201 " to support vectorization.\n");
2203 /* If we need to peel an extra epilogue iteration to handle data
2204 accesses with gaps, check that there are enough scalar iterations
2205 available.
2207 The check above is redundant with this one when peeling for gaps,
2208 but the distinction is useful for diagnostics. */
2209 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2210 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2211 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2212 return opt_result::failure_at (vect_location,
2213 "loop does not have enough iterations"
2214 " to support peeling for gaps.\n");
2217 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2218 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2219 && need_peeling_or_partial_vectors_p);
2221 return opt_result::success ();
2224 /* Function vect_analyze_loop_2.
2226 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227 for it. The different analyses will record information in the
2228 loop_vec_info struct. */
2229 static opt_result
2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2231 unsigned *suggested_unroll_factor)
2233 opt_result ok = opt_result::success ();
2234 int res;
2235 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2236 poly_uint64 min_vf = 2;
2237 loop_vec_info orig_loop_vinfo = NULL;
2239 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2240 loop_vec_info of the first vectorized loop. */
2241 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2242 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2243 else
2244 orig_loop_vinfo = loop_vinfo;
2245 gcc_assert (orig_loop_vinfo);
2247 /* The first group of checks is independent of the vector size. */
2248 fatal = true;
2250 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2251 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2252 return opt_result::failure_at (vect_location,
2253 "not vectorized: simd if(0)\n");
2255 /* Find all data references in the loop (which correspond to vdefs/vuses)
2256 and analyze their evolution in the loop. */
2258 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2260 /* Gather the data references and count stmts in the loop. */
2261 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2263 opt_result res
2264 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2265 &LOOP_VINFO_DATAREFS (loop_vinfo),
2266 &LOOP_VINFO_N_STMTS (loop_vinfo));
2267 if (!res)
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 "not vectorized: loop contains function "
2272 "calls or data references that cannot "
2273 "be analyzed\n");
2274 return res;
2276 loop_vinfo->shared->save_datarefs ();
2278 else
2279 loop_vinfo->shared->check_datarefs ();
2281 /* Analyze the data references and also adjust the minimal
2282 vectorization factor according to the loads and stores. */
2284 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2285 if (!ok)
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "bad data references.\n");
2290 return ok;
2293 /* Classify all cross-iteration scalar data-flow cycles.
2294 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2295 vect_analyze_scalar_cycles (loop_vinfo);
2297 vect_pattern_recog (loop_vinfo);
2299 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2301 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2302 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2304 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2305 if (!ok)
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 "bad data access.\n");
2310 return ok;
2313 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2315 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2316 if (!ok)
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 "unexpected pattern.\n");
2321 return ok;
2324 /* While the rest of the analysis below depends on it in some way. */
2325 fatal = false;
2327 /* Analyze data dependences between the data-refs in the loop
2328 and adjust the maximum vectorization factor according to
2329 the dependences.
2330 FORNOW: fail at the first data dependence that we encounter. */
2332 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2333 if (!ok)
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 "bad data dependence.\n");
2338 return ok;
2340 if (max_vf != MAX_VECTORIZATION_FACTOR
2341 && maybe_lt (max_vf, min_vf))
2342 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2343 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2345 ok = vect_determine_vectorization_factor (loop_vinfo);
2346 if (!ok)
2348 if (dump_enabled_p ())
2349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350 "can't determine vectorization factor.\n");
2351 return ok;
2353 if (max_vf != MAX_VECTORIZATION_FACTOR
2354 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2355 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2357 /* Compute the scalar iteration cost. */
2358 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2360 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2362 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2363 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2364 if (!ok)
2365 return ok;
2367 /* If there are any SLP instances mark them as pure_slp. */
2368 bool slp = vect_make_slp_decision (loop_vinfo);
2369 if (slp)
2371 /* Find stmts that need to be both vectorized and SLPed. */
2372 vect_detect_hybrid_slp (loop_vinfo);
2374 /* Update the vectorization factor based on the SLP decision. */
2375 vect_update_vf_for_slp (loop_vinfo);
2377 /* Optimize the SLP graph with the vectorization factor fixed. */
2378 vect_optimize_slp (loop_vinfo);
2380 /* Gather the loads reachable from the SLP graph entries. */
2381 vect_gather_slp_loads (loop_vinfo);
2384 bool saved_can_use_partial_vectors_p
2385 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2387 /* We don't expect to have to roll back to anything other than an empty
2388 set of rgroups. */
2389 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2391 /* Apply the suggested unrolling factor, this was determined by the backend
2392 during finish_cost the first time we ran the analyzis for this
2393 vector mode. */
2394 if (loop_vinfo->suggested_unroll_factor > 1)
2395 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2397 /* This is the point where we can re-start analysis with SLP forced off. */
2398 start_over:
2400 /* Now the vectorization factor is final. */
2401 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2402 gcc_assert (known_ne (vectorization_factor, 0U));
2404 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2406 dump_printf_loc (MSG_NOTE, vect_location,
2407 "vectorization_factor = ");
2408 dump_dec (MSG_NOTE, vectorization_factor);
2409 dump_printf (MSG_NOTE, ", niters = %wd\n",
2410 LOOP_VINFO_INT_NITERS (loop_vinfo));
2413 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2415 /* Analyze the alignment of the data-refs in the loop.
2416 Fail if a data reference is found that cannot be vectorized. */
2418 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2419 if (!ok)
2421 if (dump_enabled_p ())
2422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2423 "bad data alignment.\n");
2424 return ok;
2427 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2428 It is important to call pruning after vect_analyze_data_ref_accesses,
2429 since we use grouping information gathered by interleaving analysis. */
2430 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2431 if (!ok)
2432 return ok;
2434 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2435 vectorization, since we do not want to add extra peeling or
2436 add versioning for alignment. */
2437 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2438 /* This pass will decide on using loop versioning and/or loop peeling in
2439 order to enhance the alignment of data references in the loop. */
2440 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2441 if (!ok)
2442 return ok;
2444 if (slp)
2446 /* Analyze operations in the SLP instances. Note this may
2447 remove unsupported SLP instances which makes the above
2448 SLP kind detection invalid. */
2449 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2450 vect_slp_analyze_operations (loop_vinfo);
2451 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2453 ok = opt_result::failure_at (vect_location,
2454 "unsupported SLP instances\n");
2455 goto again;
2458 /* Check whether any load in ALL SLP instances is possibly permuted. */
2459 slp_tree load_node, slp_root;
2460 unsigned i, x;
2461 slp_instance instance;
2462 bool can_use_lanes = true;
2463 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2465 slp_root = SLP_INSTANCE_TREE (instance);
2466 int group_size = SLP_TREE_LANES (slp_root);
2467 tree vectype = SLP_TREE_VECTYPE (slp_root);
2468 bool loads_permuted = false;
2469 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2471 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2472 continue;
2473 unsigned j;
2474 stmt_vec_info load_info;
2475 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2476 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2478 loads_permuted = true;
2479 break;
2483 /* If the loads and stores can be handled with load/store-lane
2484 instructions record it and move on to the next instance. */
2485 if (loads_permuted
2486 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2487 && vect_store_lanes_supported (vectype, group_size, false))
2489 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2491 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2492 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2493 /* Use SLP for strided accesses (or if we can't
2494 load-lanes). */
2495 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2496 || ! vect_load_lanes_supported
2497 (STMT_VINFO_VECTYPE (stmt_vinfo),
2498 DR_GROUP_SIZE (stmt_vinfo), false))
2499 break;
2502 can_use_lanes
2503 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2505 if (can_use_lanes && dump_enabled_p ())
2506 dump_printf_loc (MSG_NOTE, vect_location,
2507 "SLP instance %p can use load/store-lanes\n",
2508 instance);
2510 else
2512 can_use_lanes = false;
2513 break;
2517 /* If all SLP instances can use load/store-lanes abort SLP and try again
2518 with SLP disabled. */
2519 if (can_use_lanes)
2521 ok = opt_result::failure_at (vect_location,
2522 "Built SLP cancelled: can use "
2523 "load/store-lanes\n");
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526 "Built SLP cancelled: all SLP instances support "
2527 "load/store-lanes\n");
2528 goto again;
2532 /* Dissolve SLP-only groups. */
2533 vect_dissolve_slp_only_groups (loop_vinfo);
2535 /* Scan all the remaining operations in the loop that are not subject
2536 to SLP and make sure they are vectorizable. */
2537 ok = vect_analyze_loop_operations (loop_vinfo);
2538 if (!ok)
2540 if (dump_enabled_p ())
2541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542 "bad operation or unsupported loop bound.\n");
2543 return ok;
2546 /* For now, we don't expect to mix both masking and length approaches for one
2547 loop, disable it if both are recorded. */
2548 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2549 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2550 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2552 if (dump_enabled_p ())
2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 "can't vectorize a loop with partial vectors"
2555 " because we don't expect to mix different"
2556 " approaches with partial vectors for the"
2557 " same loop.\n");
2558 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2561 /* If we still have the option of using partial vectors,
2562 check whether we can generate the necessary loop controls. */
2563 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2564 && !vect_verify_full_masking (loop_vinfo)
2565 && !vect_verify_loop_lens (loop_vinfo))
2566 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2568 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2569 to be able to handle fewer than VF scalars, or needs to have a lower VF
2570 than the main loop. */
2571 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2572 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2573 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2574 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2575 return opt_result::failure_at (vect_location,
2576 "Vectorization factor too high for"
2577 " epilogue loop.\n");
2579 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2580 assuming that the loop will be used as a main loop. We will redo
2581 this analysis later if we instead decide to use the loop as an
2582 epilogue loop. */
2583 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2584 if (!ok)
2585 return ok;
2587 /* Check the costings of the loop make vectorizing worthwhile. */
2588 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2589 if (res < 0)
2591 ok = opt_result::failure_at (vect_location,
2592 "Loop costings may not be worthwhile.\n");
2593 goto again;
2595 if (!res)
2596 return opt_result::failure_at (vect_location,
2597 "Loop costings not worthwhile.\n");
2599 /* If an epilogue loop is required make sure we can create one. */
2600 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2601 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2605 if (!vect_can_advance_ivs_p (loop_vinfo)
2606 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2607 single_exit (LOOP_VINFO_LOOP
2608 (loop_vinfo))))
2610 ok = opt_result::failure_at (vect_location,
2611 "not vectorized: can't create required "
2612 "epilog loop\n");
2613 goto again;
2617 /* During peeling, we need to check if number of loop iterations is
2618 enough for both peeled prolog loop and vector loop. This check
2619 can be merged along with threshold check of loop versioning, so
2620 increase threshold for this case if necessary.
2622 If we are analyzing an epilogue we still want to check what its
2623 versioning threshold would be. If we decide to vectorize the epilogues we
2624 will want to use the lowest versioning threshold of all epilogues and main
2625 loop. This will enable us to enter a vectorized epilogue even when
2626 versioning the loop. We can't simply check whether the epilogue requires
2627 versioning though since we may have skipped some versioning checks when
2628 analyzing the epilogue. For instance, checks for alias versioning will be
2629 skipped when dealing with epilogues as we assume we already checked them
2630 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2631 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2633 poly_uint64 niters_th = 0;
2634 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2636 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2638 /* Niters for peeled prolog loop. */
2639 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2641 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2642 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2643 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2645 else
2646 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2649 /* Niters for at least one iteration of vectorized loop. */
2650 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2651 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2652 /* One additional iteration because of peeling for gap. */
2653 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2654 niters_th += 1;
2656 /* Use the same condition as vect_transform_loop to decide when to use
2657 the cost to determine a versioning threshold. */
2658 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2659 && ordered_p (th, niters_th))
2660 niters_th = ordered_max (poly_uint64 (th), niters_th);
2662 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2665 gcc_assert (known_eq (vectorization_factor,
2666 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2668 /* Ok to vectorize! */
2669 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2670 return opt_result::success ();
2672 again:
2673 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2674 gcc_assert (!ok);
2676 /* Try again with SLP forced off but if we didn't do any SLP there is
2677 no point in re-trying. */
2678 if (!slp)
2679 return ok;
2681 /* If there are reduction chains re-trying will fail anyway. */
2682 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2683 return ok;
2685 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686 via interleaving or lane instructions. */
2687 slp_instance instance;
2688 slp_tree node;
2689 unsigned i, j;
2690 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2692 stmt_vec_info vinfo;
2693 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2694 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2695 continue;
2696 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2697 unsigned int size = DR_GROUP_SIZE (vinfo);
2698 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2699 if (! vect_store_lanes_supported (vectype, size, false)
2700 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2701 && ! vect_grouped_store_supported (vectype, size))
2702 return opt_result::failure_at (vinfo->stmt,
2703 "unsupported grouped store\n");
2704 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2706 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2707 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2708 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2709 size = DR_GROUP_SIZE (vinfo);
2710 vectype = STMT_VINFO_VECTYPE (vinfo);
2711 if (! vect_load_lanes_supported (vectype, size, false)
2712 && ! vect_grouped_load_supported (vectype, single_element_p,
2713 size))
2714 return opt_result::failure_at (vinfo->stmt,
2715 "unsupported grouped load\n");
2719 if (dump_enabled_p ())
2720 dump_printf_loc (MSG_NOTE, vect_location,
2721 "re-trying with SLP disabled\n");
2723 /* Roll back state appropriately. No SLP this time. */
2724 slp = false;
2725 /* Restore vectorization factor as it were without SLP. */
2726 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2727 /* Free the SLP instances. */
2728 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2729 vect_free_slp_instance (instance);
2730 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2731 /* Reset SLP type to loop_vect on all stmts. */
2732 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2734 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2735 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2736 !gsi_end_p (si); gsi_next (&si))
2738 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2739 STMT_SLP_TYPE (stmt_info) = loop_vect;
2740 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2741 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2743 /* vectorizable_reduction adjusts reduction stmt def-types,
2744 restore them to that of the PHI. */
2745 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2746 = STMT_VINFO_DEF_TYPE (stmt_info);
2747 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2748 (STMT_VINFO_REDUC_DEF (stmt_info)))
2749 = STMT_VINFO_DEF_TYPE (stmt_info);
2752 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2753 !gsi_end_p (si); gsi_next (&si))
2755 if (is_gimple_debug (gsi_stmt (si)))
2756 continue;
2757 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2758 STMT_SLP_TYPE (stmt_info) = loop_vect;
2759 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2761 stmt_vec_info pattern_stmt_info
2762 = STMT_VINFO_RELATED_STMT (stmt_info);
2763 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2764 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2766 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2767 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2768 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2769 !gsi_end_p (pi); gsi_next (&pi))
2770 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2771 = loop_vect;
2775 /* Free optimized alias test DDRS. */
2776 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2777 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2778 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2779 /* Reset target cost data. */
2780 delete loop_vinfo->vector_costs;
2781 loop_vinfo->vector_costs = nullptr;
2782 /* Reset accumulated rgroup information. */
2783 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2784 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2785 /* Reset assorted flags. */
2786 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2787 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2788 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2789 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2790 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2791 = saved_can_use_partial_vectors_p;
2793 goto start_over;
2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2797 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2798 OLD_LOOP_VINFO is better unless something specifically indicates
2799 otherwise.
2801 Note that this deliberately isn't a partial order. */
2803 static bool
2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2805 loop_vec_info old_loop_vinfo)
2807 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2808 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2810 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2811 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2813 /* Always prefer a VF of loop->simdlen over any other VF. */
2814 if (loop->simdlen)
2816 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2817 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2818 if (new_simdlen_p != old_simdlen_p)
2819 return new_simdlen_p;
2822 const auto *old_costs = old_loop_vinfo->vector_costs;
2823 const auto *new_costs = new_loop_vinfo->vector_costs;
2824 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2825 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2827 return new_costs->better_main_loop_than_p (old_costs);
2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2831 true if we should. */
2833 static bool
2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2835 loop_vec_info old_loop_vinfo)
2837 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2838 return false;
2840 if (dump_enabled_p ())
2841 dump_printf_loc (MSG_NOTE, vect_location,
2842 "***** Preferring vector mode %s to vector mode %s\n",
2843 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2844 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2845 return true;
2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2849 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2850 MODE_I to the next mode useful to analyze.
2851 Return the loop_vinfo on success and wrapped null on failure. */
2853 static opt_loop_vec_info
2854 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2855 const vect_loop_form_info *loop_form_info,
2856 loop_vec_info main_loop_vinfo,
2857 const vector_modes &vector_modes, unsigned &mode_i,
2858 machine_mode &autodetected_vector_mode,
2859 bool &fatal)
2861 loop_vec_info loop_vinfo
2862 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2864 machine_mode vector_mode = vector_modes[mode_i];
2865 loop_vinfo->vector_mode = vector_mode;
2866 unsigned int suggested_unroll_factor = 1;
2868 /* Run the main analysis. */
2869 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2870 &suggested_unroll_factor);
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_NOTE, vect_location,
2873 "***** Analysis %s with vector mode %s\n",
2874 res ? "succeeded" : " failed",
2875 GET_MODE_NAME (loop_vinfo->vector_mode));
2877 if (!main_loop_vinfo && suggested_unroll_factor > 1)
2879 if (dump_enabled_p ())
2880 dump_printf_loc (MSG_NOTE, vect_location,
2881 "***** Re-trying analysis for unrolling"
2882 " with unroll factor %d.\n",
2883 suggested_unroll_factor);
2884 loop_vec_info unroll_vinfo
2885 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2886 unroll_vinfo->vector_mode = vector_mode;
2887 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2888 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
2889 if (new_res)
2891 delete loop_vinfo;
2892 loop_vinfo = unroll_vinfo;
2894 else
2895 delete unroll_vinfo;
2898 /* Remember the autodetected vector mode. */
2899 if (vector_mode == VOIDmode)
2900 autodetected_vector_mode = loop_vinfo->vector_mode;
2902 /* Advance mode_i, first skipping modes that would result in the
2903 same analysis result. */
2904 while (mode_i + 1 < vector_modes.length ()
2905 && vect_chooses_same_modes_p (loop_vinfo,
2906 vector_modes[mode_i + 1]))
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_NOTE, vect_location,
2910 "***** The result for vector mode %s would"
2911 " be the same\n",
2912 GET_MODE_NAME (vector_modes[mode_i + 1]));
2913 mode_i += 1;
2915 if (mode_i + 1 < vector_modes.length ()
2916 && VECTOR_MODE_P (autodetected_vector_mode)
2917 && (related_vector_mode (vector_modes[mode_i + 1],
2918 GET_MODE_INNER (autodetected_vector_mode))
2919 == autodetected_vector_mode)
2920 && (related_vector_mode (autodetected_vector_mode,
2921 GET_MODE_INNER (vector_modes[mode_i + 1]))
2922 == vector_modes[mode_i + 1]))
2924 if (dump_enabled_p ())
2925 dump_printf_loc (MSG_NOTE, vect_location,
2926 "***** Skipping vector mode %s, which would"
2927 " repeat the analysis for %s\n",
2928 GET_MODE_NAME (vector_modes[mode_i + 1]),
2929 GET_MODE_NAME (autodetected_vector_mode));
2930 mode_i += 1;
2932 mode_i++;
2934 if (!res)
2936 delete loop_vinfo;
2937 if (fatal)
2938 gcc_checking_assert (main_loop_vinfo == NULL);
2939 return opt_loop_vec_info::propagate_failure (res);
2942 return opt_loop_vec_info::success (loop_vinfo);
2945 /* Function vect_analyze_loop.
2947 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2948 for it. The different analyses will record information in the
2949 loop_vec_info struct. */
2950 opt_loop_vec_info
2951 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2953 DUMP_VECT_SCOPE ("analyze_loop_nest");
2955 if (loop_outer (loop)
2956 && loop_vec_info_for_loop (loop_outer (loop))
2957 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2958 return opt_loop_vec_info::failure_at (vect_location,
2959 "outer-loop already vectorized.\n");
2961 if (!find_loop_nest (loop, &shared->loop_nest))
2962 return opt_loop_vec_info::failure_at
2963 (vect_location,
2964 "not vectorized: loop nest containing two or more consecutive inner"
2965 " loops cannot be vectorized\n");
2967 /* Analyze the loop form. */
2968 vect_loop_form_info loop_form_info;
2969 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2970 if (!res)
2972 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974 "bad loop form.\n");
2975 return opt_loop_vec_info::propagate_failure (res);
2977 if (!integer_onep (loop_form_info.assumptions))
2979 /* We consider to vectorize this loop by versioning it under
2980 some assumptions. In order to do this, we need to clear
2981 existing information computed by scev and niter analyzer. */
2982 scev_reset_htab ();
2983 free_numbers_of_iterations_estimates (loop);
2984 /* Also set flag for this loop so that following scev and niter
2985 analysis are done under the assumptions. */
2986 loop_constraint_set (loop, LOOP_C_FINITE);
2989 auto_vector_modes vector_modes;
2990 /* Autodetect first vector size we try. */
2991 vector_modes.safe_push (VOIDmode);
2992 unsigned int autovec_flags
2993 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2994 loop->simdlen != 0);
2995 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2996 && !unlimited_cost_model (loop));
2997 machine_mode autodetected_vector_mode = VOIDmode;
2998 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2999 unsigned int mode_i = 0;
3000 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3002 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3003 a mode has not been analyzed. */
3004 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3005 for (unsigned i = 0; i < vector_modes.length (); ++i)
3006 cached_vf_per_mode.safe_push (0);
3008 /* First determine the main loop vectorization mode, either the first
3009 one that works, starting with auto-detecting the vector mode and then
3010 following the targets order of preference, or the one with the
3011 lowest cost if pick_lowest_cost_p. */
3012 while (1)
3014 bool fatal;
3015 unsigned int last_mode_i = mode_i;
3016 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3017 failed. */
3018 cached_vf_per_mode[last_mode_i] = -1;
3019 opt_loop_vec_info loop_vinfo
3020 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3021 NULL, vector_modes, mode_i,
3022 autodetected_vector_mode, fatal);
3023 if (fatal)
3024 break;
3026 if (loop_vinfo)
3028 /* Analyzis has been successful so update the VF value. The
3029 VF should always be a multiple of unroll_factor and we want to
3030 capture the original VF here. */
3031 cached_vf_per_mode[last_mode_i]
3032 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3033 loop_vinfo->suggested_unroll_factor);
3034 /* Once we hit the desired simdlen for the first time,
3035 discard any previous attempts. */
3036 if (simdlen
3037 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3039 delete first_loop_vinfo;
3040 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3041 simdlen = 0;
3043 else if (pick_lowest_cost_p
3044 && first_loop_vinfo
3045 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3047 /* Pick loop_vinfo over first_loop_vinfo. */
3048 delete first_loop_vinfo;
3049 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3051 if (first_loop_vinfo == NULL)
3052 first_loop_vinfo = loop_vinfo;
3053 else
3055 delete loop_vinfo;
3056 loop_vinfo = opt_loop_vec_info::success (NULL);
3059 /* Commit to first_loop_vinfo if we have no reason to try
3060 alternatives. */
3061 if (!simdlen && !pick_lowest_cost_p)
3062 break;
3064 if (mode_i == vector_modes.length ()
3065 || autodetected_vector_mode == VOIDmode)
3066 break;
3068 /* Try the next biggest vector size. */
3069 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_NOTE, vect_location,
3071 "***** Re-trying analysis with vector mode %s\n",
3072 GET_MODE_NAME (vector_modes[mode_i]));
3074 if (!first_loop_vinfo)
3075 return opt_loop_vec_info::propagate_failure (res);
3077 if (dump_enabled_p ())
3078 dump_printf_loc (MSG_NOTE, vect_location,
3079 "***** Choosing vector mode %s\n",
3080 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3082 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3083 enabled, SIMDUID is not set, it is the innermost loop and we have
3084 either already found the loop's SIMDLEN or there was no SIMDLEN to
3085 begin with.
3086 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3087 bool vect_epilogues = (!simdlen
3088 && loop->inner == NULL
3089 && param_vect_epilogues_nomask
3090 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3091 && !loop->simduid);
3092 if (!vect_epilogues)
3093 return first_loop_vinfo;
3095 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3096 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3098 /* For epilogues start the analysis from the first mode. The motivation
3099 behind starting from the beginning comes from cases where the VECTOR_MODES
3100 array may contain length-agnostic and length-specific modes. Their
3101 ordering is not guaranteed, so we could end up picking a mode for the main
3102 loop that is after the epilogue's optimal mode. */
3103 vector_modes[0] = autodetected_vector_mode;
3104 mode_i = 0;
3106 bool supports_partial_vectors =
3107 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3108 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3110 while (1)
3112 /* If the target does not support partial vectors we can shorten the
3113 number of modes to analyze for the epilogue as we know we can't pick a
3114 mode that would lead to a VF at least as big as the
3115 FIRST_VINFO_VF. */
3116 if (!supports_partial_vectors
3117 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3119 mode_i++;
3120 if (mode_i == vector_modes.length ())
3121 break;
3122 continue;
3125 if (dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE, vect_location,
3127 "***** Re-trying epilogue analysis with vector "
3128 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3130 bool fatal;
3131 opt_loop_vec_info loop_vinfo
3132 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133 first_loop_vinfo,
3134 vector_modes, mode_i,
3135 autodetected_vector_mode, fatal);
3136 if (fatal)
3137 break;
3139 if (loop_vinfo)
3141 if (pick_lowest_cost_p)
3143 /* Keep trying to roll back vectorization attempts while the
3144 loop_vec_infos they produced were worse than this one. */
3145 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3146 while (!vinfos.is_empty ()
3147 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3149 gcc_assert (vect_epilogues);
3150 delete vinfos.pop ();
3153 /* For now only allow one epilogue loop. */
3154 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3156 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3157 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3158 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3159 || maybe_ne (lowest_th, 0U));
3160 /* Keep track of the known smallest versioning
3161 threshold. */
3162 if (ordered_p (lowest_th, th))
3163 lowest_th = ordered_min (lowest_th, th);
3165 else
3167 delete loop_vinfo;
3168 loop_vinfo = opt_loop_vec_info::success (NULL);
3171 /* For now only allow one epilogue loop, but allow
3172 pick_lowest_cost_p to replace it, so commit to the
3173 first epilogue if we have no reason to try alternatives. */
3174 if (!pick_lowest_cost_p)
3175 break;
3178 if (mode_i == vector_modes.length ())
3179 break;
3183 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3185 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3186 if (dump_enabled_p ())
3187 dump_printf_loc (MSG_NOTE, vect_location,
3188 "***** Choosing epilogue vector mode %s\n",
3189 GET_MODE_NAME
3190 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3193 return first_loop_vinfo;
3196 /* Return true if there is an in-order reduction function for CODE, storing
3197 it in *REDUC_FN if so. */
3199 static bool
3200 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3202 if (code == PLUS_EXPR)
3204 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3205 return true;
3207 return false;
3210 /* Function reduction_fn_for_scalar_code
3212 Input:
3213 CODE - tree_code of a reduction operations.
3215 Output:
3216 REDUC_FN - the corresponding internal function to be used to reduce the
3217 vector of partial results into a single scalar result, or IFN_LAST
3218 if the operation is a supported reduction operation, but does not have
3219 such an internal function.
3221 Return FALSE if CODE currently cannot be vectorized as reduction. */
3223 bool
3224 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3226 if (code.is_tree_code ())
3227 switch (tree_code (code))
3229 case MAX_EXPR:
3230 *reduc_fn = IFN_REDUC_MAX;
3231 return true;
3233 case MIN_EXPR:
3234 *reduc_fn = IFN_REDUC_MIN;
3235 return true;
3237 case PLUS_EXPR:
3238 *reduc_fn = IFN_REDUC_PLUS;
3239 return true;
3241 case BIT_AND_EXPR:
3242 *reduc_fn = IFN_REDUC_AND;
3243 return true;
3245 case BIT_IOR_EXPR:
3246 *reduc_fn = IFN_REDUC_IOR;
3247 return true;
3249 case BIT_XOR_EXPR:
3250 *reduc_fn = IFN_REDUC_XOR;
3251 return true;
3253 case MULT_EXPR:
3254 case MINUS_EXPR:
3255 *reduc_fn = IFN_LAST;
3256 return true;
3258 default:
3259 return false;
3261 else
3262 switch (combined_fn (code))
3264 CASE_CFN_FMAX:
3265 *reduc_fn = IFN_REDUC_FMAX;
3266 return true;
3268 CASE_CFN_FMIN:
3269 *reduc_fn = IFN_REDUC_FMIN;
3270 return true;
3272 default:
3273 return false;
3277 /* If there is a neutral value X such that a reduction would not be affected
3278 by the introduction of additional X elements, return that X, otherwise
3279 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3280 of the scalar elements. If the reduction has just a single initial value
3281 then INITIAL_VALUE is that value, otherwise it is null. */
3283 tree
3284 neutral_op_for_reduction (tree scalar_type, code_helper code,
3285 tree initial_value)
3287 if (code.is_tree_code ())
3288 switch (tree_code (code))
3290 case WIDEN_SUM_EXPR:
3291 case DOT_PROD_EXPR:
3292 case SAD_EXPR:
3293 case PLUS_EXPR:
3294 case MINUS_EXPR:
3295 case BIT_IOR_EXPR:
3296 case BIT_XOR_EXPR:
3297 return build_zero_cst (scalar_type);
3299 case MULT_EXPR:
3300 return build_one_cst (scalar_type);
3302 case BIT_AND_EXPR:
3303 return build_all_ones_cst (scalar_type);
3305 case MAX_EXPR:
3306 case MIN_EXPR:
3307 return initial_value;
3309 default:
3310 return NULL_TREE;
3312 else
3313 switch (combined_fn (code))
3315 CASE_CFN_FMIN:
3316 CASE_CFN_FMAX:
3317 return initial_value;
3319 default:
3320 return NULL_TREE;
3324 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3325 STMT is printed with a message MSG. */
3327 static void
3328 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3330 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3333 /* Return true if we need an in-order reduction for operation CODE
3334 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3335 overflow must wrap. */
3337 bool
3338 needs_fold_left_reduction_p (tree type, code_helper code)
3340 /* CHECKME: check for !flag_finite_math_only too? */
3341 if (SCALAR_FLOAT_TYPE_P (type))
3343 if (code.is_tree_code ())
3344 switch (tree_code (code))
3346 case MIN_EXPR:
3347 case MAX_EXPR:
3348 return false;
3350 default:
3351 return !flag_associative_math;
3353 else
3354 switch (combined_fn (code))
3356 CASE_CFN_FMIN:
3357 CASE_CFN_FMAX:
3358 return false;
3360 default:
3361 return !flag_associative_math;
3365 if (INTEGRAL_TYPE_P (type))
3366 return (!code.is_tree_code ()
3367 || !operation_no_trapping_overflow (type, tree_code (code)));
3369 if (SAT_FIXED_POINT_TYPE_P (type))
3370 return true;
3372 return false;
3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3376 has a handled computation expression. Store the main reduction
3377 operation in *CODE. */
3379 static bool
3380 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3381 tree loop_arg, code_helper *code,
3382 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3384 auto_bitmap visited;
3385 tree lookfor = PHI_RESULT (phi);
3386 ssa_op_iter curri;
3387 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3388 while (USE_FROM_PTR (curr) != loop_arg)
3389 curr = op_iter_next_use (&curri);
3390 curri.i = curri.numops;
3393 path.safe_push (std::make_pair (curri, curr));
3394 tree use = USE_FROM_PTR (curr);
3395 if (use == lookfor)
3396 break;
3397 gimple *def = SSA_NAME_DEF_STMT (use);
3398 if (gimple_nop_p (def)
3399 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3401 pop:
3404 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3405 curri = x.first;
3406 curr = x.second;
3408 curr = op_iter_next_use (&curri);
3409 /* Skip already visited or non-SSA operands (from iterating
3410 over PHI args). */
3411 while (curr != NULL_USE_OPERAND_P
3412 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3413 || ! bitmap_set_bit (visited,
3414 SSA_NAME_VERSION
3415 (USE_FROM_PTR (curr)))));
3417 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3418 if (curr == NULL_USE_OPERAND_P)
3419 break;
3421 else
3423 if (gimple_code (def) == GIMPLE_PHI)
3424 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3425 else
3426 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3427 while (curr != NULL_USE_OPERAND_P
3428 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3429 || ! bitmap_set_bit (visited,
3430 SSA_NAME_VERSION
3431 (USE_FROM_PTR (curr)))))
3432 curr = op_iter_next_use (&curri);
3433 if (curr == NULL_USE_OPERAND_P)
3434 goto pop;
3437 while (1);
3438 if (dump_file && (dump_flags & TDF_DETAILS))
3440 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3441 unsigned i;
3442 std::pair<ssa_op_iter, use_operand_p> *x;
3443 FOR_EACH_VEC_ELT (path, i, x)
3444 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3445 dump_printf (MSG_NOTE, "\n");
3448 /* Check whether the reduction path detected is valid. */
3449 bool fail = path.length () == 0;
3450 bool neg = false;
3451 int sign = -1;
3452 *code = ERROR_MARK;
3453 for (unsigned i = 1; i < path.length (); ++i)
3455 gimple *use_stmt = USE_STMT (path[i].second);
3456 gimple_match_op op;
3457 if (!gimple_extract_op (use_stmt, &op))
3459 fail = true;
3460 break;
3462 unsigned int opi = op.num_ops;
3463 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3465 /* The following make sure we can compute the operand index
3466 easily plus it mostly disallows chaining via COND_EXPR condition
3467 operands. */
3468 for (opi = 0; opi < op.num_ops; ++opi)
3469 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3470 break;
3472 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3474 for (opi = 0; opi < op.num_ops; ++opi)
3475 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3476 break;
3478 if (opi == op.num_ops)
3480 fail = true;
3481 break;
3483 op.code = canonicalize_code (op.code, op.type);
3484 if (op.code == MINUS_EXPR)
3486 op.code = PLUS_EXPR;
3487 /* Track whether we negate the reduction value each iteration. */
3488 if (op.ops[1] == op.ops[opi])
3489 neg = ! neg;
3491 if (CONVERT_EXPR_CODE_P (op.code)
3492 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3494 else if (*code == ERROR_MARK)
3496 *code = op.code;
3497 sign = TYPE_SIGN (op.type);
3499 else if (op.code != *code)
3501 fail = true;
3502 break;
3504 else if ((op.code == MIN_EXPR
3505 || op.code == MAX_EXPR)
3506 && sign != TYPE_SIGN (op.type))
3508 fail = true;
3509 break;
3511 /* Check there's only a single stmt the op is used on. For the
3512 not value-changing tail and the last stmt allow out-of-loop uses.
3513 ??? We could relax this and handle arbitrary live stmts by
3514 forcing a scalar epilogue for example. */
3515 imm_use_iterator imm_iter;
3516 gimple *op_use_stmt;
3517 unsigned cnt = 0;
3518 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3519 if (!is_gimple_debug (op_use_stmt)
3520 && (*code != ERROR_MARK
3521 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3523 /* We want to allow x + x but not x < 1 ? x : 2. */
3524 if (is_gimple_assign (op_use_stmt)
3525 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3527 use_operand_p use_p;
3528 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3529 cnt++;
3531 else
3532 cnt++;
3534 if (cnt != 1)
3536 fail = true;
3537 break;
3540 return ! fail && ! neg && *code != ERROR_MARK;
3543 bool
3544 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3545 tree loop_arg, enum tree_code code)
3547 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3548 code_helper code_;
3549 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3550 && code_ == code);
3555 /* Function vect_is_simple_reduction
3557 (1) Detect a cross-iteration def-use cycle that represents a simple
3558 reduction computation. We look for the following pattern:
3560 loop_header:
3561 a1 = phi < a0, a2 >
3562 a3 = ...
3563 a2 = operation (a3, a1)
3567 a3 = ...
3568 loop_header:
3569 a1 = phi < a0, a2 >
3570 a2 = operation (a3, a1)
3572 such that:
3573 1. operation is commutative and associative and it is safe to
3574 change the order of the computation
3575 2. no uses for a2 in the loop (a2 is used out of the loop)
3576 3. no uses of a1 in the loop besides the reduction operation
3577 4. no uses of a1 outside the loop.
3579 Conditions 1,4 are tested here.
3580 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3582 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3583 nested cycles.
3585 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3586 reductions:
3588 a1 = phi < a0, a2 >
3589 inner loop (def of a3)
3590 a2 = phi < a3 >
3592 (4) Detect condition expressions, ie:
3593 for (int i = 0; i < N; i++)
3594 if (a[i] < val)
3595 ret_val = a[i];
3599 static stmt_vec_info
3600 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3601 bool *double_reduc, bool *reduc_chain_p)
3603 gphi *phi = as_a <gphi *> (phi_info->stmt);
3604 gimple *phi_use_stmt = NULL;
3605 imm_use_iterator imm_iter;
3606 use_operand_p use_p;
3608 *double_reduc = false;
3609 *reduc_chain_p = false;
3610 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3612 tree phi_name = PHI_RESULT (phi);
3613 /* ??? If there are no uses of the PHI result the inner loop reduction
3614 won't be detected as possibly double-reduction by vectorizable_reduction
3615 because that tries to walk the PHI arg from the preheader edge which
3616 can be constant. See PR60382. */
3617 if (has_zero_uses (phi_name))
3618 return NULL;
3619 class loop *loop = (gimple_bb (phi))->loop_father;
3620 unsigned nphi_def_loop_uses = 0;
3621 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3623 gimple *use_stmt = USE_STMT (use_p);
3624 if (is_gimple_debug (use_stmt))
3625 continue;
3627 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3629 if (dump_enabled_p ())
3630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3631 "intermediate value used outside loop.\n");
3633 return NULL;
3636 nphi_def_loop_uses++;
3637 phi_use_stmt = use_stmt;
3640 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3641 if (TREE_CODE (latch_def) != SSA_NAME)
3643 if (dump_enabled_p ())
3644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3645 "reduction: not ssa_name: %T\n", latch_def);
3646 return NULL;
3649 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3650 if (!def_stmt_info
3651 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3652 return NULL;
3654 bool nested_in_vect_loop
3655 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3656 unsigned nlatch_def_loop_uses = 0;
3657 auto_vec<gphi *, 3> lcphis;
3658 bool inner_loop_of_double_reduc = false;
3659 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3661 gimple *use_stmt = USE_STMT (use_p);
3662 if (is_gimple_debug (use_stmt))
3663 continue;
3664 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3665 nlatch_def_loop_uses++;
3666 else
3668 /* We can have more than one loop-closed PHI. */
3669 lcphis.safe_push (as_a <gphi *> (use_stmt));
3670 if (nested_in_vect_loop
3671 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3672 == vect_double_reduction_def))
3673 inner_loop_of_double_reduc = true;
3677 /* If we are vectorizing an inner reduction we are executing that
3678 in the original order only in case we are not dealing with a
3679 double reduction. */
3680 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3682 if (dump_enabled_p ())
3683 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3684 "detected nested cycle: ");
3685 return def_stmt_info;
3688 /* When the inner loop of a double reduction ends up with more than
3689 one loop-closed PHI we have failed to classify alternate such
3690 PHIs as double reduction, leading to wrong code. See PR103237. */
3691 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3695 "unhandle double reduction\n");
3696 return NULL;
3699 /* If this isn't a nested cycle or if the nested cycle reduction value
3700 is used ouside of the inner loop we cannot handle uses of the reduction
3701 value. */
3702 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3704 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706 "reduction used in loop.\n");
3707 return NULL;
3710 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3711 defined in the inner loop. */
3712 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3714 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3715 if (gimple_phi_num_args (def_stmt) != 1
3716 || TREE_CODE (op1) != SSA_NAME)
3718 if (dump_enabled_p ())
3719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3720 "unsupported phi node definition.\n");
3722 return NULL;
3725 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3726 if (gimple_bb (def1)
3727 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3728 && loop->inner
3729 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3730 && (is_gimple_assign (def1) || is_gimple_call (def1))
3731 && is_a <gphi *> (phi_use_stmt)
3732 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3734 if (dump_enabled_p ())
3735 report_vect_op (MSG_NOTE, def_stmt,
3736 "detected double reduction: ");
3738 *double_reduc = true;
3739 return def_stmt_info;
3742 return NULL;
3745 /* Look for the expression computing latch_def from then loop PHI result. */
3746 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3747 code_helper code;
3748 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3749 path))
3751 STMT_VINFO_REDUC_CODE (phi_info) = code;
3752 if (code == COND_EXPR && !nested_in_vect_loop)
3753 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3755 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3756 reduction chain for which the additional restriction is that
3757 all operations in the chain are the same. */
3758 auto_vec<stmt_vec_info, 8> reduc_chain;
3759 unsigned i;
3760 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3761 for (i = path.length () - 1; i >= 1; --i)
3763 gimple *stmt = USE_STMT (path[i].second);
3764 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3765 gimple_match_op op;
3766 if (!gimple_extract_op (stmt, &op))
3767 gcc_unreachable ();
3768 if (gassign *assign = dyn_cast<gassign *> (stmt))
3769 STMT_VINFO_REDUC_IDX (stmt_info)
3770 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3771 else
3773 gcall *call = as_a<gcall *> (stmt);
3774 STMT_VINFO_REDUC_IDX (stmt_info)
3775 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3777 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3778 && (i == 1 || i == path.length () - 1));
3779 if ((op.code != code && !leading_conversion)
3780 /* We can only handle the final value in epilogue
3781 generation for reduction chains. */
3782 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3783 is_slp_reduc = false;
3784 /* For reduction chains we support a trailing/leading
3785 conversions. We do not store those in the actual chain. */
3786 if (leading_conversion)
3787 continue;
3788 reduc_chain.safe_push (stmt_info);
3790 if (is_slp_reduc && reduc_chain.length () > 1)
3792 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3794 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3795 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3797 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3798 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3800 /* Save the chain for further analysis in SLP detection. */
3801 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3802 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3804 *reduc_chain_p = true;
3805 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_NOTE, vect_location,
3807 "reduction: detected reduction chain\n");
3809 else if (dump_enabled_p ())
3810 dump_printf_loc (MSG_NOTE, vect_location,
3811 "reduction: detected reduction\n");
3813 return def_stmt_info;
3816 if (dump_enabled_p ())
3817 dump_printf_loc (MSG_NOTE, vect_location,
3818 "reduction: unknown pattern\n");
3820 return NULL;
3823 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3824 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3825 or -1 if not known. */
3827 static int
3828 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3830 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3831 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3833 if (dump_enabled_p ())
3834 dump_printf_loc (MSG_NOTE, vect_location,
3835 "cost model: epilogue peel iters set to vf/2 "
3836 "because loop iterations are unknown .\n");
3837 return assumed_vf / 2;
3839 else
3841 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3842 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3843 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3844 /* If we need to peel for gaps, but no peeling is required, we have to
3845 peel VF iterations. */
3846 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3847 peel_iters_epilogue = assumed_vf;
3848 return peel_iters_epilogue;
3852 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3854 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3855 int *peel_iters_epilogue,
3856 stmt_vector_for_cost *scalar_cost_vec,
3857 stmt_vector_for_cost *prologue_cost_vec,
3858 stmt_vector_for_cost *epilogue_cost_vec)
3860 int retval = 0;
3862 *peel_iters_epilogue
3863 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3865 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3867 /* If peeled iterations are known but number of scalar loop
3868 iterations are unknown, count a taken branch per peeled loop. */
3869 if (peel_iters_prologue > 0)
3870 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3871 vect_prologue);
3872 if (*peel_iters_epilogue > 0)
3873 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3874 vect_epilogue);
3877 stmt_info_for_cost *si;
3878 int j;
3879 if (peel_iters_prologue)
3880 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3881 retval += record_stmt_cost (prologue_cost_vec,
3882 si->count * peel_iters_prologue,
3883 si->kind, si->stmt_info, si->misalign,
3884 vect_prologue);
3885 if (*peel_iters_epilogue)
3886 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3887 retval += record_stmt_cost (epilogue_cost_vec,
3888 si->count * *peel_iters_epilogue,
3889 si->kind, si->stmt_info, si->misalign,
3890 vect_epilogue);
3892 return retval;
3895 /* Function vect_estimate_min_profitable_iters
3897 Return the number of iterations required for the vector version of the
3898 loop to be profitable relative to the cost of the scalar version of the
3899 loop.
3901 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3902 of iterations for vectorization. -1 value means loop vectorization
3903 is not profitable. This returned value may be used for dynamic
3904 profitability check.
3906 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3907 for static check against estimated number of iterations. */
3909 static void
3910 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3911 int *ret_min_profitable_niters,
3912 int *ret_min_profitable_estimate,
3913 unsigned *suggested_unroll_factor)
3915 int min_profitable_iters;
3916 int min_profitable_estimate;
3917 int peel_iters_prologue;
3918 int peel_iters_epilogue;
3919 unsigned vec_inside_cost = 0;
3920 int vec_outside_cost = 0;
3921 unsigned vec_prologue_cost = 0;
3922 unsigned vec_epilogue_cost = 0;
3923 int scalar_single_iter_cost = 0;
3924 int scalar_outside_cost = 0;
3925 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3926 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3927 vector_costs *target_cost_data = loop_vinfo->vector_costs;
3929 /* Cost model disabled. */
3930 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3932 if (dump_enabled_p ())
3933 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3934 *ret_min_profitable_niters = 0;
3935 *ret_min_profitable_estimate = 0;
3936 return;
3939 /* Requires loop versioning tests to handle misalignment. */
3940 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3942 /* FIXME: Make cost depend on complexity of individual check. */
3943 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3944 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3945 if (dump_enabled_p ())
3946 dump_printf (MSG_NOTE,
3947 "cost model: Adding cost of checks for loop "
3948 "versioning to treat misalignment.\n");
3951 /* Requires loop versioning with alias checks. */
3952 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3954 /* FIXME: Make cost depend on complexity of individual check. */
3955 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3956 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3957 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3958 if (len)
3959 /* Count LEN - 1 ANDs and LEN comparisons. */
3960 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3961 scalar_stmt, vect_prologue);
3962 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3963 if (len)
3965 /* Count LEN - 1 ANDs and LEN comparisons. */
3966 unsigned int nstmts = len * 2 - 1;
3967 /* +1 for each bias that needs adding. */
3968 for (unsigned int i = 0; i < len; ++i)
3969 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3970 nstmts += 1;
3971 (void) add_stmt_cost (target_cost_data, nstmts,
3972 scalar_stmt, vect_prologue);
3974 if (dump_enabled_p ())
3975 dump_printf (MSG_NOTE,
3976 "cost model: Adding cost of checks for loop "
3977 "versioning aliasing.\n");
3980 /* Requires loop versioning with niter checks. */
3981 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3983 /* FIXME: Make cost depend on complexity of individual check. */
3984 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3985 NULL, NULL, NULL_TREE, 0, vect_prologue);
3986 if (dump_enabled_p ())
3987 dump_printf (MSG_NOTE,
3988 "cost model: Adding cost of checks for loop "
3989 "versioning niters.\n");
3992 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3993 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3994 vect_prologue);
3996 /* Count statements in scalar loop. Using this as scalar cost for a single
3997 iteration for now.
3999 TODO: Add outer loop support.
4001 TODO: Consider assigning different costs to different scalar
4002 statements. */
4004 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4006 /* Add additional cost for the peeled instructions in prologue and epilogue
4007 loop. (For fully-masked loops there will be no peeling.)
4009 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4010 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4012 TODO: Build an expression that represents peel_iters for prologue and
4013 epilogue to be used in a run-time test. */
4015 bool prologue_need_br_taken_cost = false;
4016 bool prologue_need_br_not_taken_cost = false;
4018 /* Calculate peel_iters_prologue. */
4019 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4020 peel_iters_prologue = 0;
4021 else if (npeel < 0)
4023 peel_iters_prologue = assumed_vf / 2;
4024 if (dump_enabled_p ())
4025 dump_printf (MSG_NOTE, "cost model: "
4026 "prologue peel iters set to vf/2.\n");
4028 /* If peeled iterations are unknown, count a taken branch and a not taken
4029 branch per peeled loop. Even if scalar loop iterations are known,
4030 vector iterations are not known since peeled prologue iterations are
4031 not known. Hence guards remain the same. */
4032 prologue_need_br_taken_cost = true;
4033 prologue_need_br_not_taken_cost = true;
4035 else
4037 peel_iters_prologue = npeel;
4038 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4039 /* If peeled iterations are known but number of scalar loop
4040 iterations are unknown, count a taken branch per peeled loop. */
4041 prologue_need_br_taken_cost = true;
4044 bool epilogue_need_br_taken_cost = false;
4045 bool epilogue_need_br_not_taken_cost = false;
4047 /* Calculate peel_iters_epilogue. */
4048 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4049 /* We need to peel exactly one iteration for gaps. */
4050 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4051 else if (npeel < 0)
4053 /* If peeling for alignment is unknown, loop bound of main loop
4054 becomes unknown. */
4055 peel_iters_epilogue = assumed_vf / 2;
4056 if (dump_enabled_p ())
4057 dump_printf (MSG_NOTE, "cost model: "
4058 "epilogue peel iters set to vf/2 because "
4059 "peeling for alignment is unknown.\n");
4061 /* See the same reason above in peel_iters_prologue calculation. */
4062 epilogue_need_br_taken_cost = true;
4063 epilogue_need_br_not_taken_cost = true;
4065 else
4067 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4068 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4069 /* If peeled iterations are known but number of scalar loop
4070 iterations are unknown, count a taken branch per peeled loop. */
4071 epilogue_need_br_taken_cost = true;
4074 stmt_info_for_cost *si;
4075 int j;
4076 /* Add costs associated with peel_iters_prologue. */
4077 if (peel_iters_prologue)
4078 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4080 (void) add_stmt_cost (target_cost_data,
4081 si->count * peel_iters_prologue, si->kind,
4082 si->stmt_info, si->node, si->vectype,
4083 si->misalign, vect_prologue);
4086 /* Add costs associated with peel_iters_epilogue. */
4087 if (peel_iters_epilogue)
4088 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4090 (void) add_stmt_cost (target_cost_data,
4091 si->count * peel_iters_epilogue, si->kind,
4092 si->stmt_info, si->node, si->vectype,
4093 si->misalign, vect_epilogue);
4096 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4098 if (prologue_need_br_taken_cost)
4099 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4100 vect_prologue);
4102 if (prologue_need_br_not_taken_cost)
4103 (void) add_stmt_cost (target_cost_data, 1,
4104 cond_branch_not_taken, vect_prologue);
4106 if (epilogue_need_br_taken_cost)
4107 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4108 vect_epilogue);
4110 if (epilogue_need_br_not_taken_cost)
4111 (void) add_stmt_cost (target_cost_data, 1,
4112 cond_branch_not_taken, vect_epilogue);
4114 /* Take care of special costs for rgroup controls of partial vectors. */
4115 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4117 /* Calculate how many masks we need to generate. */
4118 unsigned int num_masks = 0;
4119 rgroup_controls *rgm;
4120 unsigned int num_vectors_m1;
4121 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4122 if (rgm->type)
4123 num_masks += num_vectors_m1 + 1;
4124 gcc_assert (num_masks > 0);
4126 /* In the worst case, we need to generate each mask in the prologue
4127 and in the loop body. One of the loop body mask instructions
4128 replaces the comparison in the scalar loop, and since we don't
4129 count the scalar comparison against the scalar body, we shouldn't
4130 count that vector instruction against the vector body either.
4132 Sometimes we can use unpacks instead of generating prologue
4133 masks and sometimes the prologue mask will fold to a constant,
4134 so the actual prologue cost might be smaller. However, it's
4135 simpler and safer to use the worst-case cost; if this ends up
4136 being the tie-breaker between vectorizing or not, then it's
4137 probably better not to vectorize. */
4138 (void) add_stmt_cost (target_cost_data, num_masks,
4139 vector_stmt, NULL, NULL, NULL_TREE, 0,
4140 vect_prologue);
4141 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4142 vector_stmt, NULL, NULL, NULL_TREE, 0,
4143 vect_body);
4145 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4147 /* Referring to the functions vect_set_loop_condition_partial_vectors
4148 and vect_set_loop_controls_directly, we need to generate each
4149 length in the prologue and in the loop body if required. Although
4150 there are some possible optimizations, we consider the worst case
4151 here. */
4153 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4154 signed char partial_load_store_bias
4155 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4156 bool need_iterate_p
4157 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4158 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4160 /* Calculate how many statements to be added. */
4161 unsigned int prologue_stmts = 0;
4162 unsigned int body_stmts = 0;
4164 rgroup_controls *rgc;
4165 unsigned int num_vectors_m1;
4166 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4167 if (rgc->type)
4169 /* May need one SHIFT for nitems_total computation. */
4170 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4171 if (nitems != 1 && !niters_known_p)
4172 prologue_stmts += 1;
4174 /* May need one MAX and one MINUS for wrap around. */
4175 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4176 prologue_stmts += 2;
4178 /* Need one MAX and one MINUS for each batch limit excepting for
4179 the 1st one. */
4180 prologue_stmts += num_vectors_m1 * 2;
4182 unsigned int num_vectors = num_vectors_m1 + 1;
4184 /* Need to set up lengths in prologue, only one MIN required
4185 for each since start index is zero. */
4186 prologue_stmts += num_vectors;
4188 /* If we have a non-zero partial load bias, we need one PLUS
4189 to adjust the load length. */
4190 if (partial_load_store_bias != 0)
4191 body_stmts += 1;
4193 /* Each may need two MINs and one MINUS to update lengths in body
4194 for next iteration. */
4195 if (need_iterate_p)
4196 body_stmts += 3 * num_vectors;
4199 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4200 scalar_stmt, vect_prologue);
4201 (void) add_stmt_cost (target_cost_data, body_stmts,
4202 scalar_stmt, vect_body);
4205 /* FORNOW: The scalar outside cost is incremented in one of the
4206 following ways:
4208 1. The vectorizer checks for alignment and aliasing and generates
4209 a condition that allows dynamic vectorization. A cost model
4210 check is ANDED with the versioning condition. Hence scalar code
4211 path now has the added cost of the versioning check.
4213 if (cost > th & versioning_check)
4214 jmp to vector code
4216 Hence run-time scalar is incremented by not-taken branch cost.
4218 2. The vectorizer then checks if a prologue is required. If the
4219 cost model check was not done before during versioning, it has to
4220 be done before the prologue check.
4222 if (cost <= th)
4223 prologue = scalar_iters
4224 if (prologue == 0)
4225 jmp to vector code
4226 else
4227 execute prologue
4228 if (prologue == num_iters)
4229 go to exit
4231 Hence the run-time scalar cost is incremented by a taken branch,
4232 plus a not-taken branch, plus a taken branch cost.
4234 3. The vectorizer then checks if an epilogue is required. If the
4235 cost model check was not done before during prologue check, it
4236 has to be done with the epilogue check.
4238 if (prologue == 0)
4239 jmp to vector code
4240 else
4241 execute prologue
4242 if (prologue == num_iters)
4243 go to exit
4244 vector code:
4245 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4246 jmp to epilogue
4248 Hence the run-time scalar cost should be incremented by 2 taken
4249 branches.
4251 TODO: The back end may reorder the BBS's differently and reverse
4252 conditions/branch directions. Change the estimates below to
4253 something more reasonable. */
4255 /* If the number of iterations is known and we do not do versioning, we can
4256 decide whether to vectorize at compile time. Hence the scalar version
4257 do not carry cost model guard costs. */
4258 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4259 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4261 /* Cost model check occurs at versioning. */
4262 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4263 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4264 else
4266 /* Cost model check occurs at prologue generation. */
4267 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4268 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4269 + vect_get_stmt_cost (cond_branch_not_taken);
4270 /* Cost model check occurs at epilogue generation. */
4271 else
4272 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4276 /* Complete the target-specific cost calculations. */
4277 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4278 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4279 suggested_unroll_factor);
4281 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4282 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4283 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4284 *suggested_unroll_factor,
4285 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4287 if (dump_enabled_p ())
4288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4289 "can't unroll as unrolled vectorization factor larger"
4290 " than maximum vectorization factor: %d\n",
4291 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4292 *suggested_unroll_factor = 1;
4295 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4297 if (dump_enabled_p ())
4299 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4300 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4301 vec_inside_cost);
4302 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4303 vec_prologue_cost);
4304 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4305 vec_epilogue_cost);
4306 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4307 scalar_single_iter_cost);
4308 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4309 scalar_outside_cost);
4310 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4311 vec_outside_cost);
4312 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4313 peel_iters_prologue);
4314 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4315 peel_iters_epilogue);
4318 /* Calculate number of iterations required to make the vector version
4319 profitable, relative to the loop bodies only. The following condition
4320 must hold true:
4321 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4322 where
4323 SIC = scalar iteration cost, VIC = vector iteration cost,
4324 VOC = vector outside cost, VF = vectorization factor,
4325 NPEEL = prologue iterations + epilogue iterations,
4326 SOC = scalar outside cost for run time cost model check. */
4328 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4329 - vec_inside_cost);
4330 if (saving_per_viter <= 0)
4332 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4333 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4334 "vectorization did not happen for a simd loop");
4336 if (dump_enabled_p ())
4337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4338 "cost model: the vector iteration cost = %d "
4339 "divided by the scalar iteration cost = %d "
4340 "is greater or equal to the vectorization factor = %d"
4341 ".\n",
4342 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4343 *ret_min_profitable_niters = -1;
4344 *ret_min_profitable_estimate = -1;
4345 return;
4348 /* ??? The "if" arm is written to handle all cases; see below for what
4349 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4350 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4352 /* Rewriting the condition above in terms of the number of
4353 vector iterations (vniters) rather than the number of
4354 scalar iterations (niters) gives:
4356 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4358 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4360 For integer N, X and Y when X > 0:
4362 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4363 int outside_overhead = (vec_outside_cost
4364 - scalar_single_iter_cost * peel_iters_prologue
4365 - scalar_single_iter_cost * peel_iters_epilogue
4366 - scalar_outside_cost);
4367 /* We're only interested in cases that require at least one
4368 vector iteration. */
4369 int min_vec_niters = 1;
4370 if (outside_overhead > 0)
4371 min_vec_niters = outside_overhead / saving_per_viter + 1;
4373 if (dump_enabled_p ())
4374 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4375 min_vec_niters);
4377 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4379 /* Now that we know the minimum number of vector iterations,
4380 find the minimum niters for which the scalar cost is larger:
4382 SIC * niters > VIC * vniters + VOC - SOC
4384 We know that the minimum niters is no more than
4385 vniters * VF + NPEEL, but it might be (and often is) less
4386 than that if a partial vector iteration is cheaper than the
4387 equivalent scalar code. */
4388 int threshold = (vec_inside_cost * min_vec_niters
4389 + vec_outside_cost
4390 - scalar_outside_cost);
4391 if (threshold <= 0)
4392 min_profitable_iters = 1;
4393 else
4394 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4396 else
4397 /* Convert the number of vector iterations into a number of
4398 scalar iterations. */
4399 min_profitable_iters = (min_vec_niters * assumed_vf
4400 + peel_iters_prologue
4401 + peel_iters_epilogue);
4403 else
4405 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4406 * assumed_vf
4407 - vec_inside_cost * peel_iters_prologue
4408 - vec_inside_cost * peel_iters_epilogue);
4409 if (min_profitable_iters <= 0)
4410 min_profitable_iters = 0;
4411 else
4413 min_profitable_iters /= saving_per_viter;
4415 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4416 <= (((int) vec_inside_cost * min_profitable_iters)
4417 + (((int) vec_outside_cost - scalar_outside_cost)
4418 * assumed_vf)))
4419 min_profitable_iters++;
4423 if (dump_enabled_p ())
4424 dump_printf (MSG_NOTE,
4425 " Calculated minimum iters for profitability: %d\n",
4426 min_profitable_iters);
4428 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4429 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4430 /* We want the vectorized loop to execute at least once. */
4431 min_profitable_iters = assumed_vf + peel_iters_prologue;
4432 else if (min_profitable_iters < peel_iters_prologue)
4433 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4434 vectorized loop executes at least once. */
4435 min_profitable_iters = peel_iters_prologue;
4437 if (dump_enabled_p ())
4438 dump_printf_loc (MSG_NOTE, vect_location,
4439 " Runtime profitability threshold = %d\n",
4440 min_profitable_iters);
4442 *ret_min_profitable_niters = min_profitable_iters;
4444 /* Calculate number of iterations required to make the vector version
4445 profitable, relative to the loop bodies only.
4447 Non-vectorized variant is SIC * niters and it must win over vector
4448 variant on the expected loop trip count. The following condition must hold true:
4449 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4451 if (vec_outside_cost <= 0)
4452 min_profitable_estimate = 0;
4453 /* ??? This "else if" arm is written to handle all cases; see below for
4454 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4455 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4457 /* This is a repeat of the code above, but with + SOC rather
4458 than - SOC. */
4459 int outside_overhead = (vec_outside_cost
4460 - scalar_single_iter_cost * peel_iters_prologue
4461 - scalar_single_iter_cost * peel_iters_epilogue
4462 + scalar_outside_cost);
4463 int min_vec_niters = 1;
4464 if (outside_overhead > 0)
4465 min_vec_niters = outside_overhead / saving_per_viter + 1;
4467 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4469 int threshold = (vec_inside_cost * min_vec_niters
4470 + vec_outside_cost
4471 + scalar_outside_cost);
4472 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4474 else
4475 min_profitable_estimate = (min_vec_niters * assumed_vf
4476 + peel_iters_prologue
4477 + peel_iters_epilogue);
4479 else
4481 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4482 * assumed_vf
4483 - vec_inside_cost * peel_iters_prologue
4484 - vec_inside_cost * peel_iters_epilogue)
4485 / ((scalar_single_iter_cost * assumed_vf)
4486 - vec_inside_cost);
4488 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4489 if (dump_enabled_p ())
4490 dump_printf_loc (MSG_NOTE, vect_location,
4491 " Static estimate profitability threshold = %d\n",
4492 min_profitable_estimate);
4494 *ret_min_profitable_estimate = min_profitable_estimate;
4497 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4498 vector elements (not bits) for a vector with NELT elements. */
4499 static void
4500 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4501 vec_perm_builder *sel)
4503 /* The encoding is a single stepped pattern. Any wrap-around is handled
4504 by vec_perm_indices. */
4505 sel->new_vector (nelt, 1, 3);
4506 for (unsigned int i = 0; i < 3; i++)
4507 sel->quick_push (i + offset);
4510 /* Checks whether the target supports whole-vector shifts for vectors of mode
4511 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4512 it supports vec_perm_const with masks for all necessary shift amounts. */
4513 static bool
4514 have_whole_vector_shift (machine_mode mode)
4516 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4517 return true;
4519 /* Variable-length vectors should be handled via the optab. */
4520 unsigned int nelt;
4521 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4522 return false;
4524 vec_perm_builder sel;
4525 vec_perm_indices indices;
4526 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4528 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4529 indices.new_vector (sel, 2, nelt);
4530 if (!can_vec_perm_const_p (mode, indices, false))
4531 return false;
4533 return true;
4536 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4537 functions. Design better to avoid maintenance issues. */
4539 /* Function vect_model_reduction_cost.
4541 Models cost for a reduction operation, including the vector ops
4542 generated within the strip-mine loop in some cases, the initial
4543 definition before the loop, and the epilogue code that must be generated. */
4545 static void
4546 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4547 stmt_vec_info stmt_info, internal_fn reduc_fn,
4548 vect_reduction_type reduction_type,
4549 int ncopies, stmt_vector_for_cost *cost_vec)
4551 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4552 tree vectype;
4553 machine_mode mode;
4554 class loop *loop = NULL;
4556 if (loop_vinfo)
4557 loop = LOOP_VINFO_LOOP (loop_vinfo);
4559 /* Condition reductions generate two reductions in the loop. */
4560 if (reduction_type == COND_REDUCTION)
4561 ncopies *= 2;
4563 vectype = STMT_VINFO_VECTYPE (stmt_info);
4564 mode = TYPE_MODE (vectype);
4565 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4567 gimple_match_op op;
4568 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4569 gcc_unreachable ();
4571 if (reduction_type == EXTRACT_LAST_REDUCTION)
4572 /* No extra instructions are needed in the prologue. The loop body
4573 operations are costed in vectorizable_condition. */
4574 inside_cost = 0;
4575 else if (reduction_type == FOLD_LEFT_REDUCTION)
4577 /* No extra instructions needed in the prologue. */
4578 prologue_cost = 0;
4580 if (reduc_fn != IFN_LAST)
4581 /* Count one reduction-like operation per vector. */
4582 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4583 stmt_info, 0, vect_body);
4584 else
4586 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4587 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4588 inside_cost = record_stmt_cost (cost_vec, nelements,
4589 vec_to_scalar, stmt_info, 0,
4590 vect_body);
4591 inside_cost += record_stmt_cost (cost_vec, nelements,
4592 scalar_stmt, stmt_info, 0,
4593 vect_body);
4596 else
4598 /* Add in cost for initial definition.
4599 For cond reduction we have four vectors: initial index, step,
4600 initial result of the data reduction, initial value of the index
4601 reduction. */
4602 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4603 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4604 scalar_to_vec, stmt_info, 0,
4605 vect_prologue);
4608 /* Determine cost of epilogue code.
4610 We have a reduction operator that will reduce the vector in one statement.
4611 Also requires scalar extract. */
4613 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4615 if (reduc_fn != IFN_LAST)
4617 if (reduction_type == COND_REDUCTION)
4619 /* An EQ stmt and an COND_EXPR stmt. */
4620 epilogue_cost += record_stmt_cost (cost_vec, 2,
4621 vector_stmt, stmt_info, 0,
4622 vect_epilogue);
4623 /* Reduction of the max index and a reduction of the found
4624 values. */
4625 epilogue_cost += record_stmt_cost (cost_vec, 2,
4626 vec_to_scalar, stmt_info, 0,
4627 vect_epilogue);
4628 /* A broadcast of the max value. */
4629 epilogue_cost += record_stmt_cost (cost_vec, 1,
4630 scalar_to_vec, stmt_info, 0,
4631 vect_epilogue);
4633 else
4635 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4636 stmt_info, 0, vect_epilogue);
4637 epilogue_cost += record_stmt_cost (cost_vec, 1,
4638 vec_to_scalar, stmt_info, 0,
4639 vect_epilogue);
4642 else if (reduction_type == COND_REDUCTION)
4644 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4645 /* Extraction of scalar elements. */
4646 epilogue_cost += record_stmt_cost (cost_vec,
4647 2 * estimated_nunits,
4648 vec_to_scalar, stmt_info, 0,
4649 vect_epilogue);
4650 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4651 epilogue_cost += record_stmt_cost (cost_vec,
4652 2 * estimated_nunits - 3,
4653 scalar_stmt, stmt_info, 0,
4654 vect_epilogue);
4656 else if (reduction_type == EXTRACT_LAST_REDUCTION
4657 || reduction_type == FOLD_LEFT_REDUCTION)
4658 /* No extra instructions need in the epilogue. */
4660 else
4662 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4663 tree bitsize = TYPE_SIZE (op.type);
4664 int element_bitsize = tree_to_uhwi (bitsize);
4665 int nelements = vec_size_in_bits / element_bitsize;
4667 if (op.code == COND_EXPR)
4668 op.code = MAX_EXPR;
4670 /* We have a whole vector shift available. */
4671 if (VECTOR_MODE_P (mode)
4672 && directly_supported_p (op.code, vectype)
4673 && have_whole_vector_shift (mode))
4675 /* Final reduction via vector shifts and the reduction operator.
4676 Also requires scalar extract. */
4677 epilogue_cost += record_stmt_cost (cost_vec,
4678 exact_log2 (nelements) * 2,
4679 vector_stmt, stmt_info, 0,
4680 vect_epilogue);
4681 epilogue_cost += record_stmt_cost (cost_vec, 1,
4682 vec_to_scalar, stmt_info, 0,
4683 vect_epilogue);
4685 else
4686 /* Use extracts and reduction op for final reduction. For N
4687 elements, we have N extracts and N-1 reduction ops. */
4688 epilogue_cost += record_stmt_cost (cost_vec,
4689 nelements + nelements - 1,
4690 vector_stmt, stmt_info, 0,
4691 vect_epilogue);
4695 if (dump_enabled_p ())
4696 dump_printf (MSG_NOTE,
4697 "vect_model_reduction_cost: inside_cost = %d, "
4698 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4699 prologue_cost, epilogue_cost);
4702 /* SEQ is a sequence of instructions that initialize the reduction
4703 described by REDUC_INFO. Emit them in the appropriate place. */
4705 static void
4706 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4707 stmt_vec_info reduc_info, gimple *seq)
4709 if (reduc_info->reused_accumulator)
4711 /* When reusing an accumulator from the main loop, we only need
4712 initialization instructions if the main loop can be skipped.
4713 In that case, emit the initialization instructions at the end
4714 of the guard block that does the skip. */
4715 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4716 gcc_assert (skip_edge);
4717 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4718 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4720 else
4722 /* The normal case: emit the initialization instructions on the
4723 preheader edge. */
4724 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4725 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4729 /* Function get_initial_def_for_reduction
4731 Input:
4732 REDUC_INFO - the info_for_reduction
4733 INIT_VAL - the initial value of the reduction variable
4734 NEUTRAL_OP - a value that has no effect on the reduction, as per
4735 neutral_op_for_reduction
4737 Output:
4738 Return a vector variable, initialized according to the operation that
4739 STMT_VINFO performs. This vector will be used as the initial value
4740 of the vector of partial results.
4742 The value we need is a vector in which element 0 has value INIT_VAL
4743 and every other element has value NEUTRAL_OP. */
4745 static tree
4746 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4747 stmt_vec_info reduc_info,
4748 tree init_val, tree neutral_op)
4750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4751 tree scalar_type = TREE_TYPE (init_val);
4752 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4753 tree init_def;
4754 gimple_seq stmts = NULL;
4756 gcc_assert (vectype);
4758 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4759 || SCALAR_FLOAT_TYPE_P (scalar_type));
4761 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4762 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4764 if (operand_equal_p (init_val, neutral_op))
4766 /* If both elements are equal then the vector described above is
4767 just a splat. */
4768 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4769 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4771 else
4773 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4774 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4775 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4777 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4778 element 0. */
4779 init_def = gimple_build_vector_from_val (&stmts, vectype,
4780 neutral_op);
4781 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4782 vectype, init_def, init_val);
4784 else
4786 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4787 tree_vector_builder elts (vectype, 1, 2);
4788 elts.quick_push (init_val);
4789 elts.quick_push (neutral_op);
4790 init_def = gimple_build_vector (&stmts, &elts);
4794 if (stmts)
4795 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4796 return init_def;
4799 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4800 which performs a reduction involving GROUP_SIZE scalar statements.
4801 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4802 is nonnull, introducing extra elements of that value will not change the
4803 result. */
4805 static void
4806 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4807 stmt_vec_info reduc_info,
4808 vec<tree> *vec_oprnds,
4809 unsigned int number_of_vectors,
4810 unsigned int group_size, tree neutral_op)
4812 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4813 unsigned HOST_WIDE_INT nunits;
4814 unsigned j, number_of_places_left_in_vector;
4815 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4816 unsigned int i;
4818 gcc_assert (group_size == initial_values.length () || neutral_op);
4820 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4821 created vectors. It is greater than 1 if unrolling is performed.
4823 For example, we have two scalar operands, s1 and s2 (e.g., group of
4824 strided accesses of size two), while NUNITS is four (i.e., four scalars
4825 of this type can be packed in a vector). The output vector will contain
4826 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4827 will be 2).
4829 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4830 vectors containing the operands.
4832 For example, NUNITS is four as before, and the group size is 8
4833 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4834 {s5, s6, s7, s8}. */
4836 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4837 nunits = group_size;
4839 number_of_places_left_in_vector = nunits;
4840 bool constant_p = true;
4841 tree_vector_builder elts (vector_type, nunits, 1);
4842 elts.quick_grow (nunits);
4843 gimple_seq ctor_seq = NULL;
4844 for (j = 0; j < nunits * number_of_vectors; ++j)
4846 tree op;
4847 i = j % group_size;
4849 /* Get the def before the loop. In reduction chain we have only
4850 one initial value. Else we have as many as PHIs in the group. */
4851 if (i >= initial_values.length () || (j > i && neutral_op))
4852 op = neutral_op;
4853 else
4854 op = initial_values[i];
4856 /* Create 'vect_ = {op0,op1,...,opn}'. */
4857 number_of_places_left_in_vector--;
4858 elts[nunits - number_of_places_left_in_vector - 1] = op;
4859 if (!CONSTANT_CLASS_P (op))
4860 constant_p = false;
4862 if (number_of_places_left_in_vector == 0)
4864 tree init;
4865 if (constant_p && !neutral_op
4866 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4867 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4868 /* Build the vector directly from ELTS. */
4869 init = gimple_build_vector (&ctor_seq, &elts);
4870 else if (neutral_op)
4872 /* Build a vector of the neutral value and shift the
4873 other elements into place. */
4874 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4875 neutral_op);
4876 int k = nunits;
4877 while (k > 0 && elts[k - 1] == neutral_op)
4878 k -= 1;
4879 while (k > 0)
4881 k -= 1;
4882 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4883 vector_type, init, elts[k]);
4886 else
4888 /* First time round, duplicate ELTS to fill the
4889 required number of vectors. */
4890 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4891 elts, number_of_vectors, *vec_oprnds);
4892 break;
4894 vec_oprnds->quick_push (init);
4896 number_of_places_left_in_vector = nunits;
4897 elts.new_vector (vector_type, nunits, 1);
4898 elts.quick_grow (nunits);
4899 constant_p = true;
4902 if (ctor_seq != NULL)
4903 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4906 /* For a statement STMT_INFO taking part in a reduction operation return
4907 the stmt_vec_info the meta information is stored on. */
4909 stmt_vec_info
4910 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4912 stmt_info = vect_orig_stmt (stmt_info);
4913 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4914 if (!is_a <gphi *> (stmt_info->stmt)
4915 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4916 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4917 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4918 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4920 if (gimple_phi_num_args (phi) == 1)
4921 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4923 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4925 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4926 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4927 stmt_info = info;
4929 return stmt_info;
4932 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4933 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4934 return false. */
4936 static bool
4937 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4938 stmt_vec_info reduc_info)
4940 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4941 if (!main_loop_vinfo)
4942 return false;
4944 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4945 return false;
4947 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4948 auto_vec<tree, 16> main_loop_results (num_phis);
4949 auto_vec<tree, 16> initial_values (num_phis);
4950 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4952 /* The epilogue loop can be entered either from the main loop or
4953 from an earlier guard block. */
4954 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4955 for (tree incoming_value : reduc_info->reduc_initial_values)
4957 /* Look for:
4959 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4960 INITIAL_VALUE(guard block)>. */
4961 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4963 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4964 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4966 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4967 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4969 main_loop_results.quick_push (from_main_loop);
4970 initial_values.quick_push (from_skip);
4973 else
4974 /* The main loop dominates the epilogue loop. */
4975 main_loop_results.splice (reduc_info->reduc_initial_values);
4977 /* See if the main loop has the kind of accumulator we need. */
4978 vect_reusable_accumulator *accumulator
4979 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4980 if (!accumulator
4981 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4982 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4983 accumulator->reduc_info->reduc_scalar_results.begin ()))
4984 return false;
4986 /* Handle the case where we can reduce wider vectors to narrower ones. */
4987 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4988 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4989 unsigned HOST_WIDE_INT m;
4990 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4991 TYPE_VECTOR_SUBPARTS (vectype), &m))
4992 return false;
4993 /* Check the intermediate vector types and operations are available. */
4994 tree prev_vectype = old_vectype;
4995 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
4996 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4998 intermediate_nunits = exact_div (intermediate_nunits, 2);
4999 tree intermediate_vectype = get_related_vectype_for_scalar_type
5000 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5001 if (!intermediate_vectype
5002 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5003 intermediate_vectype)
5004 || !can_vec_extract (TYPE_MODE (prev_vectype),
5005 TYPE_MODE (intermediate_vectype)))
5006 return false;
5007 prev_vectype = intermediate_vectype;
5010 /* Non-SLP reductions might apply an adjustment after the reduction
5011 operation, in order to simplify the initialization of the accumulator.
5012 If the epilogue loop carries on from where the main loop left off,
5013 it should apply the same adjustment to the final reduction result.
5015 If the epilogue loop can also be entered directly (rather than via
5016 the main loop), we need to be able to handle that case in the same way,
5017 with the same adjustment. (In principle we could add a PHI node
5018 to select the correct adjustment, but in practice that shouldn't be
5019 necessary.) */
5020 tree main_adjustment
5021 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5022 if (loop_vinfo->main_loop_edge && main_adjustment)
5024 gcc_assert (num_phis == 1);
5025 tree initial_value = initial_values[0];
5026 /* Check that we can use INITIAL_VALUE as the adjustment and
5027 initialize the accumulator with a neutral value instead. */
5028 if (!operand_equal_p (initial_value, main_adjustment))
5029 return false;
5030 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5031 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5032 code, initial_value);
5034 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5035 reduc_info->reduc_initial_values.truncate (0);
5036 reduc_info->reduc_initial_values.splice (initial_values);
5037 reduc_info->reused_accumulator = accumulator;
5038 return true;
5041 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5042 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5044 static tree
5045 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5046 gimple_seq *seq)
5048 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5049 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5050 tree stype = TREE_TYPE (vectype);
5051 tree new_temp = vec_def;
5052 while (nunits > nunits1)
5054 nunits /= 2;
5055 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5056 stype, nunits);
5057 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5059 /* The target has to make sure we support lowpart/highpart
5060 extraction, either via direct vector extract or through
5061 an integer mode punning. */
5062 tree dst1, dst2;
5063 gimple *epilog_stmt;
5064 if (convert_optab_handler (vec_extract_optab,
5065 TYPE_MODE (TREE_TYPE (new_temp)),
5066 TYPE_MODE (vectype1))
5067 != CODE_FOR_nothing)
5069 /* Extract sub-vectors directly once vec_extract becomes
5070 a conversion optab. */
5071 dst1 = make_ssa_name (vectype1);
5072 epilog_stmt
5073 = gimple_build_assign (dst1, BIT_FIELD_REF,
5074 build3 (BIT_FIELD_REF, vectype1,
5075 new_temp, TYPE_SIZE (vectype1),
5076 bitsize_int (0)));
5077 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5078 dst2 = make_ssa_name (vectype1);
5079 epilog_stmt
5080 = gimple_build_assign (dst2, BIT_FIELD_REF,
5081 build3 (BIT_FIELD_REF, vectype1,
5082 new_temp, TYPE_SIZE (vectype1),
5083 bitsize_int (bitsize)));
5084 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5086 else
5088 /* Extract via punning to appropriately sized integer mode
5089 vector. */
5090 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5091 tree etype = build_vector_type (eltype, 2);
5092 gcc_assert (convert_optab_handler (vec_extract_optab,
5093 TYPE_MODE (etype),
5094 TYPE_MODE (eltype))
5095 != CODE_FOR_nothing);
5096 tree tem = make_ssa_name (etype);
5097 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5098 build1 (VIEW_CONVERT_EXPR,
5099 etype, new_temp));
5100 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5101 new_temp = tem;
5102 tem = make_ssa_name (eltype);
5103 epilog_stmt
5104 = gimple_build_assign (tem, BIT_FIELD_REF,
5105 build3 (BIT_FIELD_REF, eltype,
5106 new_temp, TYPE_SIZE (eltype),
5107 bitsize_int (0)));
5108 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5109 dst1 = make_ssa_name (vectype1);
5110 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5111 build1 (VIEW_CONVERT_EXPR,
5112 vectype1, tem));
5113 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5114 tem = make_ssa_name (eltype);
5115 epilog_stmt
5116 = gimple_build_assign (tem, BIT_FIELD_REF,
5117 build3 (BIT_FIELD_REF, eltype,
5118 new_temp, TYPE_SIZE (eltype),
5119 bitsize_int (bitsize)));
5120 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5121 dst2 = make_ssa_name (vectype1);
5122 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5123 build1 (VIEW_CONVERT_EXPR,
5124 vectype1, tem));
5125 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5128 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5131 return new_temp;
5134 /* Function vect_create_epilog_for_reduction
5136 Create code at the loop-epilog to finalize the result of a reduction
5137 computation.
5139 STMT_INFO is the scalar reduction stmt that is being vectorized.
5140 SLP_NODE is an SLP node containing a group of reduction statements. The
5141 first one in this group is STMT_INFO.
5142 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5143 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5144 (counting from 0)
5146 This function:
5147 1. Completes the reduction def-use cycles.
5148 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5149 by calling the function specified by REDUC_FN if available, or by
5150 other means (whole-vector shifts or a scalar loop).
5151 The function also creates a new phi node at the loop exit to preserve
5152 loop-closed form, as illustrated below.
5154 The flow at the entry to this function:
5156 loop:
5157 vec_def = phi <vec_init, null> # REDUCTION_PHI
5158 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5159 s_loop = scalar_stmt # (scalar) STMT_INFO
5160 loop_exit:
5161 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5162 use <s_out0>
5163 use <s_out0>
5165 The above is transformed by this function into:
5167 loop:
5168 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5169 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5170 s_loop = scalar_stmt # (scalar) STMT_INFO
5171 loop_exit:
5172 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5173 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5174 v_out2 = reduce <v_out1>
5175 s_out3 = extract_field <v_out2, 0>
5176 s_out4 = adjust_result <s_out3>
5177 use <s_out4>
5178 use <s_out4>
5181 static void
5182 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5183 stmt_vec_info stmt_info,
5184 slp_tree slp_node,
5185 slp_instance slp_node_instance)
5187 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5188 gcc_assert (reduc_info->is_reduc_info);
5189 /* For double reductions we need to get at the inner loop reduction
5190 stmt which has the meta info attached. Our stmt_info is that of the
5191 loop-closed PHI of the inner loop which we remember as
5192 def for the reduction PHI generation. */
5193 bool double_reduc = false;
5194 stmt_vec_info rdef_info = stmt_info;
5195 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5197 gcc_assert (!slp_node);
5198 double_reduc = true;
5199 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5200 (stmt_info->stmt, 0));
5201 stmt_info = vect_stmt_to_vectorize (stmt_info);
5203 gphi *reduc_def_stmt
5204 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5205 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5206 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5207 tree vectype;
5208 machine_mode mode;
5209 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5210 basic_block exit_bb;
5211 tree scalar_dest;
5212 tree scalar_type;
5213 gimple *new_phi = NULL, *phi;
5214 gimple_stmt_iterator exit_gsi;
5215 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5216 gimple *epilog_stmt = NULL;
5217 gimple *exit_phi;
5218 tree bitsize;
5219 tree def;
5220 tree orig_name, scalar_result;
5221 imm_use_iterator imm_iter, phi_imm_iter;
5222 use_operand_p use_p, phi_use_p;
5223 gimple *use_stmt;
5224 auto_vec<tree> reduc_inputs;
5225 int j, i;
5226 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5227 unsigned int group_size = 1, k;
5228 auto_vec<gimple *> phis;
5229 /* SLP reduction without reduction chain, e.g.,
5230 # a1 = phi <a2, a0>
5231 # b1 = phi <b2, b0>
5232 a2 = operation (a1)
5233 b2 = operation (b1) */
5234 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5235 bool direct_slp_reduc;
5236 tree induction_index = NULL_TREE;
5238 if (slp_node)
5239 group_size = SLP_TREE_LANES (slp_node);
5241 if (nested_in_vect_loop_p (loop, stmt_info))
5243 outer_loop = loop;
5244 loop = loop->inner;
5245 gcc_assert (!slp_node && double_reduc);
5248 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5249 gcc_assert (vectype);
5250 mode = TYPE_MODE (vectype);
5252 tree induc_val = NULL_TREE;
5253 tree adjustment_def = NULL;
5254 if (slp_node)
5256 else
5258 /* Optimize: for induction condition reduction, if we can't use zero
5259 for induc_val, use initial_def. */
5260 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5261 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5262 else if (double_reduc)
5264 else
5265 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5268 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5269 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5270 if (slp_reduc)
5271 /* All statements produce live-out values. */
5272 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5273 else if (slp_node)
5275 /* The last statement in the reduction chain produces the live-out
5276 value. Note SLP optimization can shuffle scalar stmts to
5277 optimize permutations so we have to search for the last stmt. */
5278 for (k = 0; k < group_size; ++k)
5279 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5281 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5282 break;
5286 unsigned vec_num;
5287 int ncopies;
5288 if (slp_node)
5290 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5291 ncopies = 1;
5293 else
5295 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5296 vec_num = 1;
5297 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5300 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5301 which is updated with the current index of the loop for every match of
5302 the original loop's cond_expr (VEC_STMT). This results in a vector
5303 containing the last time the condition passed for that vector lane.
5304 The first match will be a 1 to allow 0 to be used for non-matching
5305 indexes. If there are no matches at all then the vector will be all
5306 zeroes.
5308 PR92772: This algorithm is broken for architectures that support
5309 masked vectors, but do not provide fold_extract_last. */
5310 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5312 auto_vec<std::pair<tree, bool>, 2> ccompares;
5313 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5314 cond_info = vect_stmt_to_vectorize (cond_info);
5315 while (cond_info != reduc_info)
5317 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5319 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5320 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5321 ccompares.safe_push
5322 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5323 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5325 cond_info
5326 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5327 1 + STMT_VINFO_REDUC_IDX
5328 (cond_info)));
5329 cond_info = vect_stmt_to_vectorize (cond_info);
5331 gcc_assert (ccompares.length () != 0);
5333 tree indx_before_incr, indx_after_incr;
5334 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5335 int scalar_precision
5336 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5337 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5338 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5339 (TYPE_MODE (vectype), cr_index_scalar_type,
5340 TYPE_VECTOR_SUBPARTS (vectype));
5342 /* First we create a simple vector induction variable which starts
5343 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5344 vector size (STEP). */
5346 /* Create a {1,2,3,...} vector. */
5347 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5349 /* Create a vector of the step value. */
5350 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5351 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5353 /* Create an induction variable. */
5354 gimple_stmt_iterator incr_gsi;
5355 bool insert_after;
5356 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5357 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5358 insert_after, &indx_before_incr, &indx_after_incr);
5360 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5361 filled with zeros (VEC_ZERO). */
5363 /* Create a vector of 0s. */
5364 tree zero = build_zero_cst (cr_index_scalar_type);
5365 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5367 /* Create a vector phi node. */
5368 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5369 new_phi = create_phi_node (new_phi_tree, loop->header);
5370 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5371 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5373 /* Now take the condition from the loops original cond_exprs
5374 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5375 every match uses values from the induction variable
5376 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5377 (NEW_PHI_TREE).
5378 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5379 the new cond_expr (INDEX_COND_EXPR). */
5380 gimple_seq stmts = NULL;
5381 for (int i = ccompares.length () - 1; i != -1; --i)
5383 tree ccompare = ccompares[i].first;
5384 if (ccompares[i].second)
5385 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5386 cr_index_vector_type,
5387 ccompare,
5388 indx_before_incr, new_phi_tree);
5389 else
5390 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5391 cr_index_vector_type,
5392 ccompare,
5393 new_phi_tree, indx_before_incr);
5395 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5397 /* Update the phi with the vec cond. */
5398 induction_index = new_phi_tree;
5399 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5400 loop_latch_edge (loop), UNKNOWN_LOCATION);
5403 /* 2. Create epilog code.
5404 The reduction epilog code operates across the elements of the vector
5405 of partial results computed by the vectorized loop.
5406 The reduction epilog code consists of:
5408 step 1: compute the scalar result in a vector (v_out2)
5409 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5410 step 3: adjust the scalar result (s_out3) if needed.
5412 Step 1 can be accomplished using one the following three schemes:
5413 (scheme 1) using reduc_fn, if available.
5414 (scheme 2) using whole-vector shifts, if available.
5415 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5416 combined.
5418 The overall epilog code looks like this:
5420 s_out0 = phi <s_loop> # original EXIT_PHI
5421 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5422 v_out2 = reduce <v_out1> # step 1
5423 s_out3 = extract_field <v_out2, 0> # step 2
5424 s_out4 = adjust_result <s_out3> # step 3
5426 (step 3 is optional, and steps 1 and 2 may be combined).
5427 Lastly, the uses of s_out0 are replaced by s_out4. */
5430 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5431 v_out1 = phi <VECT_DEF>
5432 Store them in NEW_PHIS. */
5433 if (double_reduc)
5434 loop = outer_loop;
5435 exit_bb = single_exit (loop)->dest;
5436 exit_gsi = gsi_after_labels (exit_bb);
5437 reduc_inputs.create (slp_node ? vec_num : ncopies);
5438 for (unsigned i = 0; i < vec_num; i++)
5440 gimple_seq stmts = NULL;
5441 if (slp_node)
5442 def = vect_get_slp_vect_def (slp_node, i);
5443 else
5444 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5445 for (j = 0; j < ncopies; j++)
5447 tree new_def = copy_ssa_name (def);
5448 phi = create_phi_node (new_def, exit_bb);
5449 if (j)
5450 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5451 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5452 new_def = gimple_convert (&stmts, vectype, new_def);
5453 reduc_inputs.quick_push (new_def);
5455 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5458 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5459 (i.e. when reduc_fn is not available) and in the final adjustment
5460 code (if needed). Also get the original scalar reduction variable as
5461 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5462 represents a reduction pattern), the tree-code and scalar-def are
5463 taken from the original stmt that the pattern-stmt (STMT) replaces.
5464 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5465 are taken from STMT. */
5467 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5468 if (orig_stmt_info != stmt_info)
5470 /* Reduction pattern */
5471 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5472 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5475 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5476 scalar_type = TREE_TYPE (scalar_dest);
5477 scalar_results.truncate (0);
5478 scalar_results.reserve_exact (group_size);
5479 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5480 bitsize = TYPE_SIZE (scalar_type);
5482 /* True if we should implement SLP_REDUC using native reduction operations
5483 instead of scalar operations. */
5484 direct_slp_reduc = (reduc_fn != IFN_LAST
5485 && slp_reduc
5486 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5488 /* In case of reduction chain, e.g.,
5489 # a1 = phi <a3, a0>
5490 a2 = operation (a1)
5491 a3 = operation (a2),
5493 we may end up with more than one vector result. Here we reduce them
5494 to one vector.
5496 The same is true if we couldn't use a single defuse cycle. */
5497 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5498 || direct_slp_reduc
5499 || ncopies > 1)
5501 gimple_seq stmts = NULL;
5502 tree single_input = reduc_inputs[0];
5503 for (k = 1; k < reduc_inputs.length (); k++)
5504 single_input = gimple_build (&stmts, code, vectype,
5505 single_input, reduc_inputs[k]);
5506 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5508 reduc_inputs.truncate (0);
5509 reduc_inputs.safe_push (single_input);
5512 tree orig_reduc_input = reduc_inputs[0];
5514 /* If this loop is an epilogue loop that can be skipped after the
5515 main loop, we can only share a reduction operation between the
5516 main loop and the epilogue if we put it at the target of the
5517 skip edge.
5519 We can still reuse accumulators if this check fails. Doing so has
5520 the minor(?) benefit of making the epilogue loop's scalar result
5521 independent of the main loop's scalar result. */
5522 bool unify_with_main_loop_p = false;
5523 if (reduc_info->reused_accumulator
5524 && loop_vinfo->skip_this_loop_edge
5525 && single_succ_p (exit_bb)
5526 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5528 unify_with_main_loop_p = true;
5530 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5531 reduc_inputs[0] = make_ssa_name (vectype);
5532 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5533 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5534 UNKNOWN_LOCATION);
5535 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5536 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5537 exit_gsi = gsi_after_labels (reduc_block);
5540 /* Shouldn't be used beyond this point. */
5541 exit_bb = nullptr;
5543 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5544 && reduc_fn != IFN_LAST)
5546 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5547 various data values where the condition matched and another vector
5548 (INDUCTION_INDEX) containing all the indexes of those matches. We
5549 need to extract the last matching index (which will be the index with
5550 highest value) and use this to index into the data vector.
5551 For the case where there were no matches, the data vector will contain
5552 all default values and the index vector will be all zeros. */
5554 /* Get various versions of the type of the vector of indexes. */
5555 tree index_vec_type = TREE_TYPE (induction_index);
5556 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5557 tree index_scalar_type = TREE_TYPE (index_vec_type);
5558 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5560 /* Get an unsigned integer version of the type of the data vector. */
5561 int scalar_precision
5562 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5563 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5564 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5565 vectype);
5567 /* First we need to create a vector (ZERO_VEC) of zeros and another
5568 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5569 can create using a MAX reduction and then expanding.
5570 In the case where the loop never made any matches, the max index will
5571 be zero. */
5573 /* Vector of {0, 0, 0,...}. */
5574 tree zero_vec = build_zero_cst (vectype);
5576 /* Find maximum value from the vector of found indexes. */
5577 tree max_index = make_ssa_name (index_scalar_type);
5578 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5579 1, induction_index);
5580 gimple_call_set_lhs (max_index_stmt, max_index);
5581 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5583 /* Vector of {max_index, max_index, max_index,...}. */
5584 tree max_index_vec = make_ssa_name (index_vec_type);
5585 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5586 max_index);
5587 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5588 max_index_vec_rhs);
5589 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5591 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5592 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5593 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5594 otherwise. Only one value should match, resulting in a vector
5595 (VEC_COND) with one data value and the rest zeros.
5596 In the case where the loop never made any matches, every index will
5597 match, resulting in a vector with all data values (which will all be
5598 the default value). */
5600 /* Compare the max index vector to the vector of found indexes to find
5601 the position of the max value. */
5602 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5603 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5604 induction_index,
5605 max_index_vec);
5606 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5608 /* Use the compare to choose either values from the data vector or
5609 zero. */
5610 tree vec_cond = make_ssa_name (vectype);
5611 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5612 vec_compare,
5613 reduc_inputs[0],
5614 zero_vec);
5615 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5617 /* Finally we need to extract the data value from the vector (VEC_COND)
5618 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5619 reduction, but because this doesn't exist, we can use a MAX reduction
5620 instead. The data value might be signed or a float so we need to cast
5621 it first.
5622 In the case where the loop never made any matches, the data values are
5623 all identical, and so will reduce down correctly. */
5625 /* Make the matched data values unsigned. */
5626 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5627 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5628 vec_cond);
5629 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5630 VIEW_CONVERT_EXPR,
5631 vec_cond_cast_rhs);
5632 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5634 /* Reduce down to a scalar value. */
5635 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5636 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5637 1, vec_cond_cast);
5638 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5639 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5641 /* Convert the reduced value back to the result type and set as the
5642 result. */
5643 gimple_seq stmts = NULL;
5644 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5645 data_reduc);
5646 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5647 scalar_results.safe_push (new_temp);
5649 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5650 && reduc_fn == IFN_LAST)
5652 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5653 idx = 0;
5654 idx_val = induction_index[0];
5655 val = data_reduc[0];
5656 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5657 if (induction_index[i] > idx_val)
5658 val = data_reduc[i], idx_val = induction_index[i];
5659 return val; */
5661 tree data_eltype = TREE_TYPE (vectype);
5662 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5663 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5664 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5665 /* Enforced by vectorizable_reduction, which ensures we have target
5666 support before allowing a conditional reduction on variable-length
5667 vectors. */
5668 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5669 tree idx_val = NULL_TREE, val = NULL_TREE;
5670 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5672 tree old_idx_val = idx_val;
5673 tree old_val = val;
5674 idx_val = make_ssa_name (idx_eltype);
5675 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5676 build3 (BIT_FIELD_REF, idx_eltype,
5677 induction_index,
5678 bitsize_int (el_size),
5679 bitsize_int (off)));
5680 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5681 val = make_ssa_name (data_eltype);
5682 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5683 build3 (BIT_FIELD_REF,
5684 data_eltype,
5685 reduc_inputs[0],
5686 bitsize_int (el_size),
5687 bitsize_int (off)));
5688 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5689 if (off != 0)
5691 tree new_idx_val = idx_val;
5692 if (off != v_size - el_size)
5694 new_idx_val = make_ssa_name (idx_eltype);
5695 epilog_stmt = gimple_build_assign (new_idx_val,
5696 MAX_EXPR, idx_val,
5697 old_idx_val);
5698 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5700 tree new_val = make_ssa_name (data_eltype);
5701 epilog_stmt = gimple_build_assign (new_val,
5702 COND_EXPR,
5703 build2 (GT_EXPR,
5704 boolean_type_node,
5705 idx_val,
5706 old_idx_val),
5707 val, old_val);
5708 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5709 idx_val = new_idx_val;
5710 val = new_val;
5713 /* Convert the reduced value back to the result type and set as the
5714 result. */
5715 gimple_seq stmts = NULL;
5716 val = gimple_convert (&stmts, scalar_type, val);
5717 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5718 scalar_results.safe_push (val);
5721 /* 2.3 Create the reduction code, using one of the three schemes described
5722 above. In SLP we simply need to extract all the elements from the
5723 vector (without reducing them), so we use scalar shifts. */
5724 else if (reduc_fn != IFN_LAST && !slp_reduc)
5726 tree tmp;
5727 tree vec_elem_type;
5729 /* Case 1: Create:
5730 v_out2 = reduc_expr <v_out1> */
5732 if (dump_enabled_p ())
5733 dump_printf_loc (MSG_NOTE, vect_location,
5734 "Reduce using direct vector reduction.\n");
5736 gimple_seq stmts = NULL;
5737 vec_elem_type = TREE_TYPE (vectype);
5738 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5739 vec_elem_type, reduc_inputs[0]);
5740 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5741 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5743 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5744 && induc_val)
5746 /* Earlier we set the initial value to be a vector if induc_val
5747 values. Check the result and if it is induc_val then replace
5748 with the original initial value, unless induc_val is
5749 the same as initial_def already. */
5750 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5751 induc_val);
5752 tree initial_def = reduc_info->reduc_initial_values[0];
5754 tmp = make_ssa_name (new_scalar_dest);
5755 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5756 initial_def, new_temp);
5757 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5758 new_temp = tmp;
5761 scalar_results.safe_push (new_temp);
5763 else if (direct_slp_reduc)
5765 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5766 with the elements for other SLP statements replaced with the
5767 neutral value. We can then do a normal reduction on each vector. */
5769 /* Enforced by vectorizable_reduction. */
5770 gcc_assert (reduc_inputs.length () == 1);
5771 gcc_assert (pow2p_hwi (group_size));
5773 gimple_seq seq = NULL;
5775 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5776 and the same element size as VECTYPE. */
5777 tree index = build_index_vector (vectype, 0, 1);
5778 tree index_type = TREE_TYPE (index);
5779 tree index_elt_type = TREE_TYPE (index_type);
5780 tree mask_type = truth_type_for (index_type);
5782 /* Create a vector that, for each element, identifies which of
5783 the REDUC_GROUP_SIZE results should use it. */
5784 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5785 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5786 build_vector_from_val (index_type, index_mask));
5788 /* Get a neutral vector value. This is simply a splat of the neutral
5789 scalar value if we have one, otherwise the initial scalar value
5790 is itself a neutral value. */
5791 tree vector_identity = NULL_TREE;
5792 tree neutral_op = NULL_TREE;
5793 if (slp_node)
5795 tree initial_value = NULL_TREE;
5796 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5797 initial_value = reduc_info->reduc_initial_values[0];
5798 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5799 initial_value);
5801 if (neutral_op)
5802 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5803 neutral_op);
5804 for (unsigned int i = 0; i < group_size; ++i)
5806 /* If there's no univeral neutral value, we can use the
5807 initial scalar value from the original PHI. This is used
5808 for MIN and MAX reduction, for example. */
5809 if (!neutral_op)
5811 tree scalar_value = reduc_info->reduc_initial_values[i];
5812 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5813 scalar_value);
5814 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5815 scalar_value);
5818 /* Calculate the equivalent of:
5820 sel[j] = (index[j] == i);
5822 which selects the elements of REDUC_INPUTS[0] that should
5823 be included in the result. */
5824 tree compare_val = build_int_cst (index_elt_type, i);
5825 compare_val = build_vector_from_val (index_type, compare_val);
5826 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5827 index, compare_val);
5829 /* Calculate the equivalent of:
5831 vec = seq ? reduc_inputs[0] : vector_identity;
5833 VEC is now suitable for a full vector reduction. */
5834 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5835 sel, reduc_inputs[0], vector_identity);
5837 /* Do the reduction and convert it to the appropriate type. */
5838 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5839 TREE_TYPE (vectype), vec);
5840 scalar = gimple_convert (&seq, scalar_type, scalar);
5841 scalar_results.safe_push (scalar);
5843 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5845 else
5847 bool reduce_with_shift;
5848 tree vec_temp;
5850 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5852 /* See if the target wants to do the final (shift) reduction
5853 in a vector mode of smaller size and first reduce upper/lower
5854 halves against each other. */
5855 enum machine_mode mode1 = mode;
5856 tree stype = TREE_TYPE (vectype);
5857 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5858 unsigned nunits1 = nunits;
5859 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5860 && reduc_inputs.length () == 1)
5862 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5863 /* For SLP reductions we have to make sure lanes match up, but
5864 since we're doing individual element final reduction reducing
5865 vector width here is even more important.
5866 ??? We can also separate lanes with permutes, for the common
5867 case of power-of-two group-size odd/even extracts would work. */
5868 if (slp_reduc && nunits != nunits1)
5870 nunits1 = least_common_multiple (nunits1, group_size);
5871 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5874 if (!slp_reduc
5875 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5876 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5878 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5879 stype, nunits1);
5880 reduce_with_shift = have_whole_vector_shift (mode1);
5881 if (!VECTOR_MODE_P (mode1)
5882 || !directly_supported_p (code, vectype1))
5883 reduce_with_shift = false;
5885 /* First reduce the vector to the desired vector size we should
5886 do shift reduction on by combining upper and lower halves. */
5887 gimple_seq stmts = NULL;
5888 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5889 code, &stmts);
5890 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5891 reduc_inputs[0] = new_temp;
5893 if (reduce_with_shift && !slp_reduc)
5895 int element_bitsize = tree_to_uhwi (bitsize);
5896 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5897 for variable-length vectors and also requires direct target support
5898 for loop reductions. */
5899 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5900 int nelements = vec_size_in_bits / element_bitsize;
5901 vec_perm_builder sel;
5902 vec_perm_indices indices;
5904 int elt_offset;
5906 tree zero_vec = build_zero_cst (vectype1);
5907 /* Case 2: Create:
5908 for (offset = nelements/2; offset >= 1; offset/=2)
5910 Create: va' = vec_shift <va, offset>
5911 Create: va = vop <va, va'>
5912 } */
5914 tree rhs;
5916 if (dump_enabled_p ())
5917 dump_printf_loc (MSG_NOTE, vect_location,
5918 "Reduce using vector shifts\n");
5920 gimple_seq stmts = NULL;
5921 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5922 for (elt_offset = nelements / 2;
5923 elt_offset >= 1;
5924 elt_offset /= 2)
5926 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5927 indices.new_vector (sel, 2, nelements);
5928 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5929 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5930 new_temp, zero_vec, mask);
5931 new_temp = gimple_build (&stmts, code,
5932 vectype1, new_name, new_temp);
5934 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5936 /* 2.4 Extract the final scalar result. Create:
5937 s_out3 = extract_field <v_out2, bitpos> */
5939 if (dump_enabled_p ())
5940 dump_printf_loc (MSG_NOTE, vect_location,
5941 "extract scalar result\n");
5943 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5944 bitsize, bitsize_zero_node);
5945 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5946 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5947 gimple_assign_set_lhs (epilog_stmt, new_temp);
5948 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5949 scalar_results.safe_push (new_temp);
5951 else
5953 /* Case 3: Create:
5954 s = extract_field <v_out2, 0>
5955 for (offset = element_size;
5956 offset < vector_size;
5957 offset += element_size;)
5959 Create: s' = extract_field <v_out2, offset>
5960 Create: s = op <s, s'> // For non SLP cases
5961 } */
5963 if (dump_enabled_p ())
5964 dump_printf_loc (MSG_NOTE, vect_location,
5965 "Reduce using scalar code.\n");
5967 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5968 int element_bitsize = tree_to_uhwi (bitsize);
5969 tree compute_type = TREE_TYPE (vectype);
5970 gimple_seq stmts = NULL;
5971 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5973 int bit_offset;
5974 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5975 vec_temp, bitsize, bitsize_zero_node);
5977 /* In SLP we don't need to apply reduction operation, so we just
5978 collect s' values in SCALAR_RESULTS. */
5979 if (slp_reduc)
5980 scalar_results.safe_push (new_temp);
5982 for (bit_offset = element_bitsize;
5983 bit_offset < vec_size_in_bits;
5984 bit_offset += element_bitsize)
5986 tree bitpos = bitsize_int (bit_offset);
5987 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5988 compute_type, vec_temp,
5989 bitsize, bitpos);
5990 if (slp_reduc)
5992 /* In SLP we don't need to apply reduction operation, so
5993 we just collect s' values in SCALAR_RESULTS. */
5994 new_temp = new_name;
5995 scalar_results.safe_push (new_name);
5997 else
5998 new_temp = gimple_build (&stmts, code, compute_type,
5999 new_name, new_temp);
6003 /* The only case where we need to reduce scalar results in SLP, is
6004 unrolling. If the size of SCALAR_RESULTS is greater than
6005 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6006 REDUC_GROUP_SIZE. */
6007 if (slp_reduc)
6009 tree res, first_res, new_res;
6011 /* Reduce multiple scalar results in case of SLP unrolling. */
6012 for (j = group_size; scalar_results.iterate (j, &res);
6013 j++)
6015 first_res = scalar_results[j % group_size];
6016 new_res = gimple_build (&stmts, code, compute_type,
6017 first_res, res);
6018 scalar_results[j % group_size] = new_res;
6020 scalar_results.truncate (group_size);
6021 for (k = 0; k < group_size; k++)
6022 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6023 scalar_results[k]);
6025 else
6027 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6028 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6029 scalar_results.safe_push (new_temp);
6032 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6035 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6036 && induc_val)
6038 /* Earlier we set the initial value to be a vector if induc_val
6039 values. Check the result and if it is induc_val then replace
6040 with the original initial value, unless induc_val is
6041 the same as initial_def already. */
6042 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
6043 induc_val);
6044 tree initial_def = reduc_info->reduc_initial_values[0];
6046 tree tmp = make_ssa_name (new_scalar_dest);
6047 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6048 initial_def, new_temp);
6049 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6050 scalar_results[0] = tmp;
6054 /* 2.5 Adjust the final result by the initial value of the reduction
6055 variable. (When such adjustment is not needed, then
6056 'adjustment_def' is zero). For example, if code is PLUS we create:
6057 new_temp = loop_exit_def + adjustment_def */
6059 if (adjustment_def)
6061 gcc_assert (!slp_reduc);
6062 gimple_seq stmts = NULL;
6063 if (double_reduc)
6065 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6066 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6067 new_temp = gimple_build (&stmts, code, vectype,
6068 reduc_inputs[0], adjustment_def);
6070 else
6072 new_temp = scalar_results[0];
6073 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6074 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6075 new_temp = gimple_build (&stmts, code, scalar_type,
6076 new_temp, adjustment_def);
6079 epilog_stmt = gimple_seq_last_stmt (stmts);
6080 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6081 scalar_results[0] = new_temp;
6084 /* Record this operation if it could be reused by the epilogue loop. */
6085 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6086 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6087 { orig_reduc_input, reduc_info });
6089 if (double_reduc)
6090 loop = outer_loop;
6092 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6093 phis with new adjusted scalar results, i.e., replace use <s_out0>
6094 with use <s_out4>.
6096 Transform:
6097 loop_exit:
6098 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6099 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6100 v_out2 = reduce <v_out1>
6101 s_out3 = extract_field <v_out2, 0>
6102 s_out4 = adjust_result <s_out3>
6103 use <s_out0>
6104 use <s_out0>
6106 into:
6108 loop_exit:
6109 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6110 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6111 v_out2 = reduce <v_out1>
6112 s_out3 = extract_field <v_out2, 0>
6113 s_out4 = adjust_result <s_out3>
6114 use <s_out4>
6115 use <s_out4> */
6117 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6118 for (k = 0; k < live_out_stmts.size (); k++)
6120 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6121 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6123 phis.create (3);
6124 /* Find the loop-closed-use at the loop exit of the original scalar
6125 result. (The reduction result is expected to have two immediate uses,
6126 one at the latch block, and one at the loop exit). For double
6127 reductions we are looking for exit phis of the outer loop. */
6128 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6130 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6132 if (!is_gimple_debug (USE_STMT (use_p)))
6133 phis.safe_push (USE_STMT (use_p));
6135 else
6137 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6139 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6141 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6143 if (!flow_bb_inside_loop_p (loop,
6144 gimple_bb (USE_STMT (phi_use_p)))
6145 && !is_gimple_debug (USE_STMT (phi_use_p)))
6146 phis.safe_push (USE_STMT (phi_use_p));
6152 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6154 /* Replace the uses: */
6155 orig_name = PHI_RESULT (exit_phi);
6157 /* Look for a single use at the target of the skip edge. */
6158 if (unify_with_main_loop_p)
6160 use_operand_p use_p;
6161 gimple *user;
6162 if (!single_imm_use (orig_name, &use_p, &user))
6163 gcc_unreachable ();
6164 orig_name = gimple_get_lhs (user);
6167 scalar_result = scalar_results[k];
6168 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6170 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6171 SET_USE (use_p, scalar_result);
6172 update_stmt (use_stmt);
6176 phis.release ();
6180 /* Return a vector of type VECTYPE that is equal to the vector select
6181 operation "MASK ? VEC : IDENTITY". Insert the select statements
6182 before GSI. */
6184 static tree
6185 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6186 tree vec, tree identity)
6188 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6189 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6190 mask, vec, identity);
6191 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6192 return cond;
6195 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6196 order, starting with LHS. Insert the extraction statements before GSI and
6197 associate the new scalar SSA names with variable SCALAR_DEST.
6198 Return the SSA name for the result. */
6200 static tree
6201 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6202 tree_code code, tree lhs, tree vector_rhs)
6204 tree vectype = TREE_TYPE (vector_rhs);
6205 tree scalar_type = TREE_TYPE (vectype);
6206 tree bitsize = TYPE_SIZE (scalar_type);
6207 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6208 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6210 for (unsigned HOST_WIDE_INT bit_offset = 0;
6211 bit_offset < vec_size_in_bits;
6212 bit_offset += element_bitsize)
6214 tree bitpos = bitsize_int (bit_offset);
6215 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6216 bitsize, bitpos);
6218 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6219 rhs = make_ssa_name (scalar_dest, stmt);
6220 gimple_assign_set_lhs (stmt, rhs);
6221 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6223 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6224 tree new_name = make_ssa_name (scalar_dest, stmt);
6225 gimple_assign_set_lhs (stmt, new_name);
6226 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6227 lhs = new_name;
6229 return lhs;
6232 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6233 type of the vector input. */
6235 static internal_fn
6236 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6238 internal_fn mask_reduc_fn;
6240 switch (reduc_fn)
6242 case IFN_FOLD_LEFT_PLUS:
6243 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6244 break;
6246 default:
6247 return IFN_LAST;
6250 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6251 OPTIMIZE_FOR_SPEED))
6252 return mask_reduc_fn;
6253 return IFN_LAST;
6256 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6257 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6258 statement. CODE is the operation performed by STMT_INFO and OPS are
6259 its scalar operands. REDUC_INDEX is the index of the operand in
6260 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6261 implements in-order reduction, or IFN_LAST if we should open-code it.
6262 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6263 that should be used to control the operation in a fully-masked loop. */
6265 static bool
6266 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6267 stmt_vec_info stmt_info,
6268 gimple_stmt_iterator *gsi,
6269 gimple **vec_stmt, slp_tree slp_node,
6270 gimple *reduc_def_stmt,
6271 tree_code code, internal_fn reduc_fn,
6272 tree ops[3], tree vectype_in,
6273 int reduc_index, vec_loop_masks *masks)
6275 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6276 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6277 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6279 int ncopies;
6280 if (slp_node)
6281 ncopies = 1;
6282 else
6283 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6285 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6286 gcc_assert (ncopies == 1);
6287 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6289 if (slp_node)
6290 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6291 TYPE_VECTOR_SUBPARTS (vectype_in)));
6293 tree op0 = ops[1 - reduc_index];
6295 int group_size = 1;
6296 stmt_vec_info scalar_dest_def_info;
6297 auto_vec<tree> vec_oprnds0;
6298 if (slp_node)
6300 auto_vec<vec<tree> > vec_defs (2);
6301 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6302 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6303 vec_defs[0].release ();
6304 vec_defs[1].release ();
6305 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6306 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6308 else
6310 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6311 op0, &vec_oprnds0);
6312 scalar_dest_def_info = stmt_info;
6315 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6316 tree scalar_type = TREE_TYPE (scalar_dest);
6317 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6319 int vec_num = vec_oprnds0.length ();
6320 gcc_assert (vec_num == 1 || slp_node);
6321 tree vec_elem_type = TREE_TYPE (vectype_out);
6322 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6324 tree vector_identity = NULL_TREE;
6325 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6326 vector_identity = build_zero_cst (vectype_out);
6328 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6329 int i;
6330 tree def0;
6331 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6333 gimple *new_stmt;
6334 tree mask = NULL_TREE;
6335 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6336 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6338 /* Handle MINUS by adding the negative. */
6339 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6341 tree negated = make_ssa_name (vectype_out);
6342 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6343 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6344 def0 = negated;
6347 if (mask && mask_reduc_fn == IFN_LAST)
6348 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6349 vector_identity);
6351 /* On the first iteration the input is simply the scalar phi
6352 result, and for subsequent iterations it is the output of
6353 the preceding operation. */
6354 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6356 if (mask && mask_reduc_fn != IFN_LAST)
6357 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6358 def0, mask);
6359 else
6360 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6361 def0);
6362 /* For chained SLP reductions the output of the previous reduction
6363 operation serves as the input of the next. For the final statement
6364 the output cannot be a temporary - we reuse the original
6365 scalar destination of the last statement. */
6366 if (i != vec_num - 1)
6368 gimple_set_lhs (new_stmt, scalar_dest_var);
6369 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6370 gimple_set_lhs (new_stmt, reduc_var);
6373 else
6375 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6376 reduc_var, def0);
6377 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6378 /* Remove the statement, so that we can use the same code paths
6379 as for statements that we've just created. */
6380 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6381 gsi_remove (&tmp_gsi, true);
6384 if (i == vec_num - 1)
6386 gimple_set_lhs (new_stmt, scalar_dest);
6387 vect_finish_replace_stmt (loop_vinfo,
6388 scalar_dest_def_info,
6389 new_stmt);
6391 else
6392 vect_finish_stmt_generation (loop_vinfo,
6393 scalar_dest_def_info,
6394 new_stmt, gsi);
6396 if (slp_node)
6397 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6398 else
6400 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6401 *vec_stmt = new_stmt;
6405 return true;
6408 /* Function is_nonwrapping_integer_induction.
6410 Check if STMT_VINO (which is part of loop LOOP) both increments and
6411 does not cause overflow. */
6413 static bool
6414 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6416 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6417 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6418 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6419 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6420 widest_int ni, max_loop_value, lhs_max;
6421 wi::overflow_type overflow = wi::OVF_NONE;
6423 /* Make sure the loop is integer based. */
6424 if (TREE_CODE (base) != INTEGER_CST
6425 || TREE_CODE (step) != INTEGER_CST)
6426 return false;
6428 /* Check that the max size of the loop will not wrap. */
6430 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6431 return true;
6433 if (! max_stmt_executions (loop, &ni))
6434 return false;
6436 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6437 &overflow);
6438 if (overflow)
6439 return false;
6441 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6442 TYPE_SIGN (lhs_type), &overflow);
6443 if (overflow)
6444 return false;
6446 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6447 <= TYPE_PRECISION (lhs_type));
6450 /* Check if masking can be supported by inserting a conditional expression.
6451 CODE is the code for the operation. COND_FN is the conditional internal
6452 function, if it exists. VECTYPE_IN is the type of the vector input. */
6453 static bool
6454 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6455 tree vectype_in)
6457 if (cond_fn != IFN_LAST
6458 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6459 OPTIMIZE_FOR_SPEED))
6460 return false;
6462 if (code.is_tree_code ())
6463 switch (tree_code (code))
6465 case DOT_PROD_EXPR:
6466 case SAD_EXPR:
6467 return true;
6469 default:
6470 break;
6472 return false;
6475 /* Insert a conditional expression to enable masked vectorization. CODE is the
6476 code for the operation. VOP is the array of operands. MASK is the loop
6477 mask. GSI is a statement iterator used to place the new conditional
6478 expression. */
6479 static void
6480 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6481 gimple_stmt_iterator *gsi)
6483 switch (tree_code (code))
6485 case DOT_PROD_EXPR:
6487 tree vectype = TREE_TYPE (vop[1]);
6488 tree zero = build_zero_cst (vectype);
6489 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6490 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6491 mask, vop[1], zero);
6492 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6493 vop[1] = masked_op1;
6494 break;
6497 case SAD_EXPR:
6499 tree vectype = TREE_TYPE (vop[1]);
6500 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6501 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6502 mask, vop[1], vop[0]);
6503 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6504 vop[1] = masked_op1;
6505 break;
6508 default:
6509 gcc_unreachable ();
6513 /* Function vectorizable_reduction.
6515 Check if STMT_INFO performs a reduction operation that can be vectorized.
6516 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6517 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6518 Return true if STMT_INFO is vectorizable in this way.
6520 This function also handles reduction idioms (patterns) that have been
6521 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6522 may be of this form:
6523 X = pattern_expr (arg0, arg1, ..., X)
6524 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6525 sequence that had been detected and replaced by the pattern-stmt
6526 (STMT_INFO).
6528 This function also handles reduction of condition expressions, for example:
6529 for (int i = 0; i < N; i++)
6530 if (a[i] < value)
6531 last = a[i];
6532 This is handled by vectorising the loop and creating an additional vector
6533 containing the loop indexes for which "a[i] < value" was true. In the
6534 function epilogue this is reduced to a single max value and then used to
6535 index into the vector of results.
6537 In some cases of reduction patterns, the type of the reduction variable X is
6538 different than the type of the other arguments of STMT_INFO.
6539 In such cases, the vectype that is used when transforming STMT_INFO into
6540 a vector stmt is different than the vectype that is used to determine the
6541 vectorization factor, because it consists of a different number of elements
6542 than the actual number of elements that are being operated upon in parallel.
6544 For example, consider an accumulation of shorts into an int accumulator.
6545 On some targets it's possible to vectorize this pattern operating on 8
6546 shorts at a time (hence, the vectype for purposes of determining the
6547 vectorization factor should be V8HI); on the other hand, the vectype that
6548 is used to create the vector form is actually V4SI (the type of the result).
6550 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6551 indicates what is the actual level of parallelism (V8HI in the example), so
6552 that the right vectorization factor would be derived. This vectype
6553 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6554 be used to create the vectorized stmt. The right vectype for the vectorized
6555 stmt is obtained from the type of the result X:
6556 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6558 This means that, contrary to "regular" reductions (or "regular" stmts in
6559 general), the following equation:
6560 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6561 does *NOT* necessarily hold for reduction patterns. */
6563 bool
6564 vectorizable_reduction (loop_vec_info loop_vinfo,
6565 stmt_vec_info stmt_info, slp_tree slp_node,
6566 slp_instance slp_node_instance,
6567 stmt_vector_for_cost *cost_vec)
6569 tree vectype_in = NULL_TREE;
6570 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6571 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6572 stmt_vec_info cond_stmt_vinfo = NULL;
6573 int i;
6574 int ncopies;
6575 bool single_defuse_cycle = false;
6576 bool nested_cycle = false;
6577 bool double_reduc = false;
6578 int vec_num;
6579 tree tem;
6580 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6581 tree cond_reduc_val = NULL_TREE;
6583 /* Make sure it was already recognized as a reduction computation. */
6584 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6585 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6586 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6587 return false;
6589 /* The stmt we store reduction analysis meta on. */
6590 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6591 reduc_info->is_reduc_info = true;
6593 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6595 if (is_a <gphi *> (stmt_info->stmt))
6597 if (slp_node)
6599 /* We eventually need to set a vector type on invariant
6600 arguments. */
6601 unsigned j;
6602 slp_tree child;
6603 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6604 if (!vect_maybe_update_slp_op_vectype
6605 (child, SLP_TREE_VECTYPE (slp_node)))
6607 if (dump_enabled_p ())
6608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609 "incompatible vector types for "
6610 "invariants\n");
6611 return false;
6614 /* Analysis for double-reduction is done on the outer
6615 loop PHI, nested cycles have no further restrictions. */
6616 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6618 else
6619 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6620 return true;
6623 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6624 stmt_vec_info phi_info = stmt_info;
6625 if (!is_a <gphi *> (stmt_info->stmt))
6627 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6628 return true;
6630 if (slp_node)
6632 slp_node_instance->reduc_phis = slp_node;
6633 /* ??? We're leaving slp_node to point to the PHIs, we only
6634 need it to get at the number of vector stmts which wasn't
6635 yet initialized for the instance root. */
6637 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6639 use_operand_p use_p;
6640 gimple *use_stmt;
6641 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6642 &use_p, &use_stmt);
6643 gcc_assert (res);
6644 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6647 /* PHIs should not participate in patterns. */
6648 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6649 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6651 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6652 and compute the reduction chain length. Discover the real
6653 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6654 tree reduc_def
6655 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6656 loop_latch_edge
6657 (gimple_bb (reduc_def_phi)->loop_father));
6658 unsigned reduc_chain_length = 0;
6659 bool only_slp_reduc_chain = true;
6660 stmt_info = NULL;
6661 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6662 while (reduc_def != PHI_RESULT (reduc_def_phi))
6664 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6665 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6666 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6668 if (dump_enabled_p ())
6669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6670 "reduction chain broken by patterns.\n");
6671 return false;
6673 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6674 only_slp_reduc_chain = false;
6675 /* ??? For epilogue generation live members of the chain need
6676 to point back to the PHI via their original stmt for
6677 info_for_reduction to work. */
6678 if (STMT_VINFO_LIVE_P (vdef))
6679 STMT_VINFO_REDUC_DEF (def) = phi_info;
6680 gimple_match_op op;
6681 if (!gimple_extract_op (vdef->stmt, &op))
6683 if (dump_enabled_p ())
6684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6685 "reduction chain includes unsupported"
6686 " statement type.\n");
6687 return false;
6689 if (CONVERT_EXPR_CODE_P (op.code))
6691 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6693 if (dump_enabled_p ())
6694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6695 "conversion in the reduction chain.\n");
6696 return false;
6699 else if (!stmt_info)
6700 /* First non-conversion stmt. */
6701 stmt_info = vdef;
6702 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6703 reduc_chain_length++;
6704 if (!stmt_info && slp_node)
6705 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6707 /* PHIs should not participate in patterns. */
6708 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6710 if (nested_in_vect_loop_p (loop, stmt_info))
6712 loop = loop->inner;
6713 nested_cycle = true;
6716 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6717 element. */
6718 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6720 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6721 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6723 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6724 gcc_assert (slp_node
6725 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6727 /* 1. Is vectorizable reduction? */
6728 /* Not supportable if the reduction variable is used in the loop, unless
6729 it's a reduction chain. */
6730 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6731 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6732 return false;
6734 /* Reductions that are not used even in an enclosing outer-loop,
6735 are expected to be "live" (used out of the loop). */
6736 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6737 && !STMT_VINFO_LIVE_P (stmt_info))
6738 return false;
6740 /* 2. Has this been recognized as a reduction pattern?
6742 Check if STMT represents a pattern that has been recognized
6743 in earlier analysis stages. For stmts that represent a pattern,
6744 the STMT_VINFO_RELATED_STMT field records the last stmt in
6745 the original sequence that constitutes the pattern. */
6747 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6748 if (orig_stmt_info)
6750 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6751 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6754 /* 3. Check the operands of the operation. The first operands are defined
6755 inside the loop body. The last operand is the reduction variable,
6756 which is defined by the loop-header-phi. */
6758 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6759 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6760 gimple_match_op op;
6761 if (!gimple_extract_op (stmt_info->stmt, &op))
6762 gcc_unreachable ();
6763 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6764 || op.code == WIDEN_SUM_EXPR
6765 || op.code == SAD_EXPR);
6766 enum optab_subtype optab_query_kind = optab_vector;
6767 if (op.code == DOT_PROD_EXPR
6768 && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6769 != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6770 optab_query_kind = optab_vector_mixed_sign;
6772 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6773 && !SCALAR_FLOAT_TYPE_P (op.type))
6774 return false;
6776 /* Do not try to vectorize bit-precision reductions. */
6777 if (!type_has_mode_precision_p (op.type))
6778 return false;
6780 /* For lane-reducing ops we're reducing the number of reduction PHIs
6781 which means the only use of that may be in the lane-reducing operation. */
6782 if (lane_reduc_code_p
6783 && reduc_chain_length != 1
6784 && !only_slp_reduc_chain)
6786 if (dump_enabled_p ())
6787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6788 "lane-reducing reduction with extra stmts.\n");
6789 return false;
6792 /* All uses but the last are expected to be defined in the loop.
6793 The last use is the reduction variable. In case of nested cycle this
6794 assumption is not true: we use reduc_index to record the index of the
6795 reduction variable. */
6796 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6797 /* We need to skip an extra operand for COND_EXPRs with embedded
6798 comparison. */
6799 unsigned opno_adjust = 0;
6800 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6801 opno_adjust = 1;
6802 for (i = 0; i < (int) op.num_ops; i++)
6804 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6805 if (i == 0 && op.code == COND_EXPR)
6806 continue;
6808 stmt_vec_info def_stmt_info;
6809 enum vect_def_type dt;
6810 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6811 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6812 &tem, &def_stmt_info))
6814 if (dump_enabled_p ())
6815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6816 "use not simple.\n");
6817 return false;
6819 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6820 continue;
6822 /* There should be only one cycle def in the stmt, the one
6823 leading to reduc_def. */
6824 if (VECTORIZABLE_CYCLE_DEF (dt))
6825 return false;
6827 /* To properly compute ncopies we are interested in the widest
6828 non-reduction input type in case we're looking at a widening
6829 accumulation that we later handle in vect_transform_reduction. */
6830 if (lane_reduc_code_p
6831 && tem
6832 && (!vectype_in
6833 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6834 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6835 vectype_in = tem;
6837 if (op.code == COND_EXPR)
6839 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6840 if (dt == vect_constant_def)
6842 cond_reduc_dt = dt;
6843 cond_reduc_val = op.ops[i];
6845 if (dt == vect_induction_def
6846 && def_stmt_info
6847 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6849 cond_reduc_dt = dt;
6850 cond_stmt_vinfo = def_stmt_info;
6854 if (!vectype_in)
6855 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6856 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6858 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6859 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6860 /* If we have a condition reduction, see if we can simplify it further. */
6861 if (v_reduc_type == COND_REDUCTION)
6863 if (slp_node)
6864 return false;
6866 /* When the condition uses the reduction value in the condition, fail. */
6867 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6869 if (dump_enabled_p ())
6870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6871 "condition depends on previous iteration\n");
6872 return false;
6875 if (reduc_chain_length == 1
6876 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6877 vectype_in, OPTIMIZE_FOR_SPEED))
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881 "optimizing condition reduction with"
6882 " FOLD_EXTRACT_LAST.\n");
6883 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6885 else if (cond_reduc_dt == vect_induction_def)
6887 tree base
6888 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6889 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6891 gcc_assert (TREE_CODE (base) == INTEGER_CST
6892 && TREE_CODE (step) == INTEGER_CST);
6893 cond_reduc_val = NULL_TREE;
6894 enum tree_code cond_reduc_op_code = ERROR_MARK;
6895 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6896 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6898 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6899 above base; punt if base is the minimum value of the type for
6900 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6901 else if (tree_int_cst_sgn (step) == -1)
6903 cond_reduc_op_code = MIN_EXPR;
6904 if (tree_int_cst_sgn (base) == -1)
6905 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6906 else if (tree_int_cst_lt (base,
6907 TYPE_MAX_VALUE (TREE_TYPE (base))))
6908 cond_reduc_val
6909 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6911 else
6913 cond_reduc_op_code = MAX_EXPR;
6914 if (tree_int_cst_sgn (base) == 1)
6915 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6916 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6917 base))
6918 cond_reduc_val
6919 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6921 if (cond_reduc_val)
6923 if (dump_enabled_p ())
6924 dump_printf_loc (MSG_NOTE, vect_location,
6925 "condition expression based on "
6926 "integer induction.\n");
6927 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6928 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6929 = cond_reduc_val;
6930 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6933 else if (cond_reduc_dt == vect_constant_def)
6935 enum vect_def_type cond_initial_dt;
6936 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6937 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6938 if (cond_initial_dt == vect_constant_def
6939 && types_compatible_p (TREE_TYPE (cond_initial_val),
6940 TREE_TYPE (cond_reduc_val)))
6942 tree e = fold_binary (LE_EXPR, boolean_type_node,
6943 cond_initial_val, cond_reduc_val);
6944 if (e && (integer_onep (e) || integer_zerop (e)))
6946 if (dump_enabled_p ())
6947 dump_printf_loc (MSG_NOTE, vect_location,
6948 "condition expression based on "
6949 "compile time constant.\n");
6950 /* Record reduction code at analysis stage. */
6951 STMT_VINFO_REDUC_CODE (reduc_info)
6952 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6953 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6959 if (STMT_VINFO_LIVE_P (phi_info))
6960 return false;
6962 if (slp_node)
6963 ncopies = 1;
6964 else
6965 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6967 gcc_assert (ncopies >= 1);
6969 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6971 if (nested_cycle)
6973 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6974 == vect_double_reduction_def);
6975 double_reduc = true;
6978 /* 4.2. Check support for the epilog operation.
6980 If STMT represents a reduction pattern, then the type of the
6981 reduction variable may be different than the type of the rest
6982 of the arguments. For example, consider the case of accumulation
6983 of shorts into an int accumulator; The original code:
6984 S1: int_a = (int) short_a;
6985 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6987 was replaced with:
6988 STMT: int_acc = widen_sum <short_a, int_acc>
6990 This means that:
6991 1. The tree-code that is used to create the vector operation in the
6992 epilog code (that reduces the partial results) is not the
6993 tree-code of STMT, but is rather the tree-code of the original
6994 stmt from the pattern that STMT is replacing. I.e, in the example
6995 above we want to use 'widen_sum' in the loop, but 'plus' in the
6996 epilog.
6997 2. The type (mode) we use to check available target support
6998 for the vector operation to be created in the *epilog*, is
6999 determined by the type of the reduction variable (in the example
7000 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7001 However the type (mode) we use to check available target support
7002 for the vector operation to be created *inside the loop*, is
7003 determined by the type of the other arguments to STMT (in the
7004 example we'd check this: optab_handler (widen_sum_optab,
7005 vect_short_mode)).
7007 This is contrary to "regular" reductions, in which the types of all
7008 the arguments are the same as the type of the reduction variable.
7009 For "regular" reductions we can therefore use the same vector type
7010 (and also the same tree-code) when generating the epilog code and
7011 when generating the code inside the loop. */
7013 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7014 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7016 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7017 if (reduction_type == TREE_CODE_REDUCTION)
7019 /* Check whether it's ok to change the order of the computation.
7020 Generally, when vectorizing a reduction we change the order of the
7021 computation. This may change the behavior of the program in some
7022 cases, so we need to check that this is ok. One exception is when
7023 vectorizing an outer-loop: the inner-loop is executed sequentially,
7024 and therefore vectorizing reductions in the inner-loop during
7025 outer-loop vectorization is safe. Likewise when we are vectorizing
7026 a series of reductions using SLP and the VF is one the reductions
7027 are performed in scalar order. */
7028 if (slp_node
7029 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7030 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7032 else if (needs_fold_left_reduction_p (op.type, orig_code))
7034 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7035 is not directy used in stmt. */
7036 if (!only_slp_reduc_chain
7037 && reduc_chain_length != 1)
7039 if (dump_enabled_p ())
7040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041 "in-order reduction chain without SLP.\n");
7042 return false;
7044 STMT_VINFO_REDUC_TYPE (reduc_info)
7045 = reduction_type = FOLD_LEFT_REDUCTION;
7047 else if (!commutative_binary_op_p (orig_code, op.type)
7048 || !associative_binary_op_p (orig_code, op.type))
7050 if (dump_enabled_p ())
7051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052 "reduction: not commutative/associative");
7053 return false;
7057 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7058 && ncopies > 1)
7060 if (dump_enabled_p ())
7061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7062 "multiple types in double reduction or condition "
7063 "reduction or fold-left reduction.\n");
7064 return false;
7067 internal_fn reduc_fn = IFN_LAST;
7068 if (reduction_type == TREE_CODE_REDUCTION
7069 || reduction_type == FOLD_LEFT_REDUCTION
7070 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7071 || reduction_type == CONST_COND_REDUCTION)
7073 if (reduction_type == FOLD_LEFT_REDUCTION
7074 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7075 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7077 if (reduc_fn != IFN_LAST
7078 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7079 OPTIMIZE_FOR_SPEED))
7081 if (dump_enabled_p ())
7082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7083 "reduc op not supported by target.\n");
7085 reduc_fn = IFN_LAST;
7088 else
7090 if (!nested_cycle || double_reduc)
7092 if (dump_enabled_p ())
7093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7094 "no reduc code for scalar code.\n");
7096 return false;
7100 else if (reduction_type == COND_REDUCTION)
7102 int scalar_precision
7103 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7104 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7105 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7106 vectype_out);
7108 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7109 OPTIMIZE_FOR_SPEED))
7110 reduc_fn = IFN_REDUC_MAX;
7112 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7114 if (reduction_type != EXTRACT_LAST_REDUCTION
7115 && (!nested_cycle || double_reduc)
7116 && reduc_fn == IFN_LAST
7117 && !nunits_out.is_constant ())
7119 if (dump_enabled_p ())
7120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7121 "missing target support for reduction on"
7122 " variable-length vectors.\n");
7123 return false;
7126 /* For SLP reductions, see if there is a neutral value we can use. */
7127 tree neutral_op = NULL_TREE;
7128 if (slp_node)
7130 tree initial_value = NULL_TREE;
7131 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7132 initial_value = vect_phi_initial_value (reduc_def_phi);
7133 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7134 orig_code, initial_value);
7137 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7139 /* We can't support in-order reductions of code such as this:
7141 for (int i = 0; i < n1; ++i)
7142 for (int j = 0; j < n2; ++j)
7143 l += a[j];
7145 since GCC effectively transforms the loop when vectorizing:
7147 for (int i = 0; i < n1 / VF; ++i)
7148 for (int j = 0; j < n2; ++j)
7149 for (int k = 0; k < VF; ++k)
7150 l += a[j];
7152 which is a reassociation of the original operation. */
7153 if (dump_enabled_p ())
7154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7155 "in-order double reduction not supported.\n");
7157 return false;
7160 if (reduction_type == FOLD_LEFT_REDUCTION
7161 && slp_node
7162 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7164 /* We cannot use in-order reductions in this case because there is
7165 an implicit reassociation of the operations involved. */
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7168 "in-order unchained SLP reductions not supported.\n");
7169 return false;
7172 /* For double reductions, and for SLP reductions with a neutral value,
7173 we construct a variable-length initial vector by loading a vector
7174 full of the neutral value and then shift-and-inserting the start
7175 values into the low-numbered elements. */
7176 if ((double_reduc || neutral_op)
7177 && !nunits_out.is_constant ()
7178 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7179 vectype_out, OPTIMIZE_FOR_SPEED))
7181 if (dump_enabled_p ())
7182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7183 "reduction on variable-length vectors requires"
7184 " target support for a vector-shift-and-insert"
7185 " operation.\n");
7186 return false;
7189 /* Check extra constraints for variable-length unchained SLP reductions. */
7190 if (STMT_SLP_TYPE (stmt_info)
7191 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7192 && !nunits_out.is_constant ())
7194 /* We checked above that we could build the initial vector when
7195 there's a neutral element value. Check here for the case in
7196 which each SLP statement has its own initial value and in which
7197 that value needs to be repeated for every instance of the
7198 statement within the initial vector. */
7199 unsigned int group_size = SLP_TREE_LANES (slp_node);
7200 if (!neutral_op
7201 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7202 TREE_TYPE (vectype_out)))
7204 if (dump_enabled_p ())
7205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7206 "unsupported form of SLP reduction for"
7207 " variable-length vectors: cannot build"
7208 " initial vector.\n");
7209 return false;
7211 /* The epilogue code relies on the number of elements being a multiple
7212 of the group size. The duplicate-and-interleave approach to setting
7213 up the initial vector does too. */
7214 if (!multiple_p (nunits_out, group_size))
7216 if (dump_enabled_p ())
7217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7218 "unsupported form of SLP reduction for"
7219 " variable-length vectors: the vector size"
7220 " is not a multiple of the number of results.\n");
7221 return false;
7225 if (reduction_type == COND_REDUCTION)
7227 widest_int ni;
7229 if (! max_loop_iterations (loop, &ni))
7231 if (dump_enabled_p ())
7232 dump_printf_loc (MSG_NOTE, vect_location,
7233 "loop count not known, cannot create cond "
7234 "reduction.\n");
7235 return false;
7237 /* Convert backedges to iterations. */
7238 ni += 1;
7240 /* The additional index will be the same type as the condition. Check
7241 that the loop can fit into this less one (because we'll use up the
7242 zero slot for when there are no matches). */
7243 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7244 if (wi::geu_p (ni, wi::to_widest (max_index)))
7246 if (dump_enabled_p ())
7247 dump_printf_loc (MSG_NOTE, vect_location,
7248 "loop size is greater than data size.\n");
7249 return false;
7253 /* In case the vectorization factor (VF) is bigger than the number
7254 of elements that we can fit in a vectype (nunits), we have to generate
7255 more than one vector stmt - i.e - we need to "unroll" the
7256 vector stmt by a factor VF/nunits. For more details see documentation
7257 in vectorizable_operation. */
7259 /* If the reduction is used in an outer loop we need to generate
7260 VF intermediate results, like so (e.g. for ncopies=2):
7261 r0 = phi (init, r0)
7262 r1 = phi (init, r1)
7263 r0 = x0 + r0;
7264 r1 = x1 + r1;
7265 (i.e. we generate VF results in 2 registers).
7266 In this case we have a separate def-use cycle for each copy, and therefore
7267 for each copy we get the vector def for the reduction variable from the
7268 respective phi node created for this copy.
7270 Otherwise (the reduction is unused in the loop nest), we can combine
7271 together intermediate results, like so (e.g. for ncopies=2):
7272 r = phi (init, r)
7273 r = x0 + r;
7274 r = x1 + r;
7275 (i.e. we generate VF/2 results in a single register).
7276 In this case for each copy we get the vector def for the reduction variable
7277 from the vectorized reduction operation generated in the previous iteration.
7279 This only works when we see both the reduction PHI and its only consumer
7280 in vectorizable_reduction and there are no intermediate stmts
7281 participating. When unrolling we want each unrolled iteration to have its
7282 own reduction accumulator since one of the main goals of unrolling a
7283 reduction is to reduce the aggregate loop-carried latency. */
7284 if (ncopies > 1
7285 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7286 && reduc_chain_length == 1
7287 && loop_vinfo->suggested_unroll_factor == 1)
7288 single_defuse_cycle = true;
7290 if (single_defuse_cycle || lane_reduc_code_p)
7292 gcc_assert (op.code != COND_EXPR);
7294 /* 4. Supportable by target? */
7295 bool ok = true;
7297 /* 4.1. check support for the operation in the loop */
7298 machine_mode vec_mode = TYPE_MODE (vectype_in);
7299 if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7301 if (dump_enabled_p ())
7302 dump_printf (MSG_NOTE, "op not supported by target.\n");
7303 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7304 || !vect_can_vectorize_without_simd_p (op.code))
7305 ok = false;
7306 else
7307 if (dump_enabled_p ())
7308 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7311 if (vect_emulated_vector_p (vectype_in)
7312 && !vect_can_vectorize_without_simd_p (op.code))
7314 if (dump_enabled_p ())
7315 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7316 return false;
7319 /* lane-reducing operations have to go through vect_transform_reduction.
7320 For the other cases try without the single cycle optimization. */
7321 if (!ok)
7323 if (lane_reduc_code_p)
7324 return false;
7325 else
7326 single_defuse_cycle = false;
7329 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7331 /* If the reduction stmt is one of the patterns that have lane
7332 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7333 if ((ncopies > 1 && ! single_defuse_cycle)
7334 && lane_reduc_code_p)
7336 if (dump_enabled_p ())
7337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7338 "multi def-use cycle not possible for lane-reducing "
7339 "reduction operation\n");
7340 return false;
7343 if (slp_node
7344 && !(!single_defuse_cycle
7345 && !lane_reduc_code_p
7346 && reduction_type != FOLD_LEFT_REDUCTION))
7347 for (i = 0; i < (int) op.num_ops; i++)
7348 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7350 if (dump_enabled_p ())
7351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7352 "incompatible vector types for invariants\n");
7353 return false;
7356 if (slp_node)
7357 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7358 else
7359 vec_num = 1;
7361 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7362 reduction_type, ncopies, cost_vec);
7363 /* Cost the reduction op inside the loop if transformed via
7364 vect_transform_reduction. Otherwise this is costed by the
7365 separate vectorizable_* routines. */
7366 if (single_defuse_cycle || lane_reduc_code_p)
7367 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7369 if (dump_enabled_p ()
7370 && reduction_type == FOLD_LEFT_REDUCTION)
7371 dump_printf_loc (MSG_NOTE, vect_location,
7372 "using an in-order (fold-left) reduction.\n");
7373 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7374 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7375 reductions go through their own vectorizable_* routines. */
7376 if (!single_defuse_cycle
7377 && !lane_reduc_code_p
7378 && reduction_type != FOLD_LEFT_REDUCTION)
7380 stmt_vec_info tem
7381 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7382 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7384 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7385 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7387 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7388 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7390 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7392 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7393 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7395 if (reduction_type != FOLD_LEFT_REDUCTION
7396 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7397 && (cond_fn == IFN_LAST
7398 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7399 OPTIMIZE_FOR_SPEED)))
7401 if (dump_enabled_p ())
7402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7403 "can't operate on partial vectors because"
7404 " no conditional operation is available.\n");
7405 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7407 else if (reduction_type == FOLD_LEFT_REDUCTION
7408 && reduc_fn == IFN_LAST
7409 && !expand_vec_cond_expr_p (vectype_in,
7410 truth_type_for (vectype_in),
7411 SSA_NAME))
7413 if (dump_enabled_p ())
7414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7415 "can't operate on partial vectors because"
7416 " no conditional operation is available.\n");
7417 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7419 else
7420 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7421 vectype_in, NULL);
7423 return true;
7426 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7427 value. */
7429 bool
7430 vect_transform_reduction (loop_vec_info loop_vinfo,
7431 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7432 gimple **vec_stmt, slp_tree slp_node)
7434 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7435 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7436 int i;
7437 int ncopies;
7438 int vec_num;
7440 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7441 gcc_assert (reduc_info->is_reduc_info);
7443 if (nested_in_vect_loop_p (loop, stmt_info))
7445 loop = loop->inner;
7446 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7449 gimple_match_op op;
7450 if (!gimple_extract_op (stmt_info->stmt, &op))
7451 gcc_unreachable ();
7452 gcc_assert (op.code.is_tree_code ());
7453 auto code = tree_code (op.code);
7455 /* All uses but the last are expected to be defined in the loop.
7456 The last use is the reduction variable. In case of nested cycle this
7457 assumption is not true: we use reduc_index to record the index of the
7458 reduction variable. */
7459 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7460 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7461 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7462 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7464 if (slp_node)
7466 ncopies = 1;
7467 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7469 else
7471 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7472 vec_num = 1;
7475 internal_fn cond_fn = get_conditional_internal_fn (code);
7476 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7477 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7479 /* Transform. */
7480 tree new_temp = NULL_TREE;
7481 auto_vec<tree> vec_oprnds0;
7482 auto_vec<tree> vec_oprnds1;
7483 auto_vec<tree> vec_oprnds2;
7484 tree def0;
7486 if (dump_enabled_p ())
7487 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7489 /* FORNOW: Multiple types are not supported for condition. */
7490 if (code == COND_EXPR)
7491 gcc_assert (ncopies == 1);
7493 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7495 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7496 if (reduction_type == FOLD_LEFT_REDUCTION)
7498 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7499 return vectorize_fold_left_reduction
7500 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7501 reduc_fn, op.ops, vectype_in, reduc_index, masks);
7504 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7505 gcc_assert (single_defuse_cycle
7506 || code == DOT_PROD_EXPR
7507 || code == WIDEN_SUM_EXPR
7508 || code == SAD_EXPR);
7510 /* Create the destination vector */
7511 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7512 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7514 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7515 single_defuse_cycle && reduc_index == 0
7516 ? NULL_TREE : op.ops[0], &vec_oprnds0,
7517 single_defuse_cycle && reduc_index == 1
7518 ? NULL_TREE : op.ops[1], &vec_oprnds1,
7519 op.num_ops == 3
7520 && !(single_defuse_cycle && reduc_index == 2)
7521 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7522 if (single_defuse_cycle)
7524 gcc_assert (!slp_node);
7525 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7526 op.ops[reduc_index],
7527 reduc_index == 0 ? &vec_oprnds0
7528 : (reduc_index == 1 ? &vec_oprnds1
7529 : &vec_oprnds2));
7532 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7534 gimple *new_stmt;
7535 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7536 if (masked_loop_p && !mask_by_cond_expr)
7538 /* Make sure that the reduction accumulator is vop[0]. */
7539 if (reduc_index == 1)
7541 gcc_assert (commutative_tree_code (code));
7542 std::swap (vop[0], vop[1]);
7544 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7545 vectype_in, i);
7546 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7547 vop[0], vop[1], vop[0]);
7548 new_temp = make_ssa_name (vec_dest, call);
7549 gimple_call_set_lhs (call, new_temp);
7550 gimple_call_set_nothrow (call, true);
7551 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7552 new_stmt = call;
7554 else
7556 if (op.num_ops == 3)
7557 vop[2] = vec_oprnds2[i];
7559 if (masked_loop_p && mask_by_cond_expr)
7561 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7562 vectype_in, i);
7563 build_vect_cond_expr (code, vop, mask, gsi);
7566 new_stmt = gimple_build_assign (vec_dest, code,
7567 vop[0], vop[1], vop[2]);
7568 new_temp = make_ssa_name (vec_dest, new_stmt);
7569 gimple_assign_set_lhs (new_stmt, new_temp);
7570 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7573 if (slp_node)
7574 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7575 else if (single_defuse_cycle
7576 && i < ncopies - 1)
7578 if (reduc_index == 0)
7579 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7580 else if (reduc_index == 1)
7581 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7582 else if (reduc_index == 2)
7583 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7585 else
7586 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7589 if (!slp_node)
7590 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7592 return true;
7595 /* Transform phase of a cycle PHI. */
7597 bool
7598 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7599 stmt_vec_info stmt_info, gimple **vec_stmt,
7600 slp_tree slp_node, slp_instance slp_node_instance)
7602 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7603 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7604 int i;
7605 int ncopies;
7606 int j;
7607 bool nested_cycle = false;
7608 int vec_num;
7610 if (nested_in_vect_loop_p (loop, stmt_info))
7612 loop = loop->inner;
7613 nested_cycle = true;
7616 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7617 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7618 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7619 gcc_assert (reduc_info->is_reduc_info);
7621 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7622 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7623 /* Leave the scalar phi in place. */
7624 return true;
7626 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7627 /* For a nested cycle we do not fill the above. */
7628 if (!vectype_in)
7629 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7630 gcc_assert (vectype_in);
7632 if (slp_node)
7634 /* The size vect_schedule_slp_instance computes is off for us. */
7635 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7636 * SLP_TREE_LANES (slp_node), vectype_in);
7637 ncopies = 1;
7639 else
7641 vec_num = 1;
7642 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7645 /* Check whether we should use a single PHI node and accumulate
7646 vectors to one before the backedge. */
7647 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7648 ncopies = 1;
7650 /* Create the destination vector */
7651 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7652 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7653 vectype_out);
7655 /* Get the loop-entry arguments. */
7656 tree vec_initial_def = NULL_TREE;
7657 auto_vec<tree> vec_initial_defs;
7658 if (slp_node)
7660 vec_initial_defs.reserve (vec_num);
7661 if (nested_cycle)
7663 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7664 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7665 &vec_initial_defs);
7667 else
7669 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7670 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7671 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7673 unsigned int num_phis = stmts.length ();
7674 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7675 num_phis = 1;
7676 initial_values.reserve (num_phis);
7677 for (unsigned int i = 0; i < num_phis; ++i)
7679 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7680 initial_values.quick_push (vect_phi_initial_value (this_phi));
7682 if (vec_num == 1)
7683 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7684 if (!initial_values.is_empty ())
7686 tree initial_value
7687 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7688 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7689 tree neutral_op
7690 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7691 code, initial_value);
7692 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7693 &vec_initial_defs, vec_num,
7694 stmts.length (), neutral_op);
7698 else
7700 /* Get at the scalar def before the loop, that defines the initial
7701 value of the reduction variable. */
7702 tree initial_def = vect_phi_initial_value (phi);
7703 reduc_info->reduc_initial_values.safe_push (initial_def);
7704 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7705 and we can't use zero for induc_val, use initial_def. Similarly
7706 for REDUC_MIN and initial_def larger than the base. */
7707 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7709 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7710 if (TREE_CODE (initial_def) == INTEGER_CST
7711 && !integer_zerop (induc_val)
7712 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7713 && tree_int_cst_lt (initial_def, induc_val))
7714 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7715 && tree_int_cst_lt (induc_val, initial_def))))
7717 induc_val = initial_def;
7718 /* Communicate we used the initial_def to epilouge
7719 generation. */
7720 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7722 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7724 else if (nested_cycle)
7726 /* Do not use an adjustment def as that case is not supported
7727 correctly if ncopies is not one. */
7728 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7729 ncopies, initial_def,
7730 &vec_initial_defs);
7732 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7733 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7734 /* Fill the initial vector with the initial scalar value. */
7735 vec_initial_def
7736 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7737 initial_def, initial_def);
7738 else
7740 if (ncopies == 1)
7741 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7742 if (!reduc_info->reduc_initial_values.is_empty ())
7744 initial_def = reduc_info->reduc_initial_values[0];
7745 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7746 tree neutral_op
7747 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7748 code, initial_def);
7749 gcc_assert (neutral_op);
7750 /* Try to simplify the vector initialization by applying an
7751 adjustment after the reduction has been performed. */
7752 if (!reduc_info->reused_accumulator
7753 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7754 && !operand_equal_p (neutral_op, initial_def))
7756 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7757 = initial_def;
7758 initial_def = neutral_op;
7760 vec_initial_def
7761 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7762 initial_def, neutral_op);
7767 if (vec_initial_def)
7769 vec_initial_defs.create (ncopies);
7770 for (i = 0; i < ncopies; ++i)
7771 vec_initial_defs.quick_push (vec_initial_def);
7774 if (auto *accumulator = reduc_info->reused_accumulator)
7776 tree def = accumulator->reduc_input;
7777 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7779 unsigned int nreduc;
7780 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7781 (TREE_TYPE (def)),
7782 TYPE_VECTOR_SUBPARTS (vectype_out),
7783 &nreduc);
7784 gcc_assert (res);
7785 gimple_seq stmts = NULL;
7786 /* Reduce the single vector to a smaller one. */
7787 if (nreduc != 1)
7789 /* Perform the reduction in the appropriate type. */
7790 tree rvectype = vectype_out;
7791 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7792 TREE_TYPE (TREE_TYPE (def))))
7793 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7794 TYPE_VECTOR_SUBPARTS
7795 (vectype_out));
7796 def = vect_create_partial_epilog (def, rvectype,
7797 STMT_VINFO_REDUC_CODE
7798 (reduc_info),
7799 &stmts);
7801 /* The epilogue loop might use a different vector mode, like
7802 VNx2DI vs. V2DI. */
7803 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7805 tree reduc_type = build_vector_type_for_mode
7806 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7807 def = gimple_convert (&stmts, reduc_type, def);
7809 /* Adjust the input so we pick up the partially reduced value
7810 for the skip edge in vect_create_epilog_for_reduction. */
7811 accumulator->reduc_input = def;
7812 /* And the reduction could be carried out using a different sign. */
7813 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7814 def = gimple_convert (&stmts, vectype_out, def);
7815 if (loop_vinfo->main_loop_edge)
7817 /* While we'd like to insert on the edge this will split
7818 blocks and disturb bookkeeping, we also will eventually
7819 need this on the skip edge. Rely on sinking to
7820 fixup optimal placement and insert in the pred. */
7821 gimple_stmt_iterator gsi
7822 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7823 /* Insert before a cond that eventually skips the
7824 epilogue. */
7825 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7826 gsi_prev (&gsi);
7827 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7829 else
7830 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7831 stmts);
7833 if (loop_vinfo->main_loop_edge)
7834 vec_initial_defs[0]
7835 = vect_get_main_loop_result (loop_vinfo, def,
7836 vec_initial_defs[0]);
7837 else
7838 vec_initial_defs.safe_push (def);
7841 /* Generate the reduction PHIs upfront. */
7842 for (i = 0; i < vec_num; i++)
7844 tree vec_init_def = vec_initial_defs[i];
7845 for (j = 0; j < ncopies; j++)
7847 /* Create the reduction-phi that defines the reduction
7848 operand. */
7849 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7851 /* Set the loop-entry arg of the reduction-phi. */
7852 if (j != 0 && nested_cycle)
7853 vec_init_def = vec_initial_defs[j];
7854 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7855 UNKNOWN_LOCATION);
7857 /* The loop-latch arg is set in epilogue processing. */
7859 if (slp_node)
7860 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7861 else
7863 if (j == 0)
7864 *vec_stmt = new_phi;
7865 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7870 return true;
7873 /* Vectorizes LC PHIs. */
7875 bool
7876 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7877 stmt_vec_info stmt_info, gimple **vec_stmt,
7878 slp_tree slp_node)
7880 if (!loop_vinfo
7881 || !is_a <gphi *> (stmt_info->stmt)
7882 || gimple_phi_num_args (stmt_info->stmt) != 1)
7883 return false;
7885 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7886 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7887 return false;
7889 if (!vec_stmt) /* transformation not required. */
7891 /* Deal with copies from externs or constants that disguise as
7892 loop-closed PHI nodes (PR97886). */
7893 if (slp_node
7894 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7895 SLP_TREE_VECTYPE (slp_node)))
7897 if (dump_enabled_p ())
7898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899 "incompatible vector types for invariants\n");
7900 return false;
7902 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7903 return true;
7906 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7907 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7908 basic_block bb = gimple_bb (stmt_info->stmt);
7909 edge e = single_pred_edge (bb);
7910 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7911 auto_vec<tree> vec_oprnds;
7912 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7913 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7914 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7915 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7917 /* Create the vectorized LC PHI node. */
7918 gphi *new_phi = create_phi_node (vec_dest, bb);
7919 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7920 if (slp_node)
7921 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7922 else
7923 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7925 if (!slp_node)
7926 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7928 return true;
7931 /* Vectorizes PHIs. */
7933 bool
7934 vectorizable_phi (vec_info *,
7935 stmt_vec_info stmt_info, gimple **vec_stmt,
7936 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7938 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7939 return false;
7941 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7942 return false;
7944 tree vectype = SLP_TREE_VECTYPE (slp_node);
7946 if (!vec_stmt) /* transformation not required. */
7948 slp_tree child;
7949 unsigned i;
7950 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7951 if (!child)
7953 if (dump_enabled_p ())
7954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7955 "PHI node with unvectorized backedge def\n");
7956 return false;
7958 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7960 if (dump_enabled_p ())
7961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7962 "incompatible vector types for invariants\n");
7963 return false;
7965 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7966 && !useless_type_conversion_p (vectype,
7967 SLP_TREE_VECTYPE (child)))
7969 /* With bools we can have mask and non-mask precision vectors
7970 or different non-mask precisions. while pattern recog is
7971 supposed to guarantee consistency here bugs in it can cause
7972 mismatches (PR103489 and PR103800 for example).
7973 Deal with them here instead of ICEing later. */
7974 if (dump_enabled_p ())
7975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7976 "incompatible vector type setup from "
7977 "bool pattern detection\n");
7978 return false;
7981 /* For single-argument PHIs assume coalescing which means zero cost
7982 for the scalar and the vector PHIs. This avoids artificially
7983 favoring the vector path (but may pessimize it in some cases). */
7984 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7985 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7986 vector_stmt, stmt_info, vectype, 0, vect_body);
7987 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7988 return true;
7991 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7992 basic_block bb = gimple_bb (stmt_info->stmt);
7993 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7994 auto_vec<gphi *> new_phis;
7995 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7997 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7999 /* Skip not yet vectorized defs. */
8000 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8001 && SLP_TREE_VEC_STMTS (child).is_empty ())
8002 continue;
8004 auto_vec<tree> vec_oprnds;
8005 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8006 if (!new_phis.exists ())
8008 new_phis.create (vec_oprnds.length ());
8009 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8011 /* Create the vectorized LC PHI node. */
8012 new_phis.quick_push (create_phi_node (vec_dest, bb));
8013 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8016 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8017 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8018 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8020 /* We should have at least one already vectorized child. */
8021 gcc_assert (new_phis.exists ());
8023 return true;
8026 /* Return true if VECTYPE represents a vector that requires lowering
8027 by the vector lowering pass. */
8029 bool
8030 vect_emulated_vector_p (tree vectype)
8032 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8033 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8034 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8037 /* Return true if we can emulate CODE on an integer mode representation
8038 of a vector. */
8040 bool
8041 vect_can_vectorize_without_simd_p (tree_code code)
8043 switch (code)
8045 case PLUS_EXPR:
8046 case MINUS_EXPR:
8047 case NEGATE_EXPR:
8048 case BIT_AND_EXPR:
8049 case BIT_IOR_EXPR:
8050 case BIT_XOR_EXPR:
8051 case BIT_NOT_EXPR:
8052 return true;
8054 default:
8055 return false;
8059 /* Likewise, but taking a code_helper. */
8061 bool
8062 vect_can_vectorize_without_simd_p (code_helper code)
8064 return (code.is_tree_code ()
8065 && vect_can_vectorize_without_simd_p (tree_code (code)));
8068 /* Function vectorizable_induction
8070 Check if STMT_INFO performs an induction computation that can be vectorized.
8071 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8072 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8073 Return true if STMT_INFO is vectorizable in this way. */
8075 bool
8076 vectorizable_induction (loop_vec_info loop_vinfo,
8077 stmt_vec_info stmt_info,
8078 gimple **vec_stmt, slp_tree slp_node,
8079 stmt_vector_for_cost *cost_vec)
8081 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8082 unsigned ncopies;
8083 bool nested_in_vect_loop = false;
8084 class loop *iv_loop;
8085 tree vec_def;
8086 edge pe = loop_preheader_edge (loop);
8087 basic_block new_bb;
8088 tree new_vec, vec_init, vec_step, t;
8089 tree new_name;
8090 gimple *new_stmt;
8091 gphi *induction_phi;
8092 tree induc_def, vec_dest;
8093 tree init_expr, step_expr;
8094 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8095 unsigned i;
8096 tree expr;
8097 gimple_stmt_iterator si;
8099 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8100 if (!phi)
8101 return false;
8103 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8104 return false;
8106 /* Make sure it was recognized as induction computation. */
8107 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8108 return false;
8110 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8111 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8113 if (slp_node)
8114 ncopies = 1;
8115 else
8116 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8117 gcc_assert (ncopies >= 1);
8119 /* FORNOW. These restrictions should be relaxed. */
8120 if (nested_in_vect_loop_p (loop, stmt_info))
8122 imm_use_iterator imm_iter;
8123 use_operand_p use_p;
8124 gimple *exit_phi;
8125 edge latch_e;
8126 tree loop_arg;
8128 if (ncopies > 1)
8130 if (dump_enabled_p ())
8131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8132 "multiple types in nested loop.\n");
8133 return false;
8136 exit_phi = NULL;
8137 latch_e = loop_latch_edge (loop->inner);
8138 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8139 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8141 gimple *use_stmt = USE_STMT (use_p);
8142 if (is_gimple_debug (use_stmt))
8143 continue;
8145 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8147 exit_phi = use_stmt;
8148 break;
8151 if (exit_phi)
8153 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8154 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8155 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8157 if (dump_enabled_p ())
8158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8159 "inner-loop induction only used outside "
8160 "of the outer vectorized loop.\n");
8161 return false;
8165 nested_in_vect_loop = true;
8166 iv_loop = loop->inner;
8168 else
8169 iv_loop = loop;
8170 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8172 if (slp_node && !nunits.is_constant ())
8174 /* The current SLP code creates the step value element-by-element. */
8175 if (dump_enabled_p ())
8176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8177 "SLP induction not supported for variable-length"
8178 " vectors.\n");
8179 return false;
8182 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
8184 if (dump_enabled_p ())
8185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8186 "floating point induction vectorization disabled\n");
8187 return false;
8190 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8191 gcc_assert (step_expr != NULL_TREE);
8192 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8194 /* Check for backend support of PLUS/MINUS_EXPR. */
8195 if (!directly_supported_p (PLUS_EXPR, step_vectype)
8196 || !directly_supported_p (MINUS_EXPR, step_vectype))
8197 return false;
8199 if (!vec_stmt) /* transformation not required. */
8201 unsigned inside_cost = 0, prologue_cost = 0;
8202 if (slp_node)
8204 /* We eventually need to set a vector type on invariant
8205 arguments. */
8206 unsigned j;
8207 slp_tree child;
8208 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8209 if (!vect_maybe_update_slp_op_vectype
8210 (child, SLP_TREE_VECTYPE (slp_node)))
8212 if (dump_enabled_p ())
8213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8214 "incompatible vector types for "
8215 "invariants\n");
8216 return false;
8218 /* loop cost for vec_loop. */
8219 inside_cost
8220 = record_stmt_cost (cost_vec,
8221 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8222 vector_stmt, stmt_info, 0, vect_body);
8223 /* prologue cost for vec_init (if not nested) and step. */
8224 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8225 scalar_to_vec,
8226 stmt_info, 0, vect_prologue);
8228 else /* if (!slp_node) */
8230 /* loop cost for vec_loop. */
8231 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8232 stmt_info, 0, vect_body);
8233 /* prologue cost for vec_init and vec_step. */
8234 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8235 stmt_info, 0, vect_prologue);
8237 if (dump_enabled_p ())
8238 dump_printf_loc (MSG_NOTE, vect_location,
8239 "vect_model_induction_cost: inside_cost = %d, "
8240 "prologue_cost = %d .\n", inside_cost,
8241 prologue_cost);
8243 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8244 DUMP_VECT_SCOPE ("vectorizable_induction");
8245 return true;
8248 /* Transform. */
8250 /* Compute a vector variable, initialized with the first VF values of
8251 the induction variable. E.g., for an iv with IV_PHI='X' and
8252 evolution S, for a vector of 4 units, we want to compute:
8253 [X, X + S, X + 2*S, X + 3*S]. */
8255 if (dump_enabled_p ())
8256 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8258 pe = loop_preheader_edge (iv_loop);
8259 /* Find the first insertion point in the BB. */
8260 basic_block bb = gimple_bb (phi);
8261 si = gsi_after_labels (bb);
8263 /* For SLP induction we have to generate several IVs as for example
8264 with group size 3 we need
8265 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8266 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8267 if (slp_node)
8269 /* Enforced above. */
8270 unsigned int const_nunits = nunits.to_constant ();
8272 /* The initial values are vectorized, but any lanes > group_size
8273 need adjustment. */
8274 slp_tree init_node
8275 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8277 /* Gather steps. Since we do not vectorize inductions as
8278 cycles we have to reconstruct the step from SCEV data. */
8279 unsigned group_size = SLP_TREE_LANES (slp_node);
8280 tree *steps = XALLOCAVEC (tree, group_size);
8281 tree *inits = XALLOCAVEC (tree, group_size);
8282 stmt_vec_info phi_info;
8283 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8285 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8286 if (!init_node)
8287 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8288 pe->dest_idx);
8291 /* Now generate the IVs. */
8292 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8293 gcc_assert ((const_nunits * nvects) % group_size == 0);
8294 unsigned nivs;
8295 if (nested_in_vect_loop)
8296 nivs = nvects;
8297 else
8299 /* Compute the number of distinct IVs we need. First reduce
8300 group_size if it is a multiple of const_nunits so we get
8301 one IV for a group_size of 4 but const_nunits 2. */
8302 unsigned group_sizep = group_size;
8303 if (group_sizep % const_nunits == 0)
8304 group_sizep = group_sizep / const_nunits;
8305 nivs = least_common_multiple (group_sizep,
8306 const_nunits) / const_nunits;
8308 tree stept = TREE_TYPE (step_vectype);
8309 tree lupdate_mul = NULL_TREE;
8310 if (!nested_in_vect_loop)
8312 /* The number of iterations covered in one vector iteration. */
8313 unsigned lup_mul = (nvects * const_nunits) / group_size;
8314 lupdate_mul
8315 = build_vector_from_val (step_vectype,
8316 SCALAR_FLOAT_TYPE_P (stept)
8317 ? build_real_from_wide (stept, lup_mul,
8318 UNSIGNED)
8319 : build_int_cstu (stept, lup_mul));
8321 tree peel_mul = NULL_TREE;
8322 gimple_seq init_stmts = NULL;
8323 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8325 if (SCALAR_FLOAT_TYPE_P (stept))
8326 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8327 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8328 else
8329 peel_mul = gimple_convert (&init_stmts, stept,
8330 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8331 peel_mul = gimple_build_vector_from_val (&init_stmts,
8332 step_vectype, peel_mul);
8334 unsigned ivn;
8335 auto_vec<tree> vec_steps;
8336 for (ivn = 0; ivn < nivs; ++ivn)
8338 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8339 tree_vector_builder init_elts (vectype, const_nunits, 1);
8340 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8341 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8343 /* The scalar steps of the IVs. */
8344 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8345 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8346 step_elts.quick_push (elt);
8347 if (!init_node)
8349 /* The scalar inits of the IVs if not vectorized. */
8350 elt = inits[(ivn*const_nunits + eltn) % group_size];
8351 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8352 TREE_TYPE (elt)))
8353 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8354 TREE_TYPE (vectype), elt);
8355 init_elts.quick_push (elt);
8357 /* The number of steps to add to the initial values. */
8358 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8359 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8360 ? build_real_from_wide (stept,
8361 mul_elt, UNSIGNED)
8362 : build_int_cstu (stept, mul_elt));
8364 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8365 vec_steps.safe_push (vec_step);
8366 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8367 if (peel_mul)
8368 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8369 step_mul, peel_mul);
8370 if (!init_node)
8371 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8373 /* Create the induction-phi that defines the induction-operand. */
8374 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8375 "vec_iv_");
8376 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8377 induc_def = PHI_RESULT (induction_phi);
8379 /* Create the iv update inside the loop */
8380 tree up = vec_step;
8381 if (lupdate_mul)
8382 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8383 vec_step, lupdate_mul);
8384 gimple_seq stmts = NULL;
8385 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8386 vec_def = gimple_build (&stmts,
8387 PLUS_EXPR, step_vectype, vec_def, up);
8388 vec_def = gimple_convert (&stmts, vectype, vec_def);
8389 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8390 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8391 UNKNOWN_LOCATION);
8393 if (init_node)
8394 vec_init = vect_get_slp_vect_def (init_node, ivn);
8395 if (!nested_in_vect_loop
8396 && !integer_zerop (step_mul))
8398 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8399 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8400 vec_step, step_mul);
8401 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8402 vec_def, up);
8403 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8406 /* Set the arguments of the phi node: */
8407 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8409 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8411 if (!nested_in_vect_loop)
8413 /* Fill up to the number of vectors we need for the whole group. */
8414 nivs = least_common_multiple (group_size,
8415 const_nunits) / const_nunits;
8416 vec_steps.reserve (nivs-ivn);
8417 for (; ivn < nivs; ++ivn)
8419 SLP_TREE_VEC_STMTS (slp_node)
8420 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8421 vec_steps.quick_push (vec_steps[0]);
8425 /* Re-use IVs when we can. We are generating further vector
8426 stmts by adding VF' * stride to the IVs generated above. */
8427 if (ivn < nvects)
8429 unsigned vfp
8430 = least_common_multiple (group_size, const_nunits) / group_size;
8431 tree lupdate_mul
8432 = build_vector_from_val (step_vectype,
8433 SCALAR_FLOAT_TYPE_P (stept)
8434 ? build_real_from_wide (stept,
8435 vfp, UNSIGNED)
8436 : build_int_cstu (stept, vfp));
8437 for (; ivn < nvects; ++ivn)
8439 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8440 tree def = gimple_get_lhs (iv);
8441 if (ivn < 2*nivs)
8442 vec_steps[ivn - nivs]
8443 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8444 vec_steps[ivn - nivs], lupdate_mul);
8445 gimple_seq stmts = NULL;
8446 def = gimple_convert (&stmts, step_vectype, def);
8447 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8448 def, vec_steps[ivn % nivs]);
8449 def = gimple_convert (&stmts, vectype, def);
8450 if (gimple_code (iv) == GIMPLE_PHI)
8451 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8452 else
8454 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8455 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8457 SLP_TREE_VEC_STMTS (slp_node)
8458 .quick_push (SSA_NAME_DEF_STMT (def));
8462 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8463 gcc_assert (!new_bb);
8465 return true;
8468 init_expr = vect_phi_initial_value (phi);
8470 gimple_seq stmts = NULL;
8471 if (!nested_in_vect_loop)
8473 /* Convert the initial value to the IV update type. */
8474 tree new_type = TREE_TYPE (step_expr);
8475 init_expr = gimple_convert (&stmts, new_type, init_expr);
8477 /* If we are using the loop mask to "peel" for alignment then we need
8478 to adjust the start value here. */
8479 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8480 if (skip_niters != NULL_TREE)
8482 if (FLOAT_TYPE_P (vectype))
8483 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8484 skip_niters);
8485 else
8486 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8487 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8488 skip_niters, step_expr);
8489 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8490 init_expr, skip_step);
8494 if (stmts)
8496 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8497 gcc_assert (!new_bb);
8500 /* Create the vector that holds the initial_value of the induction. */
8501 if (nested_in_vect_loop)
8503 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8504 been created during vectorization of previous stmts. We obtain it
8505 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8506 auto_vec<tree> vec_inits;
8507 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8508 init_expr, &vec_inits);
8509 vec_init = vec_inits[0];
8510 /* If the initial value is not of proper type, convert it. */
8511 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8513 new_stmt
8514 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8515 vect_simple_var,
8516 "vec_iv_"),
8517 VIEW_CONVERT_EXPR,
8518 build1 (VIEW_CONVERT_EXPR, vectype,
8519 vec_init));
8520 vec_init = gimple_assign_lhs (new_stmt);
8521 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8522 new_stmt);
8523 gcc_assert (!new_bb);
8526 else
8528 /* iv_loop is the loop to be vectorized. Create:
8529 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8530 stmts = NULL;
8531 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8533 unsigned HOST_WIDE_INT const_nunits;
8534 if (nunits.is_constant (&const_nunits))
8536 tree_vector_builder elts (step_vectype, const_nunits, 1);
8537 elts.quick_push (new_name);
8538 for (i = 1; i < const_nunits; i++)
8540 /* Create: new_name_i = new_name + step_expr */
8541 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8542 new_name, step_expr);
8543 elts.quick_push (new_name);
8545 /* Create a vector from [new_name_0, new_name_1, ...,
8546 new_name_nunits-1] */
8547 vec_init = gimple_build_vector (&stmts, &elts);
8549 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8550 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8551 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8552 new_name, step_expr);
8553 else
8555 /* Build:
8556 [base, base, base, ...]
8557 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8558 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8559 gcc_assert (flag_associative_math);
8560 tree index = build_index_vector (step_vectype, 0, 1);
8561 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8562 new_name);
8563 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8564 step_expr);
8565 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8566 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8567 vec_init, step_vec);
8568 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8569 vec_init, base_vec);
8571 vec_init = gimple_convert (&stmts, vectype, vec_init);
8573 if (stmts)
8575 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8576 gcc_assert (!new_bb);
8581 /* Create the vector that holds the step of the induction. */
8582 if (nested_in_vect_loop)
8583 /* iv_loop is nested in the loop to be vectorized. Generate:
8584 vec_step = [S, S, S, S] */
8585 new_name = step_expr;
8586 else
8588 /* iv_loop is the loop to be vectorized. Generate:
8589 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8590 gimple_seq seq = NULL;
8591 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8593 expr = build_int_cst (integer_type_node, vf);
8594 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8596 else
8597 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8598 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8599 expr, step_expr);
8600 if (seq)
8602 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8603 gcc_assert (!new_bb);
8607 t = unshare_expr (new_name);
8608 gcc_assert (CONSTANT_CLASS_P (new_name)
8609 || TREE_CODE (new_name) == SSA_NAME);
8610 new_vec = build_vector_from_val (step_vectype, t);
8611 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8612 new_vec, step_vectype, NULL);
8615 /* Create the following def-use cycle:
8616 loop prolog:
8617 vec_init = ...
8618 vec_step = ...
8619 loop:
8620 vec_iv = PHI <vec_init, vec_loop>
8622 STMT
8624 vec_loop = vec_iv + vec_step; */
8626 /* Create the induction-phi that defines the induction-operand. */
8627 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8628 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8629 induc_def = PHI_RESULT (induction_phi);
8631 /* Create the iv update inside the loop */
8632 stmts = NULL;
8633 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8634 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8635 vec_def = gimple_convert (&stmts, vectype, vec_def);
8636 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8637 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8639 /* Set the arguments of the phi node: */
8640 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8641 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8642 UNKNOWN_LOCATION);
8644 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8645 *vec_stmt = induction_phi;
8647 /* In case that vectorization factor (VF) is bigger than the number
8648 of elements that we can fit in a vectype (nunits), we have to generate
8649 more than one vector stmt - i.e - we need to "unroll" the
8650 vector stmt by a factor VF/nunits. For more details see documentation
8651 in vectorizable_operation. */
8653 if (ncopies > 1)
8655 gimple_seq seq = NULL;
8656 /* FORNOW. This restriction should be relaxed. */
8657 gcc_assert (!nested_in_vect_loop);
8659 /* Create the vector that holds the step of the induction. */
8660 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8662 expr = build_int_cst (integer_type_node, nunits);
8663 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8665 else
8666 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8667 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8668 expr, step_expr);
8669 if (seq)
8671 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8672 gcc_assert (!new_bb);
8675 t = unshare_expr (new_name);
8676 gcc_assert (CONSTANT_CLASS_P (new_name)
8677 || TREE_CODE (new_name) == SSA_NAME);
8678 new_vec = build_vector_from_val (step_vectype, t);
8679 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8680 new_vec, step_vectype, NULL);
8682 vec_def = induc_def;
8683 for (i = 1; i < ncopies; i++)
8685 /* vec_i = vec_prev + vec_step */
8686 gimple_seq stmts = NULL;
8687 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8688 vec_def = gimple_build (&stmts,
8689 PLUS_EXPR, step_vectype, vec_def, vec_step);
8690 vec_def = gimple_convert (&stmts, vectype, vec_def);
8692 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8693 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8694 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8698 if (dump_enabled_p ())
8699 dump_printf_loc (MSG_NOTE, vect_location,
8700 "transform induction: created def-use cycle: %G%G",
8701 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8703 return true;
8706 /* Function vectorizable_live_operation.
8708 STMT_INFO computes a value that is used outside the loop. Check if
8709 it can be supported. */
8711 bool
8712 vectorizable_live_operation (vec_info *vinfo,
8713 stmt_vec_info stmt_info,
8714 gimple_stmt_iterator *gsi,
8715 slp_tree slp_node, slp_instance slp_node_instance,
8716 int slp_index, bool vec_stmt_p,
8717 stmt_vector_for_cost *cost_vec)
8719 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8720 imm_use_iterator imm_iter;
8721 tree lhs, lhs_type, bitsize;
8722 tree vectype = (slp_node
8723 ? SLP_TREE_VECTYPE (slp_node)
8724 : STMT_VINFO_VECTYPE (stmt_info));
8725 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8726 int ncopies;
8727 gimple *use_stmt;
8728 auto_vec<tree> vec_oprnds;
8729 int vec_entry = 0;
8730 poly_uint64 vec_index = 0;
8732 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8734 /* If a stmt of a reduction is live, vectorize it via
8735 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8736 validity so just trigger the transform here. */
8737 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8739 if (!vec_stmt_p)
8740 return true;
8741 if (slp_node)
8743 /* For reduction chains the meta-info is attached to
8744 the group leader. */
8745 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8746 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8747 /* For SLP reductions we vectorize the epilogue for
8748 all involved stmts together. */
8749 else if (slp_index != 0)
8750 return true;
8751 else
8752 /* For SLP reductions the meta-info is attached to
8753 the representative. */
8754 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8756 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8757 gcc_assert (reduc_info->is_reduc_info);
8758 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8759 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8760 return true;
8761 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8762 slp_node_instance);
8763 return true;
8766 /* If STMT is not relevant and it is a simple assignment and its inputs are
8767 invariant then it can remain in place, unvectorized. The original last
8768 scalar value that it computes will be used. */
8769 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8771 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8772 if (dump_enabled_p ())
8773 dump_printf_loc (MSG_NOTE, vect_location,
8774 "statement is simple and uses invariant. Leaving in "
8775 "place.\n");
8776 return true;
8779 if (slp_node)
8780 ncopies = 1;
8781 else
8782 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8784 if (slp_node)
8786 gcc_assert (slp_index >= 0);
8788 /* Get the last occurrence of the scalar index from the concatenation of
8789 all the slp vectors. Calculate which slp vector it is and the index
8790 within. */
8791 int num_scalar = SLP_TREE_LANES (slp_node);
8792 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8793 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8795 /* Calculate which vector contains the result, and which lane of
8796 that vector we need. */
8797 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8799 if (dump_enabled_p ())
8800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8801 "Cannot determine which vector holds the"
8802 " final result.\n");
8803 return false;
8807 if (!vec_stmt_p)
8809 /* No transformation required. */
8810 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8812 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8813 OPTIMIZE_FOR_SPEED))
8815 if (dump_enabled_p ())
8816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8817 "can't operate on partial vectors "
8818 "because the target doesn't support extract "
8819 "last reduction.\n");
8820 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8822 else if (slp_node)
8824 if (dump_enabled_p ())
8825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8826 "can't operate on partial vectors "
8827 "because an SLP statement is live after "
8828 "the loop.\n");
8829 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8831 else if (ncopies > 1)
8833 if (dump_enabled_p ())
8834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8835 "can't operate on partial vectors "
8836 "because ncopies is greater than 1.\n");
8837 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8839 else
8841 gcc_assert (ncopies == 1 && !slp_node);
8842 vect_record_loop_mask (loop_vinfo,
8843 &LOOP_VINFO_MASKS (loop_vinfo),
8844 1, vectype, NULL);
8847 /* ??? Enable for loop costing as well. */
8848 if (!loop_vinfo)
8849 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8850 0, vect_epilogue);
8851 return true;
8854 /* Use the lhs of the original scalar statement. */
8855 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8856 if (dump_enabled_p ())
8857 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8858 "stmt %G", stmt);
8860 lhs = gimple_get_lhs (stmt);
8861 lhs_type = TREE_TYPE (lhs);
8863 bitsize = vector_element_bits_tree (vectype);
8865 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8866 tree vec_lhs, bitstart;
8867 gimple *vec_stmt;
8868 if (slp_node)
8870 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8872 /* Get the correct slp vectorized stmt. */
8873 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8874 vec_lhs = gimple_get_lhs (vec_stmt);
8876 /* Get entry to use. */
8877 bitstart = bitsize_int (vec_index);
8878 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8880 else
8882 /* For multiple copies, get the last copy. */
8883 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8884 vec_lhs = gimple_get_lhs (vec_stmt);
8886 /* Get the last lane in the vector. */
8887 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8890 if (loop_vinfo)
8892 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8893 requirement, insert one phi node for it. It looks like:
8894 loop;
8896 # lhs' = PHI <lhs>
8898 loop;
8900 # vec_lhs' = PHI <vec_lhs>
8901 new_tree = lane_extract <vec_lhs', ...>;
8902 lhs' = new_tree; */
8904 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8905 basic_block exit_bb = single_exit (loop)->dest;
8906 gcc_assert (single_pred_p (exit_bb));
8908 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8909 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8910 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8912 gimple_seq stmts = NULL;
8913 tree new_tree;
8914 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8916 /* Emit:
8918 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8920 where VEC_LHS is the vectorized live-out result and MASK is
8921 the loop mask for the final iteration. */
8922 gcc_assert (ncopies == 1 && !slp_node);
8923 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8924 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8925 1, vectype, 0);
8926 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8927 mask, vec_lhs_phi);
8929 /* Convert the extracted vector element to the scalar type. */
8930 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8932 else
8934 tree bftype = TREE_TYPE (vectype);
8935 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8936 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8937 new_tree = build3 (BIT_FIELD_REF, bftype,
8938 vec_lhs_phi, bitsize, bitstart);
8939 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8940 &stmts, true, NULL_TREE);
8943 if (stmts)
8945 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8946 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8948 /* Remove existing phi from lhs and create one copy from new_tree. */
8949 tree lhs_phi = NULL_TREE;
8950 gimple_stmt_iterator gsi;
8951 for (gsi = gsi_start_phis (exit_bb);
8952 !gsi_end_p (gsi); gsi_next (&gsi))
8954 gimple *phi = gsi_stmt (gsi);
8955 if ((gimple_phi_arg_def (phi, 0) == lhs))
8957 remove_phi_node (&gsi, false);
8958 lhs_phi = gimple_phi_result (phi);
8959 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8960 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8961 break;
8966 /* Replace use of lhs with newly computed result. If the use stmt is a
8967 single arg PHI, just replace all uses of PHI result. It's necessary
8968 because lcssa PHI defining lhs may be before newly inserted stmt. */
8969 use_operand_p use_p;
8970 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8971 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8972 && !is_gimple_debug (use_stmt))
8974 if (gimple_code (use_stmt) == GIMPLE_PHI
8975 && gimple_phi_num_args (use_stmt) == 1)
8977 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8979 else
8981 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8982 SET_USE (use_p, new_tree);
8984 update_stmt (use_stmt);
8987 else
8989 /* For basic-block vectorization simply insert the lane-extraction. */
8990 tree bftype = TREE_TYPE (vectype);
8991 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8992 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8993 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8994 vec_lhs, bitsize, bitstart);
8995 gimple_seq stmts = NULL;
8996 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8997 &stmts, true, NULL_TREE);
8998 if (TREE_CODE (new_tree) == SSA_NAME
8999 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
9000 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
9001 if (is_a <gphi *> (vec_stmt))
9003 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9004 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9006 else
9008 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9009 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9012 /* Replace use of lhs with newly computed result. If the use stmt is a
9013 single arg PHI, just replace all uses of PHI result. It's necessary
9014 because lcssa PHI defining lhs may be before newly inserted stmt. */
9015 use_operand_p use_p;
9016 stmt_vec_info use_stmt_info;
9017 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9018 if (!is_gimple_debug (use_stmt)
9019 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9020 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9022 /* ??? This can happen when the live lane ends up being
9023 used in a vector construction code-generated by an
9024 external SLP node (and code-generation for that already
9025 happened). See gcc.dg/vect/bb-slp-47.c.
9026 Doing this is what would happen if that vector CTOR
9027 were not code-generated yet so it is not too bad.
9028 ??? In fact we'd likely want to avoid this situation
9029 in the first place. */
9030 if (TREE_CODE (new_tree) == SSA_NAME
9031 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9032 && gimple_code (use_stmt) != GIMPLE_PHI
9033 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9034 use_stmt))
9036 enum tree_code code = gimple_assign_rhs_code (use_stmt);
9037 gcc_assert (code == CONSTRUCTOR
9038 || code == VIEW_CONVERT_EXPR
9039 || CONVERT_EXPR_CODE_P (code));
9040 if (dump_enabled_p ())
9041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9042 "Using original scalar computation for "
9043 "live lane because use preceeds vector "
9044 "def\n");
9045 continue;
9047 /* ??? It can also happen that we end up pulling a def into
9048 a loop where replacing out-of-loop uses would require
9049 a new LC SSA PHI node. Retain the original scalar in
9050 those cases as well. PR98064. */
9051 if (TREE_CODE (new_tree) == SSA_NAME
9052 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9053 && (gimple_bb (use_stmt)->loop_father
9054 != gimple_bb (vec_stmt)->loop_father)
9055 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9056 gimple_bb (use_stmt)->loop_father))
9058 if (dump_enabled_p ())
9059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9060 "Using original scalar computation for "
9061 "live lane because there is an out-of-loop "
9062 "definition for it\n");
9063 continue;
9065 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9066 SET_USE (use_p, new_tree);
9067 update_stmt (use_stmt);
9071 return true;
9074 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
9076 static void
9077 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9079 ssa_op_iter op_iter;
9080 imm_use_iterator imm_iter;
9081 def_operand_p def_p;
9082 gimple *ustmt;
9084 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9086 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9088 basic_block bb;
9090 if (!is_gimple_debug (ustmt))
9091 continue;
9093 bb = gimple_bb (ustmt);
9095 if (!flow_bb_inside_loop_p (loop, bb))
9097 if (gimple_debug_bind_p (ustmt))
9099 if (dump_enabled_p ())
9100 dump_printf_loc (MSG_NOTE, vect_location,
9101 "killing debug use\n");
9103 gimple_debug_bind_reset_value (ustmt);
9104 update_stmt (ustmt);
9106 else
9107 gcc_unreachable ();
9113 /* Given loop represented by LOOP_VINFO, return true if computation of
9114 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9115 otherwise. */
9117 static bool
9118 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9120 /* Constant case. */
9121 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9123 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9124 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9126 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9127 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9128 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9129 return true;
9132 widest_int max;
9133 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9134 /* Check the upper bound of loop niters. */
9135 if (get_max_loop_iterations (loop, &max))
9137 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9138 signop sgn = TYPE_SIGN (type);
9139 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9140 if (max < type_max)
9141 return true;
9143 return false;
9146 /* Return a mask type with half the number of elements as OLD_TYPE,
9147 given that it should have mode NEW_MODE. */
9149 tree
9150 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9152 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9153 return build_truth_vector_type_for_mode (nunits, new_mode);
9156 /* Return a mask type with twice as many elements as OLD_TYPE,
9157 given that it should have mode NEW_MODE. */
9159 tree
9160 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9162 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9163 return build_truth_vector_type_for_mode (nunits, new_mode);
9166 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9167 contain a sequence of NVECTORS masks that each control a vector of type
9168 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9169 these vector masks with the vector version of SCALAR_MASK. */
9171 void
9172 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9173 unsigned int nvectors, tree vectype, tree scalar_mask)
9175 gcc_assert (nvectors != 0);
9176 if (masks->length () < nvectors)
9177 masks->safe_grow_cleared (nvectors, true);
9178 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9179 /* The number of scalars per iteration and the number of vectors are
9180 both compile-time constants. */
9181 unsigned int nscalars_per_iter
9182 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9183 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9185 if (scalar_mask)
9187 scalar_cond_masked_key cond (scalar_mask, nvectors);
9188 loop_vinfo->scalar_cond_masked_set.add (cond);
9191 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9193 rgm->max_nscalars_per_iter = nscalars_per_iter;
9194 rgm->type = truth_type_for (vectype);
9195 rgm->factor = 1;
9199 /* Given a complete set of masks MASKS, extract mask number INDEX
9200 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9201 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9203 See the comment above vec_loop_masks for more details about the mask
9204 arrangement. */
9206 tree
9207 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9208 unsigned int nvectors, tree vectype, unsigned int index)
9210 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9211 tree mask_type = rgm->type;
9213 /* Populate the rgroup's mask array, if this is the first time we've
9214 used it. */
9215 if (rgm->controls.is_empty ())
9217 rgm->controls.safe_grow_cleared (nvectors, true);
9218 for (unsigned int i = 0; i < nvectors; ++i)
9220 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9221 /* Provide a dummy definition until the real one is available. */
9222 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9223 rgm->controls[i] = mask;
9227 tree mask = rgm->controls[index];
9228 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9229 TYPE_VECTOR_SUBPARTS (vectype)))
9231 /* A loop mask for data type X can be reused for data type Y
9232 if X has N times more elements than Y and if Y's elements
9233 are N times bigger than X's. In this case each sequence
9234 of N elements in the loop mask will be all-zero or all-one.
9235 We can then view-convert the mask so that each sequence of
9236 N elements is replaced by a single element. */
9237 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9238 TYPE_VECTOR_SUBPARTS (vectype)));
9239 gimple_seq seq = NULL;
9240 mask_type = truth_type_for (vectype);
9241 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9242 if (seq)
9243 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9245 return mask;
9248 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9249 lengths for controlling an operation on VECTYPE. The operation splits
9250 each element of VECTYPE into FACTOR separate subelements, measuring the
9251 length as a number of these subelements. */
9253 void
9254 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9255 unsigned int nvectors, tree vectype, unsigned int factor)
9257 gcc_assert (nvectors != 0);
9258 if (lens->length () < nvectors)
9259 lens->safe_grow_cleared (nvectors, true);
9260 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9262 /* The number of scalars per iteration, scalar occupied bytes and
9263 the number of vectors are both compile-time constants. */
9264 unsigned int nscalars_per_iter
9265 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9266 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9268 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9270 /* For now, we only support cases in which all loads and stores fall back
9271 to VnQI or none do. */
9272 gcc_assert (!rgl->max_nscalars_per_iter
9273 || (rgl->factor == 1 && factor == 1)
9274 || (rgl->max_nscalars_per_iter * rgl->factor
9275 == nscalars_per_iter * factor));
9276 rgl->max_nscalars_per_iter = nscalars_per_iter;
9277 rgl->type = vectype;
9278 rgl->factor = factor;
9282 /* Given a complete set of length LENS, extract length number INDEX for an
9283 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9285 tree
9286 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9287 unsigned int nvectors, unsigned int index)
9289 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9290 bool use_bias_adjusted_len =
9291 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
9293 /* Populate the rgroup's len array, if this is the first time we've
9294 used it. */
9295 if (rgl->controls.is_empty ())
9297 rgl->controls.safe_grow_cleared (nvectors, true);
9298 for (unsigned int i = 0; i < nvectors; ++i)
9300 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9301 gcc_assert (len_type != NULL_TREE);
9303 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9305 /* Provide a dummy definition until the real one is available. */
9306 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9307 rgl->controls[i] = len;
9309 if (use_bias_adjusted_len)
9311 gcc_assert (i == 0);
9312 tree adjusted_len =
9313 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
9314 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
9315 rgl->bias_adjusted_ctrl = adjusted_len;
9320 if (use_bias_adjusted_len)
9321 return rgl->bias_adjusted_ctrl;
9322 else
9323 return rgl->controls[index];
9326 /* Scale profiling counters by estimation for LOOP which is vectorized
9327 by factor VF. */
9329 static void
9330 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9332 edge preheader = loop_preheader_edge (loop);
9333 /* Reduce loop iterations by the vectorization factor. */
9334 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9335 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9337 if (freq_h.nonzero_p ())
9339 profile_probability p;
9341 /* Avoid dropping loop body profile counter to 0 because of zero count
9342 in loop's preheader. */
9343 if (!(freq_e == profile_count::zero ()))
9344 freq_e = freq_e.force_nonzero ();
9345 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9346 scale_loop_frequencies (loop, p);
9349 edge exit_e = single_exit (loop);
9350 exit_e->probability = profile_probability::always ()
9351 .apply_scale (1, new_est_niter + 1);
9353 edge exit_l = single_pred_edge (loop->latch);
9354 profile_probability prob = exit_l->probability;
9355 exit_l->probability = exit_e->probability.invert ();
9356 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9357 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9360 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9361 latch edge values originally defined by it. */
9363 static void
9364 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9365 stmt_vec_info def_stmt_info)
9367 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9368 if (!def || TREE_CODE (def) != SSA_NAME)
9369 return;
9370 stmt_vec_info phi_info;
9371 imm_use_iterator iter;
9372 use_operand_p use_p;
9373 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9374 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9375 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9376 && (phi_info = loop_vinfo->lookup_stmt (phi))
9377 && STMT_VINFO_RELEVANT_P (phi_info)
9378 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9379 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9380 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9382 loop_p loop = gimple_bb (phi)->loop_father;
9383 edge e = loop_latch_edge (loop);
9384 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9386 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9387 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9388 gcc_assert (phi_defs.length () == latch_defs.length ());
9389 for (unsigned i = 0; i < phi_defs.length (); ++i)
9390 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9391 gimple_get_lhs (latch_defs[i]), e,
9392 gimple_phi_arg_location (phi, e->dest_idx));
9397 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9398 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9399 stmt_vec_info. */
9401 static bool
9402 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9403 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9405 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9406 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9408 if (dump_enabled_p ())
9409 dump_printf_loc (MSG_NOTE, vect_location,
9410 "------>vectorizing statement: %G", stmt_info->stmt);
9412 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9413 vect_loop_kill_debug_uses (loop, stmt_info);
9415 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9416 && !STMT_VINFO_LIVE_P (stmt_info))
9417 return false;
9419 if (STMT_VINFO_VECTYPE (stmt_info))
9421 poly_uint64 nunits
9422 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9423 if (!STMT_SLP_TYPE (stmt_info)
9424 && maybe_ne (nunits, vf)
9425 && dump_enabled_p ())
9426 /* For SLP VF is set according to unrolling factor, and not
9427 to vector size, hence for SLP this print is not valid. */
9428 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9431 /* Pure SLP statements have already been vectorized. We still need
9432 to apply loop vectorization to hybrid SLP statements. */
9433 if (PURE_SLP_STMT (stmt_info))
9434 return false;
9436 if (dump_enabled_p ())
9437 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9439 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9440 *seen_store = stmt_info;
9442 return true;
9445 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9446 in the hash_map with its corresponding values. */
9448 static tree
9449 find_in_mapping (tree t, void *context)
9451 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9453 tree *value = mapping->get (t);
9454 return value ? *value : t;
9457 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9458 original loop that has now been vectorized.
9460 The inits of the data_references need to be advanced with the number of
9461 iterations of the main loop. This has been computed in vect_do_peeling and
9462 is stored in parameter ADVANCE. We first restore the data_references
9463 initial offset with the values recored in ORIG_DRS_INIT.
9465 Since the loop_vec_info of this EPILOGUE was constructed for the original
9466 loop, its stmt_vec_infos all point to the original statements. These need
9467 to be updated to point to their corresponding copies as well as the SSA_NAMES
9468 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9470 The data_reference's connections also need to be updated. Their
9471 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9472 stmt_vec_infos, their statements need to point to their corresponding copy,
9473 if they are gather loads or scatter stores then their reference needs to be
9474 updated to point to its corresponding copy and finally we set
9475 'base_misaligned' to false as we have already peeled for alignment in the
9476 prologue of the main loop. */
9478 static void
9479 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9481 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9482 auto_vec<gimple *> stmt_worklist;
9483 hash_map<tree,tree> mapping;
9484 gimple *orig_stmt, *new_stmt;
9485 gimple_stmt_iterator epilogue_gsi;
9486 gphi_iterator epilogue_phi_gsi;
9487 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9488 basic_block *epilogue_bbs = get_loop_body (epilogue);
9489 unsigned i;
9491 free (LOOP_VINFO_BBS (epilogue_vinfo));
9492 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9494 /* Advance data_reference's with the number of iterations of the previous
9495 loop and its prologue. */
9496 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9499 /* The EPILOGUE loop is a copy of the original loop so they share the same
9500 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9501 point to the copied statements. We also create a mapping of all LHS' in
9502 the original loop and all the LHS' in the EPILOGUE and create worklists to
9503 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9504 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9506 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9507 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9509 new_stmt = epilogue_phi_gsi.phi ();
9511 gcc_assert (gimple_uid (new_stmt) > 0);
9512 stmt_vinfo
9513 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9515 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9516 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9518 mapping.put (gimple_phi_result (orig_stmt),
9519 gimple_phi_result (new_stmt));
9520 /* PHI nodes can not have patterns or related statements. */
9521 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9522 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9525 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9526 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9528 new_stmt = gsi_stmt (epilogue_gsi);
9529 if (is_gimple_debug (new_stmt))
9530 continue;
9532 gcc_assert (gimple_uid (new_stmt) > 0);
9533 stmt_vinfo
9534 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9536 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9537 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9539 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9540 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9542 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9544 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9545 for (gimple_stmt_iterator gsi = gsi_start (seq);
9546 !gsi_end_p (gsi); gsi_next (&gsi))
9547 stmt_worklist.safe_push (gsi_stmt (gsi));
9550 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9551 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9553 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9554 stmt_worklist.safe_push (stmt);
9555 /* Set BB such that the assert in
9556 'get_initial_def_for_reduction' is able to determine that
9557 the BB of the related stmt is inside this loop. */
9558 gimple_set_bb (stmt,
9559 gimple_bb (new_stmt));
9560 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9561 gcc_assert (related_vinfo == NULL
9562 || related_vinfo == stmt_vinfo);
9567 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9568 using the original main loop and thus need to be updated to refer to the
9569 cloned variables used in the epilogue. */
9570 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9572 gimple *stmt = stmt_worklist[i];
9573 tree *new_op;
9575 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9577 tree op = gimple_op (stmt, j);
9578 if ((new_op = mapping.get(op)))
9579 gimple_set_op (stmt, j, *new_op);
9580 else
9582 /* PR92429: The last argument of simplify_replace_tree disables
9583 folding when replacing arguments. This is required as
9584 otherwise you might end up with different statements than the
9585 ones analyzed in vect_loop_analyze, leading to different
9586 vectorization. */
9587 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9588 &find_in_mapping, &mapping, false);
9589 gimple_set_op (stmt, j, op);
9594 struct data_reference *dr;
9595 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9596 FOR_EACH_VEC_ELT (datarefs, i, dr)
9598 orig_stmt = DR_STMT (dr);
9599 gcc_assert (gimple_uid (orig_stmt) > 0);
9600 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9601 /* Data references for gather loads and scatter stores do not use the
9602 updated offset we set using ADVANCE. Instead we have to make sure the
9603 reference in the data references point to the corresponding copy of
9604 the original in the epilogue. */
9605 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9606 == VMAT_GATHER_SCATTER)
9608 DR_REF (dr)
9609 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9610 &find_in_mapping, &mapping);
9611 DR_BASE_ADDRESS (dr)
9612 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9613 &find_in_mapping, &mapping);
9615 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9616 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9617 /* The vector size of the epilogue is smaller than that of the main loop
9618 so the alignment is either the same or lower. This means the dr will
9619 thus by definition be aligned. */
9620 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9623 epilogue_vinfo->shared->datarefs_copy.release ();
9624 epilogue_vinfo->shared->save_datarefs ();
9627 /* Function vect_transform_loop.
9629 The analysis phase has determined that the loop is vectorizable.
9630 Vectorize the loop - created vectorized stmts to replace the scalar
9631 stmts in the loop, and update the loop exit condition.
9632 Returns scalar epilogue loop if any. */
9634 class loop *
9635 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9637 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9638 class loop *epilogue = NULL;
9639 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9640 int nbbs = loop->num_nodes;
9641 int i;
9642 tree niters_vector = NULL_TREE;
9643 tree step_vector = NULL_TREE;
9644 tree niters_vector_mult_vf = NULL_TREE;
9645 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9646 unsigned int lowest_vf = constant_lower_bound (vf);
9647 gimple *stmt;
9648 bool check_profitability = false;
9649 unsigned int th;
9651 DUMP_VECT_SCOPE ("vec_transform_loop");
9653 loop_vinfo->shared->check_datarefs ();
9655 /* Use the more conservative vectorization threshold. If the number
9656 of iterations is constant assume the cost check has been performed
9657 by our caller. If the threshold makes all loops profitable that
9658 run at least the (estimated) vectorization factor number of times
9659 checking is pointless, too. */
9660 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9661 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9663 if (dump_enabled_p ())
9664 dump_printf_loc (MSG_NOTE, vect_location,
9665 "Profitability threshold is %d loop iterations.\n",
9666 th);
9667 check_profitability = true;
9670 /* Make sure there exists a single-predecessor exit bb. Do this before
9671 versioning. */
9672 edge e = single_exit (loop);
9673 if (! single_pred_p (e->dest))
9675 split_loop_exit_edge (e, true);
9676 if (dump_enabled_p ())
9677 dump_printf (MSG_NOTE, "split exit edge\n");
9680 /* Version the loop first, if required, so the profitability check
9681 comes first. */
9683 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9685 class loop *sloop
9686 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9687 sloop->force_vectorize = false;
9688 check_profitability = false;
9691 /* Make sure there exists a single-predecessor exit bb also on the
9692 scalar loop copy. Do this after versioning but before peeling
9693 so CFG structure is fine for both scalar and if-converted loop
9694 to make slpeel_duplicate_current_defs_from_edges face matched
9695 loop closed PHI nodes on the exit. */
9696 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9698 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9699 if (! single_pred_p (e->dest))
9701 split_loop_exit_edge (e, true);
9702 if (dump_enabled_p ())
9703 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9707 tree niters = vect_build_loop_niters (loop_vinfo);
9708 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9709 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9710 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9711 tree advance;
9712 drs_init_vec orig_drs_init;
9714 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9715 &step_vector, &niters_vector_mult_vf, th,
9716 check_profitability, niters_no_overflow,
9717 &advance);
9719 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9720 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9721 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9722 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9724 if (niters_vector == NULL_TREE)
9726 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9727 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9728 && known_eq (lowest_vf, vf))
9730 niters_vector
9731 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9732 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9733 step_vector = build_one_cst (TREE_TYPE (niters));
9735 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9736 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9737 &step_vector, niters_no_overflow);
9738 else
9739 /* vect_do_peeling subtracted the number of peeled prologue
9740 iterations from LOOP_VINFO_NITERS. */
9741 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9742 &niters_vector, &step_vector,
9743 niters_no_overflow);
9746 /* 1) Make sure the loop header has exactly two entries
9747 2) Make sure we have a preheader basic block. */
9749 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9751 split_edge (loop_preheader_edge (loop));
9753 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9754 /* This will deal with any possible peeling. */
9755 vect_prepare_for_masked_peels (loop_vinfo);
9757 /* Schedule the SLP instances first, then handle loop vectorization
9758 below. */
9759 if (!loop_vinfo->slp_instances.is_empty ())
9761 DUMP_VECT_SCOPE ("scheduling SLP instances");
9762 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9765 /* FORNOW: the vectorizer supports only loops which body consist
9766 of one basic block (header + empty latch). When the vectorizer will
9767 support more involved loop forms, the order by which the BBs are
9768 traversed need to be reconsidered. */
9770 for (i = 0; i < nbbs; i++)
9772 basic_block bb = bbs[i];
9773 stmt_vec_info stmt_info;
9775 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9776 gsi_next (&si))
9778 gphi *phi = si.phi ();
9779 if (dump_enabled_p ())
9780 dump_printf_loc (MSG_NOTE, vect_location,
9781 "------>vectorizing phi: %G", phi);
9782 stmt_info = loop_vinfo->lookup_stmt (phi);
9783 if (!stmt_info)
9784 continue;
9786 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9787 vect_loop_kill_debug_uses (loop, stmt_info);
9789 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9790 && !STMT_VINFO_LIVE_P (stmt_info))
9791 continue;
9793 if (STMT_VINFO_VECTYPE (stmt_info)
9794 && (maybe_ne
9795 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9796 && dump_enabled_p ())
9797 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9799 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9800 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9801 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9802 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9803 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9804 && ! PURE_SLP_STMT (stmt_info))
9806 if (dump_enabled_p ())
9807 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9808 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9812 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9813 gsi_next (&si))
9815 gphi *phi = si.phi ();
9816 stmt_info = loop_vinfo->lookup_stmt (phi);
9817 if (!stmt_info)
9818 continue;
9820 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9821 && !STMT_VINFO_LIVE_P (stmt_info))
9822 continue;
9824 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9825 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9826 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9827 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9828 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9829 && ! PURE_SLP_STMT (stmt_info))
9830 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9833 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9834 !gsi_end_p (si);)
9836 stmt = gsi_stmt (si);
9837 /* During vectorization remove existing clobber stmts. */
9838 if (gimple_clobber_p (stmt))
9840 unlink_stmt_vdef (stmt);
9841 gsi_remove (&si, true);
9842 release_defs (stmt);
9844 else
9846 /* Ignore vector stmts created in the outer loop. */
9847 stmt_info = loop_vinfo->lookup_stmt (stmt);
9849 /* vector stmts created in the outer-loop during vectorization of
9850 stmts in an inner-loop may not have a stmt_info, and do not
9851 need to be vectorized. */
9852 stmt_vec_info seen_store = NULL;
9853 if (stmt_info)
9855 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9857 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9858 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9859 !gsi_end_p (subsi); gsi_next (&subsi))
9861 stmt_vec_info pat_stmt_info
9862 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9863 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9864 &si, &seen_store);
9866 stmt_vec_info pat_stmt_info
9867 = STMT_VINFO_RELATED_STMT (stmt_info);
9868 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9869 &si, &seen_store))
9870 maybe_set_vectorized_backedge_value (loop_vinfo,
9871 pat_stmt_info);
9873 else
9875 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9876 &seen_store))
9877 maybe_set_vectorized_backedge_value (loop_vinfo,
9878 stmt_info);
9881 gsi_next (&si);
9882 if (seen_store)
9884 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9885 /* Interleaving. If IS_STORE is TRUE, the
9886 vectorization of the interleaving chain was
9887 completed - free all the stores in the chain. */
9888 vect_remove_stores (loop_vinfo,
9889 DR_GROUP_FIRST_ELEMENT (seen_store));
9890 else
9891 /* Free the attached stmt_vec_info and remove the stmt. */
9892 loop_vinfo->remove_stmt (stmt_info);
9897 /* Stub out scalar statements that must not survive vectorization.
9898 Doing this here helps with grouped statements, or statements that
9899 are involved in patterns. */
9900 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9901 !gsi_end_p (gsi); gsi_next (&gsi))
9903 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9904 if (!call || !gimple_call_internal_p (call))
9905 continue;
9906 internal_fn ifn = gimple_call_internal_fn (call);
9907 if (ifn == IFN_MASK_LOAD)
9909 tree lhs = gimple_get_lhs (call);
9910 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9912 tree zero = build_zero_cst (TREE_TYPE (lhs));
9913 gimple *new_stmt = gimple_build_assign (lhs, zero);
9914 gsi_replace (&gsi, new_stmt, true);
9917 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9919 tree lhs = gimple_get_lhs (call);
9920 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9922 tree else_arg
9923 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9924 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9925 gsi_replace (&gsi, new_stmt, true);
9929 } /* BBs in loop */
9931 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9932 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9933 if (integer_onep (step_vector))
9934 niters_no_overflow = true;
9935 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9936 niters_vector_mult_vf, !niters_no_overflow);
9938 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9939 scale_profile_for_vect_loop (loop, assumed_vf);
9941 /* True if the final iteration might not handle a full vector's
9942 worth of scalar iterations. */
9943 bool final_iter_may_be_partial
9944 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9945 /* The minimum number of iterations performed by the epilogue. This
9946 is 1 when peeling for gaps because we always need a final scalar
9947 iteration. */
9948 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9949 /* +1 to convert latch counts to loop iteration counts,
9950 -min_epilogue_iters to remove iterations that cannot be performed
9951 by the vector code. */
9952 int bias_for_lowest = 1 - min_epilogue_iters;
9953 int bias_for_assumed = bias_for_lowest;
9954 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9955 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9957 /* When the amount of peeling is known at compile time, the first
9958 iteration will have exactly alignment_npeels active elements.
9959 In the worst case it will have at least one. */
9960 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9961 bias_for_lowest += lowest_vf - min_first_active;
9962 bias_for_assumed += assumed_vf - min_first_active;
9964 /* In these calculations the "- 1" converts loop iteration counts
9965 back to latch counts. */
9966 if (loop->any_upper_bound)
9968 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9969 loop->nb_iterations_upper_bound
9970 = (final_iter_may_be_partial
9971 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9972 lowest_vf) - 1
9973 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9974 lowest_vf) - 1);
9975 if (main_vinfo
9976 /* Both peeling for alignment and peeling for gaps can end up
9977 with the scalar epilogue running for more than VF-1 iterations. */
9978 && !main_vinfo->peeling_for_alignment
9979 && !main_vinfo->peeling_for_gaps)
9981 unsigned int bound;
9982 poly_uint64 main_iters
9983 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9984 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9985 main_iters
9986 = upper_bound (main_iters,
9987 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9988 if (can_div_away_from_zero_p (main_iters,
9989 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9990 &bound))
9991 loop->nb_iterations_upper_bound
9992 = wi::umin ((widest_int) (bound - 1),
9993 loop->nb_iterations_upper_bound);
9996 if (loop->any_likely_upper_bound)
9997 loop->nb_iterations_likely_upper_bound
9998 = (final_iter_may_be_partial
9999 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
10000 + bias_for_lowest, lowest_vf) - 1
10001 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
10002 + bias_for_lowest, lowest_vf) - 1);
10003 if (loop->any_estimate)
10004 loop->nb_iterations_estimate
10005 = (final_iter_may_be_partial
10006 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
10007 assumed_vf) - 1
10008 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10009 assumed_vf) - 1);
10011 if (dump_enabled_p ())
10013 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10015 dump_printf_loc (MSG_NOTE, vect_location,
10016 "LOOP VECTORIZED\n");
10017 if (loop->inner)
10018 dump_printf_loc (MSG_NOTE, vect_location,
10019 "OUTER LOOP VECTORIZED\n");
10020 dump_printf (MSG_NOTE, "\n");
10022 else
10023 dump_printf_loc (MSG_NOTE, vect_location,
10024 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10025 GET_MODE_NAME (loop_vinfo->vector_mode));
10028 /* Loops vectorized with a variable factor won't benefit from
10029 unrolling/peeling. */
10030 if (!vf.is_constant ())
10032 loop->unroll = 1;
10033 if (dump_enabled_p ())
10034 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10035 " variable-length vectorization factor\n");
10037 /* Free SLP instances here because otherwise stmt reference counting
10038 won't work. */
10039 slp_instance instance;
10040 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10041 vect_free_slp_instance (instance);
10042 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10043 /* Clear-up safelen field since its value is invalid after vectorization
10044 since vectorized loop can have loop-carried dependencies. */
10045 loop->safelen = 0;
10047 if (epilogue)
10049 update_epilogue_loop_vinfo (epilogue, advance);
10051 epilogue->simduid = loop->simduid;
10052 epilogue->force_vectorize = loop->force_vectorize;
10053 epilogue->dont_vectorize = false;
10056 return epilogue;
10059 /* The code below is trying to perform simple optimization - revert
10060 if-conversion for masked stores, i.e. if the mask of a store is zero
10061 do not perform it and all stored value producers also if possible.
10062 For example,
10063 for (i=0; i<n; i++)
10064 if (c[i])
10066 p1[i] += 1;
10067 p2[i] = p3[i] +2;
10069 this transformation will produce the following semi-hammock:
10071 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10073 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10074 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10075 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10076 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10077 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10078 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10082 void
10083 optimize_mask_stores (class loop *loop)
10085 basic_block *bbs = get_loop_body (loop);
10086 unsigned nbbs = loop->num_nodes;
10087 unsigned i;
10088 basic_block bb;
10089 class loop *bb_loop;
10090 gimple_stmt_iterator gsi;
10091 gimple *stmt;
10092 auto_vec<gimple *> worklist;
10093 auto_purge_vect_location sentinel;
10095 vect_location = find_loop_location (loop);
10096 /* Pick up all masked stores in loop if any. */
10097 for (i = 0; i < nbbs; i++)
10099 bb = bbs[i];
10100 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10101 gsi_next (&gsi))
10103 stmt = gsi_stmt (gsi);
10104 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10105 worklist.safe_push (stmt);
10109 free (bbs);
10110 if (worklist.is_empty ())
10111 return;
10113 /* Loop has masked stores. */
10114 while (!worklist.is_empty ())
10116 gimple *last, *last_store;
10117 edge e, efalse;
10118 tree mask;
10119 basic_block store_bb, join_bb;
10120 gimple_stmt_iterator gsi_to;
10121 tree vdef, new_vdef;
10122 gphi *phi;
10123 tree vectype;
10124 tree zero;
10126 last = worklist.pop ();
10127 mask = gimple_call_arg (last, 2);
10128 bb = gimple_bb (last);
10129 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10130 the same loop as if_bb. It could be different to LOOP when two
10131 level loop-nest is vectorized and mask_store belongs to the inner
10132 one. */
10133 e = split_block (bb, last);
10134 bb_loop = bb->loop_father;
10135 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10136 join_bb = e->dest;
10137 store_bb = create_empty_bb (bb);
10138 add_bb_to_loop (store_bb, bb_loop);
10139 e->flags = EDGE_TRUE_VALUE;
10140 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10141 /* Put STORE_BB to likely part. */
10142 efalse->probability = profile_probability::unlikely ();
10143 store_bb->count = efalse->count ();
10144 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10145 if (dom_info_available_p (CDI_DOMINATORS))
10146 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10147 if (dump_enabled_p ())
10148 dump_printf_loc (MSG_NOTE, vect_location,
10149 "Create new block %d to sink mask stores.",
10150 store_bb->index);
10151 /* Create vector comparison with boolean result. */
10152 vectype = TREE_TYPE (mask);
10153 zero = build_zero_cst (vectype);
10154 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10155 gsi = gsi_last_bb (bb);
10156 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10157 /* Create new PHI node for vdef of the last masked store:
10158 .MEM_2 = VDEF <.MEM_1>
10159 will be converted to
10160 .MEM.3 = VDEF <.MEM_1>
10161 and new PHI node will be created in join bb
10162 .MEM_2 = PHI <.MEM_1, .MEM_3>
10164 vdef = gimple_vdef (last);
10165 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10166 gimple_set_vdef (last, new_vdef);
10167 phi = create_phi_node (vdef, join_bb);
10168 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10170 /* Put all masked stores with the same mask to STORE_BB if possible. */
10171 while (true)
10173 gimple_stmt_iterator gsi_from;
10174 gimple *stmt1 = NULL;
10176 /* Move masked store to STORE_BB. */
10177 last_store = last;
10178 gsi = gsi_for_stmt (last);
10179 gsi_from = gsi;
10180 /* Shift GSI to the previous stmt for further traversal. */
10181 gsi_prev (&gsi);
10182 gsi_to = gsi_start_bb (store_bb);
10183 gsi_move_before (&gsi_from, &gsi_to);
10184 /* Setup GSI_TO to the non-empty block start. */
10185 gsi_to = gsi_start_bb (store_bb);
10186 if (dump_enabled_p ())
10187 dump_printf_loc (MSG_NOTE, vect_location,
10188 "Move stmt to created bb\n%G", last);
10189 /* Move all stored value producers if possible. */
10190 while (!gsi_end_p (gsi))
10192 tree lhs;
10193 imm_use_iterator imm_iter;
10194 use_operand_p use_p;
10195 bool res;
10197 /* Skip debug statements. */
10198 if (is_gimple_debug (gsi_stmt (gsi)))
10200 gsi_prev (&gsi);
10201 continue;
10203 stmt1 = gsi_stmt (gsi);
10204 /* Do not consider statements writing to memory or having
10205 volatile operand. */
10206 if (gimple_vdef (stmt1)
10207 || gimple_has_volatile_ops (stmt1))
10208 break;
10209 gsi_from = gsi;
10210 gsi_prev (&gsi);
10211 lhs = gimple_get_lhs (stmt1);
10212 if (!lhs)
10213 break;
10215 /* LHS of vectorized stmt must be SSA_NAME. */
10216 if (TREE_CODE (lhs) != SSA_NAME)
10217 break;
10219 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10221 /* Remove dead scalar statement. */
10222 if (has_zero_uses (lhs))
10224 gsi_remove (&gsi_from, true);
10225 continue;
10229 /* Check that LHS does not have uses outside of STORE_BB. */
10230 res = true;
10231 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10233 gimple *use_stmt;
10234 use_stmt = USE_STMT (use_p);
10235 if (is_gimple_debug (use_stmt))
10236 continue;
10237 if (gimple_bb (use_stmt) != store_bb)
10239 res = false;
10240 break;
10243 if (!res)
10244 break;
10246 if (gimple_vuse (stmt1)
10247 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10248 break;
10250 /* Can move STMT1 to STORE_BB. */
10251 if (dump_enabled_p ())
10252 dump_printf_loc (MSG_NOTE, vect_location,
10253 "Move stmt to created bb\n%G", stmt1);
10254 gsi_move_before (&gsi_from, &gsi_to);
10255 /* Shift GSI_TO for further insertion. */
10256 gsi_prev (&gsi_to);
10258 /* Put other masked stores with the same mask to STORE_BB. */
10259 if (worklist.is_empty ()
10260 || gimple_call_arg (worklist.last (), 2) != mask
10261 || worklist.last () != stmt1)
10262 break;
10263 last = worklist.pop ();
10265 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10269 /* Decide whether it is possible to use a zero-based induction variable
10270 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10271 the value that the induction variable must be able to hold in order
10272 to ensure that the rgroups eventually have no active vector elements.
10273 Return -1 otherwise. */
10275 widest_int
10276 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10278 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10279 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10280 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10282 /* Calculate the value that the induction variable must be able
10283 to hit in order to ensure that we end the loop with an all-false mask.
10284 This involves adding the maximum number of inactive trailing scalar
10285 iterations. */
10286 widest_int iv_limit = -1;
10287 if (max_loop_iterations (loop, &iv_limit))
10289 if (niters_skip)
10291 /* Add the maximum number of skipped iterations to the
10292 maximum iteration count. */
10293 if (TREE_CODE (niters_skip) == INTEGER_CST)
10294 iv_limit += wi::to_widest (niters_skip);
10295 else
10296 iv_limit += max_vf - 1;
10298 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10299 /* Make a conservatively-correct assumption. */
10300 iv_limit += max_vf - 1;
10302 /* IV_LIMIT is the maximum number of latch iterations, which is also
10303 the maximum in-range IV value. Round this value down to the previous
10304 vector alignment boundary and then add an extra full iteration. */
10305 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10306 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10308 return iv_limit;
10311 /* For the given rgroup_controls RGC, check whether an induction variable
10312 would ever hit a value that produces a set of all-false masks or zero
10313 lengths before wrapping around. Return true if it's possible to wrap
10314 around before hitting the desirable value, otherwise return false. */
10316 bool
10317 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10319 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10321 if (iv_limit == -1)
10322 return true;
10324 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10325 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10326 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10328 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10329 return true;
10331 return false;