Default to dwarf version 4 on hppa64-hpux
[official-gcc.git] / gcc / tree-vect-loop.c
blobe94356d76e987b68201d40b0beb507c7c1b8dbea
1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
158 bool *, bool *);
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162 may already be set for general statements (not just data refs). */
164 static opt_result
165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
166 bool vectype_maybe_set_p,
167 poly_uint64 *vf)
169 gimple *stmt = stmt_info->stmt;
171 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
172 && !STMT_VINFO_LIVE_P (stmt_info))
173 || gimple_clobber_p (stmt))
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
177 return opt_result::success ();
180 tree stmt_vectype, nunits_vectype;
181 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
182 &stmt_vectype,
183 &nunits_vectype);
184 if (!res)
185 return res;
187 if (stmt_vectype)
189 if (STMT_VINFO_VECTYPE (stmt_info))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 || vectype_maybe_set_p)
195 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. Return true on success
209 or false if something prevented vectorization. */
211 static opt_result
212 vect_determine_vf_for_stmt (vec_info *vinfo,
213 stmt_vec_info stmt_info, poly_uint64 *vf)
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
217 stmt_info->stmt);
218 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
219 if (!res)
220 return res;
222 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
223 && STMT_VINFO_RELATED_STMT (stmt_info))
225 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
226 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
228 /* If a pattern statement has def stmts, analyze them too. */
229 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
230 !gsi_end_p (si); gsi_next (&si))
232 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
233 if (dump_enabled_p ())
234 dump_printf_loc (MSG_NOTE, vect_location,
235 "==> examining pattern def stmt: %G",
236 def_stmt_info->stmt);
237 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
238 if (!res)
239 return res;
242 if (dump_enabled_p ())
243 dump_printf_loc (MSG_NOTE, vect_location,
244 "==> examining pattern statement: %G",
245 stmt_info->stmt);
246 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
247 if (!res)
248 return res;
251 return opt_result::success ();
254 /* Function vect_determine_vectorization_factor
256 Determine the vectorization factor (VF). VF is the number of data elements
257 that are operated upon in parallel in a single iteration of the vectorized
258 loop. For example, when vectorizing a loop that operates on 4byte elements,
259 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260 elements can fit in a single vector register.
262 We currently support vectorization of loops in which all types operated upon
263 are of the same size. Therefore this function currently sets VF according to
264 the size of the types operated upon, and fails if there are multiple sizes
265 in the loop.
267 VF is also the factor by which the loop iterations are strip-mined, e.g.:
268 original loop:
269 for (i=0; i<N; i++){
270 a[i] = b[i] + c[i];
273 vectorized loop:
274 for (i=0; i<N; i+=VF){
275 a[i:VF] = b[i:VF] + c[i:VF];
279 static opt_result
280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
282 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284 unsigned nbbs = loop->num_nodes;
285 poly_uint64 vectorization_factor = 1;
286 tree scalar_type = NULL_TREE;
287 gphi *phi;
288 tree vectype;
289 stmt_vec_info stmt_info;
290 unsigned i;
292 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
294 for (i = 0; i < nbbs; i++)
296 basic_block bb = bbs[i];
298 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
299 gsi_next (&si))
301 phi = si.phi ();
302 stmt_info = loop_vinfo->lookup_stmt (phi);
303 if (dump_enabled_p ())
304 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
305 phi);
307 gcc_assert (stmt_info);
309 if (STMT_VINFO_RELEVANT_P (stmt_info)
310 || STMT_VINFO_LIVE_P (stmt_info))
312 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
313 scalar_type = TREE_TYPE (PHI_RESULT (phi));
315 if (dump_enabled_p ())
316 dump_printf_loc (MSG_NOTE, vect_location,
317 "get vectype for scalar type: %T\n",
318 scalar_type);
320 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
321 if (!vectype)
322 return opt_result::failure_at (phi,
323 "not vectorized: unsupported "
324 "data-type %T\n",
325 scalar_type);
326 STMT_VINFO_VECTYPE (stmt_info) = vectype;
328 if (dump_enabled_p ())
329 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
330 vectype);
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
335 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
336 dump_printf (MSG_NOTE, "\n");
339 vect_update_max_nunits (&vectorization_factor, vectype);
343 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
344 gsi_next (&si))
346 if (is_gimple_debug (gsi_stmt (si)))
347 continue;
348 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
349 opt_result res
350 = vect_determine_vf_for_stmt (loop_vinfo,
351 stmt_info, &vectorization_factor);
352 if (!res)
353 return res;
357 /* TODO: Analyze cost. Decide if worth while to vectorize. */
358 if (dump_enabled_p ())
360 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
361 dump_dec (MSG_NOTE, vectorization_factor);
362 dump_printf (MSG_NOTE, "\n");
365 if (known_le (vectorization_factor, 1U))
366 return opt_result::failure_at (vect_location,
367 "not vectorized: unsupported data-type\n");
368 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
369 return opt_result::success ();
373 /* Function vect_is_simple_iv_evolution.
375 FORNOW: A simple evolution of an induction variables in the loop is
376 considered a polynomial evolution. */
378 static bool
379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
380 tree * step)
382 tree init_expr;
383 tree step_expr;
384 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
385 basic_block bb;
387 /* When there is no evolution in this loop, the evolution function
388 is not "simple". */
389 if (evolution_part == NULL_TREE)
390 return false;
392 /* When the evolution is a polynomial of degree >= 2
393 the evolution function is not "simple". */
394 if (tree_is_chrec (evolution_part))
395 return false;
397 step_expr = evolution_part;
398 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
402 step_expr, init_expr);
404 *init = init_expr;
405 *step = step_expr;
407 if (TREE_CODE (step_expr) != INTEGER_CST
408 && (TREE_CODE (step_expr) != SSA_NAME
409 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
410 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
411 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
412 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
413 || !flag_associative_math)))
414 && (TREE_CODE (step_expr) != REAL_CST
415 || !flag_associative_math))
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "step unknown.\n");
420 return false;
423 return true;
426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
427 what we are assuming is a double reduction. For example, given
428 a structure like this:
430 outer1:
431 x_1 = PHI <x_4(outer2), ...>;
434 inner:
435 x_2 = PHI <x_1(outer1), ...>;
437 x_3 = ...;
440 outer2:
441 x_4 = PHI <x_3(inner)>;
444 outer loop analysis would treat x_1 as a double reduction phi and
445 this function would then return true for x_2. */
447 static bool
448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
450 use_operand_p use_p;
451 ssa_op_iter op_iter;
452 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
453 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
454 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
455 return true;
456 return false;
459 /* Function vect_analyze_scalar_cycles_1.
461 Examine the cross iteration def-use cycles of scalar variables
462 in LOOP. LOOP_VINFO represents the loop that is now being
463 considered for vectorization (can be LOOP, or an outer-loop
464 enclosing LOOP). */
466 static void
467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
469 basic_block bb = loop->header;
470 tree init, step;
471 auto_vec<stmt_vec_info, 64> worklist;
472 gphi_iterator gsi;
473 bool double_reduc, reduc_chain;
475 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
477 /* First - identify all inductions. Reduction detection assumes that all the
478 inductions have been identified, therefore, this order must not be
479 changed. */
480 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
482 gphi *phi = gsi.phi ();
483 tree access_fn = NULL;
484 tree def = PHI_RESULT (phi);
485 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
490 /* Skip virtual phi's. The data dependences that are associated with
491 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
492 if (virtual_operand_p (def))
493 continue;
495 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
497 /* Analyze the evolution function. */
498 access_fn = analyze_scalar_evolution (loop, def);
499 if (access_fn)
501 STRIP_NOPS (access_fn);
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "Access function of PHI: %T\n", access_fn);
505 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
506 = initial_condition_in_loop_num (access_fn, loop->num);
507 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
508 = evolution_part_in_loop_num (access_fn, loop->num);
511 if (!access_fn
512 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
513 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
514 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
515 && TREE_CODE (step) != INTEGER_CST))
517 worklist.safe_push (stmt_vinfo);
518 continue;
521 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 != NULL_TREE);
523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
525 if (dump_enabled_p ())
526 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
527 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
531 /* Second - identify all reductions and nested cycles. */
532 while (worklist.length () > 0)
534 stmt_vec_info stmt_vinfo = worklist.pop ();
535 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
536 tree def = PHI_RESULT (phi);
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
541 gcc_assert (!virtual_operand_p (def)
542 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
544 stmt_vec_info reduc_stmt_info
545 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
546 &reduc_chain);
547 if (reduc_stmt_info)
549 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
550 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
551 if (double_reduc)
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_NOTE, vect_location,
555 "Detected double reduction.\n");
557 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
558 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
560 else
562 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "Detected vectorizable nested cycle.\n");
568 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
570 else
572 if (dump_enabled_p ())
573 dump_printf_loc (MSG_NOTE, vect_location,
574 "Detected reduction.\n");
576 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
577 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
578 /* Store the reduction cycles for possible vectorization in
579 loop-aware SLP if it was not detected as reduction
580 chain. */
581 if (! reduc_chain)
582 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
583 (reduc_stmt_info);
587 else
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Unknown def-use cycle pattern.\n");
595 /* Function vect_analyze_scalar_cycles.
597 Examine the cross iteration def-use cycles of scalar variables, by
598 analyzing the loop-header PHIs of scalar variables. Classify each
599 cycle as one of the following: invariant, induction, reduction, unknown.
600 We do that for the loop represented by LOOP_VINFO, and also to its
601 inner-loop, if exists.
602 Examples for scalar cycles:
604 Example1: reduction:
606 loop1:
607 for (i=0; i<N; i++)
608 sum += a[i];
610 Example2: induction:
612 loop2:
613 for (i=0; i<N; i++)
614 a[i] = i; */
616 static void
617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
619 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
621 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
623 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
624 Reductions in such inner-loop therefore have different properties than
625 the reductions in the nest that gets vectorized:
626 1. When vectorized, they are executed in the same order as in the original
627 scalar loop, so we can't change the order of computation when
628 vectorizing them.
629 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
630 current checks are too strict. */
632 if (loop->inner)
633 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
636 /* Transfer group and reduction information from STMT_INFO to its
637 pattern stmt. */
639 static void
640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
642 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
643 stmt_vec_info stmtp;
644 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
645 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
646 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
649 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
650 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
651 == STMT_VINFO_DEF_TYPE (stmt_info));
652 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
653 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
654 if (stmt_info)
655 REDUC_GROUP_NEXT_ELEMENT (stmtp)
656 = STMT_VINFO_RELATED_STMT (stmt_info);
658 while (stmt_info);
661 /* Fixup scalar cycles that now have their stmts detected as patterns. */
663 static void
664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
666 stmt_vec_info first;
667 unsigned i;
669 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
674 if ((STMT_VINFO_IN_PATTERN_P (next)
675 != STMT_VINFO_IN_PATTERN_P (first))
676 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
677 break;
678 next = REDUC_GROUP_NEXT_ELEMENT (next);
680 /* If all reduction chain members are well-formed patterns adjust
681 the group to group the pattern stmts instead. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
685 if (STMT_VINFO_IN_PATTERN_P (first))
687 vect_fixup_reduc_chain (first);
688 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
689 = STMT_VINFO_RELATED_STMT (first);
692 /* If not all stmt in the chain are patterns or if we failed
693 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
694 it as regular reduction instead. */
695 else
697 stmt_vec_info vinfo = first;
698 stmt_vec_info last = NULL;
699 while (vinfo)
701 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
702 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
703 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
704 last = vinfo;
705 vinfo = next;
707 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
708 = vect_internal_def;
709 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
710 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
711 --i;
716 /* Function vect_get_loop_niters.
718 Determine how many iterations the loop is executed and place it
719 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
720 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
721 niter information holds in ASSUMPTIONS.
723 Return the loop exit condition. */
726 static gcond *
727 vect_get_loop_niters (class loop *loop, tree *assumptions,
728 tree *number_of_iterations, tree *number_of_iterationsm1)
730 edge exit = single_exit (loop);
731 class tree_niter_desc niter_desc;
732 tree niter_assumptions, niter, may_be_zero;
733 gcond *cond = get_loop_exit_condition (loop);
735 *assumptions = boolean_true_node;
736 *number_of_iterationsm1 = chrec_dont_know;
737 *number_of_iterations = chrec_dont_know;
738 DUMP_VECT_SCOPE ("get_loop_niters");
740 if (!exit)
741 return cond;
743 may_be_zero = NULL_TREE;
744 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
745 || chrec_contains_undetermined (niter_desc.niter))
746 return cond;
748 niter_assumptions = niter_desc.assumptions;
749 may_be_zero = niter_desc.may_be_zero;
750 niter = niter_desc.niter;
752 if (may_be_zero && integer_zerop (may_be_zero))
753 may_be_zero = NULL_TREE;
755 if (may_be_zero)
757 if (COMPARISON_CLASS_P (may_be_zero))
759 /* Try to combine may_be_zero with assumptions, this can simplify
760 computation of niter expression. */
761 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
762 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
763 niter_assumptions,
764 fold_build1 (TRUTH_NOT_EXPR,
765 boolean_type_node,
766 may_be_zero));
767 else
768 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
769 build_int_cst (TREE_TYPE (niter), 0),
770 rewrite_to_non_trapping_overflow (niter));
772 may_be_zero = NULL_TREE;
774 else if (integer_nonzerop (may_be_zero))
776 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
777 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
778 return cond;
780 else
781 return cond;
784 *assumptions = niter_assumptions;
785 *number_of_iterationsm1 = niter;
787 /* We want the number of loop header executions which is the number
788 of latch executions plus one.
789 ??? For UINT_MAX latch executions this number overflows to zero
790 for loops like do { n++; } while (n != 0); */
791 if (niter && !chrec_contains_undetermined (niter))
792 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
793 build_int_cst (TREE_TYPE (niter), 1));
794 *number_of_iterations = niter;
796 return cond;
799 /* Function bb_in_loop_p
801 Used as predicate for dfs order traversal of the loop bbs. */
803 static bool
804 bb_in_loop_p (const_basic_block bb, const void *data)
806 const class loop *const loop = (const class loop *)data;
807 if (flow_bb_inside_loop_p (loop, bb))
808 return true;
809 return false;
813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
814 stmt_vec_info structs for all the stmts in LOOP_IN. */
816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
817 : vec_info (vec_info::loop, init_cost (loop_in, false), shared),
818 loop (loop_in),
819 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
820 num_itersm1 (NULL_TREE),
821 num_iters (NULL_TREE),
822 num_iters_unchanged (NULL_TREE),
823 num_iters_assumptions (NULL_TREE),
824 th (0),
825 versioning_threshold (0),
826 vectorization_factor (0),
827 main_loop_edge (nullptr),
828 skip_main_loop_edge (nullptr),
829 skip_this_loop_edge (nullptr),
830 reusable_accumulators (),
831 max_vectorization_factor (0),
832 mask_skip_niters (NULL_TREE),
833 rgroup_compare_type (NULL_TREE),
834 simd_if_cond (NULL_TREE),
835 unaligned_dr (NULL),
836 peeling_for_alignment (0),
837 ptr_mask (0),
838 ivexpr_map (NULL),
839 scan_map (NULL),
840 slp_unrolling_factor (1),
841 single_scalar_iteration_cost (0),
842 vec_outside_cost (0),
843 vec_inside_cost (0),
844 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
845 vectorizable (false),
846 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
847 using_partial_vectors_p (false),
848 epil_using_partial_vectors_p (false),
849 peeling_for_gaps (false),
850 peeling_for_niter (false),
851 no_data_dependencies (false),
852 has_mask_store (false),
853 scalar_loop_scaling (profile_probability::uninitialized ()),
854 scalar_loop (NULL),
855 orig_loop_info (NULL)
857 /* CHECKME: We want to visit all BBs before their successors (except for
858 latch blocks, for which this assertion wouldn't hold). In the simple
859 case of the loop forms we allow, a dfs order of the BBs would the same
860 as reversed postorder traversal, so we are safe. */
862 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
863 bbs, loop->num_nodes, loop);
864 gcc_assert (nbbs == loop->num_nodes);
866 for (unsigned int i = 0; i < nbbs; i++)
868 basic_block bb = bbs[i];
869 gimple_stmt_iterator si;
871 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
873 gimple *phi = gsi_stmt (si);
874 gimple_set_uid (phi, 0);
875 add_stmt (phi);
878 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
880 gimple *stmt = gsi_stmt (si);
881 gimple_set_uid (stmt, 0);
882 if (is_gimple_debug (stmt))
883 continue;
884 add_stmt (stmt);
885 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
886 third argument is the #pragma omp simd if (x) condition, when 0,
887 loop shouldn't be vectorized, when non-zero constant, it should
888 be vectorized normally, otherwise versioned with vectorized loop
889 done if the condition is non-zero at runtime. */
890 if (loop_in->simduid
891 && is_gimple_call (stmt)
892 && gimple_call_internal_p (stmt)
893 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
894 && gimple_call_num_args (stmt) >= 3
895 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
896 && (loop_in->simduid
897 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
899 tree arg = gimple_call_arg (stmt, 2);
900 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
901 simd_if_cond = arg;
902 else
903 gcc_assert (integer_nonzerop (arg));
908 epilogue_vinfos.create (6);
911 /* Free all levels of rgroup CONTROLS. */
913 void
914 release_vec_loop_controls (vec<rgroup_controls> *controls)
916 rgroup_controls *rgc;
917 unsigned int i;
918 FOR_EACH_VEC_ELT (*controls, i, rgc)
919 rgc->controls.release ();
920 controls->release ();
923 /* Free all memory used by the _loop_vec_info, as well as all the
924 stmt_vec_info structs of all the stmts in the loop. */
926 _loop_vec_info::~_loop_vec_info ()
928 free (bbs);
930 release_vec_loop_controls (&masks);
931 release_vec_loop_controls (&lens);
932 delete ivexpr_map;
933 delete scan_map;
934 epilogue_vinfos.release ();
936 /* When we release an epiloge vinfo that we do not intend to use
937 avoid clearing AUX of the main loop which should continue to
938 point to the main loop vinfo since otherwise we'll leak that. */
939 if (loop->aux == this)
940 loop->aux = NULL;
943 /* Return an invariant or register for EXPR and emit necessary
944 computations in the LOOP_VINFO loop preheader. */
946 tree
947 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
949 if (is_gimple_reg (expr)
950 || is_gimple_min_invariant (expr))
951 return expr;
953 if (! loop_vinfo->ivexpr_map)
954 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
955 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
956 if (! cached)
958 gimple_seq stmts = NULL;
959 cached = force_gimple_operand (unshare_expr (expr),
960 &stmts, true, NULL_TREE);
961 if (stmts)
963 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
964 gsi_insert_seq_on_edge_immediate (e, stmts);
967 return cached;
970 /* Return true if we can use CMP_TYPE as the comparison type to produce
971 all masks required to mask LOOP_VINFO. */
973 static bool
974 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
976 rgroup_controls *rgm;
977 unsigned int i;
978 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
979 if (rgm->type != NULL_TREE
980 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
981 cmp_type, rgm->type,
982 OPTIMIZE_FOR_SPEED))
983 return false;
984 return true;
987 /* Calculate the maximum number of scalars per iteration for every
988 rgroup in LOOP_VINFO. */
990 static unsigned int
991 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
993 unsigned int res = 1;
994 unsigned int i;
995 rgroup_controls *rgm;
996 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
997 res = MAX (res, rgm->max_nscalars_per_iter);
998 return res;
1001 /* Calculate the minimum precision necessary to represent:
1003 MAX_NITERS * FACTOR
1005 as an unsigned integer, where MAX_NITERS is the maximum number of
1006 loop header iterations for the original scalar form of LOOP_VINFO. */
1008 static unsigned
1009 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1011 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1013 /* Get the maximum number of iterations that is representable
1014 in the counter type. */
1015 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1016 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1018 /* Get a more refined estimate for the number of iterations. */
1019 widest_int max_back_edges;
1020 if (max_loop_iterations (loop, &max_back_edges))
1021 max_ni = wi::smin (max_ni, max_back_edges + 1);
1023 /* Work out how many bits we need to represent the limit. */
1024 return wi::min_precision (max_ni * factor, UNSIGNED);
1027 /* True if the loop needs peeling or partial vectors when vectorized. */
1029 static bool
1030 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1032 unsigned HOST_WIDE_INT const_vf;
1033 HOST_WIDE_INT max_niter
1034 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1036 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1037 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1038 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1039 (loop_vinfo));
1041 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1042 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1044 /* Work out the (constant) number of iterations that need to be
1045 peeled for reasons other than niters. */
1046 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1047 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1048 peel_niter += 1;
1049 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1050 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1051 return true;
1053 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1054 /* ??? When peeling for gaps but not alignment, we could
1055 try to check whether the (variable) niters is known to be
1056 VF * N + 1. That's something of a niche case though. */
1057 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1058 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1059 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1060 < (unsigned) exact_log2 (const_vf))
1061 /* In case of versioning, check if the maximum number of
1062 iterations is greater than th. If they are identical,
1063 the epilogue is unnecessary. */
1064 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1065 || ((unsigned HOST_WIDE_INT) max_niter
1066 > (th / const_vf) * const_vf))))
1067 return true;
1069 return false;
1072 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1073 whether we can actually generate the masks required. Return true if so,
1074 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1076 static bool
1077 vect_verify_full_masking (loop_vec_info loop_vinfo)
1079 unsigned int min_ni_width;
1080 unsigned int max_nscalars_per_iter
1081 = vect_get_max_nscalars_per_iter (loop_vinfo);
1083 /* Use a normal loop if there are no statements that need masking.
1084 This only happens in rare degenerate cases: it means that the loop
1085 has no loads, no stores, and no live-out values. */
1086 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1087 return false;
1089 /* Work out how many bits we need to represent the limit. */
1090 min_ni_width
1091 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1093 /* Find a scalar mode for which WHILE_ULT is supported. */
1094 opt_scalar_int_mode cmp_mode_iter;
1095 tree cmp_type = NULL_TREE;
1096 tree iv_type = NULL_TREE;
1097 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1098 unsigned int iv_precision = UINT_MAX;
1100 if (iv_limit != -1)
1101 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1102 UNSIGNED);
1104 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1106 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1107 if (cmp_bits >= min_ni_width
1108 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1110 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1111 if (this_type
1112 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1114 /* Although we could stop as soon as we find a valid mode,
1115 there are at least two reasons why that's not always the
1116 best choice:
1118 - An IV that's Pmode or wider is more likely to be reusable
1119 in address calculations than an IV that's narrower than
1120 Pmode.
1122 - Doing the comparison in IV_PRECISION or wider allows
1123 a natural 0-based IV, whereas using a narrower comparison
1124 type requires mitigations against wrap-around.
1126 Conversely, if the IV limit is variable, doing the comparison
1127 in a wider type than the original type can introduce
1128 unnecessary extensions, so picking the widest valid mode
1129 is not always a good choice either.
1131 Here we prefer the first IV type that's Pmode or wider,
1132 and the first comparison type that's IV_PRECISION or wider.
1133 (The comparison type must be no wider than the IV type,
1134 to avoid extensions in the vector loop.)
1136 ??? We might want to try continuing beyond Pmode for ILP32
1137 targets if CMP_BITS < IV_PRECISION. */
1138 iv_type = this_type;
1139 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1140 cmp_type = this_type;
1141 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1142 break;
1147 if (!cmp_type)
1148 return false;
1150 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1151 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1152 return true;
1155 /* Check whether we can use vector access with length based on precison
1156 comparison. So far, to keep it simple, we only allow the case that the
1157 precision of the target supported length is larger than the precision
1158 required by loop niters. */
1160 static bool
1161 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1163 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1164 return false;
1166 unsigned int max_nitems_per_iter = 1;
1167 unsigned int i;
1168 rgroup_controls *rgl;
1169 /* Find the maximum number of items per iteration for every rgroup. */
1170 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1172 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1173 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1176 /* Work out how many bits we need to represent the length limit. */
1177 unsigned int min_ni_prec
1178 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1180 /* Now use the maximum of below precisions for one suitable IV type:
1181 - the IV's natural precision
1182 - the precision needed to hold: the maximum number of scalar
1183 iterations multiplied by the scale factor (min_ni_prec above)
1184 - the Pmode precision
1186 If min_ni_prec is less than the precision of the current niters,
1187 we perfer to still use the niters type. Prefer to use Pmode and
1188 wider IV to avoid narrow conversions. */
1190 unsigned int ni_prec
1191 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1192 min_ni_prec = MAX (min_ni_prec, ni_prec);
1193 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1195 tree iv_type = NULL_TREE;
1196 opt_scalar_int_mode tmode_iter;
1197 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1199 scalar_mode tmode = tmode_iter.require ();
1200 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1202 /* ??? Do we really want to construct one IV whose precision exceeds
1203 BITS_PER_WORD? */
1204 if (tbits > BITS_PER_WORD)
1205 break;
1207 /* Find the first available standard integral type. */
1208 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1210 iv_type = build_nonstandard_integer_type (tbits, true);
1211 break;
1215 if (!iv_type)
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "can't vectorize with length-based partial vectors"
1220 " because there is no suitable iv type.\n");
1221 return false;
1224 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1225 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1227 return true;
1230 /* Calculate the cost of one scalar iteration of the loop. */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1234 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236 int nbbs = loop->num_nodes, factor;
1237 int innerloop_iters, i;
1239 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1241 /* Gather costs for statements in the scalar loop. */
1243 /* FORNOW. */
1244 innerloop_iters = 1;
1245 if (loop->inner)
1246 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1248 for (i = 0; i < nbbs; i++)
1250 gimple_stmt_iterator si;
1251 basic_block bb = bbs[i];
1253 if (bb->loop_father == loop->inner)
1254 factor = innerloop_iters;
1255 else
1256 factor = 1;
1258 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1260 gimple *stmt = gsi_stmt (si);
1261 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1263 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1264 continue;
1266 /* Skip stmts that are not vectorized inside the loop. */
1267 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1268 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1269 && (!STMT_VINFO_LIVE_P (vstmt_info)
1270 || !VECTORIZABLE_CYCLE_DEF
1271 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1272 continue;
1274 vect_cost_for_stmt kind;
1275 if (STMT_VINFO_DATA_REF (stmt_info))
1277 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1278 kind = scalar_load;
1279 else
1280 kind = scalar_store;
1282 else if (vect_nop_conversion_p (stmt_info))
1283 continue;
1284 else
1285 kind = scalar_stmt;
1287 /* We are using vect_prologue here to avoid scaling twice
1288 by the inner loop factor. */
1289 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1290 factor, kind, stmt_info, 0, vect_prologue);
1294 /* Now accumulate cost. */
1295 void *target_cost_data = init_cost (loop, true);
1296 stmt_info_for_cost *si;
1297 int j;
1298 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1299 j, si)
1300 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1301 si->kind, si->stmt_info, si->vectype,
1302 si->misalign, si->where);
1303 unsigned prologue_cost = 0, body_cost = 0, epilogue_cost = 0;
1304 finish_cost (target_cost_data, &prologue_cost, &body_cost,
1305 &epilogue_cost);
1306 destroy_cost_data (target_cost_data);
1307 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1308 = prologue_cost + body_cost + epilogue_cost;
1312 /* Function vect_analyze_loop_form_1.
1314 Verify that certain CFG restrictions hold, including:
1315 - the loop has a pre-header
1316 - the loop has a single entry and exit
1317 - the loop exit condition is simple enough
1318 - the number of iterations can be analyzed, i.e, a countable loop. The
1319 niter could be analyzed under some assumptions. */
1321 opt_result
1322 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1323 tree *assumptions, tree *number_of_iterationsm1,
1324 tree *number_of_iterations, gcond **inner_loop_cond)
1326 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1328 /* Different restrictions apply when we are considering an inner-most loop,
1329 vs. an outer (nested) loop.
1330 (FORNOW. May want to relax some of these restrictions in the future). */
1332 if (!loop->inner)
1334 /* Inner-most loop. We currently require that the number of BBs is
1335 exactly 2 (the header and latch). Vectorizable inner-most loops
1336 look like this:
1338 (pre-header)
1340 header <--------+
1341 | | |
1342 | +--> latch --+
1344 (exit-bb) */
1346 if (loop->num_nodes != 2)
1347 return opt_result::failure_at (vect_location,
1348 "not vectorized:"
1349 " control flow in loop.\n");
1351 if (empty_block_p (loop->header))
1352 return opt_result::failure_at (vect_location,
1353 "not vectorized: empty loop.\n");
1355 else
1357 class loop *innerloop = loop->inner;
1358 edge entryedge;
1360 /* Nested loop. We currently require that the loop is doubly-nested,
1361 contains a single inner loop, and the number of BBs is exactly 5.
1362 Vectorizable outer-loops look like this:
1364 (pre-header)
1366 header <---+
1368 inner-loop |
1370 tail ------+
1372 (exit-bb)
1374 The inner-loop has the properties expected of inner-most loops
1375 as described above. */
1377 if ((loop->inner)->inner || (loop->inner)->next)
1378 return opt_result::failure_at (vect_location,
1379 "not vectorized:"
1380 " multiple nested loops.\n");
1382 if (loop->num_nodes != 5)
1383 return opt_result::failure_at (vect_location,
1384 "not vectorized:"
1385 " control flow in loop.\n");
1387 entryedge = loop_preheader_edge (innerloop);
1388 if (entryedge->src != loop->header
1389 || !single_exit (innerloop)
1390 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1391 return opt_result::failure_at (vect_location,
1392 "not vectorized:"
1393 " unsupported outerloop form.\n");
1395 /* Analyze the inner-loop. */
1396 tree inner_niterm1, inner_niter, inner_assumptions;
1397 opt_result res
1398 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1399 &inner_assumptions, &inner_niterm1,
1400 &inner_niter, NULL);
1401 if (!res)
1403 if (dump_enabled_p ())
1404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1405 "not vectorized: Bad inner loop.\n");
1406 return res;
1409 /* Don't support analyzing niter under assumptions for inner
1410 loop. */
1411 if (!integer_onep (inner_assumptions))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: Bad inner loop.\n");
1415 if (!expr_invariant_in_loop_p (loop, inner_niter))
1416 return opt_result::failure_at (vect_location,
1417 "not vectorized: inner-loop count not"
1418 " invariant.\n");
1420 if (dump_enabled_p ())
1421 dump_printf_loc (MSG_NOTE, vect_location,
1422 "Considering outer-loop vectorization.\n");
1425 if (!single_exit (loop))
1426 return opt_result::failure_at (vect_location,
1427 "not vectorized: multiple exits.\n");
1428 if (EDGE_COUNT (loop->header->preds) != 2)
1429 return opt_result::failure_at (vect_location,
1430 "not vectorized:"
1431 " too many incoming edges.\n");
1433 /* We assume that the loop exit condition is at the end of the loop. i.e,
1434 that the loop is represented as a do-while (with a proper if-guard
1435 before the loop if needed), where the loop header contains all the
1436 executable statements, and the latch is empty. */
1437 if (!empty_block_p (loop->latch)
1438 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1439 return opt_result::failure_at (vect_location,
1440 "not vectorized: latch block not empty.\n");
1442 /* Make sure the exit is not abnormal. */
1443 edge e = single_exit (loop);
1444 if (e->flags & EDGE_ABNORMAL)
1445 return opt_result::failure_at (vect_location,
1446 "not vectorized:"
1447 " abnormal loop exit edge.\n");
1449 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1450 number_of_iterationsm1);
1451 if (!*loop_cond)
1452 return opt_result::failure_at
1453 (vect_location,
1454 "not vectorized: complicated exit condition.\n");
1456 if (integer_zerop (*assumptions)
1457 || !*number_of_iterations
1458 || chrec_contains_undetermined (*number_of_iterations))
1459 return opt_result::failure_at
1460 (*loop_cond,
1461 "not vectorized: number of iterations cannot be computed.\n");
1463 if (integer_zerop (*number_of_iterations))
1464 return opt_result::failure_at
1465 (*loop_cond,
1466 "not vectorized: number of iterations = 0.\n");
1468 return opt_result::success ();
1471 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1473 opt_loop_vec_info
1474 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1476 tree assumptions, number_of_iterations, number_of_iterationsm1;
1477 gcond *loop_cond, *inner_loop_cond = NULL;
1479 opt_result res
1480 = vect_analyze_loop_form_1 (loop, &loop_cond,
1481 &assumptions, &number_of_iterationsm1,
1482 &number_of_iterations, &inner_loop_cond);
1483 if (!res)
1484 return opt_loop_vec_info::propagate_failure (res);
1486 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1487 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1488 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1489 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1490 if (!integer_onep (assumptions))
1492 /* We consider to vectorize this loop by versioning it under
1493 some assumptions. In order to do this, we need to clear
1494 existing information computed by scev and niter analyzer. */
1495 scev_reset_htab ();
1496 free_numbers_of_iterations_estimates (loop);
1497 /* Also set flag for this loop so that following scev and niter
1498 analysis are done under the assumptions. */
1499 loop_constraint_set (loop, LOOP_C_FINITE);
1500 /* Also record the assumptions for versioning. */
1501 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1504 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1506 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_NOTE, vect_location,
1509 "Symbolic number of iterations is ");
1510 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1511 dump_printf (MSG_NOTE, "\n");
1515 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1516 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1517 if (inner_loop_cond)
1519 stmt_vec_info inner_loop_cond_info
1520 = loop_vinfo->lookup_stmt (inner_loop_cond);
1521 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1522 /* If we have an estimate on the number of iterations of the inner
1523 loop use that to limit the scale for costing, otherwise use
1524 --param vect-inner-loop-cost-factor literally. */
1525 widest_int nit;
1526 if (estimated_stmt_executions (loop->inner, &nit))
1527 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1528 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1531 gcc_assert (!loop->aux);
1532 loop->aux = loop_vinfo;
1533 return opt_loop_vec_info::success (loop_vinfo);
1538 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1539 statements update the vectorization factor. */
1541 static void
1542 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1544 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1545 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1546 int nbbs = loop->num_nodes;
1547 poly_uint64 vectorization_factor;
1548 int i;
1550 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1552 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1553 gcc_assert (known_ne (vectorization_factor, 0U));
1555 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1556 vectorization factor of the loop is the unrolling factor required by
1557 the SLP instances. If that unrolling factor is 1, we say, that we
1558 perform pure SLP on loop - cross iteration parallelism is not
1559 exploited. */
1560 bool only_slp_in_loop = true;
1561 for (i = 0; i < nbbs; i++)
1563 basic_block bb = bbs[i];
1564 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1565 gsi_next (&si))
1567 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1568 if (!stmt_info)
1569 continue;
1570 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1571 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1572 && !PURE_SLP_STMT (stmt_info))
1573 /* STMT needs both SLP and loop-based vectorization. */
1574 only_slp_in_loop = false;
1576 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1577 gsi_next (&si))
1579 if (is_gimple_debug (gsi_stmt (si)))
1580 continue;
1581 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1582 stmt_info = vect_stmt_to_vectorize (stmt_info);
1583 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1584 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1585 && !PURE_SLP_STMT (stmt_info))
1586 /* STMT needs both SLP and loop-based vectorization. */
1587 only_slp_in_loop = false;
1591 if (only_slp_in_loop)
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_NOTE, vect_location,
1595 "Loop contains only SLP stmts\n");
1596 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1598 else
1600 if (dump_enabled_p ())
1601 dump_printf_loc (MSG_NOTE, vect_location,
1602 "Loop contains SLP and non-SLP stmts\n");
1603 /* Both the vectorization factor and unroll factor have the form
1604 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1605 so they must have a common multiple. */
1606 vectorization_factor
1607 = force_common_multiple (vectorization_factor,
1608 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1611 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1612 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_NOTE, vect_location,
1615 "Updating vectorization factor to ");
1616 dump_dec (MSG_NOTE, vectorization_factor);
1617 dump_printf (MSG_NOTE, ".\n");
1621 /* Return true if STMT_INFO describes a double reduction phi and if
1622 the other phi in the reduction is also relevant for vectorization.
1623 This rejects cases such as:
1625 outer1:
1626 x_1 = PHI <x_3(outer2), ...>;
1629 inner:
1630 x_2 = ...;
1633 outer2:
1634 x_3 = PHI <x_2(inner)>;
1636 if nothing in x_2 or elsewhere makes x_1 relevant. */
1638 static bool
1639 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1641 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1642 return false;
1644 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1647 /* Function vect_analyze_loop_operations.
1649 Scan the loop stmts and make sure they are all vectorizable. */
1651 static opt_result
1652 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1654 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1655 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1656 int nbbs = loop->num_nodes;
1657 int i;
1658 stmt_vec_info stmt_info;
1659 bool need_to_vectorize = false;
1660 bool ok;
1662 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1664 auto_vec<stmt_info_for_cost> cost_vec;
1666 for (i = 0; i < nbbs; i++)
1668 basic_block bb = bbs[i];
1670 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1671 gsi_next (&si))
1673 gphi *phi = si.phi ();
1674 ok = true;
1676 stmt_info = loop_vinfo->lookup_stmt (phi);
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1679 if (virtual_operand_p (gimple_phi_result (phi)))
1680 continue;
1682 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1683 (i.e., a phi in the tail of the outer-loop). */
1684 if (! is_loop_header_bb_p (bb))
1686 /* FORNOW: we currently don't support the case that these phis
1687 are not used in the outerloop (unless it is double reduction,
1688 i.e., this phi is vect_reduction_def), cause this case
1689 requires to actually do something here. */
1690 if (STMT_VINFO_LIVE_P (stmt_info)
1691 && !vect_active_double_reduction_p (stmt_info))
1692 return opt_result::failure_at (phi,
1693 "Unsupported loop-closed phi"
1694 " in outer-loop.\n");
1696 /* If PHI is used in the outer loop, we check that its operand
1697 is defined in the inner loop. */
1698 if (STMT_VINFO_RELEVANT_P (stmt_info))
1700 tree phi_op;
1702 if (gimple_phi_num_args (phi) != 1)
1703 return opt_result::failure_at (phi, "unsupported phi");
1705 phi_op = PHI_ARG_DEF (phi, 0);
1706 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1707 if (!op_def_info)
1708 return opt_result::failure_at (phi, "unsupported phi\n");
1710 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1711 && (STMT_VINFO_RELEVANT (op_def_info)
1712 != vect_used_in_outer_by_reduction))
1713 return opt_result::failure_at (phi, "unsupported phi\n");
1715 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1716 || (STMT_VINFO_DEF_TYPE (stmt_info)
1717 == vect_double_reduction_def))
1718 && !vectorizable_lc_phi (loop_vinfo,
1719 stmt_info, NULL, NULL))
1720 return opt_result::failure_at (phi, "unsupported phi\n");
1723 continue;
1726 gcc_assert (stmt_info);
1728 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1729 || STMT_VINFO_LIVE_P (stmt_info))
1730 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1731 /* A scalar-dependence cycle that we don't support. */
1732 return opt_result::failure_at (phi,
1733 "not vectorized:"
1734 " scalar dependence cycle.\n");
1736 if (STMT_VINFO_RELEVANT_P (stmt_info))
1738 need_to_vectorize = true;
1739 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1740 && ! PURE_SLP_STMT (stmt_info))
1741 ok = vectorizable_induction (loop_vinfo,
1742 stmt_info, NULL, NULL,
1743 &cost_vec);
1744 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1745 || (STMT_VINFO_DEF_TYPE (stmt_info)
1746 == vect_double_reduction_def)
1747 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1748 && ! PURE_SLP_STMT (stmt_info))
1749 ok = vectorizable_reduction (loop_vinfo,
1750 stmt_info, NULL, NULL, &cost_vec);
1753 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1754 if (ok
1755 && STMT_VINFO_LIVE_P (stmt_info)
1756 && !PURE_SLP_STMT (stmt_info))
1757 ok = vectorizable_live_operation (loop_vinfo,
1758 stmt_info, NULL, NULL, NULL,
1759 -1, false, &cost_vec);
1761 if (!ok)
1762 return opt_result::failure_at (phi,
1763 "not vectorized: relevant phi not "
1764 "supported: %G",
1765 static_cast <gimple *> (phi));
1768 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1769 gsi_next (&si))
1771 gimple *stmt = gsi_stmt (si);
1772 if (!gimple_clobber_p (stmt)
1773 && !is_gimple_debug (stmt))
1775 opt_result res
1776 = vect_analyze_stmt (loop_vinfo,
1777 loop_vinfo->lookup_stmt (stmt),
1778 &need_to_vectorize,
1779 NULL, NULL, &cost_vec);
1780 if (!res)
1781 return res;
1784 } /* bbs */
1786 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1788 /* All operations in the loop are either irrelevant (deal with loop
1789 control, or dead), or only used outside the loop and can be moved
1790 out of the loop (e.g. invariants, inductions). The loop can be
1791 optimized away by scalar optimizations. We're better off not
1792 touching this loop. */
1793 if (!need_to_vectorize)
1795 if (dump_enabled_p ())
1796 dump_printf_loc (MSG_NOTE, vect_location,
1797 "All the computation can be taken out of the loop.\n");
1798 return opt_result::failure_at
1799 (vect_location,
1800 "not vectorized: redundant loop. no profit to vectorize.\n");
1803 return opt_result::success ();
1806 /* Return true if we know that the iteration count is smaller than the
1807 vectorization factor. Return false if it isn't, or if we can't be sure
1808 either way. */
1810 static bool
1811 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1813 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1815 HOST_WIDE_INT max_niter;
1816 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1817 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1818 else
1819 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1821 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1822 return true;
1824 return false;
1827 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1828 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1829 definitely no, or -1 if it's worth retrying. */
1831 static int
1832 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1834 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1835 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1837 /* Only loops that can handle partially-populated vectors can have iteration
1838 counts less than the vectorization factor. */
1839 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1841 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1843 if (dump_enabled_p ())
1844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1845 "not vectorized: iteration count smaller than "
1846 "vectorization factor.\n");
1847 return 0;
1851 /* If using the "very cheap" model. reject cases in which we'd keep
1852 a copy of the scalar code (even if we might be able to vectorize it). */
1853 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1854 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1855 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1856 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1858 if (dump_enabled_p ())
1859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860 "some scalar iterations would need to be peeled\n");
1861 return 0;
1864 int min_profitable_iters, min_profitable_estimate;
1865 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1866 &min_profitable_estimate);
1868 if (min_profitable_iters < 0)
1870 if (dump_enabled_p ())
1871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872 "not vectorized: vectorization not profitable.\n");
1873 if (dump_enabled_p ())
1874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875 "not vectorized: vector version will never be "
1876 "profitable.\n");
1877 return -1;
1880 int min_scalar_loop_bound = (param_min_vect_loop_bound
1881 * assumed_vf);
1883 /* Use the cost model only if it is more conservative than user specified
1884 threshold. */
1885 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1886 min_profitable_iters);
1888 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1890 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1891 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895 "not vectorized: vectorization not profitable.\n");
1896 if (dump_enabled_p ())
1897 dump_printf_loc (MSG_NOTE, vect_location,
1898 "not vectorized: iteration count smaller than user "
1899 "specified loop bound parameter or minimum profitable "
1900 "iterations (whichever is more conservative).\n");
1901 return 0;
1904 /* The static profitablity threshold min_profitable_estimate includes
1905 the cost of having to check at runtime whether the scalar loop
1906 should be used instead. If it turns out that we don't need or want
1907 such a check, the threshold we should use for the static estimate
1908 is simply the point at which the vector loop becomes more profitable
1909 than the scalar loop. */
1910 if (min_profitable_estimate > min_profitable_iters
1911 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1912 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1913 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1914 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1916 if (dump_enabled_p ())
1917 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1918 " choice between the scalar and vector loops\n");
1919 min_profitable_estimate = min_profitable_iters;
1922 /* If the vector loop needs multiple iterations to be beneficial then
1923 things are probably too close to call, and the conservative thing
1924 would be to stick with the scalar code. */
1925 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1926 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1928 if (dump_enabled_p ())
1929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930 "one iteration of the vector loop would be"
1931 " more expensive than the equivalent number of"
1932 " iterations of the scalar loop\n");
1933 return 0;
1936 HOST_WIDE_INT estimated_niter;
1938 /* If we are vectorizing an epilogue then we know the maximum number of
1939 scalar iterations it will cover is at least one lower than the
1940 vectorization factor of the main loop. */
1941 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1942 estimated_niter
1943 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1944 else
1946 estimated_niter = estimated_stmt_executions_int (loop);
1947 if (estimated_niter == -1)
1948 estimated_niter = likely_max_stmt_executions_int (loop);
1950 if (estimated_niter != -1
1951 && ((unsigned HOST_WIDE_INT) estimated_niter
1952 < MAX (th, (unsigned) min_profitable_estimate)))
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1956 "not vectorized: estimated iteration count too "
1957 "small.\n");
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_NOTE, vect_location,
1960 "not vectorized: estimated iteration count smaller "
1961 "than specified loop bound parameter or minimum "
1962 "profitable iterations (whichever is more "
1963 "conservative).\n");
1964 return -1;
1967 return 1;
1970 static opt_result
1971 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1972 vec<data_reference_p> *datarefs,
1973 unsigned int *n_stmts)
1975 *n_stmts = 0;
1976 for (unsigned i = 0; i < loop->num_nodes; i++)
1977 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1978 !gsi_end_p (gsi); gsi_next (&gsi))
1980 gimple *stmt = gsi_stmt (gsi);
1981 if (is_gimple_debug (stmt))
1982 continue;
1983 ++(*n_stmts);
1984 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1985 NULL, 0);
1986 if (!res)
1988 if (is_gimple_call (stmt) && loop->safelen)
1990 tree fndecl = gimple_call_fndecl (stmt), op;
1991 if (fndecl != NULL_TREE)
1993 cgraph_node *node = cgraph_node::get (fndecl);
1994 if (node != NULL && node->simd_clones != NULL)
1996 unsigned int j, n = gimple_call_num_args (stmt);
1997 for (j = 0; j < n; j++)
1999 op = gimple_call_arg (stmt, j);
2000 if (DECL_P (op)
2001 || (REFERENCE_CLASS_P (op)
2002 && get_base_address (op)))
2003 break;
2005 op = gimple_call_lhs (stmt);
2006 /* Ignore #pragma omp declare simd functions
2007 if they don't have data references in the
2008 call stmt itself. */
2009 if (j == n
2010 && !(op
2011 && (DECL_P (op)
2012 || (REFERENCE_CLASS_P (op)
2013 && get_base_address (op)))))
2014 continue;
2018 return res;
2020 /* If dependence analysis will give up due to the limit on the
2021 number of datarefs stop here and fail fatally. */
2022 if (datarefs->length ()
2023 > (unsigned)param_loop_max_datarefs_for_datadeps)
2024 return opt_result::failure_at (stmt, "exceeded param "
2025 "loop-max-datarefs-for-datadeps\n");
2027 return opt_result::success ();
2030 /* Look for SLP-only access groups and turn each individual access into its own
2031 group. */
2032 static void
2033 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2035 unsigned int i;
2036 struct data_reference *dr;
2038 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2040 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2041 FOR_EACH_VEC_ELT (datarefs, i, dr)
2043 gcc_assert (DR_REF (dr));
2044 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2046 /* Check if the load is a part of an interleaving chain. */
2047 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2049 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2050 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2051 unsigned int group_size = DR_GROUP_SIZE (first_element);
2053 /* Check if SLP-only groups. */
2054 if (!STMT_SLP_TYPE (stmt_info)
2055 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2057 /* Dissolve the group. */
2058 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2060 stmt_vec_info vinfo = first_element;
2061 while (vinfo)
2063 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2064 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2065 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2066 DR_GROUP_SIZE (vinfo) = 1;
2067 if (STMT_VINFO_STRIDED_P (first_element))
2068 DR_GROUP_GAP (vinfo) = 0;
2069 else
2070 DR_GROUP_GAP (vinfo) = group_size - 1;
2071 /* Duplicate and adjust alignment info, it needs to
2072 be present on each group leader, see dr_misalignment. */
2073 if (vinfo != first_element)
2075 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2076 dr_info2->target_alignment = dr_info->target_alignment;
2077 int misalignment = dr_info->misalignment;
2078 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2080 HOST_WIDE_INT diff
2081 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2082 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2083 unsigned HOST_WIDE_INT align_c
2084 = dr_info->target_alignment.to_constant ();
2085 misalignment = (misalignment + diff) % align_c;
2087 dr_info2->misalignment = misalignment;
2089 vinfo = next;
2096 /* Determine if operating on full vectors for LOOP_VINFO might leave
2097 some scalar iterations still to do. If so, decide how we should
2098 handle those scalar iterations. The possibilities are:
2100 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2101 In this case:
2103 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2104 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2105 LOOP_VINFO_PEELING_FOR_NITER == false
2107 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2108 to handle the remaining scalar iterations. In this case:
2110 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2111 LOOP_VINFO_PEELING_FOR_NITER == true
2113 There are two choices:
2115 (2a) Consider vectorizing the epilogue loop at the same VF as the
2116 main loop, but using partial vectors instead of full vectors.
2117 In this case:
2119 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2121 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2122 In this case:
2124 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2126 When FOR_EPILOGUE_P is true, make this determination based on the
2127 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2128 based on the assumption that LOOP_VINFO is the main loop. The caller
2129 has made sure that the number of iterations is set appropriately for
2130 this value of FOR_EPILOGUE_P. */
2132 opt_result
2133 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2134 bool for_epilogue_p)
2136 /* Determine whether there would be any scalar iterations left over. */
2137 bool need_peeling_or_partial_vectors_p
2138 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2140 /* Decide whether to vectorize the loop with partial vectors. */
2141 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2142 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2143 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2144 && need_peeling_or_partial_vectors_p)
2146 /* For partial-vector-usage=1, try to push the handling of partial
2147 vectors to the epilogue, with the main loop continuing to operate
2148 on full vectors.
2150 ??? We could then end up failing to use partial vectors if we
2151 decide to peel iterations into a prologue, and if the main loop
2152 then ends up processing fewer than VF iterations. */
2153 if (param_vect_partial_vector_usage == 1
2154 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2155 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2156 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2157 else
2158 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2161 if (dump_enabled_p ())
2163 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2164 dump_printf_loc (MSG_NOTE, vect_location,
2165 "operating on partial vectors%s.\n",
2166 for_epilogue_p ? " for epilogue loop" : "");
2167 else
2168 dump_printf_loc (MSG_NOTE, vect_location,
2169 "operating only on full vectors%s.\n",
2170 for_epilogue_p ? " for epilogue loop" : "");
2173 if (for_epilogue_p)
2175 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2176 gcc_assert (orig_loop_vinfo);
2177 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2178 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2179 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2182 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2183 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2185 /* Check that the loop processes at least one full vector. */
2186 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2187 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2188 if (known_lt (wi::to_widest (scalar_niters), vf))
2189 return opt_result::failure_at (vect_location,
2190 "loop does not have enough iterations"
2191 " to support vectorization.\n");
2193 /* If we need to peel an extra epilogue iteration to handle data
2194 accesses with gaps, check that there are enough scalar iterations
2195 available.
2197 The check above is redundant with this one when peeling for gaps,
2198 but the distinction is useful for diagnostics. */
2199 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2200 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2201 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2202 return opt_result::failure_at (vect_location,
2203 "loop does not have enough iterations"
2204 " to support peeling for gaps.\n");
2207 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2208 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2209 && need_peeling_or_partial_vectors_p);
2211 return opt_result::success ();
2214 /* Function vect_analyze_loop_2.
2216 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2217 for it. The different analyses will record information in the
2218 loop_vec_info struct. */
2219 static opt_result
2220 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2222 opt_result ok = opt_result::success ();
2223 int res;
2224 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2225 poly_uint64 min_vf = 2;
2226 loop_vec_info orig_loop_vinfo = NULL;
2228 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2229 loop_vec_info of the first vectorized loop. */
2230 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2231 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2232 else
2233 orig_loop_vinfo = loop_vinfo;
2234 gcc_assert (orig_loop_vinfo);
2236 /* The first group of checks is independent of the vector size. */
2237 fatal = true;
2239 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2240 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2241 return opt_result::failure_at (vect_location,
2242 "not vectorized: simd if(0)\n");
2244 /* Find all data references in the loop (which correspond to vdefs/vuses)
2245 and analyze their evolution in the loop. */
2247 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2249 /* Gather the data references and count stmts in the loop. */
2250 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2252 opt_result res
2253 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2254 &LOOP_VINFO_DATAREFS (loop_vinfo),
2255 n_stmts);
2256 if (!res)
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2260 "not vectorized: loop contains function "
2261 "calls or data references that cannot "
2262 "be analyzed\n");
2263 return res;
2265 loop_vinfo->shared->save_datarefs ();
2267 else
2268 loop_vinfo->shared->check_datarefs ();
2270 /* Analyze the data references and also adjust the minimal
2271 vectorization factor according to the loads and stores. */
2273 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2274 if (!ok)
2276 if (dump_enabled_p ())
2277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278 "bad data references.\n");
2279 return ok;
2282 /* Classify all cross-iteration scalar data-flow cycles.
2283 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2284 vect_analyze_scalar_cycles (loop_vinfo);
2286 vect_pattern_recog (loop_vinfo);
2288 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2290 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2291 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2293 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2294 if (!ok)
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "bad data access.\n");
2299 return ok;
2302 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2304 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2305 if (!ok)
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 "unexpected pattern.\n");
2310 return ok;
2313 /* While the rest of the analysis below depends on it in some way. */
2314 fatal = false;
2316 /* Analyze data dependences between the data-refs in the loop
2317 and adjust the maximum vectorization factor according to
2318 the dependences.
2319 FORNOW: fail at the first data dependence that we encounter. */
2321 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2322 if (!ok)
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326 "bad data dependence.\n");
2327 return ok;
2329 if (max_vf != MAX_VECTORIZATION_FACTOR
2330 && maybe_lt (max_vf, min_vf))
2331 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2332 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2334 ok = vect_determine_vectorization_factor (loop_vinfo);
2335 if (!ok)
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339 "can't determine vectorization factor.\n");
2340 return ok;
2342 if (max_vf != MAX_VECTORIZATION_FACTOR
2343 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2344 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2346 /* Compute the scalar iteration cost. */
2347 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2349 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2351 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2352 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2353 if (!ok)
2354 return ok;
2356 /* If there are any SLP instances mark them as pure_slp. */
2357 bool slp = vect_make_slp_decision (loop_vinfo);
2358 if (slp)
2360 /* Find stmts that need to be both vectorized and SLPed. */
2361 vect_detect_hybrid_slp (loop_vinfo);
2363 /* Update the vectorization factor based on the SLP decision. */
2364 vect_update_vf_for_slp (loop_vinfo);
2366 /* Optimize the SLP graph with the vectorization factor fixed. */
2367 vect_optimize_slp (loop_vinfo);
2369 /* Gather the loads reachable from the SLP graph entries. */
2370 vect_gather_slp_loads (loop_vinfo);
2373 bool saved_can_use_partial_vectors_p
2374 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2376 /* We don't expect to have to roll back to anything other than an empty
2377 set of rgroups. */
2378 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2380 /* This is the point where we can re-start analysis with SLP forced off. */
2381 start_over:
2383 /* Now the vectorization factor is final. */
2384 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2385 gcc_assert (known_ne (vectorization_factor, 0U));
2387 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2389 dump_printf_loc (MSG_NOTE, vect_location,
2390 "vectorization_factor = ");
2391 dump_dec (MSG_NOTE, vectorization_factor);
2392 dump_printf (MSG_NOTE, ", niters = %wd\n",
2393 LOOP_VINFO_INT_NITERS (loop_vinfo));
2396 /* Analyze the alignment of the data-refs in the loop.
2397 Fail if a data reference is found that cannot be vectorized. */
2399 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2400 if (!ok)
2402 if (dump_enabled_p ())
2403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404 "bad data alignment.\n");
2405 return ok;
2408 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2409 It is important to call pruning after vect_analyze_data_ref_accesses,
2410 since we use grouping information gathered by interleaving analysis. */
2411 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2412 if (!ok)
2413 return ok;
2415 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2416 vectorization, since we do not want to add extra peeling or
2417 add versioning for alignment. */
2418 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2419 /* This pass will decide on using loop versioning and/or loop peeling in
2420 order to enhance the alignment of data references in the loop. */
2421 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2422 if (!ok)
2423 return ok;
2425 if (slp)
2427 /* Analyze operations in the SLP instances. Note this may
2428 remove unsupported SLP instances which makes the above
2429 SLP kind detection invalid. */
2430 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2431 vect_slp_analyze_operations (loop_vinfo);
2432 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2434 ok = opt_result::failure_at (vect_location,
2435 "unsupported SLP instances\n");
2436 goto again;
2439 /* Check whether any load in ALL SLP instances is possibly permuted. */
2440 slp_tree load_node, slp_root;
2441 unsigned i, x;
2442 slp_instance instance;
2443 bool can_use_lanes = true;
2444 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2446 slp_root = SLP_INSTANCE_TREE (instance);
2447 int group_size = SLP_TREE_LANES (slp_root);
2448 tree vectype = SLP_TREE_VECTYPE (slp_root);
2449 bool loads_permuted = false;
2450 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2452 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2453 continue;
2454 unsigned j;
2455 stmt_vec_info load_info;
2456 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2457 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2459 loads_permuted = true;
2460 break;
2464 /* If the loads and stores can be handled with load/store-lane
2465 instructions record it and move on to the next instance. */
2466 if (loads_permuted
2467 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2468 && vect_store_lanes_supported (vectype, group_size, false))
2470 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2472 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2473 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2474 /* Use SLP for strided accesses (or if we can't
2475 load-lanes). */
2476 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2477 || ! vect_load_lanes_supported
2478 (STMT_VINFO_VECTYPE (stmt_vinfo),
2479 DR_GROUP_SIZE (stmt_vinfo), false))
2480 break;
2483 can_use_lanes
2484 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2486 if (can_use_lanes && dump_enabled_p ())
2487 dump_printf_loc (MSG_NOTE, vect_location,
2488 "SLP instance %p can use load/store-lanes\n",
2489 instance);
2491 else
2493 can_use_lanes = false;
2494 break;
2498 /* If all SLP instances can use load/store-lanes abort SLP and try again
2499 with SLP disabled. */
2500 if (can_use_lanes)
2502 ok = opt_result::failure_at (vect_location,
2503 "Built SLP cancelled: can use "
2504 "load/store-lanes\n");
2505 if (dump_enabled_p ())
2506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2507 "Built SLP cancelled: all SLP instances support "
2508 "load/store-lanes\n");
2509 goto again;
2513 /* Dissolve SLP-only groups. */
2514 vect_dissolve_slp_only_groups (loop_vinfo);
2516 /* Scan all the remaining operations in the loop that are not subject
2517 to SLP and make sure they are vectorizable. */
2518 ok = vect_analyze_loop_operations (loop_vinfo);
2519 if (!ok)
2521 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523 "bad operation or unsupported loop bound.\n");
2524 return ok;
2527 /* For now, we don't expect to mix both masking and length approaches for one
2528 loop, disable it if both are recorded. */
2529 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2530 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2531 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2535 "can't vectorize a loop with partial vectors"
2536 " because we don't expect to mix different"
2537 " approaches with partial vectors for the"
2538 " same loop.\n");
2539 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2542 /* If we still have the option of using partial vectors,
2543 check whether we can generate the necessary loop controls. */
2544 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2545 && !vect_verify_full_masking (loop_vinfo)
2546 && !vect_verify_loop_lens (loop_vinfo))
2547 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2549 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2550 to be able to handle fewer than VF scalars, or needs to have a lower VF
2551 than the main loop. */
2552 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2553 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2554 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2555 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2556 return opt_result::failure_at (vect_location,
2557 "Vectorization factor too high for"
2558 " epilogue loop.\n");
2560 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2561 assuming that the loop will be used as a main loop. We will redo
2562 this analysis later if we instead decide to use the loop as an
2563 epilogue loop. */
2564 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2565 if (!ok)
2566 return ok;
2568 /* Check the costings of the loop make vectorizing worthwhile. */
2569 res = vect_analyze_loop_costing (loop_vinfo);
2570 if (res < 0)
2572 ok = opt_result::failure_at (vect_location,
2573 "Loop costings may not be worthwhile.\n");
2574 goto again;
2576 if (!res)
2577 return opt_result::failure_at (vect_location,
2578 "Loop costings not worthwhile.\n");
2580 /* If an epilogue loop is required make sure we can create one. */
2581 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2582 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2584 if (dump_enabled_p ())
2585 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2586 if (!vect_can_advance_ivs_p (loop_vinfo)
2587 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2588 single_exit (LOOP_VINFO_LOOP
2589 (loop_vinfo))))
2591 ok = opt_result::failure_at (vect_location,
2592 "not vectorized: can't create required "
2593 "epilog loop\n");
2594 goto again;
2598 /* During peeling, we need to check if number of loop iterations is
2599 enough for both peeled prolog loop and vector loop. This check
2600 can be merged along with threshold check of loop versioning, so
2601 increase threshold for this case if necessary.
2603 If we are analyzing an epilogue we still want to check what its
2604 versioning threshold would be. If we decide to vectorize the epilogues we
2605 will want to use the lowest versioning threshold of all epilogues and main
2606 loop. This will enable us to enter a vectorized epilogue even when
2607 versioning the loop. We can't simply check whether the epilogue requires
2608 versioning though since we may have skipped some versioning checks when
2609 analyzing the epilogue. For instance, checks for alias versioning will be
2610 skipped when dealing with epilogues as we assume we already checked them
2611 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2612 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2614 poly_uint64 niters_th = 0;
2615 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2617 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2619 /* Niters for peeled prolog loop. */
2620 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2622 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2623 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2624 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2626 else
2627 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2630 /* Niters for at least one iteration of vectorized loop. */
2631 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2632 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2633 /* One additional iteration because of peeling for gap. */
2634 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2635 niters_th += 1;
2637 /* Use the same condition as vect_transform_loop to decide when to use
2638 the cost to determine a versioning threshold. */
2639 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2640 && ordered_p (th, niters_th))
2641 niters_th = ordered_max (poly_uint64 (th), niters_th);
2643 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2646 gcc_assert (known_eq (vectorization_factor,
2647 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2649 /* Ok to vectorize! */
2650 return opt_result::success ();
2652 again:
2653 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2654 gcc_assert (!ok);
2656 /* Try again with SLP forced off but if we didn't do any SLP there is
2657 no point in re-trying. */
2658 if (!slp)
2659 return ok;
2661 /* If there are reduction chains re-trying will fail anyway. */
2662 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2663 return ok;
2665 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2666 via interleaving or lane instructions. */
2667 slp_instance instance;
2668 slp_tree node;
2669 unsigned i, j;
2670 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2672 stmt_vec_info vinfo;
2673 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2674 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2675 continue;
2676 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2677 unsigned int size = DR_GROUP_SIZE (vinfo);
2678 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2679 if (! vect_store_lanes_supported (vectype, size, false)
2680 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2681 && ! vect_grouped_store_supported (vectype, size))
2682 return opt_result::failure_at (vinfo->stmt,
2683 "unsupported grouped store\n");
2684 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2686 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2687 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2688 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2689 size = DR_GROUP_SIZE (vinfo);
2690 vectype = STMT_VINFO_VECTYPE (vinfo);
2691 if (! vect_load_lanes_supported (vectype, size, false)
2692 && ! vect_grouped_load_supported (vectype, single_element_p,
2693 size))
2694 return opt_result::failure_at (vinfo->stmt,
2695 "unsupported grouped load\n");
2699 if (dump_enabled_p ())
2700 dump_printf_loc (MSG_NOTE, vect_location,
2701 "re-trying with SLP disabled\n");
2703 /* Roll back state appropriately. No SLP this time. */
2704 slp = false;
2705 /* Restore vectorization factor as it were without SLP. */
2706 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2707 /* Free the SLP instances. */
2708 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2709 vect_free_slp_instance (instance);
2710 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2711 /* Reset SLP type to loop_vect on all stmts. */
2712 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2714 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2715 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2716 !gsi_end_p (si); gsi_next (&si))
2718 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2719 STMT_SLP_TYPE (stmt_info) = loop_vect;
2720 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2721 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2723 /* vectorizable_reduction adjusts reduction stmt def-types,
2724 restore them to that of the PHI. */
2725 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2726 = STMT_VINFO_DEF_TYPE (stmt_info);
2727 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2728 (STMT_VINFO_REDUC_DEF (stmt_info)))
2729 = STMT_VINFO_DEF_TYPE (stmt_info);
2732 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2733 !gsi_end_p (si); gsi_next (&si))
2735 if (is_gimple_debug (gsi_stmt (si)))
2736 continue;
2737 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2738 STMT_SLP_TYPE (stmt_info) = loop_vect;
2739 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2741 stmt_vec_info pattern_stmt_info
2742 = STMT_VINFO_RELATED_STMT (stmt_info);
2743 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2744 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2746 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2747 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2748 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2749 !gsi_end_p (pi); gsi_next (&pi))
2750 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2751 = loop_vect;
2755 /* Free optimized alias test DDRS. */
2756 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2757 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2758 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2759 /* Reset target cost data. */
2760 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2761 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2762 = init_cost (LOOP_VINFO_LOOP (loop_vinfo), false);
2763 /* Reset accumulated rgroup information. */
2764 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2765 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2766 /* Reset assorted flags. */
2767 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2768 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2769 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2770 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2771 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2772 = saved_can_use_partial_vectors_p;
2774 goto start_over;
2777 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2778 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2779 OLD_LOOP_VINFO is better unless something specifically indicates
2780 otherwise.
2782 Note that this deliberately isn't a partial order. */
2784 static bool
2785 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2786 loop_vec_info old_loop_vinfo)
2788 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2789 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2791 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2792 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2794 /* Always prefer a VF of loop->simdlen over any other VF. */
2795 if (loop->simdlen)
2797 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2798 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2799 if (new_simdlen_p != old_simdlen_p)
2800 return new_simdlen_p;
2803 /* Limit the VFs to what is likely to be the maximum number of iterations,
2804 to handle cases in which at least one loop_vinfo is fully-masked. */
2805 HOST_WIDE_INT estimated_max_niter;
2806 loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
2807 unsigned HOST_WIDE_INT main_vf;
2808 if (main_loop
2809 && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
2810 && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
2811 estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
2812 else
2813 estimated_max_niter = likely_max_stmt_executions_int (loop);
2814 if (estimated_max_niter != -1)
2816 if (known_le (estimated_max_niter, new_vf))
2817 new_vf = estimated_max_niter;
2818 if (known_le (estimated_max_niter, old_vf))
2819 old_vf = estimated_max_niter;
2822 /* Check whether the (fractional) cost per scalar iteration is lower
2823 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2824 poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2825 poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2827 HOST_WIDE_INT est_rel_new_min
2828 = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2829 HOST_WIDE_INT est_rel_new_max
2830 = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2832 HOST_WIDE_INT est_rel_old_min
2833 = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2834 HOST_WIDE_INT est_rel_old_max
2835 = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2837 /* Check first if we can make out an unambigous total order from the minimum
2838 and maximum estimates. */
2839 if (est_rel_new_min < est_rel_old_min
2840 && est_rel_new_max < est_rel_old_max)
2841 return true;
2842 else if (est_rel_old_min < est_rel_new_min
2843 && est_rel_old_max < est_rel_new_max)
2844 return false;
2845 /* When old_loop_vinfo uses a variable vectorization factor,
2846 we know that it has a lower cost for at least one runtime VF.
2847 However, we don't know how likely that VF is.
2849 One option would be to compare the costs for the estimated VFs.
2850 The problem is that that can put too much pressure on the cost
2851 model. E.g. if the estimated VF is also the lowest possible VF,
2852 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2853 for the estimated VF, we'd then choose new_loop_vinfo even
2854 though (a) new_loop_vinfo might not actually be better than
2855 old_loop_vinfo for that VF and (b) it would be significantly
2856 worse at larger VFs.
2858 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2859 no more expensive than old_loop_vinfo even after doubling the
2860 estimated old_loop_vinfo VF. For all but trivial loops, this
2861 ensures that we only pick new_loop_vinfo if it is significantly
2862 better than old_loop_vinfo at the estimated VF. */
2864 if (est_rel_old_min != est_rel_new_min
2865 || est_rel_old_max != est_rel_new_max)
2867 HOST_WIDE_INT est_rel_new_likely
2868 = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2869 HOST_WIDE_INT est_rel_old_likely
2870 = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2872 return est_rel_new_likely * 2 <= est_rel_old_likely;
2875 /* If there's nothing to choose between the loop bodies, see whether
2876 there's a difference in the prologue and epilogue costs. */
2877 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2878 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2880 return false;
2883 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2884 true if we should. */
2886 static bool
2887 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2888 loop_vec_info old_loop_vinfo)
2890 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2891 return false;
2893 if (dump_enabled_p ())
2894 dump_printf_loc (MSG_NOTE, vect_location,
2895 "***** Preferring vector mode %s to vector mode %s\n",
2896 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2897 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2898 return true;
2901 /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise
2902 try to reanalyze it as a main loop. Return the loop_vinfo on success
2903 and null on failure. */
2905 static loop_vec_info
2906 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2908 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2909 return loop_vinfo;
2911 if (dump_enabled_p ())
2912 dump_printf_loc (MSG_NOTE, vect_location,
2913 "***** Reanalyzing as a main loop with vector mode %s\n",
2914 GET_MODE_NAME (loop_vinfo->vector_mode));
2916 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2917 vec_info_shared *shared = loop_vinfo->shared;
2918 opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2919 gcc_assert (main_loop_vinfo);
2921 main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2923 bool fatal = false;
2924 bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2925 loop->aux = NULL;
2926 if (!res)
2928 if (dump_enabled_p ())
2929 dump_printf_loc (MSG_NOTE, vect_location,
2930 "***** Failed to analyze main loop with vector"
2931 " mode %s\n",
2932 GET_MODE_NAME (loop_vinfo->vector_mode));
2933 delete main_loop_vinfo;
2934 return NULL;
2936 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2937 return main_loop_vinfo;
2940 /* Function vect_analyze_loop.
2942 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2943 for it. The different analyses will record information in the
2944 loop_vec_info struct. */
2945 opt_loop_vec_info
2946 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2948 auto_vector_modes vector_modes;
2950 /* Autodetect first vector size we try. */
2951 unsigned int autovec_flags
2952 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2953 loop->simdlen != 0);
2954 unsigned int mode_i = 0;
2956 DUMP_VECT_SCOPE ("analyze_loop_nest");
2958 if (loop_outer (loop)
2959 && loop_vec_info_for_loop (loop_outer (loop))
2960 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2961 return opt_loop_vec_info::failure_at (vect_location,
2962 "outer-loop already vectorized.\n");
2964 if (!find_loop_nest (loop, &shared->loop_nest))
2965 return opt_loop_vec_info::failure_at
2966 (vect_location,
2967 "not vectorized: loop nest containing two or more consecutive inner"
2968 " loops cannot be vectorized\n");
2970 unsigned n_stmts = 0;
2971 machine_mode autodetected_vector_mode = VOIDmode;
2972 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2973 machine_mode next_vector_mode = VOIDmode;
2974 poly_uint64 lowest_th = 0;
2975 unsigned vectorized_loops = 0;
2976 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2977 && !unlimited_cost_model (loop));
2979 bool vect_epilogues = false;
2980 opt_result res = opt_result::success ();
2981 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2982 while (1)
2984 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2985 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2986 if (!loop_vinfo)
2988 if (dump_enabled_p ())
2989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2990 "bad loop form.\n");
2991 gcc_checking_assert (first_loop_vinfo == NULL);
2992 return loop_vinfo;
2994 loop_vinfo->vector_mode = next_vector_mode;
2996 bool fatal = false;
2998 /* When pick_lowest_cost_p is true, we should in principle iterate
2999 over all the loop_vec_infos that LOOP_VINFO could replace and
3000 try to vectorize LOOP_VINFO under the same conditions.
3001 E.g. when trying to replace an epilogue loop, we should vectorize
3002 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
3003 to replace the main loop, we should vectorize LOOP_VINFO as a main
3004 loop too.
3006 However, autovectorize_vector_modes is usually sorted as follows:
3008 - Modes that naturally produce lower VFs usually follow modes that
3009 naturally produce higher VFs.
3011 - When modes naturally produce the same VF, maskable modes
3012 usually follow unmaskable ones, so that the maskable mode
3013 can be used to vectorize the epilogue of the unmaskable mode.
3015 This order is preferred because it leads to the maximum
3016 epilogue vectorization opportunities. Targets should only use
3017 a different order if they want to make wide modes available while
3018 disparaging them relative to earlier, smaller modes. The assumption
3019 in that case is that the wider modes are more expensive in some
3020 way that isn't reflected directly in the costs.
3022 There should therefore be few interesting cases in which
3023 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
3024 treated as a standalone loop, and ends up being genuinely cheaper
3025 than FIRST_LOOP_VINFO. */
3026 if (vect_epilogues)
3027 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
3029 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
3030 if (mode_i == 0)
3031 autodetected_vector_mode = loop_vinfo->vector_mode;
3032 if (dump_enabled_p ())
3034 if (res)
3035 dump_printf_loc (MSG_NOTE, vect_location,
3036 "***** Analysis succeeded with vector mode %s\n",
3037 GET_MODE_NAME (loop_vinfo->vector_mode));
3038 else
3039 dump_printf_loc (MSG_NOTE, vect_location,
3040 "***** Analysis failed with vector mode %s\n",
3041 GET_MODE_NAME (loop_vinfo->vector_mode));
3044 loop->aux = NULL;
3046 if (!fatal)
3047 while (mode_i < vector_modes.length ()
3048 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3050 if (dump_enabled_p ())
3051 dump_printf_loc (MSG_NOTE, vect_location,
3052 "***** The result for vector mode %s would"
3053 " be the same\n",
3054 GET_MODE_NAME (vector_modes[mode_i]));
3055 mode_i += 1;
3058 if (res)
3060 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3061 vectorized_loops++;
3063 /* Once we hit the desired simdlen for the first time,
3064 discard any previous attempts. */
3065 if (simdlen
3066 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3068 delete first_loop_vinfo;
3069 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3070 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3071 simdlen = 0;
3073 else if (pick_lowest_cost_p && first_loop_vinfo)
3075 /* Keep trying to roll back vectorization attempts while the
3076 loop_vec_infos they produced were worse than this one. */
3077 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3078 while (!vinfos.is_empty ()
3079 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3081 gcc_assert (vect_epilogues);
3082 delete vinfos.pop ();
3084 if (vinfos.is_empty ()
3085 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3087 loop_vec_info main_loop_vinfo
3088 = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3089 if (main_loop_vinfo == loop_vinfo)
3091 delete first_loop_vinfo;
3092 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3094 else if (main_loop_vinfo
3095 && vect_joust_loop_vinfos (main_loop_vinfo,
3096 first_loop_vinfo))
3098 delete first_loop_vinfo;
3099 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3100 delete loop_vinfo;
3101 loop_vinfo
3102 = opt_loop_vec_info::success (main_loop_vinfo);
3104 else
3106 if (dump_enabled_p ())
3107 dump_printf_loc (MSG_NOTE, vect_location,
3108 "***** No longer preferring vector"
3109 " mode %s after reanalyzing the loop"
3110 " as a main loop\n",
3111 GET_MODE_NAME
3112 (main_loop_vinfo->vector_mode));
3113 delete main_loop_vinfo;
3118 if (first_loop_vinfo == NULL)
3120 first_loop_vinfo = loop_vinfo;
3121 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3123 else if (vect_epilogues
3124 /* For now only allow one epilogue loop. */
3125 && first_loop_vinfo->epilogue_vinfos.is_empty ())
3127 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3128 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3129 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3130 || maybe_ne (lowest_th, 0U));
3131 /* Keep track of the known smallest versioning
3132 threshold. */
3133 if (ordered_p (lowest_th, th))
3134 lowest_th = ordered_min (lowest_th, th);
3136 else
3138 delete loop_vinfo;
3139 loop_vinfo = opt_loop_vec_info::success (NULL);
3142 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3143 enabled, SIMDUID is not set, it is the innermost loop and we have
3144 either already found the loop's SIMDLEN or there was no SIMDLEN to
3145 begin with.
3146 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3147 vect_epilogues = (!simdlen
3148 && loop->inner == NULL
3149 && param_vect_epilogues_nomask
3150 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3151 && !loop->simduid
3152 /* For now only allow one epilogue loop, but allow
3153 pick_lowest_cost_p to replace it. */
3154 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3155 || pick_lowest_cost_p));
3157 /* Commit to first_loop_vinfo if we have no reason to try
3158 alternatives. */
3159 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3160 break;
3162 else
3164 delete loop_vinfo;
3165 loop_vinfo = opt_loop_vec_info::success (NULL);
3166 if (fatal)
3168 gcc_checking_assert (first_loop_vinfo == NULL);
3169 break;
3173 /* Handle the case that the original loop can use partial
3174 vectorization, but want to only adopt it for the epilogue.
3175 The retry should be in the same mode as original. */
3176 if (vect_epilogues
3177 && loop_vinfo
3178 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3180 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3181 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_NOTE, vect_location,
3184 "***** Re-trying analysis with same vector mode"
3185 " %s for epilogue with partial vectors.\n",
3186 GET_MODE_NAME (loop_vinfo->vector_mode));
3187 continue;
3190 if (mode_i < vector_modes.length ()
3191 && VECTOR_MODE_P (autodetected_vector_mode)
3192 && (related_vector_mode (vector_modes[mode_i],
3193 GET_MODE_INNER (autodetected_vector_mode))
3194 == autodetected_vector_mode)
3195 && (related_vector_mode (autodetected_vector_mode,
3196 GET_MODE_INNER (vector_modes[mode_i]))
3197 == vector_modes[mode_i]))
3199 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_NOTE, vect_location,
3201 "***** Skipping vector mode %s, which would"
3202 " repeat the analysis for %s\n",
3203 GET_MODE_NAME (vector_modes[mode_i]),
3204 GET_MODE_NAME (autodetected_vector_mode));
3205 mode_i += 1;
3208 if (mode_i == vector_modes.length ()
3209 || autodetected_vector_mode == VOIDmode)
3210 break;
3212 /* Try the next biggest vector size. */
3213 next_vector_mode = vector_modes[mode_i++];
3214 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_NOTE, vect_location,
3216 "***** Re-trying analysis with vector mode %s\n",
3217 GET_MODE_NAME (next_vector_mode));
3220 if (first_loop_vinfo)
3222 loop->aux = (loop_vec_info) first_loop_vinfo;
3223 if (dump_enabled_p ())
3224 dump_printf_loc (MSG_NOTE, vect_location,
3225 "***** Choosing vector mode %s\n",
3226 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3227 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3228 return first_loop_vinfo;
3231 return opt_loop_vec_info::propagate_failure (res);
3234 /* Return true if there is an in-order reduction function for CODE, storing
3235 it in *REDUC_FN if so. */
3237 static bool
3238 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3240 switch (code)
3242 case PLUS_EXPR:
3243 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3244 return true;
3246 default:
3247 return false;
3251 /* Function reduction_fn_for_scalar_code
3253 Input:
3254 CODE - tree_code of a reduction operations.
3256 Output:
3257 REDUC_FN - the corresponding internal function to be used to reduce the
3258 vector of partial results into a single scalar result, or IFN_LAST
3259 if the operation is a supported reduction operation, but does not have
3260 such an internal function.
3262 Return FALSE if CODE currently cannot be vectorized as reduction. */
3264 bool
3265 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3267 switch (code)
3269 case MAX_EXPR:
3270 *reduc_fn = IFN_REDUC_MAX;
3271 return true;
3273 case MIN_EXPR:
3274 *reduc_fn = IFN_REDUC_MIN;
3275 return true;
3277 case PLUS_EXPR:
3278 *reduc_fn = IFN_REDUC_PLUS;
3279 return true;
3281 case BIT_AND_EXPR:
3282 *reduc_fn = IFN_REDUC_AND;
3283 return true;
3285 case BIT_IOR_EXPR:
3286 *reduc_fn = IFN_REDUC_IOR;
3287 return true;
3289 case BIT_XOR_EXPR:
3290 *reduc_fn = IFN_REDUC_XOR;
3291 return true;
3293 case MULT_EXPR:
3294 case MINUS_EXPR:
3295 *reduc_fn = IFN_LAST;
3296 return true;
3298 default:
3299 return false;
3303 /* If there is a neutral value X such that a reduction would not be affected
3304 by the introduction of additional X elements, return that X, otherwise
3305 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3306 of the scalar elements. If the reduction has just a single initial value
3307 then INITIAL_VALUE is that value, otherwise it is null. */
3309 static tree
3310 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3312 switch (code)
3314 case WIDEN_SUM_EXPR:
3315 case DOT_PROD_EXPR:
3316 case SAD_EXPR:
3317 case PLUS_EXPR:
3318 case MINUS_EXPR:
3319 case BIT_IOR_EXPR:
3320 case BIT_XOR_EXPR:
3321 return build_zero_cst (scalar_type);
3323 case MULT_EXPR:
3324 return build_one_cst (scalar_type);
3326 case BIT_AND_EXPR:
3327 return build_all_ones_cst (scalar_type);
3329 case MAX_EXPR:
3330 case MIN_EXPR:
3331 return initial_value;
3333 default:
3334 return NULL_TREE;
3338 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3339 STMT is printed with a message MSG. */
3341 static void
3342 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3344 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3347 /* Return true if we need an in-order reduction for operation CODE
3348 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3349 overflow must wrap. */
3351 bool
3352 needs_fold_left_reduction_p (tree type, tree_code code)
3354 /* CHECKME: check for !flag_finite_math_only too? */
3355 if (SCALAR_FLOAT_TYPE_P (type))
3356 switch (code)
3358 case MIN_EXPR:
3359 case MAX_EXPR:
3360 return false;
3362 default:
3363 return !flag_associative_math;
3366 if (INTEGRAL_TYPE_P (type))
3368 if (!operation_no_trapping_overflow (type, code))
3369 return true;
3370 return false;
3373 if (SAT_FIXED_POINT_TYPE_P (type))
3374 return true;
3376 return false;
3379 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3380 has a handled computation expression. Store the main reduction
3381 operation in *CODE. */
3383 static bool
3384 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3385 tree loop_arg, enum tree_code *code,
3386 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3388 auto_bitmap visited;
3389 tree lookfor = PHI_RESULT (phi);
3390 ssa_op_iter curri;
3391 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3392 while (USE_FROM_PTR (curr) != loop_arg)
3393 curr = op_iter_next_use (&curri);
3394 curri.i = curri.numops;
3397 path.safe_push (std::make_pair (curri, curr));
3398 tree use = USE_FROM_PTR (curr);
3399 if (use == lookfor)
3400 break;
3401 gimple *def = SSA_NAME_DEF_STMT (use);
3402 if (gimple_nop_p (def)
3403 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3405 pop:
3408 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3409 curri = x.first;
3410 curr = x.second;
3412 curr = op_iter_next_use (&curri);
3413 /* Skip already visited or non-SSA operands (from iterating
3414 over PHI args). */
3415 while (curr != NULL_USE_OPERAND_P
3416 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3417 || ! bitmap_set_bit (visited,
3418 SSA_NAME_VERSION
3419 (USE_FROM_PTR (curr)))));
3421 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3422 if (curr == NULL_USE_OPERAND_P)
3423 break;
3425 else
3427 if (gimple_code (def) == GIMPLE_PHI)
3428 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3429 else
3430 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3431 while (curr != NULL_USE_OPERAND_P
3432 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3433 || ! bitmap_set_bit (visited,
3434 SSA_NAME_VERSION
3435 (USE_FROM_PTR (curr)))))
3436 curr = op_iter_next_use (&curri);
3437 if (curr == NULL_USE_OPERAND_P)
3438 goto pop;
3441 while (1);
3442 if (dump_file && (dump_flags & TDF_DETAILS))
3444 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3445 unsigned i;
3446 std::pair<ssa_op_iter, use_operand_p> *x;
3447 FOR_EACH_VEC_ELT (path, i, x)
3448 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3449 dump_printf (MSG_NOTE, "\n");
3452 /* Check whether the reduction path detected is valid. */
3453 bool fail = path.length () == 0;
3454 bool neg = false;
3455 int sign = -1;
3456 *code = ERROR_MARK;
3457 for (unsigned i = 1; i < path.length (); ++i)
3459 gimple *use_stmt = USE_STMT (path[i].second);
3460 tree op = USE_FROM_PTR (path[i].second);
3461 if (! is_gimple_assign (use_stmt)
3462 /* The following make sure we can compute the operand index
3463 easily plus it mostly disallows chaining via COND_EXPR condition
3464 operands. */
3465 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3466 && (gimple_num_ops (use_stmt) <= 2
3467 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3468 && (gimple_num_ops (use_stmt) <= 3
3469 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3471 fail = true;
3472 break;
3474 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3475 if (use_code == MINUS_EXPR)
3477 use_code = PLUS_EXPR;
3478 /* Track whether we negate the reduction value each iteration. */
3479 if (gimple_assign_rhs2 (use_stmt) == op)
3480 neg = ! neg;
3482 if (CONVERT_EXPR_CODE_P (use_code)
3483 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3484 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3486 else if (*code == ERROR_MARK)
3488 *code = use_code;
3489 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3491 else if (use_code != *code)
3493 fail = true;
3494 break;
3496 else if ((use_code == MIN_EXPR
3497 || use_code == MAX_EXPR)
3498 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3500 fail = true;
3501 break;
3503 /* Check there's only a single stmt the op is used on. For the
3504 not value-changing tail and the last stmt allow out-of-loop uses.
3505 ??? We could relax this and handle arbitrary live stmts by
3506 forcing a scalar epilogue for example. */
3507 imm_use_iterator imm_iter;
3508 gimple *op_use_stmt;
3509 unsigned cnt = 0;
3510 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3511 if (!is_gimple_debug (op_use_stmt)
3512 && (*code != ERROR_MARK
3513 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3515 /* We want to allow x + x but not x < 1 ? x : 2. */
3516 if (is_gimple_assign (op_use_stmt)
3517 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3519 use_operand_p use_p;
3520 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3521 cnt++;
3523 else
3524 cnt++;
3526 if (cnt != 1)
3528 fail = true;
3529 break;
3532 return ! fail && ! neg && *code != ERROR_MARK;
3535 bool
3536 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3537 tree loop_arg, enum tree_code code)
3539 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3540 enum tree_code code_;
3541 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3542 && code_ == code);
3547 /* Function vect_is_simple_reduction
3549 (1) Detect a cross-iteration def-use cycle that represents a simple
3550 reduction computation. We look for the following pattern:
3552 loop_header:
3553 a1 = phi < a0, a2 >
3554 a3 = ...
3555 a2 = operation (a3, a1)
3559 a3 = ...
3560 loop_header:
3561 a1 = phi < a0, a2 >
3562 a2 = operation (a3, a1)
3564 such that:
3565 1. operation is commutative and associative and it is safe to
3566 change the order of the computation
3567 2. no uses for a2 in the loop (a2 is used out of the loop)
3568 3. no uses of a1 in the loop besides the reduction operation
3569 4. no uses of a1 outside the loop.
3571 Conditions 1,4 are tested here.
3572 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3574 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3575 nested cycles.
3577 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3578 reductions:
3580 a1 = phi < a0, a2 >
3581 inner loop (def of a3)
3582 a2 = phi < a3 >
3584 (4) Detect condition expressions, ie:
3585 for (int i = 0; i < N; i++)
3586 if (a[i] < val)
3587 ret_val = a[i];
3591 static stmt_vec_info
3592 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3593 bool *double_reduc, bool *reduc_chain_p)
3595 gphi *phi = as_a <gphi *> (phi_info->stmt);
3596 gimple *phi_use_stmt = NULL;
3597 imm_use_iterator imm_iter;
3598 use_operand_p use_p;
3600 *double_reduc = false;
3601 *reduc_chain_p = false;
3602 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3604 tree phi_name = PHI_RESULT (phi);
3605 /* ??? If there are no uses of the PHI result the inner loop reduction
3606 won't be detected as possibly double-reduction by vectorizable_reduction
3607 because that tries to walk the PHI arg from the preheader edge which
3608 can be constant. See PR60382. */
3609 if (has_zero_uses (phi_name))
3610 return NULL;
3611 class loop *loop = (gimple_bb (phi))->loop_father;
3612 unsigned nphi_def_loop_uses = 0;
3613 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3615 gimple *use_stmt = USE_STMT (use_p);
3616 if (is_gimple_debug (use_stmt))
3617 continue;
3619 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3621 if (dump_enabled_p ())
3622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3623 "intermediate value used outside loop.\n");
3625 return NULL;
3628 nphi_def_loop_uses++;
3629 phi_use_stmt = use_stmt;
3632 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3633 if (TREE_CODE (latch_def) != SSA_NAME)
3635 if (dump_enabled_p ())
3636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3637 "reduction: not ssa_name: %T\n", latch_def);
3638 return NULL;
3641 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3642 if (!def_stmt_info
3643 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3644 return NULL;
3646 bool nested_in_vect_loop
3647 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3648 unsigned nlatch_def_loop_uses = 0;
3649 auto_vec<gphi *, 3> lcphis;
3650 bool inner_loop_of_double_reduc = false;
3651 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3653 gimple *use_stmt = USE_STMT (use_p);
3654 if (is_gimple_debug (use_stmt))
3655 continue;
3656 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3657 nlatch_def_loop_uses++;
3658 else
3660 /* We can have more than one loop-closed PHI. */
3661 lcphis.safe_push (as_a <gphi *> (use_stmt));
3662 if (nested_in_vect_loop
3663 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3664 == vect_double_reduction_def))
3665 inner_loop_of_double_reduc = true;
3669 /* If we are vectorizing an inner reduction we are executing that
3670 in the original order only in case we are not dealing with a
3671 double reduction. */
3672 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3674 if (dump_enabled_p ())
3675 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3676 "detected nested cycle: ");
3677 return def_stmt_info;
3680 /* If this isn't a nested cycle or if the nested cycle reduction value
3681 is used ouside of the inner loop we cannot handle uses of the reduction
3682 value. */
3683 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3685 if (dump_enabled_p ())
3686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3687 "reduction used in loop.\n");
3688 return NULL;
3691 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3692 defined in the inner loop. */
3693 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3695 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3696 if (gimple_phi_num_args (def_stmt) != 1
3697 || TREE_CODE (op1) != SSA_NAME)
3699 if (dump_enabled_p ())
3700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3701 "unsupported phi node definition.\n");
3703 return NULL;
3706 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3707 if (gimple_bb (def1)
3708 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3709 && loop->inner
3710 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3711 && is_gimple_assign (def1)
3712 && is_a <gphi *> (phi_use_stmt)
3713 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3715 if (dump_enabled_p ())
3716 report_vect_op (MSG_NOTE, def_stmt,
3717 "detected double reduction: ");
3719 *double_reduc = true;
3720 return def_stmt_info;
3723 return NULL;
3726 /* Look for the expression computing latch_def from then loop PHI result. */
3727 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3728 enum tree_code code;
3729 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3730 path))
3732 STMT_VINFO_REDUC_CODE (phi_info) = code;
3733 if (code == COND_EXPR && !nested_in_vect_loop)
3734 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3736 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3737 reduction chain for which the additional restriction is that
3738 all operations in the chain are the same. */
3739 auto_vec<stmt_vec_info, 8> reduc_chain;
3740 unsigned i;
3741 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3742 for (i = path.length () - 1; i >= 1; --i)
3744 gimple *stmt = USE_STMT (path[i].second);
3745 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3746 STMT_VINFO_REDUC_IDX (stmt_info)
3747 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3748 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3749 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3750 && (i == 1 || i == path.length () - 1));
3751 if ((stmt_code != code && !leading_conversion)
3752 /* We can only handle the final value in epilogue
3753 generation for reduction chains. */
3754 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3755 is_slp_reduc = false;
3756 /* For reduction chains we support a trailing/leading
3757 conversions. We do not store those in the actual chain. */
3758 if (leading_conversion)
3759 continue;
3760 reduc_chain.safe_push (stmt_info);
3762 if (is_slp_reduc && reduc_chain.length () > 1)
3764 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3766 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3767 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3769 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3770 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3772 /* Save the chain for further analysis in SLP detection. */
3773 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3774 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3776 *reduc_chain_p = true;
3777 if (dump_enabled_p ())
3778 dump_printf_loc (MSG_NOTE, vect_location,
3779 "reduction: detected reduction chain\n");
3781 else if (dump_enabled_p ())
3782 dump_printf_loc (MSG_NOTE, vect_location,
3783 "reduction: detected reduction\n");
3785 return def_stmt_info;
3788 if (dump_enabled_p ())
3789 dump_printf_loc (MSG_NOTE, vect_location,
3790 "reduction: unknown pattern\n");
3792 return NULL;
3795 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3796 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3797 or -1 if not known. */
3799 static int
3800 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3802 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3803 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3805 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_NOTE, vect_location,
3807 "cost model: epilogue peel iters set to vf/2 "
3808 "because loop iterations are unknown .\n");
3809 return assumed_vf / 2;
3811 else
3813 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3814 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3815 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3816 /* If we need to peel for gaps, but no peeling is required, we have to
3817 peel VF iterations. */
3818 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3819 peel_iters_epilogue = assumed_vf;
3820 return peel_iters_epilogue;
3824 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3826 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3827 int *peel_iters_epilogue,
3828 stmt_vector_for_cost *scalar_cost_vec,
3829 stmt_vector_for_cost *prologue_cost_vec,
3830 stmt_vector_for_cost *epilogue_cost_vec)
3832 int retval = 0;
3834 *peel_iters_epilogue
3835 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3837 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3839 /* If peeled iterations are known but number of scalar loop
3840 iterations are unknown, count a taken branch per peeled loop. */
3841 if (peel_iters_prologue > 0)
3842 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3843 NULL, NULL_TREE, 0, vect_prologue);
3844 if (*peel_iters_epilogue > 0)
3845 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3846 NULL, NULL_TREE, 0, vect_epilogue);
3849 stmt_info_for_cost *si;
3850 int j;
3851 if (peel_iters_prologue)
3852 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3853 retval += record_stmt_cost (prologue_cost_vec,
3854 si->count * peel_iters_prologue,
3855 si->kind, si->stmt_info, si->misalign,
3856 vect_prologue);
3857 if (*peel_iters_epilogue)
3858 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3859 retval += record_stmt_cost (epilogue_cost_vec,
3860 si->count * *peel_iters_epilogue,
3861 si->kind, si->stmt_info, si->misalign,
3862 vect_epilogue);
3864 return retval;
3867 /* Function vect_estimate_min_profitable_iters
3869 Return the number of iterations required for the vector version of the
3870 loop to be profitable relative to the cost of the scalar version of the
3871 loop.
3873 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3874 of iterations for vectorization. -1 value means loop vectorization
3875 is not profitable. This returned value may be used for dynamic
3876 profitability check.
3878 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3879 for static check against estimated number of iterations. */
3881 static void
3882 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3883 int *ret_min_profitable_niters,
3884 int *ret_min_profitable_estimate)
3886 int min_profitable_iters;
3887 int min_profitable_estimate;
3888 int peel_iters_prologue;
3889 int peel_iters_epilogue;
3890 unsigned vec_inside_cost = 0;
3891 int vec_outside_cost = 0;
3892 unsigned vec_prologue_cost = 0;
3893 unsigned vec_epilogue_cost = 0;
3894 int scalar_single_iter_cost = 0;
3895 int scalar_outside_cost = 0;
3896 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3897 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3898 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3900 /* Cost model disabled. */
3901 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3903 if (dump_enabled_p ())
3904 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3905 *ret_min_profitable_niters = 0;
3906 *ret_min_profitable_estimate = 0;
3907 return;
3910 /* Requires loop versioning tests to handle misalignment. */
3911 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3913 /* FIXME: Make cost depend on complexity of individual check. */
3914 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3915 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3916 NULL, NULL_TREE, 0, vect_prologue);
3917 if (dump_enabled_p ())
3918 dump_printf (MSG_NOTE,
3919 "cost model: Adding cost of checks for loop "
3920 "versioning to treat misalignment.\n");
3923 /* Requires loop versioning with alias checks. */
3924 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3926 /* FIXME: Make cost depend on complexity of individual check. */
3927 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3928 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3929 NULL, NULL_TREE, 0, vect_prologue);
3930 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3931 if (len)
3932 /* Count LEN - 1 ANDs and LEN comparisons. */
3933 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3934 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3935 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3936 if (len)
3938 /* Count LEN - 1 ANDs and LEN comparisons. */
3939 unsigned int nstmts = len * 2 - 1;
3940 /* +1 for each bias that needs adding. */
3941 for (unsigned int i = 0; i < len; ++i)
3942 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3943 nstmts += 1;
3944 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3945 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3947 if (dump_enabled_p ())
3948 dump_printf (MSG_NOTE,
3949 "cost model: Adding cost of checks for loop "
3950 "versioning aliasing.\n");
3953 /* Requires loop versioning with niter checks. */
3954 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3956 /* FIXME: Make cost depend on complexity of individual check. */
3957 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3958 NULL, NULL_TREE, 0, vect_prologue);
3959 if (dump_enabled_p ())
3960 dump_printf (MSG_NOTE,
3961 "cost model: Adding cost of checks for loop "
3962 "versioning niters.\n");
3965 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3966 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3967 NULL, NULL_TREE, 0, vect_prologue);
3969 /* Count statements in scalar loop. Using this as scalar cost for a single
3970 iteration for now.
3972 TODO: Add outer loop support.
3974 TODO: Consider assigning different costs to different scalar
3975 statements. */
3977 scalar_single_iter_cost
3978 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3980 /* Add additional cost for the peeled instructions in prologue and epilogue
3981 loop. (For fully-masked loops there will be no peeling.)
3983 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3984 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3986 TODO: Build an expression that represents peel_iters for prologue and
3987 epilogue to be used in a run-time test. */
3989 bool prologue_need_br_taken_cost = false;
3990 bool prologue_need_br_not_taken_cost = false;
3992 /* Calculate peel_iters_prologue. */
3993 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3994 peel_iters_prologue = 0;
3995 else if (npeel < 0)
3997 peel_iters_prologue = assumed_vf / 2;
3998 if (dump_enabled_p ())
3999 dump_printf (MSG_NOTE, "cost model: "
4000 "prologue peel iters set to vf/2.\n");
4002 /* If peeled iterations are unknown, count a taken branch and a not taken
4003 branch per peeled loop. Even if scalar loop iterations are known,
4004 vector iterations are not known since peeled prologue iterations are
4005 not known. Hence guards remain the same. */
4006 prologue_need_br_taken_cost = true;
4007 prologue_need_br_not_taken_cost = true;
4009 else
4011 peel_iters_prologue = npeel;
4012 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4013 /* If peeled iterations are known but number of scalar loop
4014 iterations are unknown, count a taken branch per peeled loop. */
4015 prologue_need_br_taken_cost = true;
4018 bool epilogue_need_br_taken_cost = false;
4019 bool epilogue_need_br_not_taken_cost = false;
4021 /* Calculate peel_iters_epilogue. */
4022 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4023 /* We need to peel exactly one iteration for gaps. */
4024 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4025 else if (npeel < 0)
4027 /* If peeling for alignment is unknown, loop bound of main loop
4028 becomes unknown. */
4029 peel_iters_epilogue = assumed_vf / 2;
4030 if (dump_enabled_p ())
4031 dump_printf (MSG_NOTE, "cost model: "
4032 "epilogue peel iters set to vf/2 because "
4033 "peeling for alignment is unknown.\n");
4035 /* See the same reason above in peel_iters_prologue calculation. */
4036 epilogue_need_br_taken_cost = true;
4037 epilogue_need_br_not_taken_cost = true;
4039 else
4041 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4042 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4043 /* If peeled iterations are known but number of scalar loop
4044 iterations are unknown, count a taken branch per peeled loop. */
4045 epilogue_need_br_taken_cost = true;
4048 stmt_info_for_cost *si;
4049 int j;
4050 /* Add costs associated with peel_iters_prologue. */
4051 if (peel_iters_prologue)
4052 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4054 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4055 si->count * peel_iters_prologue, si->kind,
4056 si->stmt_info, si->vectype, si->misalign,
4057 vect_prologue);
4060 /* Add costs associated with peel_iters_epilogue. */
4061 if (peel_iters_epilogue)
4062 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4064 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4065 si->count * peel_iters_epilogue, si->kind,
4066 si->stmt_info, si->vectype, si->misalign,
4067 vect_epilogue);
4070 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4072 if (prologue_need_br_taken_cost)
4073 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4074 NULL, NULL_TREE, 0, vect_prologue);
4076 if (prologue_need_br_not_taken_cost)
4077 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4078 cond_branch_not_taken, NULL, NULL_TREE, 0,
4079 vect_prologue);
4081 if (epilogue_need_br_taken_cost)
4082 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4083 NULL, NULL_TREE, 0, vect_epilogue);
4085 if (epilogue_need_br_not_taken_cost)
4086 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4087 cond_branch_not_taken, NULL, NULL_TREE, 0,
4088 vect_epilogue);
4090 /* Take care of special costs for rgroup controls of partial vectors. */
4091 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4093 /* Calculate how many masks we need to generate. */
4094 unsigned int num_masks = 0;
4095 rgroup_controls *rgm;
4096 unsigned int num_vectors_m1;
4097 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4098 if (rgm->type)
4099 num_masks += num_vectors_m1 + 1;
4100 gcc_assert (num_masks > 0);
4102 /* In the worst case, we need to generate each mask in the prologue
4103 and in the loop body. One of the loop body mask instructions
4104 replaces the comparison in the scalar loop, and since we don't
4105 count the scalar comparison against the scalar body, we shouldn't
4106 count that vector instruction against the vector body either.
4108 Sometimes we can use unpacks instead of generating prologue
4109 masks and sometimes the prologue mask will fold to a constant,
4110 so the actual prologue cost might be smaller. However, it's
4111 simpler and safer to use the worst-case cost; if this ends up
4112 being the tie-breaker between vectorizing or not, then it's
4113 probably better not to vectorize. */
4114 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4115 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4116 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4117 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4119 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4121 /* Referring to the functions vect_set_loop_condition_partial_vectors
4122 and vect_set_loop_controls_directly, we need to generate each
4123 length in the prologue and in the loop body if required. Although
4124 there are some possible optimizations, we consider the worst case
4125 here. */
4127 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4128 bool need_iterate_p
4129 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4130 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4132 /* Calculate how many statements to be added. */
4133 unsigned int prologue_stmts = 0;
4134 unsigned int body_stmts = 0;
4136 rgroup_controls *rgc;
4137 unsigned int num_vectors_m1;
4138 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4139 if (rgc->type)
4141 /* May need one SHIFT for nitems_total computation. */
4142 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4143 if (nitems != 1 && !niters_known_p)
4144 prologue_stmts += 1;
4146 /* May need one MAX and one MINUS for wrap around. */
4147 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4148 prologue_stmts += 2;
4150 /* Need one MAX and one MINUS for each batch limit excepting for
4151 the 1st one. */
4152 prologue_stmts += num_vectors_m1 * 2;
4154 unsigned int num_vectors = num_vectors_m1 + 1;
4156 /* Need to set up lengths in prologue, only one MIN required
4157 for each since start index is zero. */
4158 prologue_stmts += num_vectors;
4160 /* Each may need two MINs and one MINUS to update lengths in body
4161 for next iteration. */
4162 if (need_iterate_p)
4163 body_stmts += 3 * num_vectors;
4166 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4167 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4168 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4169 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4172 /* FORNOW: The scalar outside cost is incremented in one of the
4173 following ways:
4175 1. The vectorizer checks for alignment and aliasing and generates
4176 a condition that allows dynamic vectorization. A cost model
4177 check is ANDED with the versioning condition. Hence scalar code
4178 path now has the added cost of the versioning check.
4180 if (cost > th & versioning_check)
4181 jmp to vector code
4183 Hence run-time scalar is incremented by not-taken branch cost.
4185 2. The vectorizer then checks if a prologue is required. If the
4186 cost model check was not done before during versioning, it has to
4187 be done before the prologue check.
4189 if (cost <= th)
4190 prologue = scalar_iters
4191 if (prologue == 0)
4192 jmp to vector code
4193 else
4194 execute prologue
4195 if (prologue == num_iters)
4196 go to exit
4198 Hence the run-time scalar cost is incremented by a taken branch,
4199 plus a not-taken branch, plus a taken branch cost.
4201 3. The vectorizer then checks if an epilogue is required. If the
4202 cost model check was not done before during prologue check, it
4203 has to be done with the epilogue check.
4205 if (prologue == 0)
4206 jmp to vector code
4207 else
4208 execute prologue
4209 if (prologue == num_iters)
4210 go to exit
4211 vector code:
4212 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4213 jmp to epilogue
4215 Hence the run-time scalar cost should be incremented by 2 taken
4216 branches.
4218 TODO: The back end may reorder the BBS's differently and reverse
4219 conditions/branch directions. Change the estimates below to
4220 something more reasonable. */
4222 /* If the number of iterations is known and we do not do versioning, we can
4223 decide whether to vectorize at compile time. Hence the scalar version
4224 do not carry cost model guard costs. */
4225 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4226 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4228 /* Cost model check occurs at versioning. */
4229 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4230 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4231 else
4233 /* Cost model check occurs at prologue generation. */
4234 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4235 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4236 + vect_get_stmt_cost (cond_branch_not_taken);
4237 /* Cost model check occurs at epilogue generation. */
4238 else
4239 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4243 /* Complete the target-specific cost calculations. */
4244 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4245 &vec_inside_cost, &vec_epilogue_cost);
4247 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4249 /* Stash the costs so that we can compare two loop_vec_infos. */
4250 loop_vinfo->vec_inside_cost = vec_inside_cost;
4251 loop_vinfo->vec_outside_cost = vec_outside_cost;
4253 if (dump_enabled_p ())
4255 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4256 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4257 vec_inside_cost);
4258 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4259 vec_prologue_cost);
4260 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4261 vec_epilogue_cost);
4262 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4263 scalar_single_iter_cost);
4264 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4265 scalar_outside_cost);
4266 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4267 vec_outside_cost);
4268 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4269 peel_iters_prologue);
4270 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4271 peel_iters_epilogue);
4274 /* Calculate number of iterations required to make the vector version
4275 profitable, relative to the loop bodies only. The following condition
4276 must hold true:
4277 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4278 where
4279 SIC = scalar iteration cost, VIC = vector iteration cost,
4280 VOC = vector outside cost, VF = vectorization factor,
4281 NPEEL = prologue iterations + epilogue iterations,
4282 SOC = scalar outside cost for run time cost model check. */
4284 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4285 - vec_inside_cost);
4286 if (saving_per_viter <= 0)
4288 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4289 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4290 "vectorization did not happen for a simd loop");
4292 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4294 "cost model: the vector iteration cost = %d "
4295 "divided by the scalar iteration cost = %d "
4296 "is greater or equal to the vectorization factor = %d"
4297 ".\n",
4298 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4299 *ret_min_profitable_niters = -1;
4300 *ret_min_profitable_estimate = -1;
4301 return;
4304 /* ??? The "if" arm is written to handle all cases; see below for what
4305 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4306 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4308 /* Rewriting the condition above in terms of the number of
4309 vector iterations (vniters) rather than the number of
4310 scalar iterations (niters) gives:
4312 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4314 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4316 For integer N, X and Y when X > 0:
4318 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4319 int outside_overhead = (vec_outside_cost
4320 - scalar_single_iter_cost * peel_iters_prologue
4321 - scalar_single_iter_cost * peel_iters_epilogue
4322 - scalar_outside_cost);
4323 /* We're only interested in cases that require at least one
4324 vector iteration. */
4325 int min_vec_niters = 1;
4326 if (outside_overhead > 0)
4327 min_vec_niters = outside_overhead / saving_per_viter + 1;
4329 if (dump_enabled_p ())
4330 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4331 min_vec_niters);
4333 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4335 /* Now that we know the minimum number of vector iterations,
4336 find the minimum niters for which the scalar cost is larger:
4338 SIC * niters > VIC * vniters + VOC - SOC
4340 We know that the minimum niters is no more than
4341 vniters * VF + NPEEL, but it might be (and often is) less
4342 than that if a partial vector iteration is cheaper than the
4343 equivalent scalar code. */
4344 int threshold = (vec_inside_cost * min_vec_niters
4345 + vec_outside_cost
4346 - scalar_outside_cost);
4347 if (threshold <= 0)
4348 min_profitable_iters = 1;
4349 else
4350 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4352 else
4353 /* Convert the number of vector iterations into a number of
4354 scalar iterations. */
4355 min_profitable_iters = (min_vec_niters * assumed_vf
4356 + peel_iters_prologue
4357 + peel_iters_epilogue);
4359 else
4361 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4362 * assumed_vf
4363 - vec_inside_cost * peel_iters_prologue
4364 - vec_inside_cost * peel_iters_epilogue);
4365 if (min_profitable_iters <= 0)
4366 min_profitable_iters = 0;
4367 else
4369 min_profitable_iters /= saving_per_viter;
4371 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4372 <= (((int) vec_inside_cost * min_profitable_iters)
4373 + (((int) vec_outside_cost - scalar_outside_cost)
4374 * assumed_vf)))
4375 min_profitable_iters++;
4379 if (dump_enabled_p ())
4380 dump_printf (MSG_NOTE,
4381 " Calculated minimum iters for profitability: %d\n",
4382 min_profitable_iters);
4384 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4385 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4386 /* We want the vectorized loop to execute at least once. */
4387 min_profitable_iters = assumed_vf + peel_iters_prologue;
4388 else if (min_profitable_iters < peel_iters_prologue)
4389 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4390 vectorized loop executes at least once. */
4391 min_profitable_iters = peel_iters_prologue;
4393 if (dump_enabled_p ())
4394 dump_printf_loc (MSG_NOTE, vect_location,
4395 " Runtime profitability threshold = %d\n",
4396 min_profitable_iters);
4398 *ret_min_profitable_niters = min_profitable_iters;
4400 /* Calculate number of iterations required to make the vector version
4401 profitable, relative to the loop bodies only.
4403 Non-vectorized variant is SIC * niters and it must win over vector
4404 variant on the expected loop trip count. The following condition must hold true:
4405 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4407 if (vec_outside_cost <= 0)
4408 min_profitable_estimate = 0;
4409 /* ??? This "else if" arm is written to handle all cases; see below for
4410 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4411 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4413 /* This is a repeat of the code above, but with + SOC rather
4414 than - SOC. */
4415 int outside_overhead = (vec_outside_cost
4416 - scalar_single_iter_cost * peel_iters_prologue
4417 - scalar_single_iter_cost * peel_iters_epilogue
4418 + scalar_outside_cost);
4419 int min_vec_niters = 1;
4420 if (outside_overhead > 0)
4421 min_vec_niters = outside_overhead / saving_per_viter + 1;
4423 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4425 int threshold = (vec_inside_cost * min_vec_niters
4426 + vec_outside_cost
4427 + scalar_outside_cost);
4428 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4430 else
4431 min_profitable_estimate = (min_vec_niters * assumed_vf
4432 + peel_iters_prologue
4433 + peel_iters_epilogue);
4435 else
4437 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4438 * assumed_vf
4439 - vec_inside_cost * peel_iters_prologue
4440 - vec_inside_cost * peel_iters_epilogue)
4441 / ((scalar_single_iter_cost * assumed_vf)
4442 - vec_inside_cost);
4444 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4445 if (dump_enabled_p ())
4446 dump_printf_loc (MSG_NOTE, vect_location,
4447 " Static estimate profitability threshold = %d\n",
4448 min_profitable_estimate);
4450 *ret_min_profitable_estimate = min_profitable_estimate;
4453 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4454 vector elements (not bits) for a vector with NELT elements. */
4455 static void
4456 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4457 vec_perm_builder *sel)
4459 /* The encoding is a single stepped pattern. Any wrap-around is handled
4460 by vec_perm_indices. */
4461 sel->new_vector (nelt, 1, 3);
4462 for (unsigned int i = 0; i < 3; i++)
4463 sel->quick_push (i + offset);
4466 /* Checks whether the target supports whole-vector shifts for vectors of mode
4467 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4468 it supports vec_perm_const with masks for all necessary shift amounts. */
4469 static bool
4470 have_whole_vector_shift (machine_mode mode)
4472 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4473 return true;
4475 /* Variable-length vectors should be handled via the optab. */
4476 unsigned int nelt;
4477 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4478 return false;
4480 vec_perm_builder sel;
4481 vec_perm_indices indices;
4482 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4484 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4485 indices.new_vector (sel, 2, nelt);
4486 if (!can_vec_perm_const_p (mode, indices, false))
4487 return false;
4489 return true;
4492 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4493 functions. Design better to avoid maintenance issues. */
4495 /* Function vect_model_reduction_cost.
4497 Models cost for a reduction operation, including the vector ops
4498 generated within the strip-mine loop in some cases, the initial
4499 definition before the loop, and the epilogue code that must be generated. */
4501 static void
4502 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4503 stmt_vec_info stmt_info, internal_fn reduc_fn,
4504 vect_reduction_type reduction_type,
4505 int ncopies, stmt_vector_for_cost *cost_vec)
4507 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4508 enum tree_code code;
4509 optab optab;
4510 tree vectype;
4511 machine_mode mode;
4512 class loop *loop = NULL;
4514 if (loop_vinfo)
4515 loop = LOOP_VINFO_LOOP (loop_vinfo);
4517 /* Condition reductions generate two reductions in the loop. */
4518 if (reduction_type == COND_REDUCTION)
4519 ncopies *= 2;
4521 vectype = STMT_VINFO_VECTYPE (stmt_info);
4522 mode = TYPE_MODE (vectype);
4523 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4525 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4527 if (reduction_type == EXTRACT_LAST_REDUCTION)
4528 /* No extra instructions are needed in the prologue. The loop body
4529 operations are costed in vectorizable_condition. */
4530 inside_cost = 0;
4531 else if (reduction_type == FOLD_LEFT_REDUCTION)
4533 /* No extra instructions needed in the prologue. */
4534 prologue_cost = 0;
4536 if (reduc_fn != IFN_LAST)
4537 /* Count one reduction-like operation per vector. */
4538 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4539 stmt_info, 0, vect_body);
4540 else
4542 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4543 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4544 inside_cost = record_stmt_cost (cost_vec, nelements,
4545 vec_to_scalar, stmt_info, 0,
4546 vect_body);
4547 inside_cost += record_stmt_cost (cost_vec, nelements,
4548 scalar_stmt, stmt_info, 0,
4549 vect_body);
4552 else
4554 /* Add in cost for initial definition.
4555 For cond reduction we have four vectors: initial index, step,
4556 initial result of the data reduction, initial value of the index
4557 reduction. */
4558 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4559 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4560 scalar_to_vec, stmt_info, 0,
4561 vect_prologue);
4564 /* Determine cost of epilogue code.
4566 We have a reduction operator that will reduce the vector in one statement.
4567 Also requires scalar extract. */
4569 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4571 if (reduc_fn != IFN_LAST)
4573 if (reduction_type == COND_REDUCTION)
4575 /* An EQ stmt and an COND_EXPR stmt. */
4576 epilogue_cost += record_stmt_cost (cost_vec, 2,
4577 vector_stmt, stmt_info, 0,
4578 vect_epilogue);
4579 /* Reduction of the max index and a reduction of the found
4580 values. */
4581 epilogue_cost += record_stmt_cost (cost_vec, 2,
4582 vec_to_scalar, stmt_info, 0,
4583 vect_epilogue);
4584 /* A broadcast of the max value. */
4585 epilogue_cost += record_stmt_cost (cost_vec, 1,
4586 scalar_to_vec, stmt_info, 0,
4587 vect_epilogue);
4589 else
4591 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4592 stmt_info, 0, vect_epilogue);
4593 epilogue_cost += record_stmt_cost (cost_vec, 1,
4594 vec_to_scalar, stmt_info, 0,
4595 vect_epilogue);
4598 else if (reduction_type == COND_REDUCTION)
4600 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4601 /* Extraction of scalar elements. */
4602 epilogue_cost += record_stmt_cost (cost_vec,
4603 2 * estimated_nunits,
4604 vec_to_scalar, stmt_info, 0,
4605 vect_epilogue);
4606 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4607 epilogue_cost += record_stmt_cost (cost_vec,
4608 2 * estimated_nunits - 3,
4609 scalar_stmt, stmt_info, 0,
4610 vect_epilogue);
4612 else if (reduction_type == EXTRACT_LAST_REDUCTION
4613 || reduction_type == FOLD_LEFT_REDUCTION)
4614 /* No extra instructions need in the epilogue. */
4616 else
4618 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4619 tree bitsize =
4620 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4621 int element_bitsize = tree_to_uhwi (bitsize);
4622 int nelements = vec_size_in_bits / element_bitsize;
4624 if (code == COND_EXPR)
4625 code = MAX_EXPR;
4627 optab = optab_for_tree_code (code, vectype, optab_default);
4629 /* We have a whole vector shift available. */
4630 if (optab != unknown_optab
4631 && VECTOR_MODE_P (mode)
4632 && optab_handler (optab, mode) != CODE_FOR_nothing
4633 && have_whole_vector_shift (mode))
4635 /* Final reduction via vector shifts and the reduction operator.
4636 Also requires scalar extract. */
4637 epilogue_cost += record_stmt_cost (cost_vec,
4638 exact_log2 (nelements) * 2,
4639 vector_stmt, stmt_info, 0,
4640 vect_epilogue);
4641 epilogue_cost += record_stmt_cost (cost_vec, 1,
4642 vec_to_scalar, stmt_info, 0,
4643 vect_epilogue);
4645 else
4646 /* Use extracts and reduction op for final reduction. For N
4647 elements, we have N extracts and N-1 reduction ops. */
4648 epilogue_cost += record_stmt_cost (cost_vec,
4649 nelements + nelements - 1,
4650 vector_stmt, stmt_info, 0,
4651 vect_epilogue);
4655 if (dump_enabled_p ())
4656 dump_printf (MSG_NOTE,
4657 "vect_model_reduction_cost: inside_cost = %d, "
4658 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4659 prologue_cost, epilogue_cost);
4662 /* SEQ is a sequence of instructions that initialize the reduction
4663 described by REDUC_INFO. Emit them in the appropriate place. */
4665 static void
4666 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4667 stmt_vec_info reduc_info, gimple *seq)
4669 if (reduc_info->reused_accumulator)
4671 /* When reusing an accumulator from the main loop, we only need
4672 initialization instructions if the main loop can be skipped.
4673 In that case, emit the initialization instructions at the end
4674 of the guard block that does the skip. */
4675 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4676 gcc_assert (skip_edge);
4677 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4678 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4680 else
4682 /* The normal case: emit the initialization instructions on the
4683 preheader edge. */
4684 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4685 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4689 /* Function get_initial_def_for_reduction
4691 Input:
4692 REDUC_INFO - the info_for_reduction
4693 INIT_VAL - the initial value of the reduction variable
4694 NEUTRAL_OP - a value that has no effect on the reduction, as per
4695 neutral_op_for_reduction
4697 Output:
4698 Return a vector variable, initialized according to the operation that
4699 STMT_VINFO performs. This vector will be used as the initial value
4700 of the vector of partial results.
4702 The value we need is a vector in which element 0 has value INIT_VAL
4703 and every other element has value NEUTRAL_OP. */
4705 static tree
4706 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4707 stmt_vec_info reduc_info,
4708 tree init_val, tree neutral_op)
4710 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4711 tree scalar_type = TREE_TYPE (init_val);
4712 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4713 tree init_def;
4714 gimple_seq stmts = NULL;
4716 gcc_assert (vectype);
4718 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4719 || SCALAR_FLOAT_TYPE_P (scalar_type));
4721 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4722 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4724 if (operand_equal_p (init_val, neutral_op))
4726 /* If both elements are equal then the vector described above is
4727 just a splat. */
4728 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4729 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4731 else
4733 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4734 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4735 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4737 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4738 element 0. */
4739 init_def = gimple_build_vector_from_val (&stmts, vectype,
4740 neutral_op);
4741 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4742 vectype, init_def, init_val);
4744 else
4746 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4747 tree_vector_builder elts (vectype, 1, 2);
4748 elts.quick_push (init_val);
4749 elts.quick_push (neutral_op);
4750 init_def = gimple_build_vector (&stmts, &elts);
4754 if (stmts)
4755 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4756 return init_def;
4759 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4760 which performs a reduction involving GROUP_SIZE scalar statements.
4761 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4762 is nonnull, introducing extra elements of that value will not change the
4763 result. */
4765 static void
4766 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4767 stmt_vec_info reduc_info,
4768 vec<tree> *vec_oprnds,
4769 unsigned int number_of_vectors,
4770 unsigned int group_size, tree neutral_op)
4772 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4773 unsigned HOST_WIDE_INT nunits;
4774 unsigned j, number_of_places_left_in_vector;
4775 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4776 unsigned int i;
4778 gcc_assert (group_size == initial_values.length () || neutral_op);
4780 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4781 created vectors. It is greater than 1 if unrolling is performed.
4783 For example, we have two scalar operands, s1 and s2 (e.g., group of
4784 strided accesses of size two), while NUNITS is four (i.e., four scalars
4785 of this type can be packed in a vector). The output vector will contain
4786 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4787 will be 2).
4789 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4790 vectors containing the operands.
4792 For example, NUNITS is four as before, and the group size is 8
4793 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4794 {s5, s6, s7, s8}. */
4796 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4797 nunits = group_size;
4799 number_of_places_left_in_vector = nunits;
4800 bool constant_p = true;
4801 tree_vector_builder elts (vector_type, nunits, 1);
4802 elts.quick_grow (nunits);
4803 gimple_seq ctor_seq = NULL;
4804 for (j = 0; j < nunits * number_of_vectors; ++j)
4806 tree op;
4807 i = j % group_size;
4809 /* Get the def before the loop. In reduction chain we have only
4810 one initial value. Else we have as many as PHIs in the group. */
4811 if (i >= initial_values.length () || (j > i && neutral_op))
4812 op = neutral_op;
4813 else
4814 op = initial_values[i];
4816 /* Create 'vect_ = {op0,op1,...,opn}'. */
4817 number_of_places_left_in_vector--;
4818 elts[nunits - number_of_places_left_in_vector - 1] = op;
4819 if (!CONSTANT_CLASS_P (op))
4820 constant_p = false;
4822 if (number_of_places_left_in_vector == 0)
4824 tree init;
4825 if (constant_p && !neutral_op
4826 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4827 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4828 /* Build the vector directly from ELTS. */
4829 init = gimple_build_vector (&ctor_seq, &elts);
4830 else if (neutral_op)
4832 /* Build a vector of the neutral value and shift the
4833 other elements into place. */
4834 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4835 neutral_op);
4836 int k = nunits;
4837 while (k > 0 && elts[k - 1] == neutral_op)
4838 k -= 1;
4839 while (k > 0)
4841 k -= 1;
4842 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4843 vector_type, init, elts[k]);
4846 else
4848 /* First time round, duplicate ELTS to fill the
4849 required number of vectors. */
4850 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4851 elts, number_of_vectors, *vec_oprnds);
4852 break;
4854 vec_oprnds->quick_push (init);
4856 number_of_places_left_in_vector = nunits;
4857 elts.new_vector (vector_type, nunits, 1);
4858 elts.quick_grow (nunits);
4859 constant_p = true;
4862 if (ctor_seq != NULL)
4863 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4866 /* For a statement STMT_INFO taking part in a reduction operation return
4867 the stmt_vec_info the meta information is stored on. */
4869 stmt_vec_info
4870 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4872 stmt_info = vect_orig_stmt (stmt_info);
4873 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4874 if (!is_a <gphi *> (stmt_info->stmt)
4875 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4876 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4877 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4878 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4880 if (gimple_phi_num_args (phi) == 1)
4881 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4883 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4885 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4886 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4887 stmt_info = info;
4889 return stmt_info;
4892 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4893 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4894 return false. */
4896 static bool
4897 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4898 stmt_vec_info reduc_info)
4900 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4901 if (!main_loop_vinfo)
4902 return false;
4904 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4905 return false;
4907 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4908 auto_vec<tree, 16> main_loop_results (num_phis);
4909 auto_vec<tree, 16> initial_values (num_phis);
4910 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4912 /* The epilogue loop can be entered either from the main loop or
4913 from an earlier guard block. */
4914 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4915 for (tree incoming_value : reduc_info->reduc_initial_values)
4917 /* Look for:
4919 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4920 INITIAL_VALUE(guard block)>. */
4921 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4923 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4924 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4926 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4927 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4929 main_loop_results.quick_push (from_main_loop);
4930 initial_values.quick_push (from_skip);
4933 else
4934 /* The main loop dominates the epilogue loop. */
4935 main_loop_results.splice (reduc_info->reduc_initial_values);
4937 /* See if the main loop has the kind of accumulator we need. */
4938 vect_reusable_accumulator *accumulator
4939 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4940 if (!accumulator
4941 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4942 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4943 accumulator->reduc_info->reduc_scalar_results.begin ()))
4944 return false;
4946 /* Handle the case where we can reduce wider vectors to narrower ones. */
4947 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4948 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4949 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4950 TYPE_VECTOR_SUBPARTS (vectype)))
4951 return false;
4953 /* Non-SLP reductions might apply an adjustment after the reduction
4954 operation, in order to simplify the initialization of the accumulator.
4955 If the epilogue loop carries on from where the main loop left off,
4956 it should apply the same adjustment to the final reduction result.
4958 If the epilogue loop can also be entered directly (rather than via
4959 the main loop), we need to be able to handle that case in the same way,
4960 with the same adjustment. (In principle we could add a PHI node
4961 to select the correct adjustment, but in practice that shouldn't be
4962 necessary.) */
4963 tree main_adjustment
4964 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4965 if (loop_vinfo->main_loop_edge && main_adjustment)
4967 gcc_assert (num_phis == 1);
4968 tree initial_value = initial_values[0];
4969 /* Check that we can use INITIAL_VALUE as the adjustment and
4970 initialize the accumulator with a neutral value instead. */
4971 if (!operand_equal_p (initial_value, main_adjustment))
4972 return false;
4973 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4974 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4975 code, initial_value);
4977 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4978 reduc_info->reduc_initial_values.truncate (0);
4979 reduc_info->reduc_initial_values.splice (initial_values);
4980 reduc_info->reused_accumulator = accumulator;
4981 return true;
4984 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4985 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
4987 static tree
4988 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4989 gimple_seq *seq)
4991 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4992 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4993 tree stype = TREE_TYPE (vectype);
4994 tree new_temp = vec_def;
4995 while (nunits > nunits1)
4997 nunits /= 2;
4998 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4999 stype, nunits);
5000 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5002 /* The target has to make sure we support lowpart/highpart
5003 extraction, either via direct vector extract or through
5004 an integer mode punning. */
5005 tree dst1, dst2;
5006 gimple *epilog_stmt;
5007 if (convert_optab_handler (vec_extract_optab,
5008 TYPE_MODE (TREE_TYPE (new_temp)),
5009 TYPE_MODE (vectype1))
5010 != CODE_FOR_nothing)
5012 /* Extract sub-vectors directly once vec_extract becomes
5013 a conversion optab. */
5014 dst1 = make_ssa_name (vectype1);
5015 epilog_stmt
5016 = gimple_build_assign (dst1, BIT_FIELD_REF,
5017 build3 (BIT_FIELD_REF, vectype1,
5018 new_temp, TYPE_SIZE (vectype1),
5019 bitsize_int (0)));
5020 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5021 dst2 = make_ssa_name (vectype1);
5022 epilog_stmt
5023 = gimple_build_assign (dst2, BIT_FIELD_REF,
5024 build3 (BIT_FIELD_REF, vectype1,
5025 new_temp, TYPE_SIZE (vectype1),
5026 bitsize_int (bitsize)));
5027 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5029 else
5031 /* Extract via punning to appropriately sized integer mode
5032 vector. */
5033 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5034 tree etype = build_vector_type (eltype, 2);
5035 gcc_assert (convert_optab_handler (vec_extract_optab,
5036 TYPE_MODE (etype),
5037 TYPE_MODE (eltype))
5038 != CODE_FOR_nothing);
5039 tree tem = make_ssa_name (etype);
5040 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5041 build1 (VIEW_CONVERT_EXPR,
5042 etype, new_temp));
5043 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5044 new_temp = tem;
5045 tem = make_ssa_name (eltype);
5046 epilog_stmt
5047 = gimple_build_assign (tem, BIT_FIELD_REF,
5048 build3 (BIT_FIELD_REF, eltype,
5049 new_temp, TYPE_SIZE (eltype),
5050 bitsize_int (0)));
5051 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5052 dst1 = make_ssa_name (vectype1);
5053 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5054 build1 (VIEW_CONVERT_EXPR,
5055 vectype1, tem));
5056 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5057 tem = make_ssa_name (eltype);
5058 epilog_stmt
5059 = gimple_build_assign (tem, BIT_FIELD_REF,
5060 build3 (BIT_FIELD_REF, eltype,
5061 new_temp, TYPE_SIZE (eltype),
5062 bitsize_int (bitsize)));
5063 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5064 dst2 = make_ssa_name (vectype1);
5065 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5066 build1 (VIEW_CONVERT_EXPR,
5067 vectype1, tem));
5068 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5071 new_temp = make_ssa_name (vectype1);
5072 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5073 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5076 return new_temp;
5079 /* Function vect_create_epilog_for_reduction
5081 Create code at the loop-epilog to finalize the result of a reduction
5082 computation.
5084 STMT_INFO is the scalar reduction stmt that is being vectorized.
5085 SLP_NODE is an SLP node containing a group of reduction statements. The
5086 first one in this group is STMT_INFO.
5087 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5088 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5089 (counting from 0)
5091 This function:
5092 1. Completes the reduction def-use cycles.
5093 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5094 by calling the function specified by REDUC_FN if available, or by
5095 other means (whole-vector shifts or a scalar loop).
5096 The function also creates a new phi node at the loop exit to preserve
5097 loop-closed form, as illustrated below.
5099 The flow at the entry to this function:
5101 loop:
5102 vec_def = phi <vec_init, null> # REDUCTION_PHI
5103 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5104 s_loop = scalar_stmt # (scalar) STMT_INFO
5105 loop_exit:
5106 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5107 use <s_out0>
5108 use <s_out0>
5110 The above is transformed by this function into:
5112 loop:
5113 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5114 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5115 s_loop = scalar_stmt # (scalar) STMT_INFO
5116 loop_exit:
5117 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5118 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5119 v_out2 = reduce <v_out1>
5120 s_out3 = extract_field <v_out2, 0>
5121 s_out4 = adjust_result <s_out3>
5122 use <s_out4>
5123 use <s_out4>
5126 static void
5127 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5128 stmt_vec_info stmt_info,
5129 slp_tree slp_node,
5130 slp_instance slp_node_instance)
5132 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5133 gcc_assert (reduc_info->is_reduc_info);
5134 /* For double reductions we need to get at the inner loop reduction
5135 stmt which has the meta info attached. Our stmt_info is that of the
5136 loop-closed PHI of the inner loop which we remember as
5137 def for the reduction PHI generation. */
5138 bool double_reduc = false;
5139 stmt_vec_info rdef_info = stmt_info;
5140 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5142 gcc_assert (!slp_node);
5143 double_reduc = true;
5144 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5145 (stmt_info->stmt, 0));
5146 stmt_info = vect_stmt_to_vectorize (stmt_info);
5148 gphi *reduc_def_stmt
5149 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5150 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5151 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5152 tree vectype;
5153 machine_mode mode;
5154 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5155 basic_block exit_bb;
5156 tree scalar_dest;
5157 tree scalar_type;
5158 gimple *new_phi = NULL, *phi;
5159 gimple_stmt_iterator exit_gsi;
5160 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5161 gimple *epilog_stmt = NULL;
5162 gimple *exit_phi;
5163 tree bitsize;
5164 tree def;
5165 tree orig_name, scalar_result;
5166 imm_use_iterator imm_iter, phi_imm_iter;
5167 use_operand_p use_p, phi_use_p;
5168 gimple *use_stmt;
5169 auto_vec<tree> reduc_inputs;
5170 int j, i;
5171 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5172 unsigned int group_size = 1, k;
5173 auto_vec<gimple *> phis;
5174 /* SLP reduction without reduction chain, e.g.,
5175 # a1 = phi <a2, a0>
5176 # b1 = phi <b2, b0>
5177 a2 = operation (a1)
5178 b2 = operation (b1) */
5179 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5180 bool direct_slp_reduc;
5181 tree induction_index = NULL_TREE;
5183 if (slp_node)
5184 group_size = SLP_TREE_LANES (slp_node);
5186 if (nested_in_vect_loop_p (loop, stmt_info))
5188 outer_loop = loop;
5189 loop = loop->inner;
5190 gcc_assert (!slp_node && double_reduc);
5193 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5194 gcc_assert (vectype);
5195 mode = TYPE_MODE (vectype);
5197 tree induc_val = NULL_TREE;
5198 tree adjustment_def = NULL;
5199 if (slp_node)
5201 else
5203 /* Optimize: for induction condition reduction, if we can't use zero
5204 for induc_val, use initial_def. */
5205 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5206 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5207 else if (double_reduc)
5209 else
5210 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5213 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5214 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5215 if (slp_reduc)
5216 /* All statements produce live-out values. */
5217 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5218 else if (slp_node)
5219 /* The last statement in the reduction chain produces the live-out
5220 value. */
5221 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5223 unsigned vec_num;
5224 int ncopies;
5225 if (slp_node)
5227 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5228 ncopies = 1;
5230 else
5232 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5233 vec_num = 1;
5234 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5237 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5238 which is updated with the current index of the loop for every match of
5239 the original loop's cond_expr (VEC_STMT). This results in a vector
5240 containing the last time the condition passed for that vector lane.
5241 The first match will be a 1 to allow 0 to be used for non-matching
5242 indexes. If there are no matches at all then the vector will be all
5243 zeroes.
5245 PR92772: This algorithm is broken for architectures that support
5246 masked vectors, but do not provide fold_extract_last. */
5247 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5249 auto_vec<std::pair<tree, bool>, 2> ccompares;
5250 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5251 cond_info = vect_stmt_to_vectorize (cond_info);
5252 while (cond_info != reduc_info)
5254 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5256 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5257 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5258 ccompares.safe_push
5259 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5260 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5262 cond_info
5263 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5264 1 + STMT_VINFO_REDUC_IDX
5265 (cond_info)));
5266 cond_info = vect_stmt_to_vectorize (cond_info);
5268 gcc_assert (ccompares.length () != 0);
5270 tree indx_before_incr, indx_after_incr;
5271 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5272 int scalar_precision
5273 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5274 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5275 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5276 (TYPE_MODE (vectype), cr_index_scalar_type,
5277 TYPE_VECTOR_SUBPARTS (vectype));
5279 /* First we create a simple vector induction variable which starts
5280 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5281 vector size (STEP). */
5283 /* Create a {1,2,3,...} vector. */
5284 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5286 /* Create a vector of the step value. */
5287 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5288 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5290 /* Create an induction variable. */
5291 gimple_stmt_iterator incr_gsi;
5292 bool insert_after;
5293 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5294 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5295 insert_after, &indx_before_incr, &indx_after_incr);
5297 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5298 filled with zeros (VEC_ZERO). */
5300 /* Create a vector of 0s. */
5301 tree zero = build_zero_cst (cr_index_scalar_type);
5302 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5304 /* Create a vector phi node. */
5305 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5306 new_phi = create_phi_node (new_phi_tree, loop->header);
5307 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5308 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5310 /* Now take the condition from the loops original cond_exprs
5311 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5312 every match uses values from the induction variable
5313 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5314 (NEW_PHI_TREE).
5315 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5316 the new cond_expr (INDEX_COND_EXPR). */
5317 gimple_seq stmts = NULL;
5318 for (int i = ccompares.length () - 1; i != -1; --i)
5320 tree ccompare = ccompares[i].first;
5321 if (ccompares[i].second)
5322 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5323 cr_index_vector_type,
5324 ccompare,
5325 indx_before_incr, new_phi_tree);
5326 else
5327 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5328 cr_index_vector_type,
5329 ccompare,
5330 new_phi_tree, indx_before_incr);
5332 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5334 /* Update the phi with the vec cond. */
5335 induction_index = new_phi_tree;
5336 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5337 loop_latch_edge (loop), UNKNOWN_LOCATION);
5340 /* 2. Create epilog code.
5341 The reduction epilog code operates across the elements of the vector
5342 of partial results computed by the vectorized loop.
5343 The reduction epilog code consists of:
5345 step 1: compute the scalar result in a vector (v_out2)
5346 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5347 step 3: adjust the scalar result (s_out3) if needed.
5349 Step 1 can be accomplished using one the following three schemes:
5350 (scheme 1) using reduc_fn, if available.
5351 (scheme 2) using whole-vector shifts, if available.
5352 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5353 combined.
5355 The overall epilog code looks like this:
5357 s_out0 = phi <s_loop> # original EXIT_PHI
5358 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5359 v_out2 = reduce <v_out1> # step 1
5360 s_out3 = extract_field <v_out2, 0> # step 2
5361 s_out4 = adjust_result <s_out3> # step 3
5363 (step 3 is optional, and steps 1 and 2 may be combined).
5364 Lastly, the uses of s_out0 are replaced by s_out4. */
5367 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5368 v_out1 = phi <VECT_DEF>
5369 Store them in NEW_PHIS. */
5370 if (double_reduc)
5371 loop = outer_loop;
5372 exit_bb = single_exit (loop)->dest;
5373 exit_gsi = gsi_after_labels (exit_bb);
5374 reduc_inputs.create (slp_node ? vec_num : ncopies);
5375 for (unsigned i = 0; i < vec_num; i++)
5377 gimple_seq stmts = NULL;
5378 if (slp_node)
5379 def = vect_get_slp_vect_def (slp_node, i);
5380 else
5381 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5382 for (j = 0; j < ncopies; j++)
5384 tree new_def = copy_ssa_name (def);
5385 phi = create_phi_node (new_def, exit_bb);
5386 if (j)
5387 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5388 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5389 new_def = gimple_convert (&stmts, vectype, new_def);
5390 reduc_inputs.quick_push (new_def);
5392 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5395 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5396 (i.e. when reduc_fn is not available) and in the final adjustment
5397 code (if needed). Also get the original scalar reduction variable as
5398 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5399 represents a reduction pattern), the tree-code and scalar-def are
5400 taken from the original stmt that the pattern-stmt (STMT) replaces.
5401 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5402 are taken from STMT. */
5404 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5405 if (orig_stmt_info != stmt_info)
5407 /* Reduction pattern */
5408 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5409 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5412 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5413 scalar_type = TREE_TYPE (scalar_dest);
5414 scalar_results.create (group_size);
5415 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5416 bitsize = TYPE_SIZE (scalar_type);
5418 /* True if we should implement SLP_REDUC using native reduction operations
5419 instead of scalar operations. */
5420 direct_slp_reduc = (reduc_fn != IFN_LAST
5421 && slp_reduc
5422 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5424 /* In case of reduction chain, e.g.,
5425 # a1 = phi <a3, a0>
5426 a2 = operation (a1)
5427 a3 = operation (a2),
5429 we may end up with more than one vector result. Here we reduce them
5430 to one vector.
5432 The same is true if we couldn't use a single defuse cycle. */
5433 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5434 || direct_slp_reduc
5435 || ncopies > 1)
5437 gimple_seq stmts = NULL;
5438 tree single_input = reduc_inputs[0];
5439 for (k = 1; k < reduc_inputs.length (); k++)
5440 single_input = gimple_build (&stmts, code, vectype,
5441 single_input, reduc_inputs[k]);
5442 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5444 reduc_inputs.truncate (0);
5445 reduc_inputs.safe_push (single_input);
5448 tree orig_reduc_input = reduc_inputs[0];
5450 /* If this loop is an epilogue loop that can be skipped after the
5451 main loop, we can only share a reduction operation between the
5452 main loop and the epilogue if we put it at the target of the
5453 skip edge.
5455 We can still reuse accumulators if this check fails. Doing so has
5456 the minor(?) benefit of making the epilogue loop's scalar result
5457 independent of the main loop's scalar result. */
5458 bool unify_with_main_loop_p = false;
5459 if (reduc_info->reused_accumulator
5460 && loop_vinfo->skip_this_loop_edge
5461 && single_succ_p (exit_bb)
5462 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5464 unify_with_main_loop_p = true;
5466 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5467 reduc_inputs[0] = make_ssa_name (vectype);
5468 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5469 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5470 UNKNOWN_LOCATION);
5471 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5472 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5473 exit_gsi = gsi_after_labels (reduc_block);
5476 /* Shouldn't be used beyond this point. */
5477 exit_bb = nullptr;
5479 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5480 && reduc_fn != IFN_LAST)
5482 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5483 various data values where the condition matched and another vector
5484 (INDUCTION_INDEX) containing all the indexes of those matches. We
5485 need to extract the last matching index (which will be the index with
5486 highest value) and use this to index into the data vector.
5487 For the case where there were no matches, the data vector will contain
5488 all default values and the index vector will be all zeros. */
5490 /* Get various versions of the type of the vector of indexes. */
5491 tree index_vec_type = TREE_TYPE (induction_index);
5492 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5493 tree index_scalar_type = TREE_TYPE (index_vec_type);
5494 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5496 /* Get an unsigned integer version of the type of the data vector. */
5497 int scalar_precision
5498 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5499 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5500 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5501 vectype);
5503 /* First we need to create a vector (ZERO_VEC) of zeros and another
5504 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5505 can create using a MAX reduction and then expanding.
5506 In the case where the loop never made any matches, the max index will
5507 be zero. */
5509 /* Vector of {0, 0, 0,...}. */
5510 tree zero_vec = build_zero_cst (vectype);
5512 /* Find maximum value from the vector of found indexes. */
5513 tree max_index = make_ssa_name (index_scalar_type);
5514 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5515 1, induction_index);
5516 gimple_call_set_lhs (max_index_stmt, max_index);
5517 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5519 /* Vector of {max_index, max_index, max_index,...}. */
5520 tree max_index_vec = make_ssa_name (index_vec_type);
5521 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5522 max_index);
5523 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5524 max_index_vec_rhs);
5525 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5527 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5528 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5529 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5530 otherwise. Only one value should match, resulting in a vector
5531 (VEC_COND) with one data value and the rest zeros.
5532 In the case where the loop never made any matches, every index will
5533 match, resulting in a vector with all data values (which will all be
5534 the default value). */
5536 /* Compare the max index vector to the vector of found indexes to find
5537 the position of the max value. */
5538 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5539 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5540 induction_index,
5541 max_index_vec);
5542 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5544 /* Use the compare to choose either values from the data vector or
5545 zero. */
5546 tree vec_cond = make_ssa_name (vectype);
5547 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5548 vec_compare,
5549 reduc_inputs[0],
5550 zero_vec);
5551 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5553 /* Finally we need to extract the data value from the vector (VEC_COND)
5554 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5555 reduction, but because this doesn't exist, we can use a MAX reduction
5556 instead. The data value might be signed or a float so we need to cast
5557 it first.
5558 In the case where the loop never made any matches, the data values are
5559 all identical, and so will reduce down correctly. */
5561 /* Make the matched data values unsigned. */
5562 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5563 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5564 vec_cond);
5565 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5566 VIEW_CONVERT_EXPR,
5567 vec_cond_cast_rhs);
5568 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5570 /* Reduce down to a scalar value. */
5571 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5572 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5573 1, vec_cond_cast);
5574 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5575 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5577 /* Convert the reduced value back to the result type and set as the
5578 result. */
5579 gimple_seq stmts = NULL;
5580 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5581 data_reduc);
5582 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5583 scalar_results.safe_push (new_temp);
5585 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5586 && reduc_fn == IFN_LAST)
5588 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5589 idx = 0;
5590 idx_val = induction_index[0];
5591 val = data_reduc[0];
5592 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5593 if (induction_index[i] > idx_val)
5594 val = data_reduc[i], idx_val = induction_index[i];
5595 return val; */
5597 tree data_eltype = TREE_TYPE (vectype);
5598 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5599 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5600 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5601 /* Enforced by vectorizable_reduction, which ensures we have target
5602 support before allowing a conditional reduction on variable-length
5603 vectors. */
5604 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5605 tree idx_val = NULL_TREE, val = NULL_TREE;
5606 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5608 tree old_idx_val = idx_val;
5609 tree old_val = val;
5610 idx_val = make_ssa_name (idx_eltype);
5611 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5612 build3 (BIT_FIELD_REF, idx_eltype,
5613 induction_index,
5614 bitsize_int (el_size),
5615 bitsize_int (off)));
5616 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617 val = make_ssa_name (data_eltype);
5618 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5619 build3 (BIT_FIELD_REF,
5620 data_eltype,
5621 reduc_inputs[0],
5622 bitsize_int (el_size),
5623 bitsize_int (off)));
5624 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5625 if (off != 0)
5627 tree new_idx_val = idx_val;
5628 if (off != v_size - el_size)
5630 new_idx_val = make_ssa_name (idx_eltype);
5631 epilog_stmt = gimple_build_assign (new_idx_val,
5632 MAX_EXPR, idx_val,
5633 old_idx_val);
5634 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5636 tree new_val = make_ssa_name (data_eltype);
5637 epilog_stmt = gimple_build_assign (new_val,
5638 COND_EXPR,
5639 build2 (GT_EXPR,
5640 boolean_type_node,
5641 idx_val,
5642 old_idx_val),
5643 val, old_val);
5644 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5645 idx_val = new_idx_val;
5646 val = new_val;
5649 /* Convert the reduced value back to the result type and set as the
5650 result. */
5651 gimple_seq stmts = NULL;
5652 val = gimple_convert (&stmts, scalar_type, val);
5653 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5654 scalar_results.safe_push (val);
5657 /* 2.3 Create the reduction code, using one of the three schemes described
5658 above. In SLP we simply need to extract all the elements from the
5659 vector (without reducing them), so we use scalar shifts. */
5660 else if (reduc_fn != IFN_LAST && !slp_reduc)
5662 tree tmp;
5663 tree vec_elem_type;
5665 /* Case 1: Create:
5666 v_out2 = reduc_expr <v_out1> */
5668 if (dump_enabled_p ())
5669 dump_printf_loc (MSG_NOTE, vect_location,
5670 "Reduce using direct vector reduction.\n");
5672 gimple_seq stmts = NULL;
5673 vec_elem_type = TREE_TYPE (vectype);
5674 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5675 vec_elem_type, reduc_inputs[0]);
5676 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5677 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5679 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5680 && induc_val)
5682 /* Earlier we set the initial value to be a vector if induc_val
5683 values. Check the result and if it is induc_val then replace
5684 with the original initial value, unless induc_val is
5685 the same as initial_def already. */
5686 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5687 induc_val);
5688 tree initial_def = reduc_info->reduc_initial_values[0];
5690 tmp = make_ssa_name (new_scalar_dest);
5691 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5692 initial_def, new_temp);
5693 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5694 new_temp = tmp;
5697 scalar_results.safe_push (new_temp);
5699 else if (direct_slp_reduc)
5701 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5702 with the elements for other SLP statements replaced with the
5703 neutral value. We can then do a normal reduction on each vector. */
5705 /* Enforced by vectorizable_reduction. */
5706 gcc_assert (reduc_inputs.length () == 1);
5707 gcc_assert (pow2p_hwi (group_size));
5709 gimple_seq seq = NULL;
5711 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5712 and the same element size as VECTYPE. */
5713 tree index = build_index_vector (vectype, 0, 1);
5714 tree index_type = TREE_TYPE (index);
5715 tree index_elt_type = TREE_TYPE (index_type);
5716 tree mask_type = truth_type_for (index_type);
5718 /* Create a vector that, for each element, identifies which of
5719 the REDUC_GROUP_SIZE results should use it. */
5720 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5721 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5722 build_vector_from_val (index_type, index_mask));
5724 /* Get a neutral vector value. This is simply a splat of the neutral
5725 scalar value if we have one, otherwise the initial scalar value
5726 is itself a neutral value. */
5727 tree vector_identity = NULL_TREE;
5728 tree neutral_op = NULL_TREE;
5729 if (slp_node)
5731 tree initial_value = NULL_TREE;
5732 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5733 initial_value = reduc_info->reduc_initial_values[0];
5734 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5735 initial_value);
5737 if (neutral_op)
5738 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5739 neutral_op);
5740 for (unsigned int i = 0; i < group_size; ++i)
5742 /* If there's no univeral neutral value, we can use the
5743 initial scalar value from the original PHI. This is used
5744 for MIN and MAX reduction, for example. */
5745 if (!neutral_op)
5747 tree scalar_value = reduc_info->reduc_initial_values[i];
5748 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5749 scalar_value);
5750 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5751 scalar_value);
5754 /* Calculate the equivalent of:
5756 sel[j] = (index[j] == i);
5758 which selects the elements of REDUC_INPUTS[0] that should
5759 be included in the result. */
5760 tree compare_val = build_int_cst (index_elt_type, i);
5761 compare_val = build_vector_from_val (index_type, compare_val);
5762 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5763 index, compare_val);
5765 /* Calculate the equivalent of:
5767 vec = seq ? reduc_inputs[0] : vector_identity;
5769 VEC is now suitable for a full vector reduction. */
5770 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5771 sel, reduc_inputs[0], vector_identity);
5773 /* Do the reduction and convert it to the appropriate type. */
5774 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5775 TREE_TYPE (vectype), vec);
5776 scalar = gimple_convert (&seq, scalar_type, scalar);
5777 scalar_results.safe_push (scalar);
5779 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5781 else
5783 bool reduce_with_shift;
5784 tree vec_temp;
5786 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5788 /* See if the target wants to do the final (shift) reduction
5789 in a vector mode of smaller size and first reduce upper/lower
5790 halves against each other. */
5791 enum machine_mode mode1 = mode;
5792 tree stype = TREE_TYPE (vectype);
5793 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5794 unsigned nunits1 = nunits;
5795 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5796 && reduc_inputs.length () == 1)
5798 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5799 /* For SLP reductions we have to make sure lanes match up, but
5800 since we're doing individual element final reduction reducing
5801 vector width here is even more important.
5802 ??? We can also separate lanes with permutes, for the common
5803 case of power-of-two group-size odd/even extracts would work. */
5804 if (slp_reduc && nunits != nunits1)
5806 nunits1 = least_common_multiple (nunits1, group_size);
5807 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5810 if (!slp_reduc
5811 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5812 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5814 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5815 stype, nunits1);
5816 reduce_with_shift = have_whole_vector_shift (mode1);
5817 if (!VECTOR_MODE_P (mode1))
5818 reduce_with_shift = false;
5819 else
5821 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5822 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5823 reduce_with_shift = false;
5826 /* First reduce the vector to the desired vector size we should
5827 do shift reduction on by combining upper and lower halves. */
5828 gimple_seq stmts = NULL;
5829 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5830 code, &stmts);
5831 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5832 reduc_inputs[0] = new_temp;
5834 if (reduce_with_shift && !slp_reduc)
5836 int element_bitsize = tree_to_uhwi (bitsize);
5837 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5838 for variable-length vectors and also requires direct target support
5839 for loop reductions. */
5840 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5841 int nelements = vec_size_in_bits / element_bitsize;
5842 vec_perm_builder sel;
5843 vec_perm_indices indices;
5845 int elt_offset;
5847 tree zero_vec = build_zero_cst (vectype1);
5848 /* Case 2: Create:
5849 for (offset = nelements/2; offset >= 1; offset/=2)
5851 Create: va' = vec_shift <va, offset>
5852 Create: va = vop <va, va'>
5853 } */
5855 tree rhs;
5857 if (dump_enabled_p ())
5858 dump_printf_loc (MSG_NOTE, vect_location,
5859 "Reduce using vector shifts\n");
5861 gimple_seq stmts = NULL;
5862 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5863 for (elt_offset = nelements / 2;
5864 elt_offset >= 1;
5865 elt_offset /= 2)
5867 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5868 indices.new_vector (sel, 2, nelements);
5869 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5870 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5871 new_temp, zero_vec, mask);
5872 new_temp = gimple_build (&stmts, code,
5873 vectype1, new_name, new_temp);
5875 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5877 /* 2.4 Extract the final scalar result. Create:
5878 s_out3 = extract_field <v_out2, bitpos> */
5880 if (dump_enabled_p ())
5881 dump_printf_loc (MSG_NOTE, vect_location,
5882 "extract scalar result\n");
5884 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5885 bitsize, bitsize_zero_node);
5886 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5887 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5888 gimple_assign_set_lhs (epilog_stmt, new_temp);
5889 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5890 scalar_results.safe_push (new_temp);
5892 else
5894 /* Case 3: Create:
5895 s = extract_field <v_out2, 0>
5896 for (offset = element_size;
5897 offset < vector_size;
5898 offset += element_size;)
5900 Create: s' = extract_field <v_out2, offset>
5901 Create: s = op <s, s'> // For non SLP cases
5902 } */
5904 if (dump_enabled_p ())
5905 dump_printf_loc (MSG_NOTE, vect_location,
5906 "Reduce using scalar code.\n");
5908 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5909 int element_bitsize = tree_to_uhwi (bitsize);
5910 tree compute_type = TREE_TYPE (vectype);
5911 gimple_seq stmts = NULL;
5912 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5914 int bit_offset;
5915 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5916 vec_temp, bitsize, bitsize_zero_node);
5918 /* In SLP we don't need to apply reduction operation, so we just
5919 collect s' values in SCALAR_RESULTS. */
5920 if (slp_reduc)
5921 scalar_results.safe_push (new_temp);
5923 for (bit_offset = element_bitsize;
5924 bit_offset < vec_size_in_bits;
5925 bit_offset += element_bitsize)
5927 tree bitpos = bitsize_int (bit_offset);
5928 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5929 compute_type, vec_temp,
5930 bitsize, bitpos);
5931 if (slp_reduc)
5933 /* In SLP we don't need to apply reduction operation, so
5934 we just collect s' values in SCALAR_RESULTS. */
5935 new_temp = new_name;
5936 scalar_results.safe_push (new_name);
5938 else
5939 new_temp = gimple_build (&stmts, code, compute_type,
5940 new_name, new_temp);
5944 /* The only case where we need to reduce scalar results in SLP, is
5945 unrolling. If the size of SCALAR_RESULTS is greater than
5946 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5947 REDUC_GROUP_SIZE. */
5948 if (slp_reduc)
5950 tree res, first_res, new_res;
5952 /* Reduce multiple scalar results in case of SLP unrolling. */
5953 for (j = group_size; scalar_results.iterate (j, &res);
5954 j++)
5956 first_res = scalar_results[j % group_size];
5957 new_res = gimple_build (&stmts, code, compute_type,
5958 first_res, res);
5959 scalar_results[j % group_size] = new_res;
5961 scalar_results.truncate (group_size);
5962 for (k = 0; k < group_size; k++)
5963 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5964 scalar_results[k]);
5966 else
5968 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5969 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5970 scalar_results.safe_push (new_temp);
5973 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5976 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5977 && induc_val)
5979 /* Earlier we set the initial value to be a vector if induc_val
5980 values. Check the result and if it is induc_val then replace
5981 with the original initial value, unless induc_val is
5982 the same as initial_def already. */
5983 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5984 induc_val);
5985 tree initial_def = reduc_info->reduc_initial_values[0];
5987 tree tmp = make_ssa_name (new_scalar_dest);
5988 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5989 initial_def, new_temp);
5990 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5991 scalar_results[0] = tmp;
5995 /* 2.5 Adjust the final result by the initial value of the reduction
5996 variable. (When such adjustment is not needed, then
5997 'adjustment_def' is zero). For example, if code is PLUS we create:
5998 new_temp = loop_exit_def + adjustment_def */
6000 if (adjustment_def)
6002 gcc_assert (!slp_reduc);
6003 gimple_seq stmts = NULL;
6004 if (double_reduc)
6006 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6007 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6008 new_temp = gimple_build (&stmts, code, vectype,
6009 reduc_inputs[0], adjustment_def);
6011 else
6013 new_temp = scalar_results[0];
6014 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6015 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6016 new_temp = gimple_build (&stmts, code, scalar_type,
6017 new_temp, adjustment_def);
6020 epilog_stmt = gimple_seq_last_stmt (stmts);
6021 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6022 scalar_results[0] = new_temp;
6025 /* Record this operation if it could be reused by the epilogue loop. */
6026 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6027 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6028 { orig_reduc_input, reduc_info });
6030 if (double_reduc)
6031 loop = outer_loop;
6033 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6034 phis with new adjusted scalar results, i.e., replace use <s_out0>
6035 with use <s_out4>.
6037 Transform:
6038 loop_exit:
6039 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6040 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6041 v_out2 = reduce <v_out1>
6042 s_out3 = extract_field <v_out2, 0>
6043 s_out4 = adjust_result <s_out3>
6044 use <s_out0>
6045 use <s_out0>
6047 into:
6049 loop_exit:
6050 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6051 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6052 v_out2 = reduce <v_out1>
6053 s_out3 = extract_field <v_out2, 0>
6054 s_out4 = adjust_result <s_out3>
6055 use <s_out4>
6056 use <s_out4> */
6058 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6059 for (k = 0; k < live_out_stmts.size (); k++)
6061 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6062 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6064 phis.create (3);
6065 /* Find the loop-closed-use at the loop exit of the original scalar
6066 result. (The reduction result is expected to have two immediate uses,
6067 one at the latch block, and one at the loop exit). For double
6068 reductions we are looking for exit phis of the outer loop. */
6069 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6071 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6073 if (!is_gimple_debug (USE_STMT (use_p)))
6074 phis.safe_push (USE_STMT (use_p));
6076 else
6078 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6080 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6082 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6084 if (!flow_bb_inside_loop_p (loop,
6085 gimple_bb (USE_STMT (phi_use_p)))
6086 && !is_gimple_debug (USE_STMT (phi_use_p)))
6087 phis.safe_push (USE_STMT (phi_use_p));
6093 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6095 /* Replace the uses: */
6096 orig_name = PHI_RESULT (exit_phi);
6098 /* Look for a single use at the target of the skip edge. */
6099 if (unify_with_main_loop_p)
6101 use_operand_p use_p;
6102 gimple *user;
6103 if (!single_imm_use (orig_name, &use_p, &user))
6104 gcc_unreachable ();
6105 orig_name = gimple_get_lhs (user);
6108 scalar_result = scalar_results[k];
6109 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6111 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6112 SET_USE (use_p, scalar_result);
6113 update_stmt (use_stmt);
6117 phis.release ();
6121 /* Return a vector of type VECTYPE that is equal to the vector select
6122 operation "MASK ? VEC : IDENTITY". Insert the select statements
6123 before GSI. */
6125 static tree
6126 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6127 tree vec, tree identity)
6129 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6130 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6131 mask, vec, identity);
6132 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6133 return cond;
6136 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6137 order, starting with LHS. Insert the extraction statements before GSI and
6138 associate the new scalar SSA names with variable SCALAR_DEST.
6139 Return the SSA name for the result. */
6141 static tree
6142 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6143 tree_code code, tree lhs, tree vector_rhs)
6145 tree vectype = TREE_TYPE (vector_rhs);
6146 tree scalar_type = TREE_TYPE (vectype);
6147 tree bitsize = TYPE_SIZE (scalar_type);
6148 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6149 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6151 for (unsigned HOST_WIDE_INT bit_offset = 0;
6152 bit_offset < vec_size_in_bits;
6153 bit_offset += element_bitsize)
6155 tree bitpos = bitsize_int (bit_offset);
6156 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6157 bitsize, bitpos);
6159 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6160 rhs = make_ssa_name (scalar_dest, stmt);
6161 gimple_assign_set_lhs (stmt, rhs);
6162 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6164 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6165 tree new_name = make_ssa_name (scalar_dest, stmt);
6166 gimple_assign_set_lhs (stmt, new_name);
6167 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6168 lhs = new_name;
6170 return lhs;
6173 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6174 type of the vector input. */
6176 static internal_fn
6177 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6179 internal_fn mask_reduc_fn;
6181 switch (reduc_fn)
6183 case IFN_FOLD_LEFT_PLUS:
6184 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6185 break;
6187 default:
6188 return IFN_LAST;
6191 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6192 OPTIMIZE_FOR_SPEED))
6193 return mask_reduc_fn;
6194 return IFN_LAST;
6197 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6198 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6199 statement. CODE is the operation performed by STMT_INFO and OPS are
6200 its scalar operands. REDUC_INDEX is the index of the operand in
6201 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6202 implements in-order reduction, or IFN_LAST if we should open-code it.
6203 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6204 that should be used to control the operation in a fully-masked loop. */
6206 static bool
6207 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6208 stmt_vec_info stmt_info,
6209 gimple_stmt_iterator *gsi,
6210 gimple **vec_stmt, slp_tree slp_node,
6211 gimple *reduc_def_stmt,
6212 tree_code code, internal_fn reduc_fn,
6213 tree ops[3], tree vectype_in,
6214 int reduc_index, vec_loop_masks *masks)
6216 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6217 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6218 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6220 int ncopies;
6221 if (slp_node)
6222 ncopies = 1;
6223 else
6224 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6226 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6227 gcc_assert (ncopies == 1);
6228 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6230 if (slp_node)
6231 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6232 TYPE_VECTOR_SUBPARTS (vectype_in)));
6234 tree op0 = ops[1 - reduc_index];
6236 int group_size = 1;
6237 stmt_vec_info scalar_dest_def_info;
6238 auto_vec<tree> vec_oprnds0;
6239 if (slp_node)
6241 auto_vec<vec<tree> > vec_defs (2);
6242 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6243 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6244 vec_defs[0].release ();
6245 vec_defs[1].release ();
6246 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6247 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6249 else
6251 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6252 op0, &vec_oprnds0);
6253 scalar_dest_def_info = stmt_info;
6256 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6257 tree scalar_type = TREE_TYPE (scalar_dest);
6258 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6260 int vec_num = vec_oprnds0.length ();
6261 gcc_assert (vec_num == 1 || slp_node);
6262 tree vec_elem_type = TREE_TYPE (vectype_out);
6263 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6265 tree vector_identity = NULL_TREE;
6266 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6267 vector_identity = build_zero_cst (vectype_out);
6269 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6270 int i;
6271 tree def0;
6272 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6274 gimple *new_stmt;
6275 tree mask = NULL_TREE;
6276 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6277 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6279 /* Handle MINUS by adding the negative. */
6280 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6282 tree negated = make_ssa_name (vectype_out);
6283 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6284 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6285 def0 = negated;
6288 if (mask && mask_reduc_fn == IFN_LAST)
6289 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6290 vector_identity);
6292 /* On the first iteration the input is simply the scalar phi
6293 result, and for subsequent iterations it is the output of
6294 the preceding operation. */
6295 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6297 if (mask && mask_reduc_fn != IFN_LAST)
6298 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6299 def0, mask);
6300 else
6301 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6302 def0);
6303 /* For chained SLP reductions the output of the previous reduction
6304 operation serves as the input of the next. For the final statement
6305 the output cannot be a temporary - we reuse the original
6306 scalar destination of the last statement. */
6307 if (i != vec_num - 1)
6309 gimple_set_lhs (new_stmt, scalar_dest_var);
6310 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6311 gimple_set_lhs (new_stmt, reduc_var);
6314 else
6316 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6317 reduc_var, def0);
6318 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6319 /* Remove the statement, so that we can use the same code paths
6320 as for statements that we've just created. */
6321 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6322 gsi_remove (&tmp_gsi, true);
6325 if (i == vec_num - 1)
6327 gimple_set_lhs (new_stmt, scalar_dest);
6328 vect_finish_replace_stmt (loop_vinfo,
6329 scalar_dest_def_info,
6330 new_stmt);
6332 else
6333 vect_finish_stmt_generation (loop_vinfo,
6334 scalar_dest_def_info,
6335 new_stmt, gsi);
6337 if (slp_node)
6338 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6339 else
6341 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6342 *vec_stmt = new_stmt;
6346 return true;
6349 /* Function is_nonwrapping_integer_induction.
6351 Check if STMT_VINO (which is part of loop LOOP) both increments and
6352 does not cause overflow. */
6354 static bool
6355 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6357 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6358 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6359 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6360 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6361 widest_int ni, max_loop_value, lhs_max;
6362 wi::overflow_type overflow = wi::OVF_NONE;
6364 /* Make sure the loop is integer based. */
6365 if (TREE_CODE (base) != INTEGER_CST
6366 || TREE_CODE (step) != INTEGER_CST)
6367 return false;
6369 /* Check that the max size of the loop will not wrap. */
6371 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6372 return true;
6374 if (! max_stmt_executions (loop, &ni))
6375 return false;
6377 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6378 &overflow);
6379 if (overflow)
6380 return false;
6382 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6383 TYPE_SIGN (lhs_type), &overflow);
6384 if (overflow)
6385 return false;
6387 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6388 <= TYPE_PRECISION (lhs_type));
6391 /* Check if masking can be supported by inserting a conditional expression.
6392 CODE is the code for the operation. COND_FN is the conditional internal
6393 function, if it exists. VECTYPE_IN is the type of the vector input. */
6394 static bool
6395 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6396 tree vectype_in)
6398 if (cond_fn != IFN_LAST
6399 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6400 OPTIMIZE_FOR_SPEED))
6401 return false;
6403 switch (code)
6405 case DOT_PROD_EXPR:
6406 case SAD_EXPR:
6407 return true;
6409 default:
6410 return false;
6414 /* Insert a conditional expression to enable masked vectorization. CODE is the
6415 code for the operation. VOP is the array of operands. MASK is the loop
6416 mask. GSI is a statement iterator used to place the new conditional
6417 expression. */
6418 static void
6419 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6420 gimple_stmt_iterator *gsi)
6422 switch (code)
6424 case DOT_PROD_EXPR:
6426 tree vectype = TREE_TYPE (vop[1]);
6427 tree zero = build_zero_cst (vectype);
6428 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6429 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6430 mask, vop[1], zero);
6431 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6432 vop[1] = masked_op1;
6433 break;
6436 case SAD_EXPR:
6438 tree vectype = TREE_TYPE (vop[1]);
6439 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6440 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6441 mask, vop[1], vop[0]);
6442 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6443 vop[1] = masked_op1;
6444 break;
6447 default:
6448 gcc_unreachable ();
6452 /* Function vectorizable_reduction.
6454 Check if STMT_INFO performs a reduction operation that can be vectorized.
6455 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6456 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6457 Return true if STMT_INFO is vectorizable in this way.
6459 This function also handles reduction idioms (patterns) that have been
6460 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6461 may be of this form:
6462 X = pattern_expr (arg0, arg1, ..., X)
6463 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6464 sequence that had been detected and replaced by the pattern-stmt
6465 (STMT_INFO).
6467 This function also handles reduction of condition expressions, for example:
6468 for (int i = 0; i < N; i++)
6469 if (a[i] < value)
6470 last = a[i];
6471 This is handled by vectorising the loop and creating an additional vector
6472 containing the loop indexes for which "a[i] < value" was true. In the
6473 function epilogue this is reduced to a single max value and then used to
6474 index into the vector of results.
6476 In some cases of reduction patterns, the type of the reduction variable X is
6477 different than the type of the other arguments of STMT_INFO.
6478 In such cases, the vectype that is used when transforming STMT_INFO into
6479 a vector stmt is different than the vectype that is used to determine the
6480 vectorization factor, because it consists of a different number of elements
6481 than the actual number of elements that are being operated upon in parallel.
6483 For example, consider an accumulation of shorts into an int accumulator.
6484 On some targets it's possible to vectorize this pattern operating on 8
6485 shorts at a time (hence, the vectype for purposes of determining the
6486 vectorization factor should be V8HI); on the other hand, the vectype that
6487 is used to create the vector form is actually V4SI (the type of the result).
6489 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6490 indicates what is the actual level of parallelism (V8HI in the example), so
6491 that the right vectorization factor would be derived. This vectype
6492 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6493 be used to create the vectorized stmt. The right vectype for the vectorized
6494 stmt is obtained from the type of the result X:
6495 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6497 This means that, contrary to "regular" reductions (or "regular" stmts in
6498 general), the following equation:
6499 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6500 does *NOT* necessarily hold for reduction patterns. */
6502 bool
6503 vectorizable_reduction (loop_vec_info loop_vinfo,
6504 stmt_vec_info stmt_info, slp_tree slp_node,
6505 slp_instance slp_node_instance,
6506 stmt_vector_for_cost *cost_vec)
6508 tree scalar_dest;
6509 tree vectype_in = NULL_TREE;
6510 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6511 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6512 stmt_vec_info cond_stmt_vinfo = NULL;
6513 tree scalar_type;
6514 int i;
6515 int ncopies;
6516 bool single_defuse_cycle = false;
6517 bool nested_cycle = false;
6518 bool double_reduc = false;
6519 int vec_num;
6520 tree tem;
6521 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6522 tree cond_reduc_val = NULL_TREE;
6524 /* Make sure it was already recognized as a reduction computation. */
6525 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6526 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6527 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6528 return false;
6530 /* The stmt we store reduction analysis meta on. */
6531 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6532 reduc_info->is_reduc_info = true;
6534 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6536 if (is_a <gphi *> (stmt_info->stmt))
6538 if (slp_node)
6540 /* We eventually need to set a vector type on invariant
6541 arguments. */
6542 unsigned j;
6543 slp_tree child;
6544 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6545 if (!vect_maybe_update_slp_op_vectype
6546 (child, SLP_TREE_VECTYPE (slp_node)))
6548 if (dump_enabled_p ())
6549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6550 "incompatible vector types for "
6551 "invariants\n");
6552 return false;
6555 /* Analysis for double-reduction is done on the outer
6556 loop PHI, nested cycles have no further restrictions. */
6557 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6559 else
6560 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6561 return true;
6564 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6565 stmt_vec_info phi_info = stmt_info;
6566 if (!is_a <gphi *> (stmt_info->stmt))
6568 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6569 return true;
6571 if (slp_node)
6573 slp_node_instance->reduc_phis = slp_node;
6574 /* ??? We're leaving slp_node to point to the PHIs, we only
6575 need it to get at the number of vector stmts which wasn't
6576 yet initialized for the instance root. */
6578 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6579 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6580 else
6582 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6583 == vect_double_reduction_def);
6584 use_operand_p use_p;
6585 gimple *use_stmt;
6586 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6587 &use_p, &use_stmt);
6588 gcc_assert (res);
6589 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6590 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6593 /* PHIs should not participate in patterns. */
6594 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6595 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6597 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6598 and compute the reduction chain length. Discover the real
6599 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6600 tree reduc_def
6601 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6602 loop_latch_edge
6603 (gimple_bb (reduc_def_phi)->loop_father));
6604 unsigned reduc_chain_length = 0;
6605 bool only_slp_reduc_chain = true;
6606 stmt_info = NULL;
6607 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6608 while (reduc_def != PHI_RESULT (reduc_def_phi))
6610 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6611 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6612 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6614 if (dump_enabled_p ())
6615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6616 "reduction chain broken by patterns.\n");
6617 return false;
6619 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6620 only_slp_reduc_chain = false;
6621 /* ??? For epilogue generation live members of the chain need
6622 to point back to the PHI via their original stmt for
6623 info_for_reduction to work. */
6624 if (STMT_VINFO_LIVE_P (vdef))
6625 STMT_VINFO_REDUC_DEF (def) = phi_info;
6626 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6627 if (!assign)
6629 if (dump_enabled_p ())
6630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6631 "reduction chain includes calls.\n");
6632 return false;
6634 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6636 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6637 TREE_TYPE (gimple_assign_rhs1 (assign))))
6639 if (dump_enabled_p ())
6640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6641 "conversion in the reduction chain.\n");
6642 return false;
6645 else if (!stmt_info)
6646 /* First non-conversion stmt. */
6647 stmt_info = vdef;
6648 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6649 reduc_chain_length++;
6650 if (!stmt_info && slp_node)
6651 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6653 /* PHIs should not participate in patterns. */
6654 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6656 if (nested_in_vect_loop_p (loop, stmt_info))
6658 loop = loop->inner;
6659 nested_cycle = true;
6662 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6663 element. */
6664 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6666 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6667 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6669 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6670 gcc_assert (slp_node
6671 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6673 /* 1. Is vectorizable reduction? */
6674 /* Not supportable if the reduction variable is used in the loop, unless
6675 it's a reduction chain. */
6676 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6677 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6678 return false;
6680 /* Reductions that are not used even in an enclosing outer-loop,
6681 are expected to be "live" (used out of the loop). */
6682 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6683 && !STMT_VINFO_LIVE_P (stmt_info))
6684 return false;
6686 /* 2. Has this been recognized as a reduction pattern?
6688 Check if STMT represents a pattern that has been recognized
6689 in earlier analysis stages. For stmts that represent a pattern,
6690 the STMT_VINFO_RELATED_STMT field records the last stmt in
6691 the original sequence that constitutes the pattern. */
6693 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6694 if (orig_stmt_info)
6696 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6697 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6700 /* 3. Check the operands of the operation. The first operands are defined
6701 inside the loop body. The last operand is the reduction variable,
6702 which is defined by the loop-header-phi. */
6704 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6705 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6706 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6707 enum tree_code code = gimple_assign_rhs_code (stmt);
6708 bool lane_reduc_code_p
6709 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6710 int op_type = TREE_CODE_LENGTH (code);
6711 enum optab_subtype optab_query_kind = optab_vector;
6712 if (code == DOT_PROD_EXPR
6713 && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6714 != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6715 optab_query_kind = optab_vector_mixed_sign;
6718 scalar_dest = gimple_assign_lhs (stmt);
6719 scalar_type = TREE_TYPE (scalar_dest);
6720 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6721 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6722 return false;
6724 /* Do not try to vectorize bit-precision reductions. */
6725 if (!type_has_mode_precision_p (scalar_type))
6726 return false;
6728 /* For lane-reducing ops we're reducing the number of reduction PHIs
6729 which means the only use of that may be in the lane-reducing operation. */
6730 if (lane_reduc_code_p
6731 && reduc_chain_length != 1
6732 && !only_slp_reduc_chain)
6734 if (dump_enabled_p ())
6735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6736 "lane-reducing reduction with extra stmts.\n");
6737 return false;
6740 /* All uses but the last are expected to be defined in the loop.
6741 The last use is the reduction variable. In case of nested cycle this
6742 assumption is not true: we use reduc_index to record the index of the
6743 reduction variable. */
6744 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6745 /* We need to skip an extra operand for COND_EXPRs with embedded
6746 comparison. */
6747 unsigned opno_adjust = 0;
6748 if (code == COND_EXPR
6749 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6750 opno_adjust = 1;
6751 for (i = 0; i < op_type; i++)
6753 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6754 if (i == 0 && code == COND_EXPR)
6755 continue;
6757 stmt_vec_info def_stmt_info;
6758 enum vect_def_type dt;
6759 tree op;
6760 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6761 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6762 &def_stmt_info))
6764 if (dump_enabled_p ())
6765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6766 "use not simple.\n");
6767 return false;
6769 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6770 continue;
6772 /* There should be only one cycle def in the stmt, the one
6773 leading to reduc_def. */
6774 if (VECTORIZABLE_CYCLE_DEF (dt))
6775 return false;
6777 /* To properly compute ncopies we are interested in the widest
6778 non-reduction input type in case we're looking at a widening
6779 accumulation that we later handle in vect_transform_reduction. */
6780 if (lane_reduc_code_p
6781 && tem
6782 && (!vectype_in
6783 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6784 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6785 vectype_in = tem;
6787 if (code == COND_EXPR)
6789 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6790 if (dt == vect_constant_def)
6792 cond_reduc_dt = dt;
6793 cond_reduc_val = op;
6795 if (dt == vect_induction_def
6796 && def_stmt_info
6797 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6799 cond_reduc_dt = dt;
6800 cond_stmt_vinfo = def_stmt_info;
6804 if (!vectype_in)
6805 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6806 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6808 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6809 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6810 /* If we have a condition reduction, see if we can simplify it further. */
6811 if (v_reduc_type == COND_REDUCTION)
6813 if (slp_node)
6814 return false;
6816 /* When the condition uses the reduction value in the condition, fail. */
6817 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6819 if (dump_enabled_p ())
6820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6821 "condition depends on previous iteration\n");
6822 return false;
6825 if (reduc_chain_length == 1
6826 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6827 vectype_in, OPTIMIZE_FOR_SPEED))
6829 if (dump_enabled_p ())
6830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6831 "optimizing condition reduction with"
6832 " FOLD_EXTRACT_LAST.\n");
6833 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6835 else if (cond_reduc_dt == vect_induction_def)
6837 tree base
6838 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6839 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6841 gcc_assert (TREE_CODE (base) == INTEGER_CST
6842 && TREE_CODE (step) == INTEGER_CST);
6843 cond_reduc_val = NULL_TREE;
6844 enum tree_code cond_reduc_op_code = ERROR_MARK;
6845 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6846 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6848 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6849 above base; punt if base is the minimum value of the type for
6850 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6851 else if (tree_int_cst_sgn (step) == -1)
6853 cond_reduc_op_code = MIN_EXPR;
6854 if (tree_int_cst_sgn (base) == -1)
6855 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6856 else if (tree_int_cst_lt (base,
6857 TYPE_MAX_VALUE (TREE_TYPE (base))))
6858 cond_reduc_val
6859 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6861 else
6863 cond_reduc_op_code = MAX_EXPR;
6864 if (tree_int_cst_sgn (base) == 1)
6865 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6866 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6867 base))
6868 cond_reduc_val
6869 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6871 if (cond_reduc_val)
6873 if (dump_enabled_p ())
6874 dump_printf_loc (MSG_NOTE, vect_location,
6875 "condition expression based on "
6876 "integer induction.\n");
6877 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6878 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6879 = cond_reduc_val;
6880 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6883 else if (cond_reduc_dt == vect_constant_def)
6885 enum vect_def_type cond_initial_dt;
6886 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6887 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6888 if (cond_initial_dt == vect_constant_def
6889 && types_compatible_p (TREE_TYPE (cond_initial_val),
6890 TREE_TYPE (cond_reduc_val)))
6892 tree e = fold_binary (LE_EXPR, boolean_type_node,
6893 cond_initial_val, cond_reduc_val);
6894 if (e && (integer_onep (e) || integer_zerop (e)))
6896 if (dump_enabled_p ())
6897 dump_printf_loc (MSG_NOTE, vect_location,
6898 "condition expression based on "
6899 "compile time constant.\n");
6900 /* Record reduction code at analysis stage. */
6901 STMT_VINFO_REDUC_CODE (reduc_info)
6902 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6903 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6909 if (STMT_VINFO_LIVE_P (phi_info))
6910 return false;
6912 if (slp_node)
6913 ncopies = 1;
6914 else
6915 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6917 gcc_assert (ncopies >= 1);
6919 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6921 if (nested_cycle)
6923 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6924 == vect_double_reduction_def);
6925 double_reduc = true;
6928 /* 4.2. Check support for the epilog operation.
6930 If STMT represents a reduction pattern, then the type of the
6931 reduction variable may be different than the type of the rest
6932 of the arguments. For example, consider the case of accumulation
6933 of shorts into an int accumulator; The original code:
6934 S1: int_a = (int) short_a;
6935 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6937 was replaced with:
6938 STMT: int_acc = widen_sum <short_a, int_acc>
6940 This means that:
6941 1. The tree-code that is used to create the vector operation in the
6942 epilog code (that reduces the partial results) is not the
6943 tree-code of STMT, but is rather the tree-code of the original
6944 stmt from the pattern that STMT is replacing. I.e, in the example
6945 above we want to use 'widen_sum' in the loop, but 'plus' in the
6946 epilog.
6947 2. The type (mode) we use to check available target support
6948 for the vector operation to be created in the *epilog*, is
6949 determined by the type of the reduction variable (in the example
6950 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6951 However the type (mode) we use to check available target support
6952 for the vector operation to be created *inside the loop*, is
6953 determined by the type of the other arguments to STMT (in the
6954 example we'd check this: optab_handler (widen_sum_optab,
6955 vect_short_mode)).
6957 This is contrary to "regular" reductions, in which the types of all
6958 the arguments are the same as the type of the reduction variable.
6959 For "regular" reductions we can therefore use the same vector type
6960 (and also the same tree-code) when generating the epilog code and
6961 when generating the code inside the loop. */
6963 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6964 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6966 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6967 if (reduction_type == TREE_CODE_REDUCTION)
6969 /* Check whether it's ok to change the order of the computation.
6970 Generally, when vectorizing a reduction we change the order of the
6971 computation. This may change the behavior of the program in some
6972 cases, so we need to check that this is ok. One exception is when
6973 vectorizing an outer-loop: the inner-loop is executed sequentially,
6974 and therefore vectorizing reductions in the inner-loop during
6975 outer-loop vectorization is safe. Likewise when we are vectorizing
6976 a series of reductions using SLP and the VF is one the reductions
6977 are performed in scalar order. */
6978 if (slp_node
6979 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6980 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6982 else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6984 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6985 is not directy used in stmt. */
6986 if (!only_slp_reduc_chain
6987 && reduc_chain_length != 1)
6989 if (dump_enabled_p ())
6990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6991 "in-order reduction chain without SLP.\n");
6992 return false;
6994 STMT_VINFO_REDUC_TYPE (reduc_info)
6995 = reduction_type = FOLD_LEFT_REDUCTION;
6997 else if (!commutative_tree_code (orig_code)
6998 || !associative_tree_code (orig_code))
7000 if (dump_enabled_p ())
7001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7002 "reduction: not commutative/associative");
7003 return false;
7007 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7008 && ncopies > 1)
7010 if (dump_enabled_p ())
7011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7012 "multiple types in double reduction or condition "
7013 "reduction or fold-left reduction.\n");
7014 return false;
7017 internal_fn reduc_fn = IFN_LAST;
7018 if (reduction_type == TREE_CODE_REDUCTION
7019 || reduction_type == FOLD_LEFT_REDUCTION
7020 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7021 || reduction_type == CONST_COND_REDUCTION)
7023 if (reduction_type == FOLD_LEFT_REDUCTION
7024 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7025 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7027 if (reduc_fn != IFN_LAST
7028 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7029 OPTIMIZE_FOR_SPEED))
7031 if (dump_enabled_p ())
7032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7033 "reduc op not supported by target.\n");
7035 reduc_fn = IFN_LAST;
7038 else
7040 if (!nested_cycle || double_reduc)
7042 if (dump_enabled_p ())
7043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7044 "no reduc code for scalar code.\n");
7046 return false;
7050 else if (reduction_type == COND_REDUCTION)
7052 int scalar_precision
7053 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7054 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7055 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7056 vectype_out);
7058 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7059 OPTIMIZE_FOR_SPEED))
7060 reduc_fn = IFN_REDUC_MAX;
7062 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7064 if (reduction_type != EXTRACT_LAST_REDUCTION
7065 && (!nested_cycle || double_reduc)
7066 && reduc_fn == IFN_LAST
7067 && !nunits_out.is_constant ())
7069 if (dump_enabled_p ())
7070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7071 "missing target support for reduction on"
7072 " variable-length vectors.\n");
7073 return false;
7076 /* For SLP reductions, see if there is a neutral value we can use. */
7077 tree neutral_op = NULL_TREE;
7078 if (slp_node)
7080 tree initial_value = NULL_TREE;
7081 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7082 initial_value = vect_phi_initial_value (reduc_def_phi);
7083 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7084 orig_code, initial_value);
7087 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7089 /* We can't support in-order reductions of code such as this:
7091 for (int i = 0; i < n1; ++i)
7092 for (int j = 0; j < n2; ++j)
7093 l += a[j];
7095 since GCC effectively transforms the loop when vectorizing:
7097 for (int i = 0; i < n1 / VF; ++i)
7098 for (int j = 0; j < n2; ++j)
7099 for (int k = 0; k < VF; ++k)
7100 l += a[j];
7102 which is a reassociation of the original operation. */
7103 if (dump_enabled_p ())
7104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7105 "in-order double reduction not supported.\n");
7107 return false;
7110 if (reduction_type == FOLD_LEFT_REDUCTION
7111 && slp_node
7112 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7114 /* We cannot use in-order reductions in this case because there is
7115 an implicit reassociation of the operations involved. */
7116 if (dump_enabled_p ())
7117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7118 "in-order unchained SLP reductions not supported.\n");
7119 return false;
7122 /* For double reductions, and for SLP reductions with a neutral value,
7123 we construct a variable-length initial vector by loading a vector
7124 full of the neutral value and then shift-and-inserting the start
7125 values into the low-numbered elements. */
7126 if ((double_reduc || neutral_op)
7127 && !nunits_out.is_constant ()
7128 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7129 vectype_out, OPTIMIZE_FOR_SPEED))
7131 if (dump_enabled_p ())
7132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7133 "reduction on variable-length vectors requires"
7134 " target support for a vector-shift-and-insert"
7135 " operation.\n");
7136 return false;
7139 /* Check extra constraints for variable-length unchained SLP reductions. */
7140 if (STMT_SLP_TYPE (stmt_info)
7141 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7142 && !nunits_out.is_constant ())
7144 /* We checked above that we could build the initial vector when
7145 there's a neutral element value. Check here for the case in
7146 which each SLP statement has its own initial value and in which
7147 that value needs to be repeated for every instance of the
7148 statement within the initial vector. */
7149 unsigned int group_size = SLP_TREE_LANES (slp_node);
7150 if (!neutral_op
7151 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7152 TREE_TYPE (vectype_out)))
7154 if (dump_enabled_p ())
7155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7156 "unsupported form of SLP reduction for"
7157 " variable-length vectors: cannot build"
7158 " initial vector.\n");
7159 return false;
7161 /* The epilogue code relies on the number of elements being a multiple
7162 of the group size. The duplicate-and-interleave approach to setting
7163 up the initial vector does too. */
7164 if (!multiple_p (nunits_out, group_size))
7166 if (dump_enabled_p ())
7167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7168 "unsupported form of SLP reduction for"
7169 " variable-length vectors: the vector size"
7170 " is not a multiple of the number of results.\n");
7171 return false;
7175 if (reduction_type == COND_REDUCTION)
7177 widest_int ni;
7179 if (! max_loop_iterations (loop, &ni))
7181 if (dump_enabled_p ())
7182 dump_printf_loc (MSG_NOTE, vect_location,
7183 "loop count not known, cannot create cond "
7184 "reduction.\n");
7185 return false;
7187 /* Convert backedges to iterations. */
7188 ni += 1;
7190 /* The additional index will be the same type as the condition. Check
7191 that the loop can fit into this less one (because we'll use up the
7192 zero slot for when there are no matches). */
7193 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7194 if (wi::geu_p (ni, wi::to_widest (max_index)))
7196 if (dump_enabled_p ())
7197 dump_printf_loc (MSG_NOTE, vect_location,
7198 "loop size is greater than data size.\n");
7199 return false;
7203 /* In case the vectorization factor (VF) is bigger than the number
7204 of elements that we can fit in a vectype (nunits), we have to generate
7205 more than one vector stmt - i.e - we need to "unroll" the
7206 vector stmt by a factor VF/nunits. For more details see documentation
7207 in vectorizable_operation. */
7209 /* If the reduction is used in an outer loop we need to generate
7210 VF intermediate results, like so (e.g. for ncopies=2):
7211 r0 = phi (init, r0)
7212 r1 = phi (init, r1)
7213 r0 = x0 + r0;
7214 r1 = x1 + r1;
7215 (i.e. we generate VF results in 2 registers).
7216 In this case we have a separate def-use cycle for each copy, and therefore
7217 for each copy we get the vector def for the reduction variable from the
7218 respective phi node created for this copy.
7220 Otherwise (the reduction is unused in the loop nest), we can combine
7221 together intermediate results, like so (e.g. for ncopies=2):
7222 r = phi (init, r)
7223 r = x0 + r;
7224 r = x1 + r;
7225 (i.e. we generate VF/2 results in a single register).
7226 In this case for each copy we get the vector def for the reduction variable
7227 from the vectorized reduction operation generated in the previous iteration.
7229 This only works when we see both the reduction PHI and its only consumer
7230 in vectorizable_reduction and there are no intermediate stmts
7231 participating. */
7232 if (ncopies > 1
7233 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7234 && reduc_chain_length == 1)
7235 single_defuse_cycle = true;
7237 if (single_defuse_cycle || lane_reduc_code_p)
7239 gcc_assert (code != COND_EXPR);
7241 /* 4. Supportable by target? */
7242 bool ok = true;
7244 /* 4.1. check support for the operation in the loop */
7245 optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7246 if (!optab)
7248 if (dump_enabled_p ())
7249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7250 "no optab.\n");
7251 ok = false;
7254 machine_mode vec_mode = TYPE_MODE (vectype_in);
7255 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7257 if (dump_enabled_p ())
7258 dump_printf (MSG_NOTE, "op not supported by target.\n");
7259 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7260 || !vect_can_vectorize_without_simd_p (code))
7261 ok = false;
7262 else
7263 if (dump_enabled_p ())
7264 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7267 if (vect_emulated_vector_p (vectype_in)
7268 && !vect_can_vectorize_without_simd_p (code))
7270 if (dump_enabled_p ())
7271 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7272 return false;
7275 /* lane-reducing operations have to go through vect_transform_reduction.
7276 For the other cases try without the single cycle optimization. */
7277 if (!ok)
7279 if (lane_reduc_code_p)
7280 return false;
7281 else
7282 single_defuse_cycle = false;
7285 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7287 /* If the reduction stmt is one of the patterns that have lane
7288 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7289 if ((ncopies > 1 && ! single_defuse_cycle)
7290 && lane_reduc_code_p)
7292 if (dump_enabled_p ())
7293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7294 "multi def-use cycle not possible for lane-reducing "
7295 "reduction operation\n");
7296 return false;
7299 if (slp_node
7300 && !(!single_defuse_cycle
7301 && code != DOT_PROD_EXPR
7302 && code != WIDEN_SUM_EXPR
7303 && code != SAD_EXPR
7304 && reduction_type != FOLD_LEFT_REDUCTION))
7305 for (i = 0; i < op_type; i++)
7306 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7308 if (dump_enabled_p ())
7309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7310 "incompatible vector types for invariants\n");
7311 return false;
7314 if (slp_node)
7315 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7316 else
7317 vec_num = 1;
7319 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7320 reduction_type, ncopies, cost_vec);
7321 /* Cost the reduction op inside the loop if transformed via
7322 vect_transform_reduction. Otherwise this is costed by the
7323 separate vectorizable_* routines. */
7324 if (single_defuse_cycle
7325 || code == DOT_PROD_EXPR
7326 || code == WIDEN_SUM_EXPR
7327 || code == SAD_EXPR)
7328 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7330 if (dump_enabled_p ()
7331 && reduction_type == FOLD_LEFT_REDUCTION)
7332 dump_printf_loc (MSG_NOTE, vect_location,
7333 "using an in-order (fold-left) reduction.\n");
7334 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7335 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7336 reductions go through their own vectorizable_* routines. */
7337 if (!single_defuse_cycle
7338 && code != DOT_PROD_EXPR
7339 && code != WIDEN_SUM_EXPR
7340 && code != SAD_EXPR
7341 && reduction_type != FOLD_LEFT_REDUCTION)
7343 stmt_vec_info tem
7344 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7345 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7347 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7348 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7350 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7351 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7353 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7355 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7356 internal_fn cond_fn = get_conditional_internal_fn (code);
7358 if (reduction_type != FOLD_LEFT_REDUCTION
7359 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7360 && (cond_fn == IFN_LAST
7361 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7362 OPTIMIZE_FOR_SPEED)))
7364 if (dump_enabled_p ())
7365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7366 "can't operate on partial vectors because"
7367 " no conditional operation is available.\n");
7368 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7370 else if (reduction_type == FOLD_LEFT_REDUCTION
7371 && reduc_fn == IFN_LAST
7372 && !expand_vec_cond_expr_p (vectype_in,
7373 truth_type_for (vectype_in),
7374 SSA_NAME))
7376 if (dump_enabled_p ())
7377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7378 "can't operate on partial vectors because"
7379 " no conditional operation is available.\n");
7380 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7382 else
7383 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7384 vectype_in, NULL);
7386 return true;
7389 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7390 value. */
7392 bool
7393 vect_transform_reduction (loop_vec_info loop_vinfo,
7394 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7395 gimple **vec_stmt, slp_tree slp_node)
7397 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7398 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7399 int i;
7400 int ncopies;
7401 int vec_num;
7403 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7404 gcc_assert (reduc_info->is_reduc_info);
7406 if (nested_in_vect_loop_p (loop, stmt_info))
7408 loop = loop->inner;
7409 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7412 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7413 enum tree_code code = gimple_assign_rhs_code (stmt);
7414 int op_type = TREE_CODE_LENGTH (code);
7416 /* Flatten RHS. */
7417 tree ops[3];
7418 switch (get_gimple_rhs_class (code))
7420 case GIMPLE_TERNARY_RHS:
7421 ops[2] = gimple_assign_rhs3 (stmt);
7422 /* Fall thru. */
7423 case GIMPLE_BINARY_RHS:
7424 ops[0] = gimple_assign_rhs1 (stmt);
7425 ops[1] = gimple_assign_rhs2 (stmt);
7426 break;
7427 default:
7428 gcc_unreachable ();
7431 /* All uses but the last are expected to be defined in the loop.
7432 The last use is the reduction variable. In case of nested cycle this
7433 assumption is not true: we use reduc_index to record the index of the
7434 reduction variable. */
7435 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7436 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7437 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7438 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7440 if (slp_node)
7442 ncopies = 1;
7443 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7445 else
7447 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7448 vec_num = 1;
7451 internal_fn cond_fn = get_conditional_internal_fn (code);
7452 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7453 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7455 /* Transform. */
7456 tree new_temp = NULL_TREE;
7457 auto_vec<tree> vec_oprnds0;
7458 auto_vec<tree> vec_oprnds1;
7459 auto_vec<tree> vec_oprnds2;
7460 tree def0;
7462 if (dump_enabled_p ())
7463 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7465 /* FORNOW: Multiple types are not supported for condition. */
7466 if (code == COND_EXPR)
7467 gcc_assert (ncopies == 1);
7469 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7471 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7472 if (reduction_type == FOLD_LEFT_REDUCTION)
7474 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7475 return vectorize_fold_left_reduction
7476 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7477 reduc_fn, ops, vectype_in, reduc_index, masks);
7480 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7481 gcc_assert (single_defuse_cycle
7482 || code == DOT_PROD_EXPR
7483 || code == WIDEN_SUM_EXPR
7484 || code == SAD_EXPR);
7486 /* Create the destination vector */
7487 tree scalar_dest = gimple_assign_lhs (stmt);
7488 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7490 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7491 single_defuse_cycle && reduc_index == 0
7492 ? NULL_TREE : ops[0], &vec_oprnds0,
7493 single_defuse_cycle && reduc_index == 1
7494 ? NULL_TREE : ops[1], &vec_oprnds1,
7495 op_type == ternary_op
7496 && !(single_defuse_cycle && reduc_index == 2)
7497 ? ops[2] : NULL_TREE, &vec_oprnds2);
7498 if (single_defuse_cycle)
7500 gcc_assert (!slp_node);
7501 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7502 ops[reduc_index],
7503 reduc_index == 0 ? &vec_oprnds0
7504 : (reduc_index == 1 ? &vec_oprnds1
7505 : &vec_oprnds2));
7508 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7510 gimple *new_stmt;
7511 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7512 if (masked_loop_p && !mask_by_cond_expr)
7514 /* Make sure that the reduction accumulator is vop[0]. */
7515 if (reduc_index == 1)
7517 gcc_assert (commutative_tree_code (code));
7518 std::swap (vop[0], vop[1]);
7520 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7521 vectype_in, i);
7522 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7523 vop[0], vop[1], vop[0]);
7524 new_temp = make_ssa_name (vec_dest, call);
7525 gimple_call_set_lhs (call, new_temp);
7526 gimple_call_set_nothrow (call, true);
7527 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7528 new_stmt = call;
7530 else
7532 if (op_type == ternary_op)
7533 vop[2] = vec_oprnds2[i];
7535 if (masked_loop_p && mask_by_cond_expr)
7537 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7538 vectype_in, i);
7539 build_vect_cond_expr (code, vop, mask, gsi);
7542 new_stmt = gimple_build_assign (vec_dest, code,
7543 vop[0], vop[1], vop[2]);
7544 new_temp = make_ssa_name (vec_dest, new_stmt);
7545 gimple_assign_set_lhs (new_stmt, new_temp);
7546 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7549 if (slp_node)
7550 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7551 else if (single_defuse_cycle
7552 && i < ncopies - 1)
7554 if (reduc_index == 0)
7555 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7556 else if (reduc_index == 1)
7557 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7558 else if (reduc_index == 2)
7559 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7561 else
7562 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7565 if (!slp_node)
7566 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7568 return true;
7571 /* Transform phase of a cycle PHI. */
7573 bool
7574 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7575 stmt_vec_info stmt_info, gimple **vec_stmt,
7576 slp_tree slp_node, slp_instance slp_node_instance)
7578 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7579 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7580 int i;
7581 int ncopies;
7582 int j;
7583 bool nested_cycle = false;
7584 int vec_num;
7586 if (nested_in_vect_loop_p (loop, stmt_info))
7588 loop = loop->inner;
7589 nested_cycle = true;
7592 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7593 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7594 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7595 gcc_assert (reduc_info->is_reduc_info);
7597 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7598 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7599 /* Leave the scalar phi in place. */
7600 return true;
7602 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7603 /* For a nested cycle we do not fill the above. */
7604 if (!vectype_in)
7605 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7606 gcc_assert (vectype_in);
7608 if (slp_node)
7610 /* The size vect_schedule_slp_instance computes is off for us. */
7611 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7612 * SLP_TREE_LANES (slp_node), vectype_in);
7613 ncopies = 1;
7615 else
7617 vec_num = 1;
7618 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7621 /* Check whether we should use a single PHI node and accumulate
7622 vectors to one before the backedge. */
7623 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7624 ncopies = 1;
7626 /* Create the destination vector */
7627 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7628 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7629 vectype_out);
7631 /* Get the loop-entry arguments. */
7632 tree vec_initial_def = NULL_TREE;
7633 auto_vec<tree> vec_initial_defs;
7634 if (slp_node)
7636 vec_initial_defs.reserve (vec_num);
7637 if (nested_cycle)
7639 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7640 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7641 &vec_initial_defs);
7643 else
7645 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7646 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7647 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7649 unsigned int num_phis = stmts.length ();
7650 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7651 num_phis = 1;
7652 initial_values.reserve (num_phis);
7653 for (unsigned int i = 0; i < num_phis; ++i)
7655 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7656 initial_values.quick_push (vect_phi_initial_value (this_phi));
7658 if (vec_num == 1)
7659 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7660 if (!initial_values.is_empty ())
7662 tree initial_value
7663 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7664 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7665 tree neutral_op
7666 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7667 code, initial_value);
7668 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7669 &vec_initial_defs, vec_num,
7670 stmts.length (), neutral_op);
7674 else
7676 /* Get at the scalar def before the loop, that defines the initial
7677 value of the reduction variable. */
7678 tree initial_def = vect_phi_initial_value (phi);
7679 reduc_info->reduc_initial_values.safe_push (initial_def);
7680 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7681 and we can't use zero for induc_val, use initial_def. Similarly
7682 for REDUC_MIN and initial_def larger than the base. */
7683 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7685 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7686 if (TREE_CODE (initial_def) == INTEGER_CST
7687 && !integer_zerop (induc_val)
7688 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7689 && tree_int_cst_lt (initial_def, induc_val))
7690 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7691 && tree_int_cst_lt (induc_val, initial_def))))
7693 induc_val = initial_def;
7694 /* Communicate we used the initial_def to epilouge
7695 generation. */
7696 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7698 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7700 else if (nested_cycle)
7702 /* Do not use an adjustment def as that case is not supported
7703 correctly if ncopies is not one. */
7704 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7705 ncopies, initial_def,
7706 &vec_initial_defs);
7708 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7709 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7710 /* Fill the initial vector with the initial scalar value. */
7711 vec_initial_def
7712 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7713 initial_def, initial_def);
7714 else
7716 if (ncopies == 1)
7717 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7718 if (!reduc_info->reduc_initial_values.is_empty ())
7720 initial_def = reduc_info->reduc_initial_values[0];
7721 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7722 tree neutral_op
7723 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7724 code, initial_def);
7725 gcc_assert (neutral_op);
7726 /* Try to simplify the vector initialization by applying an
7727 adjustment after the reduction has been performed. */
7728 if (!reduc_info->reused_accumulator
7729 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7730 && !operand_equal_p (neutral_op, initial_def))
7732 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7733 = initial_def;
7734 initial_def = neutral_op;
7736 vec_initial_def
7737 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7738 initial_def, neutral_op);
7743 if (vec_initial_def)
7745 vec_initial_defs.create (ncopies);
7746 for (i = 0; i < ncopies; ++i)
7747 vec_initial_defs.quick_push (vec_initial_def);
7750 if (auto *accumulator = reduc_info->reused_accumulator)
7752 tree def = accumulator->reduc_input;
7753 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7755 unsigned int nreduc;
7756 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7757 (TREE_TYPE (def)),
7758 TYPE_VECTOR_SUBPARTS (vectype_out),
7759 &nreduc);
7760 gcc_assert (res);
7761 gimple_seq stmts = NULL;
7762 /* Reduce the single vector to a smaller one. */
7763 if (nreduc != 1)
7765 /* Perform the reduction in the appropriate type. */
7766 tree rvectype = vectype_out;
7767 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7768 TREE_TYPE (TREE_TYPE (def))))
7769 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7770 TYPE_VECTOR_SUBPARTS
7771 (vectype_out));
7772 def = vect_create_partial_epilog (def, rvectype,
7773 STMT_VINFO_REDUC_CODE
7774 (reduc_info),
7775 &stmts);
7777 /* The epilogue loop might use a different vector mode, like
7778 VNx2DI vs. V2DI. */
7779 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7781 tree reduc_type = build_vector_type_for_mode
7782 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7783 def = gimple_convert (&stmts, reduc_type, def);
7785 /* Adjust the input so we pick up the partially reduced value
7786 for the skip edge in vect_create_epilog_for_reduction. */
7787 accumulator->reduc_input = def;
7788 /* And the reduction could be carried out using a different sign. */
7789 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7790 def = gimple_convert (&stmts, vectype_out, def);
7791 if (loop_vinfo->main_loop_edge)
7793 /* While we'd like to insert on the edge this will split
7794 blocks and disturb bookkeeping, we also will eventually
7795 need this on the skip edge. Rely on sinking to
7796 fixup optimal placement and insert in the pred. */
7797 gimple_stmt_iterator gsi
7798 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7799 /* Insert before a cond that eventually skips the
7800 epilogue. */
7801 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7802 gsi_prev (&gsi);
7803 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7805 else
7806 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7807 stmts);
7809 if (loop_vinfo->main_loop_edge)
7810 vec_initial_defs[0]
7811 = vect_get_main_loop_result (loop_vinfo, def,
7812 vec_initial_defs[0]);
7813 else
7814 vec_initial_defs.safe_push (def);
7817 /* Generate the reduction PHIs upfront. */
7818 for (i = 0; i < vec_num; i++)
7820 tree vec_init_def = vec_initial_defs[i];
7821 for (j = 0; j < ncopies; j++)
7823 /* Create the reduction-phi that defines the reduction
7824 operand. */
7825 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7827 /* Set the loop-entry arg of the reduction-phi. */
7828 if (j != 0 && nested_cycle)
7829 vec_init_def = vec_initial_defs[j];
7830 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7831 UNKNOWN_LOCATION);
7833 /* The loop-latch arg is set in epilogue processing. */
7835 if (slp_node)
7836 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7837 else
7839 if (j == 0)
7840 *vec_stmt = new_phi;
7841 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7846 return true;
7849 /* Vectorizes LC PHIs. */
7851 bool
7852 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7853 stmt_vec_info stmt_info, gimple **vec_stmt,
7854 slp_tree slp_node)
7856 if (!loop_vinfo
7857 || !is_a <gphi *> (stmt_info->stmt)
7858 || gimple_phi_num_args (stmt_info->stmt) != 1)
7859 return false;
7861 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7862 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7863 return false;
7865 if (!vec_stmt) /* transformation not required. */
7867 /* Deal with copies from externs or constants that disguise as
7868 loop-closed PHI nodes (PR97886). */
7869 if (slp_node
7870 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7871 SLP_TREE_VECTYPE (slp_node)))
7873 if (dump_enabled_p ())
7874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7875 "incompatible vector types for invariants\n");
7876 return false;
7878 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7879 return true;
7882 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7883 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7884 basic_block bb = gimple_bb (stmt_info->stmt);
7885 edge e = single_pred_edge (bb);
7886 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7887 auto_vec<tree> vec_oprnds;
7888 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7889 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7890 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7891 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7893 /* Create the vectorized LC PHI node. */
7894 gphi *new_phi = create_phi_node (vec_dest, bb);
7895 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7896 if (slp_node)
7897 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7898 else
7899 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7901 if (!slp_node)
7902 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7904 return true;
7907 /* Vectorizes PHIs. */
7909 bool
7910 vectorizable_phi (vec_info *,
7911 stmt_vec_info stmt_info, gimple **vec_stmt,
7912 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7914 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7915 return false;
7917 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7918 return false;
7920 tree vectype = SLP_TREE_VECTYPE (slp_node);
7922 if (!vec_stmt) /* transformation not required. */
7924 slp_tree child;
7925 unsigned i;
7926 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7927 if (!child)
7929 if (dump_enabled_p ())
7930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7931 "PHI node with unvectorized backedge def\n");
7932 return false;
7934 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7936 if (dump_enabled_p ())
7937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7938 "incompatible vector types for invariants\n");
7939 return false;
7941 /* For single-argument PHIs assume coalescing which means zero cost
7942 for the scalar and the vector PHIs. This avoids artificially
7943 favoring the vector path (but may pessimize it in some cases). */
7944 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7945 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7946 vector_stmt, stmt_info, vectype, 0, vect_body);
7947 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7948 return true;
7951 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7952 basic_block bb = gimple_bb (stmt_info->stmt);
7953 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7954 auto_vec<gphi *> new_phis;
7955 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7957 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7959 /* Skip not yet vectorized defs. */
7960 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7961 && SLP_TREE_VEC_STMTS (child).is_empty ())
7962 continue;
7964 auto_vec<tree> vec_oprnds;
7965 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7966 if (!new_phis.exists ())
7968 new_phis.create (vec_oprnds.length ());
7969 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7971 /* Create the vectorized LC PHI node. */
7972 new_phis.quick_push (create_phi_node (vec_dest, bb));
7973 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7976 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7977 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7978 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7980 /* We should have at least one already vectorized child. */
7981 gcc_assert (new_phis.exists ());
7983 return true;
7986 /* Return true if VECTYPE represents a vector that requires lowering
7987 by the vector lowering pass. */
7989 bool
7990 vect_emulated_vector_p (tree vectype)
7992 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7993 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7994 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7997 /* Return true if we can emulate CODE on an integer mode representation
7998 of a vector. */
8000 bool
8001 vect_can_vectorize_without_simd_p (tree_code code)
8003 switch (code)
8005 case PLUS_EXPR:
8006 case MINUS_EXPR:
8007 case NEGATE_EXPR:
8008 case BIT_AND_EXPR:
8009 case BIT_IOR_EXPR:
8010 case BIT_XOR_EXPR:
8011 case BIT_NOT_EXPR:
8012 return true;
8014 default:
8015 return false;
8019 /* Function vectorizable_induction
8021 Check if STMT_INFO performs an induction computation that can be vectorized.
8022 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8023 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8024 Return true if STMT_INFO is vectorizable in this way. */
8026 bool
8027 vectorizable_induction (loop_vec_info loop_vinfo,
8028 stmt_vec_info stmt_info,
8029 gimple **vec_stmt, slp_tree slp_node,
8030 stmt_vector_for_cost *cost_vec)
8032 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8033 unsigned ncopies;
8034 bool nested_in_vect_loop = false;
8035 class loop *iv_loop;
8036 tree vec_def;
8037 edge pe = loop_preheader_edge (loop);
8038 basic_block new_bb;
8039 tree new_vec, vec_init, vec_step, t;
8040 tree new_name;
8041 gimple *new_stmt;
8042 gphi *induction_phi;
8043 tree induc_def, vec_dest;
8044 tree init_expr, step_expr;
8045 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8046 unsigned i;
8047 tree expr;
8048 gimple_stmt_iterator si;
8050 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8051 if (!phi)
8052 return false;
8054 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8055 return false;
8057 /* Make sure it was recognized as induction computation. */
8058 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8059 return false;
8061 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8062 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8064 if (slp_node)
8065 ncopies = 1;
8066 else
8067 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8068 gcc_assert (ncopies >= 1);
8070 /* FORNOW. These restrictions should be relaxed. */
8071 if (nested_in_vect_loop_p (loop, stmt_info))
8073 imm_use_iterator imm_iter;
8074 use_operand_p use_p;
8075 gimple *exit_phi;
8076 edge latch_e;
8077 tree loop_arg;
8079 if (ncopies > 1)
8081 if (dump_enabled_p ())
8082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8083 "multiple types in nested loop.\n");
8084 return false;
8087 exit_phi = NULL;
8088 latch_e = loop_latch_edge (loop->inner);
8089 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8090 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8092 gimple *use_stmt = USE_STMT (use_p);
8093 if (is_gimple_debug (use_stmt))
8094 continue;
8096 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8098 exit_phi = use_stmt;
8099 break;
8102 if (exit_phi)
8104 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8105 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8106 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8108 if (dump_enabled_p ())
8109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8110 "inner-loop induction only used outside "
8111 "of the outer vectorized loop.\n");
8112 return false;
8116 nested_in_vect_loop = true;
8117 iv_loop = loop->inner;
8119 else
8120 iv_loop = loop;
8121 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8123 if (slp_node && !nunits.is_constant ())
8125 /* The current SLP code creates the step value element-by-element. */
8126 if (dump_enabled_p ())
8127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8128 "SLP induction not supported for variable-length"
8129 " vectors.\n");
8130 return false;
8133 if (!vec_stmt) /* transformation not required. */
8135 unsigned inside_cost = 0, prologue_cost = 0;
8136 if (slp_node)
8138 /* We eventually need to set a vector type on invariant
8139 arguments. */
8140 unsigned j;
8141 slp_tree child;
8142 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8143 if (!vect_maybe_update_slp_op_vectype
8144 (child, SLP_TREE_VECTYPE (slp_node)))
8146 if (dump_enabled_p ())
8147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8148 "incompatible vector types for "
8149 "invariants\n");
8150 return false;
8152 /* loop cost for vec_loop. */
8153 inside_cost
8154 = record_stmt_cost (cost_vec,
8155 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8156 vector_stmt, stmt_info, 0, vect_body);
8157 /* prologue cost for vec_init (if not nested) and step. */
8158 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8159 scalar_to_vec,
8160 stmt_info, 0, vect_prologue);
8162 else /* if (!slp_node) */
8164 /* loop cost for vec_loop. */
8165 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8166 stmt_info, 0, vect_body);
8167 /* prologue cost for vec_init and vec_step. */
8168 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8169 stmt_info, 0, vect_prologue);
8171 if (dump_enabled_p ())
8172 dump_printf_loc (MSG_NOTE, vect_location,
8173 "vect_model_induction_cost: inside_cost = %d, "
8174 "prologue_cost = %d .\n", inside_cost,
8175 prologue_cost);
8177 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8178 DUMP_VECT_SCOPE ("vectorizable_induction");
8179 return true;
8182 /* Transform. */
8184 /* Compute a vector variable, initialized with the first VF values of
8185 the induction variable. E.g., for an iv with IV_PHI='X' and
8186 evolution S, for a vector of 4 units, we want to compute:
8187 [X, X + S, X + 2*S, X + 3*S]. */
8189 if (dump_enabled_p ())
8190 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8192 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8193 gcc_assert (step_expr != NULL_TREE);
8194 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8196 pe = loop_preheader_edge (iv_loop);
8197 /* Find the first insertion point in the BB. */
8198 basic_block bb = gimple_bb (phi);
8199 si = gsi_after_labels (bb);
8201 /* For SLP induction we have to generate several IVs as for example
8202 with group size 3 we need
8203 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8204 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8205 if (slp_node)
8207 /* Enforced above. */
8208 unsigned int const_nunits = nunits.to_constant ();
8210 /* The initial values are vectorized, but any lanes > group_size
8211 need adjustment. */
8212 slp_tree init_node
8213 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8215 /* Gather steps. Since we do not vectorize inductions as
8216 cycles we have to reconstruct the step from SCEV data. */
8217 unsigned group_size = SLP_TREE_LANES (slp_node);
8218 tree *steps = XALLOCAVEC (tree, group_size);
8219 tree *inits = XALLOCAVEC (tree, group_size);
8220 stmt_vec_info phi_info;
8221 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8223 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8224 if (!init_node)
8225 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8226 pe->dest_idx);
8229 /* Now generate the IVs. */
8230 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8231 gcc_assert ((const_nunits * nvects) % group_size == 0);
8232 unsigned nivs;
8233 if (nested_in_vect_loop)
8234 nivs = nvects;
8235 else
8237 /* Compute the number of distinct IVs we need. First reduce
8238 group_size if it is a multiple of const_nunits so we get
8239 one IV for a group_size of 4 but const_nunits 2. */
8240 unsigned group_sizep = group_size;
8241 if (group_sizep % const_nunits == 0)
8242 group_sizep = group_sizep / const_nunits;
8243 nivs = least_common_multiple (group_sizep,
8244 const_nunits) / const_nunits;
8246 tree stept = TREE_TYPE (step_vectype);
8247 tree lupdate_mul = NULL_TREE;
8248 if (!nested_in_vect_loop)
8250 /* The number of iterations covered in one vector iteration. */
8251 unsigned lup_mul = (nvects * const_nunits) / group_size;
8252 lupdate_mul
8253 = build_vector_from_val (step_vectype,
8254 SCALAR_FLOAT_TYPE_P (stept)
8255 ? build_real_from_wide (stept, lup_mul,
8256 UNSIGNED)
8257 : build_int_cstu (stept, lup_mul));
8259 tree peel_mul = NULL_TREE;
8260 gimple_seq init_stmts = NULL;
8261 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8263 if (SCALAR_FLOAT_TYPE_P (stept))
8264 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8265 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8266 else
8267 peel_mul = gimple_convert (&init_stmts, stept,
8268 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8269 peel_mul = gimple_build_vector_from_val (&init_stmts,
8270 step_vectype, peel_mul);
8272 unsigned ivn;
8273 auto_vec<tree> vec_steps;
8274 for (ivn = 0; ivn < nivs; ++ivn)
8276 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8277 tree_vector_builder init_elts (vectype, const_nunits, 1);
8278 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8279 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8281 /* The scalar steps of the IVs. */
8282 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8283 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8284 step_elts.quick_push (elt);
8285 if (!init_node)
8287 /* The scalar inits of the IVs if not vectorized. */
8288 elt = inits[(ivn*const_nunits + eltn) % group_size];
8289 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8290 TREE_TYPE (elt)))
8291 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8292 TREE_TYPE (vectype), elt);
8293 init_elts.quick_push (elt);
8295 /* The number of steps to add to the initial values. */
8296 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8297 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8298 ? build_real_from_wide (stept,
8299 mul_elt, UNSIGNED)
8300 : build_int_cstu (stept, mul_elt));
8302 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8303 vec_steps.safe_push (vec_step);
8304 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8305 if (peel_mul)
8306 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8307 step_mul, peel_mul);
8308 if (!init_node)
8309 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8311 /* Create the induction-phi that defines the induction-operand. */
8312 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8313 "vec_iv_");
8314 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8315 induc_def = PHI_RESULT (induction_phi);
8317 /* Create the iv update inside the loop */
8318 tree up = vec_step;
8319 if (lupdate_mul)
8320 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8321 vec_step, lupdate_mul);
8322 gimple_seq stmts = NULL;
8323 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8324 vec_def = gimple_build (&stmts,
8325 PLUS_EXPR, step_vectype, vec_def, up);
8326 vec_def = gimple_convert (&stmts, vectype, vec_def);
8327 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8328 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8329 UNKNOWN_LOCATION);
8331 if (init_node)
8332 vec_init = vect_get_slp_vect_def (init_node, ivn);
8333 if (!nested_in_vect_loop
8334 && !integer_zerop (step_mul))
8336 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8337 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8338 vec_step, step_mul);
8339 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8340 vec_def, up);
8341 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8344 /* Set the arguments of the phi node: */
8345 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8347 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8349 if (!nested_in_vect_loop)
8351 /* Fill up to the number of vectors we need for the whole group. */
8352 nivs = least_common_multiple (group_size,
8353 const_nunits) / const_nunits;
8354 vec_steps.reserve (nivs-ivn);
8355 for (; ivn < nivs; ++ivn)
8357 SLP_TREE_VEC_STMTS (slp_node)
8358 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8359 vec_steps.quick_push (vec_steps[0]);
8363 /* Re-use IVs when we can. We are generating further vector
8364 stmts by adding VF' * stride to the IVs generated above. */
8365 if (ivn < nvects)
8367 unsigned vfp
8368 = least_common_multiple (group_size, const_nunits) / group_size;
8369 tree lupdate_mul
8370 = build_vector_from_val (step_vectype,
8371 SCALAR_FLOAT_TYPE_P (stept)
8372 ? build_real_from_wide (stept,
8373 vfp, UNSIGNED)
8374 : build_int_cstu (stept, vfp));
8375 for (; ivn < nvects; ++ivn)
8377 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8378 tree def = gimple_get_lhs (iv);
8379 if (ivn < 2*nivs)
8380 vec_steps[ivn - nivs]
8381 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8382 vec_steps[ivn - nivs], lupdate_mul);
8383 gimple_seq stmts = NULL;
8384 def = gimple_convert (&stmts, step_vectype, def);
8385 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8386 def, vec_steps[ivn % nivs]);
8387 def = gimple_convert (&stmts, vectype, def);
8388 if (gimple_code (iv) == GIMPLE_PHI)
8389 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8390 else
8392 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8393 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8395 SLP_TREE_VEC_STMTS (slp_node)
8396 .quick_push (SSA_NAME_DEF_STMT (def));
8400 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8401 gcc_assert (!new_bb);
8403 return true;
8406 init_expr = vect_phi_initial_value (phi);
8408 gimple_seq stmts = NULL;
8409 if (!nested_in_vect_loop)
8411 /* Convert the initial value to the IV update type. */
8412 tree new_type = TREE_TYPE (step_expr);
8413 init_expr = gimple_convert (&stmts, new_type, init_expr);
8415 /* If we are using the loop mask to "peel" for alignment then we need
8416 to adjust the start value here. */
8417 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8418 if (skip_niters != NULL_TREE)
8420 if (FLOAT_TYPE_P (vectype))
8421 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8422 skip_niters);
8423 else
8424 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8425 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8426 skip_niters, step_expr);
8427 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8428 init_expr, skip_step);
8432 if (stmts)
8434 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8435 gcc_assert (!new_bb);
8438 /* Create the vector that holds the initial_value of the induction. */
8439 if (nested_in_vect_loop)
8441 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8442 been created during vectorization of previous stmts. We obtain it
8443 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8444 auto_vec<tree> vec_inits;
8445 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8446 init_expr, &vec_inits);
8447 vec_init = vec_inits[0];
8448 /* If the initial value is not of proper type, convert it. */
8449 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8451 new_stmt
8452 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8453 vect_simple_var,
8454 "vec_iv_"),
8455 VIEW_CONVERT_EXPR,
8456 build1 (VIEW_CONVERT_EXPR, vectype,
8457 vec_init));
8458 vec_init = gimple_assign_lhs (new_stmt);
8459 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8460 new_stmt);
8461 gcc_assert (!new_bb);
8464 else
8466 /* iv_loop is the loop to be vectorized. Create:
8467 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8468 stmts = NULL;
8469 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8471 unsigned HOST_WIDE_INT const_nunits;
8472 if (nunits.is_constant (&const_nunits))
8474 tree_vector_builder elts (step_vectype, const_nunits, 1);
8475 elts.quick_push (new_name);
8476 for (i = 1; i < const_nunits; i++)
8478 /* Create: new_name_i = new_name + step_expr */
8479 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8480 new_name, step_expr);
8481 elts.quick_push (new_name);
8483 /* Create a vector from [new_name_0, new_name_1, ...,
8484 new_name_nunits-1] */
8485 vec_init = gimple_build_vector (&stmts, &elts);
8487 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8488 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8489 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8490 new_name, step_expr);
8491 else
8493 /* Build:
8494 [base, base, base, ...]
8495 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8496 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8497 gcc_assert (flag_associative_math);
8498 tree index = build_index_vector (step_vectype, 0, 1);
8499 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8500 new_name);
8501 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8502 step_expr);
8503 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8504 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8505 vec_init, step_vec);
8506 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8507 vec_init, base_vec);
8509 vec_init = gimple_convert (&stmts, vectype, vec_init);
8511 if (stmts)
8513 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8514 gcc_assert (!new_bb);
8519 /* Create the vector that holds the step of the induction. */
8520 if (nested_in_vect_loop)
8521 /* iv_loop is nested in the loop to be vectorized. Generate:
8522 vec_step = [S, S, S, S] */
8523 new_name = step_expr;
8524 else
8526 /* iv_loop is the loop to be vectorized. Generate:
8527 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8528 gimple_seq seq = NULL;
8529 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8531 expr = build_int_cst (integer_type_node, vf);
8532 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8534 else
8535 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8536 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8537 expr, step_expr);
8538 if (seq)
8540 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8541 gcc_assert (!new_bb);
8545 t = unshare_expr (new_name);
8546 gcc_assert (CONSTANT_CLASS_P (new_name)
8547 || TREE_CODE (new_name) == SSA_NAME);
8548 new_vec = build_vector_from_val (step_vectype, t);
8549 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8550 new_vec, step_vectype, NULL);
8553 /* Create the following def-use cycle:
8554 loop prolog:
8555 vec_init = ...
8556 vec_step = ...
8557 loop:
8558 vec_iv = PHI <vec_init, vec_loop>
8560 STMT
8562 vec_loop = vec_iv + vec_step; */
8564 /* Create the induction-phi that defines the induction-operand. */
8565 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8566 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8567 induc_def = PHI_RESULT (induction_phi);
8569 /* Create the iv update inside the loop */
8570 stmts = NULL;
8571 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8572 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8573 vec_def = gimple_convert (&stmts, vectype, vec_def);
8574 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8575 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8577 /* Set the arguments of the phi node: */
8578 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8579 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8580 UNKNOWN_LOCATION);
8582 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8583 *vec_stmt = induction_phi;
8585 /* In case that vectorization factor (VF) is bigger than the number
8586 of elements that we can fit in a vectype (nunits), we have to generate
8587 more than one vector stmt - i.e - we need to "unroll" the
8588 vector stmt by a factor VF/nunits. For more details see documentation
8589 in vectorizable_operation. */
8591 if (ncopies > 1)
8593 gimple_seq seq = NULL;
8594 /* FORNOW. This restriction should be relaxed. */
8595 gcc_assert (!nested_in_vect_loop);
8597 /* Create the vector that holds the step of the induction. */
8598 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8600 expr = build_int_cst (integer_type_node, nunits);
8601 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8603 else
8604 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8605 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8606 expr, step_expr);
8607 if (seq)
8609 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8610 gcc_assert (!new_bb);
8613 t = unshare_expr (new_name);
8614 gcc_assert (CONSTANT_CLASS_P (new_name)
8615 || TREE_CODE (new_name) == SSA_NAME);
8616 new_vec = build_vector_from_val (step_vectype, t);
8617 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8618 new_vec, step_vectype, NULL);
8620 vec_def = induc_def;
8621 for (i = 1; i < ncopies; i++)
8623 /* vec_i = vec_prev + vec_step */
8624 gimple_seq stmts = NULL;
8625 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8626 vec_def = gimple_build (&stmts,
8627 PLUS_EXPR, step_vectype, vec_def, vec_step);
8628 vec_def = gimple_convert (&stmts, vectype, vec_def);
8630 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8631 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8632 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8636 if (dump_enabled_p ())
8637 dump_printf_loc (MSG_NOTE, vect_location,
8638 "transform induction: created def-use cycle: %G%G",
8639 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8641 return true;
8644 /* Function vectorizable_live_operation.
8646 STMT_INFO computes a value that is used outside the loop. Check if
8647 it can be supported. */
8649 bool
8650 vectorizable_live_operation (vec_info *vinfo,
8651 stmt_vec_info stmt_info,
8652 gimple_stmt_iterator *gsi,
8653 slp_tree slp_node, slp_instance slp_node_instance,
8654 int slp_index, bool vec_stmt_p,
8655 stmt_vector_for_cost *cost_vec)
8657 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8658 imm_use_iterator imm_iter;
8659 tree lhs, lhs_type, bitsize;
8660 tree vectype = (slp_node
8661 ? SLP_TREE_VECTYPE (slp_node)
8662 : STMT_VINFO_VECTYPE (stmt_info));
8663 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8664 int ncopies;
8665 gimple *use_stmt;
8666 auto_vec<tree> vec_oprnds;
8667 int vec_entry = 0;
8668 poly_uint64 vec_index = 0;
8670 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8672 /* If a stmt of a reduction is live, vectorize it via
8673 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8674 validity so just trigger the transform here. */
8675 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8677 if (!vec_stmt_p)
8678 return true;
8679 if (slp_node)
8681 /* For reduction chains the meta-info is attached to
8682 the group leader. */
8683 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8684 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8685 /* For SLP reductions we vectorize the epilogue for
8686 all involved stmts together. */
8687 else if (slp_index != 0)
8688 return true;
8689 else
8690 /* For SLP reductions the meta-info is attached to
8691 the representative. */
8692 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8694 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8695 gcc_assert (reduc_info->is_reduc_info);
8696 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8697 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8698 return true;
8699 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8700 slp_node_instance);
8701 return true;
8704 /* If STMT is not relevant and it is a simple assignment and its inputs are
8705 invariant then it can remain in place, unvectorized. The original last
8706 scalar value that it computes will be used. */
8707 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8709 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8710 if (dump_enabled_p ())
8711 dump_printf_loc (MSG_NOTE, vect_location,
8712 "statement is simple and uses invariant. Leaving in "
8713 "place.\n");
8714 return true;
8717 if (slp_node)
8718 ncopies = 1;
8719 else
8720 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8722 if (slp_node)
8724 gcc_assert (slp_index >= 0);
8726 /* Get the last occurrence of the scalar index from the concatenation of
8727 all the slp vectors. Calculate which slp vector it is and the index
8728 within. */
8729 int num_scalar = SLP_TREE_LANES (slp_node);
8730 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8731 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8733 /* Calculate which vector contains the result, and which lane of
8734 that vector we need. */
8735 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8737 if (dump_enabled_p ())
8738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8739 "Cannot determine which vector holds the"
8740 " final result.\n");
8741 return false;
8745 if (!vec_stmt_p)
8747 /* No transformation required. */
8748 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8750 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8751 OPTIMIZE_FOR_SPEED))
8753 if (dump_enabled_p ())
8754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8755 "can't operate on partial vectors "
8756 "because the target doesn't support extract "
8757 "last reduction.\n");
8758 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8760 else if (slp_node)
8762 if (dump_enabled_p ())
8763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8764 "can't operate on partial vectors "
8765 "because an SLP statement is live after "
8766 "the loop.\n");
8767 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8769 else if (ncopies > 1)
8771 if (dump_enabled_p ())
8772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8773 "can't operate on partial vectors "
8774 "because ncopies is greater than 1.\n");
8775 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8777 else
8779 gcc_assert (ncopies == 1 && !slp_node);
8780 vect_record_loop_mask (loop_vinfo,
8781 &LOOP_VINFO_MASKS (loop_vinfo),
8782 1, vectype, NULL);
8785 /* ??? Enable for loop costing as well. */
8786 if (!loop_vinfo)
8787 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8788 0, vect_epilogue);
8789 return true;
8792 /* Use the lhs of the original scalar statement. */
8793 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8794 if (dump_enabled_p ())
8795 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8796 "stmt %G", stmt);
8798 lhs = gimple_get_lhs (stmt);
8799 lhs_type = TREE_TYPE (lhs);
8801 bitsize = vector_element_bits_tree (vectype);
8803 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8804 tree vec_lhs, bitstart;
8805 gimple *vec_stmt;
8806 if (slp_node)
8808 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8810 /* Get the correct slp vectorized stmt. */
8811 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8812 vec_lhs = gimple_get_lhs (vec_stmt);
8814 /* Get entry to use. */
8815 bitstart = bitsize_int (vec_index);
8816 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8818 else
8820 /* For multiple copies, get the last copy. */
8821 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8822 vec_lhs = gimple_get_lhs (vec_stmt);
8824 /* Get the last lane in the vector. */
8825 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8828 if (loop_vinfo)
8830 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8831 requirement, insert one phi node for it. It looks like:
8832 loop;
8834 # lhs' = PHI <lhs>
8836 loop;
8838 # vec_lhs' = PHI <vec_lhs>
8839 new_tree = lane_extract <vec_lhs', ...>;
8840 lhs' = new_tree; */
8842 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8843 basic_block exit_bb = single_exit (loop)->dest;
8844 gcc_assert (single_pred_p (exit_bb));
8846 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8847 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8848 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8850 gimple_seq stmts = NULL;
8851 tree new_tree;
8852 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8854 /* Emit:
8856 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8858 where VEC_LHS is the vectorized live-out result and MASK is
8859 the loop mask for the final iteration. */
8860 gcc_assert (ncopies == 1 && !slp_node);
8861 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8862 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8863 1, vectype, 0);
8864 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8865 mask, vec_lhs_phi);
8867 /* Convert the extracted vector element to the scalar type. */
8868 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8870 else
8872 tree bftype = TREE_TYPE (vectype);
8873 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8874 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8875 new_tree = build3 (BIT_FIELD_REF, bftype,
8876 vec_lhs_phi, bitsize, bitstart);
8877 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8878 &stmts, true, NULL_TREE);
8881 if (stmts)
8883 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8884 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8886 /* Remove existing phi from lhs and create one copy from new_tree. */
8887 tree lhs_phi = NULL_TREE;
8888 gimple_stmt_iterator gsi;
8889 for (gsi = gsi_start_phis (exit_bb);
8890 !gsi_end_p (gsi); gsi_next (&gsi))
8892 gimple *phi = gsi_stmt (gsi);
8893 if ((gimple_phi_arg_def (phi, 0) == lhs))
8895 remove_phi_node (&gsi, false);
8896 lhs_phi = gimple_phi_result (phi);
8897 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8898 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8899 break;
8904 /* Replace use of lhs with newly computed result. If the use stmt is a
8905 single arg PHI, just replace all uses of PHI result. It's necessary
8906 because lcssa PHI defining lhs may be before newly inserted stmt. */
8907 use_operand_p use_p;
8908 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8909 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8910 && !is_gimple_debug (use_stmt))
8912 if (gimple_code (use_stmt) == GIMPLE_PHI
8913 && gimple_phi_num_args (use_stmt) == 1)
8915 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8917 else
8919 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8920 SET_USE (use_p, new_tree);
8922 update_stmt (use_stmt);
8925 else
8927 /* For basic-block vectorization simply insert the lane-extraction. */
8928 tree bftype = TREE_TYPE (vectype);
8929 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8930 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8931 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8932 vec_lhs, bitsize, bitstart);
8933 gimple_seq stmts = NULL;
8934 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8935 &stmts, true, NULL_TREE);
8936 if (TREE_CODE (new_tree) == SSA_NAME
8937 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8938 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8939 if (is_a <gphi *> (vec_stmt))
8941 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8942 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8944 else
8946 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8947 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8950 /* Replace use of lhs with newly computed result. If the use stmt is a
8951 single arg PHI, just replace all uses of PHI result. It's necessary
8952 because lcssa PHI defining lhs may be before newly inserted stmt. */
8953 use_operand_p use_p;
8954 stmt_vec_info use_stmt_info;
8955 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8956 if (!is_gimple_debug (use_stmt)
8957 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8958 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8960 /* ??? This can happen when the live lane ends up being
8961 used in a vector construction code-generated by an
8962 external SLP node (and code-generation for that already
8963 happened). See gcc.dg/vect/bb-slp-47.c.
8964 Doing this is what would happen if that vector CTOR
8965 were not code-generated yet so it is not too bad.
8966 ??? In fact we'd likely want to avoid this situation
8967 in the first place. */
8968 if (TREE_CODE (new_tree) == SSA_NAME
8969 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8970 && gimple_code (use_stmt) != GIMPLE_PHI
8971 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8972 use_stmt))
8974 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8975 gcc_assert (code == CONSTRUCTOR
8976 || code == VIEW_CONVERT_EXPR
8977 || CONVERT_EXPR_CODE_P (code));
8978 if (dump_enabled_p ())
8979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8980 "Using original scalar computation for "
8981 "live lane because use preceeds vector "
8982 "def\n");
8983 continue;
8985 /* ??? It can also happen that we end up pulling a def into
8986 a loop where replacing out-of-loop uses would require
8987 a new LC SSA PHI node. Retain the original scalar in
8988 those cases as well. PR98064. */
8989 if (TREE_CODE (new_tree) == SSA_NAME
8990 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8991 && (gimple_bb (use_stmt)->loop_father
8992 != gimple_bb (vec_stmt)->loop_father)
8993 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8994 gimple_bb (use_stmt)->loop_father))
8996 if (dump_enabled_p ())
8997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8998 "Using original scalar computation for "
8999 "live lane because there is an out-of-loop "
9000 "definition for it\n");
9001 continue;
9003 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9004 SET_USE (use_p, new_tree);
9005 update_stmt (use_stmt);
9009 return true;
9012 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
9014 static void
9015 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9017 ssa_op_iter op_iter;
9018 imm_use_iterator imm_iter;
9019 def_operand_p def_p;
9020 gimple *ustmt;
9022 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9024 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9026 basic_block bb;
9028 if (!is_gimple_debug (ustmt))
9029 continue;
9031 bb = gimple_bb (ustmt);
9033 if (!flow_bb_inside_loop_p (loop, bb))
9035 if (gimple_debug_bind_p (ustmt))
9037 if (dump_enabled_p ())
9038 dump_printf_loc (MSG_NOTE, vect_location,
9039 "killing debug use\n");
9041 gimple_debug_bind_reset_value (ustmt);
9042 update_stmt (ustmt);
9044 else
9045 gcc_unreachable ();
9051 /* Given loop represented by LOOP_VINFO, return true if computation of
9052 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9053 otherwise. */
9055 static bool
9056 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9058 /* Constant case. */
9059 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9061 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9062 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9064 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9065 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9066 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9067 return true;
9070 widest_int max;
9071 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9072 /* Check the upper bound of loop niters. */
9073 if (get_max_loop_iterations (loop, &max))
9075 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9076 signop sgn = TYPE_SIGN (type);
9077 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9078 if (max < type_max)
9079 return true;
9081 return false;
9084 /* Return a mask type with half the number of elements as OLD_TYPE,
9085 given that it should have mode NEW_MODE. */
9087 tree
9088 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9090 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9091 return build_truth_vector_type_for_mode (nunits, new_mode);
9094 /* Return a mask type with twice as many elements as OLD_TYPE,
9095 given that it should have mode NEW_MODE. */
9097 tree
9098 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9100 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9101 return build_truth_vector_type_for_mode (nunits, new_mode);
9104 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9105 contain a sequence of NVECTORS masks that each control a vector of type
9106 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9107 these vector masks with the vector version of SCALAR_MASK. */
9109 void
9110 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9111 unsigned int nvectors, tree vectype, tree scalar_mask)
9113 gcc_assert (nvectors != 0);
9114 if (masks->length () < nvectors)
9115 masks->safe_grow_cleared (nvectors, true);
9116 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9117 /* The number of scalars per iteration and the number of vectors are
9118 both compile-time constants. */
9119 unsigned int nscalars_per_iter
9120 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9121 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9123 if (scalar_mask)
9125 scalar_cond_masked_key cond (scalar_mask, nvectors);
9126 loop_vinfo->scalar_cond_masked_set.add (cond);
9129 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9131 rgm->max_nscalars_per_iter = nscalars_per_iter;
9132 rgm->type = truth_type_for (vectype);
9133 rgm->factor = 1;
9137 /* Given a complete set of masks MASKS, extract mask number INDEX
9138 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9139 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9141 See the comment above vec_loop_masks for more details about the mask
9142 arrangement. */
9144 tree
9145 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9146 unsigned int nvectors, tree vectype, unsigned int index)
9148 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9149 tree mask_type = rgm->type;
9151 /* Populate the rgroup's mask array, if this is the first time we've
9152 used it. */
9153 if (rgm->controls.is_empty ())
9155 rgm->controls.safe_grow_cleared (nvectors, true);
9156 for (unsigned int i = 0; i < nvectors; ++i)
9158 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9159 /* Provide a dummy definition until the real one is available. */
9160 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9161 rgm->controls[i] = mask;
9165 tree mask = rgm->controls[index];
9166 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9167 TYPE_VECTOR_SUBPARTS (vectype)))
9169 /* A loop mask for data type X can be reused for data type Y
9170 if X has N times more elements than Y and if Y's elements
9171 are N times bigger than X's. In this case each sequence
9172 of N elements in the loop mask will be all-zero or all-one.
9173 We can then view-convert the mask so that each sequence of
9174 N elements is replaced by a single element. */
9175 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9176 TYPE_VECTOR_SUBPARTS (vectype)));
9177 gimple_seq seq = NULL;
9178 mask_type = truth_type_for (vectype);
9179 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9180 if (seq)
9181 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9183 return mask;
9186 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9187 lengths for controlling an operation on VECTYPE. The operation splits
9188 each element of VECTYPE into FACTOR separate subelements, measuring the
9189 length as a number of these subelements. */
9191 void
9192 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9193 unsigned int nvectors, tree vectype, unsigned int factor)
9195 gcc_assert (nvectors != 0);
9196 if (lens->length () < nvectors)
9197 lens->safe_grow_cleared (nvectors, true);
9198 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9200 /* The number of scalars per iteration, scalar occupied bytes and
9201 the number of vectors are both compile-time constants. */
9202 unsigned int nscalars_per_iter
9203 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9204 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9206 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9208 /* For now, we only support cases in which all loads and stores fall back
9209 to VnQI or none do. */
9210 gcc_assert (!rgl->max_nscalars_per_iter
9211 || (rgl->factor == 1 && factor == 1)
9212 || (rgl->max_nscalars_per_iter * rgl->factor
9213 == nscalars_per_iter * factor));
9214 rgl->max_nscalars_per_iter = nscalars_per_iter;
9215 rgl->type = vectype;
9216 rgl->factor = factor;
9220 /* Given a complete set of length LENS, extract length number INDEX for an
9221 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9223 tree
9224 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9225 unsigned int nvectors, unsigned int index)
9227 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9229 /* Populate the rgroup's len array, if this is the first time we've
9230 used it. */
9231 if (rgl->controls.is_empty ())
9233 rgl->controls.safe_grow_cleared (nvectors, true);
9234 for (unsigned int i = 0; i < nvectors; ++i)
9236 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9237 gcc_assert (len_type != NULL_TREE);
9238 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9240 /* Provide a dummy definition until the real one is available. */
9241 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9242 rgl->controls[i] = len;
9246 return rgl->controls[index];
9249 /* Scale profiling counters by estimation for LOOP which is vectorized
9250 by factor VF. */
9252 static void
9253 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9255 edge preheader = loop_preheader_edge (loop);
9256 /* Reduce loop iterations by the vectorization factor. */
9257 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9258 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9260 if (freq_h.nonzero_p ())
9262 profile_probability p;
9264 /* Avoid dropping loop body profile counter to 0 because of zero count
9265 in loop's preheader. */
9266 if (!(freq_e == profile_count::zero ()))
9267 freq_e = freq_e.force_nonzero ();
9268 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9269 scale_loop_frequencies (loop, p);
9272 edge exit_e = single_exit (loop);
9273 exit_e->probability = profile_probability::always ()
9274 .apply_scale (1, new_est_niter + 1);
9276 edge exit_l = single_pred_edge (loop->latch);
9277 profile_probability prob = exit_l->probability;
9278 exit_l->probability = exit_e->probability.invert ();
9279 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9280 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9283 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9284 latch edge values originally defined by it. */
9286 static void
9287 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9288 stmt_vec_info def_stmt_info)
9290 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9291 if (!def || TREE_CODE (def) != SSA_NAME)
9292 return;
9293 stmt_vec_info phi_info;
9294 imm_use_iterator iter;
9295 use_operand_p use_p;
9296 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9297 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9298 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9299 && (phi_info = loop_vinfo->lookup_stmt (phi))
9300 && STMT_VINFO_RELEVANT_P (phi_info)
9301 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9302 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9303 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9305 loop_p loop = gimple_bb (phi)->loop_father;
9306 edge e = loop_latch_edge (loop);
9307 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9309 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9310 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9311 gcc_assert (phi_defs.length () == latch_defs.length ());
9312 for (unsigned i = 0; i < phi_defs.length (); ++i)
9313 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9314 gimple_get_lhs (latch_defs[i]), e,
9315 gimple_phi_arg_location (phi, e->dest_idx));
9320 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9321 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9322 stmt_vec_info. */
9324 static bool
9325 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9326 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9328 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9329 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9331 if (dump_enabled_p ())
9332 dump_printf_loc (MSG_NOTE, vect_location,
9333 "------>vectorizing statement: %G", stmt_info->stmt);
9335 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9336 vect_loop_kill_debug_uses (loop, stmt_info);
9338 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9339 && !STMT_VINFO_LIVE_P (stmt_info))
9340 return false;
9342 if (STMT_VINFO_VECTYPE (stmt_info))
9344 poly_uint64 nunits
9345 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9346 if (!STMT_SLP_TYPE (stmt_info)
9347 && maybe_ne (nunits, vf)
9348 && dump_enabled_p ())
9349 /* For SLP VF is set according to unrolling factor, and not
9350 to vector size, hence for SLP this print is not valid. */
9351 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9354 /* Pure SLP statements have already been vectorized. We still need
9355 to apply loop vectorization to hybrid SLP statements. */
9356 if (PURE_SLP_STMT (stmt_info))
9357 return false;
9359 if (dump_enabled_p ())
9360 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9362 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9363 *seen_store = stmt_info;
9365 return true;
9368 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9369 in the hash_map with its corresponding values. */
9371 static tree
9372 find_in_mapping (tree t, void *context)
9374 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9376 tree *value = mapping->get (t);
9377 return value ? *value : t;
9380 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9381 original loop that has now been vectorized.
9383 The inits of the data_references need to be advanced with the number of
9384 iterations of the main loop. This has been computed in vect_do_peeling and
9385 is stored in parameter ADVANCE. We first restore the data_references
9386 initial offset with the values recored in ORIG_DRS_INIT.
9388 Since the loop_vec_info of this EPILOGUE was constructed for the original
9389 loop, its stmt_vec_infos all point to the original statements. These need
9390 to be updated to point to their corresponding copies as well as the SSA_NAMES
9391 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9393 The data_reference's connections also need to be updated. Their
9394 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9395 stmt_vec_infos, their statements need to point to their corresponding copy,
9396 if they are gather loads or scatter stores then their reference needs to be
9397 updated to point to its corresponding copy and finally we set
9398 'base_misaligned' to false as we have already peeled for alignment in the
9399 prologue of the main loop. */
9401 static void
9402 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9404 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9405 auto_vec<gimple *> stmt_worklist;
9406 hash_map<tree,tree> mapping;
9407 gimple *orig_stmt, *new_stmt;
9408 gimple_stmt_iterator epilogue_gsi;
9409 gphi_iterator epilogue_phi_gsi;
9410 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9411 basic_block *epilogue_bbs = get_loop_body (epilogue);
9412 unsigned i;
9414 free (LOOP_VINFO_BBS (epilogue_vinfo));
9415 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9417 /* Advance data_reference's with the number of iterations of the previous
9418 loop and its prologue. */
9419 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9422 /* The EPILOGUE loop is a copy of the original loop so they share the same
9423 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9424 point to the copied statements. We also create a mapping of all LHS' in
9425 the original loop and all the LHS' in the EPILOGUE and create worklists to
9426 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9427 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9429 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9430 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9432 new_stmt = epilogue_phi_gsi.phi ();
9434 gcc_assert (gimple_uid (new_stmt) > 0);
9435 stmt_vinfo
9436 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9438 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9439 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9441 mapping.put (gimple_phi_result (orig_stmt),
9442 gimple_phi_result (new_stmt));
9443 /* PHI nodes can not have patterns or related statements. */
9444 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9445 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9448 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9449 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9451 new_stmt = gsi_stmt (epilogue_gsi);
9452 if (is_gimple_debug (new_stmt))
9453 continue;
9455 gcc_assert (gimple_uid (new_stmt) > 0);
9456 stmt_vinfo
9457 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9459 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9460 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9462 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9463 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9465 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9467 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9468 for (gimple_stmt_iterator gsi = gsi_start (seq);
9469 !gsi_end_p (gsi); gsi_next (&gsi))
9470 stmt_worklist.safe_push (gsi_stmt (gsi));
9473 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9474 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9476 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9477 stmt_worklist.safe_push (stmt);
9478 /* Set BB such that the assert in
9479 'get_initial_def_for_reduction' is able to determine that
9480 the BB of the related stmt is inside this loop. */
9481 gimple_set_bb (stmt,
9482 gimple_bb (new_stmt));
9483 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9484 gcc_assert (related_vinfo == NULL
9485 || related_vinfo == stmt_vinfo);
9490 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9491 using the original main loop and thus need to be updated to refer to the
9492 cloned variables used in the epilogue. */
9493 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9495 gimple *stmt = stmt_worklist[i];
9496 tree *new_op;
9498 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9500 tree op = gimple_op (stmt, j);
9501 if ((new_op = mapping.get(op)))
9502 gimple_set_op (stmt, j, *new_op);
9503 else
9505 /* PR92429: The last argument of simplify_replace_tree disables
9506 folding when replacing arguments. This is required as
9507 otherwise you might end up with different statements than the
9508 ones analyzed in vect_loop_analyze, leading to different
9509 vectorization. */
9510 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9511 &find_in_mapping, &mapping, false);
9512 gimple_set_op (stmt, j, op);
9517 struct data_reference *dr;
9518 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9519 FOR_EACH_VEC_ELT (datarefs, i, dr)
9521 orig_stmt = DR_STMT (dr);
9522 gcc_assert (gimple_uid (orig_stmt) > 0);
9523 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9524 /* Data references for gather loads and scatter stores do not use the
9525 updated offset we set using ADVANCE. Instead we have to make sure the
9526 reference in the data references point to the corresponding copy of
9527 the original in the epilogue. */
9528 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9529 == VMAT_GATHER_SCATTER)
9531 DR_REF (dr)
9532 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9533 &find_in_mapping, &mapping);
9534 DR_BASE_ADDRESS (dr)
9535 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9536 &find_in_mapping, &mapping);
9538 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9539 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9540 /* The vector size of the epilogue is smaller than that of the main loop
9541 so the alignment is either the same or lower. This means the dr will
9542 thus by definition be aligned. */
9543 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9546 epilogue_vinfo->shared->datarefs_copy.release ();
9547 epilogue_vinfo->shared->save_datarefs ();
9550 /* Function vect_transform_loop.
9552 The analysis phase has determined that the loop is vectorizable.
9553 Vectorize the loop - created vectorized stmts to replace the scalar
9554 stmts in the loop, and update the loop exit condition.
9555 Returns scalar epilogue loop if any. */
9557 class loop *
9558 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9560 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9561 class loop *epilogue = NULL;
9562 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9563 int nbbs = loop->num_nodes;
9564 int i;
9565 tree niters_vector = NULL_TREE;
9566 tree step_vector = NULL_TREE;
9567 tree niters_vector_mult_vf = NULL_TREE;
9568 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9569 unsigned int lowest_vf = constant_lower_bound (vf);
9570 gimple *stmt;
9571 bool check_profitability = false;
9572 unsigned int th;
9574 DUMP_VECT_SCOPE ("vec_transform_loop");
9576 loop_vinfo->shared->check_datarefs ();
9578 /* Use the more conservative vectorization threshold. If the number
9579 of iterations is constant assume the cost check has been performed
9580 by our caller. If the threshold makes all loops profitable that
9581 run at least the (estimated) vectorization factor number of times
9582 checking is pointless, too. */
9583 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9584 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9586 if (dump_enabled_p ())
9587 dump_printf_loc (MSG_NOTE, vect_location,
9588 "Profitability threshold is %d loop iterations.\n",
9589 th);
9590 check_profitability = true;
9593 /* Make sure there exists a single-predecessor exit bb. Do this before
9594 versioning. */
9595 edge e = single_exit (loop);
9596 if (! single_pred_p (e->dest))
9598 split_loop_exit_edge (e, true);
9599 if (dump_enabled_p ())
9600 dump_printf (MSG_NOTE, "split exit edge\n");
9603 /* Version the loop first, if required, so the profitability check
9604 comes first. */
9606 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9608 class loop *sloop
9609 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9610 sloop->force_vectorize = false;
9611 check_profitability = false;
9614 /* Make sure there exists a single-predecessor exit bb also on the
9615 scalar loop copy. Do this after versioning but before peeling
9616 so CFG structure is fine for both scalar and if-converted loop
9617 to make slpeel_duplicate_current_defs_from_edges face matched
9618 loop closed PHI nodes on the exit. */
9619 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9621 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9622 if (! single_pred_p (e->dest))
9624 split_loop_exit_edge (e, true);
9625 if (dump_enabled_p ())
9626 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9630 tree niters = vect_build_loop_niters (loop_vinfo);
9631 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9632 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9633 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9634 tree advance;
9635 drs_init_vec orig_drs_init;
9637 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9638 &step_vector, &niters_vector_mult_vf, th,
9639 check_profitability, niters_no_overflow,
9640 &advance);
9642 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9643 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9644 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9645 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9647 if (niters_vector == NULL_TREE)
9649 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9650 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9651 && known_eq (lowest_vf, vf))
9653 niters_vector
9654 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9655 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9656 step_vector = build_one_cst (TREE_TYPE (niters));
9658 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9659 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9660 &step_vector, niters_no_overflow);
9661 else
9662 /* vect_do_peeling subtracted the number of peeled prologue
9663 iterations from LOOP_VINFO_NITERS. */
9664 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9665 &niters_vector, &step_vector,
9666 niters_no_overflow);
9669 /* 1) Make sure the loop header has exactly two entries
9670 2) Make sure we have a preheader basic block. */
9672 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9674 split_edge (loop_preheader_edge (loop));
9676 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9677 /* This will deal with any possible peeling. */
9678 vect_prepare_for_masked_peels (loop_vinfo);
9680 /* Schedule the SLP instances first, then handle loop vectorization
9681 below. */
9682 if (!loop_vinfo->slp_instances.is_empty ())
9684 DUMP_VECT_SCOPE ("scheduling SLP instances");
9685 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9688 /* FORNOW: the vectorizer supports only loops which body consist
9689 of one basic block (header + empty latch). When the vectorizer will
9690 support more involved loop forms, the order by which the BBs are
9691 traversed need to be reconsidered. */
9693 for (i = 0; i < nbbs; i++)
9695 basic_block bb = bbs[i];
9696 stmt_vec_info stmt_info;
9698 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9699 gsi_next (&si))
9701 gphi *phi = si.phi ();
9702 if (dump_enabled_p ())
9703 dump_printf_loc (MSG_NOTE, vect_location,
9704 "------>vectorizing phi: %G", phi);
9705 stmt_info = loop_vinfo->lookup_stmt (phi);
9706 if (!stmt_info)
9707 continue;
9709 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9710 vect_loop_kill_debug_uses (loop, stmt_info);
9712 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9713 && !STMT_VINFO_LIVE_P (stmt_info))
9714 continue;
9716 if (STMT_VINFO_VECTYPE (stmt_info)
9717 && (maybe_ne
9718 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9719 && dump_enabled_p ())
9720 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9722 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9723 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9724 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9725 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9727 && ! PURE_SLP_STMT (stmt_info))
9729 if (dump_enabled_p ())
9730 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9731 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9735 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9736 gsi_next (&si))
9738 gphi *phi = si.phi ();
9739 stmt_info = loop_vinfo->lookup_stmt (phi);
9740 if (!stmt_info)
9741 continue;
9743 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9744 && !STMT_VINFO_LIVE_P (stmt_info))
9745 continue;
9747 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9748 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9749 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9750 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9751 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9752 && ! PURE_SLP_STMT (stmt_info))
9753 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9756 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9757 !gsi_end_p (si);)
9759 stmt = gsi_stmt (si);
9760 /* During vectorization remove existing clobber stmts. */
9761 if (gimple_clobber_p (stmt))
9763 unlink_stmt_vdef (stmt);
9764 gsi_remove (&si, true);
9765 release_defs (stmt);
9767 else
9769 /* Ignore vector stmts created in the outer loop. */
9770 stmt_info = loop_vinfo->lookup_stmt (stmt);
9772 /* vector stmts created in the outer-loop during vectorization of
9773 stmts in an inner-loop may not have a stmt_info, and do not
9774 need to be vectorized. */
9775 stmt_vec_info seen_store = NULL;
9776 if (stmt_info)
9778 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9780 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9781 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9782 !gsi_end_p (subsi); gsi_next (&subsi))
9784 stmt_vec_info pat_stmt_info
9785 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9786 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9787 &si, &seen_store);
9789 stmt_vec_info pat_stmt_info
9790 = STMT_VINFO_RELATED_STMT (stmt_info);
9791 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9792 &si, &seen_store))
9793 maybe_set_vectorized_backedge_value (loop_vinfo,
9794 pat_stmt_info);
9796 else
9798 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9799 &seen_store))
9800 maybe_set_vectorized_backedge_value (loop_vinfo,
9801 stmt_info);
9804 gsi_next (&si);
9805 if (seen_store)
9807 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9808 /* Interleaving. If IS_STORE is TRUE, the
9809 vectorization of the interleaving chain was
9810 completed - free all the stores in the chain. */
9811 vect_remove_stores (loop_vinfo,
9812 DR_GROUP_FIRST_ELEMENT (seen_store));
9813 else
9814 /* Free the attached stmt_vec_info and remove the stmt. */
9815 loop_vinfo->remove_stmt (stmt_info);
9820 /* Stub out scalar statements that must not survive vectorization.
9821 Doing this here helps with grouped statements, or statements that
9822 are involved in patterns. */
9823 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9824 !gsi_end_p (gsi); gsi_next (&gsi))
9826 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9827 if (!call || !gimple_call_internal_p (call))
9828 continue;
9829 internal_fn ifn = gimple_call_internal_fn (call);
9830 if (ifn == IFN_MASK_LOAD)
9832 tree lhs = gimple_get_lhs (call);
9833 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9835 tree zero = build_zero_cst (TREE_TYPE (lhs));
9836 gimple *new_stmt = gimple_build_assign (lhs, zero);
9837 gsi_replace (&gsi, new_stmt, true);
9840 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9842 tree lhs = gimple_get_lhs (call);
9843 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9845 tree else_arg
9846 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9847 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9848 gsi_replace (&gsi, new_stmt, true);
9852 } /* BBs in loop */
9854 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9855 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9856 if (integer_onep (step_vector))
9857 niters_no_overflow = true;
9858 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9859 niters_vector_mult_vf, !niters_no_overflow);
9861 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9862 scale_profile_for_vect_loop (loop, assumed_vf);
9864 /* True if the final iteration might not handle a full vector's
9865 worth of scalar iterations. */
9866 bool final_iter_may_be_partial
9867 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9868 /* The minimum number of iterations performed by the epilogue. This
9869 is 1 when peeling for gaps because we always need a final scalar
9870 iteration. */
9871 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9872 /* +1 to convert latch counts to loop iteration counts,
9873 -min_epilogue_iters to remove iterations that cannot be performed
9874 by the vector code. */
9875 int bias_for_lowest = 1 - min_epilogue_iters;
9876 int bias_for_assumed = bias_for_lowest;
9877 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9878 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9880 /* When the amount of peeling is known at compile time, the first
9881 iteration will have exactly alignment_npeels active elements.
9882 In the worst case it will have at least one. */
9883 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9884 bias_for_lowest += lowest_vf - min_first_active;
9885 bias_for_assumed += assumed_vf - min_first_active;
9887 /* In these calculations the "- 1" converts loop iteration counts
9888 back to latch counts. */
9889 if (loop->any_upper_bound)
9891 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9892 loop->nb_iterations_upper_bound
9893 = (final_iter_may_be_partial
9894 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9895 lowest_vf) - 1
9896 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9897 lowest_vf) - 1);
9898 if (main_vinfo)
9900 unsigned int bound;
9901 poly_uint64 main_iters
9902 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9903 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9904 main_iters
9905 = upper_bound (main_iters,
9906 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9907 if (can_div_away_from_zero_p (main_iters,
9908 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9909 &bound))
9910 loop->nb_iterations_upper_bound
9911 = wi::umin ((widest_int) (bound - 1),
9912 loop->nb_iterations_upper_bound);
9915 if (loop->any_likely_upper_bound)
9916 loop->nb_iterations_likely_upper_bound
9917 = (final_iter_may_be_partial
9918 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9919 + bias_for_lowest, lowest_vf) - 1
9920 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9921 + bias_for_lowest, lowest_vf) - 1);
9922 if (loop->any_estimate)
9923 loop->nb_iterations_estimate
9924 = (final_iter_may_be_partial
9925 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9926 assumed_vf) - 1
9927 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9928 assumed_vf) - 1);
9930 if (dump_enabled_p ())
9932 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9934 dump_printf_loc (MSG_NOTE, vect_location,
9935 "LOOP VECTORIZED\n");
9936 if (loop->inner)
9937 dump_printf_loc (MSG_NOTE, vect_location,
9938 "OUTER LOOP VECTORIZED\n");
9939 dump_printf (MSG_NOTE, "\n");
9941 else
9942 dump_printf_loc (MSG_NOTE, vect_location,
9943 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9944 GET_MODE_NAME (loop_vinfo->vector_mode));
9947 /* Loops vectorized with a variable factor won't benefit from
9948 unrolling/peeling. */
9949 if (!vf.is_constant ())
9951 loop->unroll = 1;
9952 if (dump_enabled_p ())
9953 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9954 " variable-length vectorization factor\n");
9956 /* Free SLP instances here because otherwise stmt reference counting
9957 won't work. */
9958 slp_instance instance;
9959 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9960 vect_free_slp_instance (instance);
9961 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9962 /* Clear-up safelen field since its value is invalid after vectorization
9963 since vectorized loop can have loop-carried dependencies. */
9964 loop->safelen = 0;
9966 if (epilogue)
9968 update_epilogue_loop_vinfo (epilogue, advance);
9970 epilogue->simduid = loop->simduid;
9971 epilogue->force_vectorize = loop->force_vectorize;
9972 epilogue->dont_vectorize = false;
9975 return epilogue;
9978 /* The code below is trying to perform simple optimization - revert
9979 if-conversion for masked stores, i.e. if the mask of a store is zero
9980 do not perform it and all stored value producers also if possible.
9981 For example,
9982 for (i=0; i<n; i++)
9983 if (c[i])
9985 p1[i] += 1;
9986 p2[i] = p3[i] +2;
9988 this transformation will produce the following semi-hammock:
9990 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9992 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9993 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9994 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9995 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9996 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9997 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10001 void
10002 optimize_mask_stores (class loop *loop)
10004 basic_block *bbs = get_loop_body (loop);
10005 unsigned nbbs = loop->num_nodes;
10006 unsigned i;
10007 basic_block bb;
10008 class loop *bb_loop;
10009 gimple_stmt_iterator gsi;
10010 gimple *stmt;
10011 auto_vec<gimple *> worklist;
10012 auto_purge_vect_location sentinel;
10014 vect_location = find_loop_location (loop);
10015 /* Pick up all masked stores in loop if any. */
10016 for (i = 0; i < nbbs; i++)
10018 bb = bbs[i];
10019 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10020 gsi_next (&gsi))
10022 stmt = gsi_stmt (gsi);
10023 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10024 worklist.safe_push (stmt);
10028 free (bbs);
10029 if (worklist.is_empty ())
10030 return;
10032 /* Loop has masked stores. */
10033 while (!worklist.is_empty ())
10035 gimple *last, *last_store;
10036 edge e, efalse;
10037 tree mask;
10038 basic_block store_bb, join_bb;
10039 gimple_stmt_iterator gsi_to;
10040 tree vdef, new_vdef;
10041 gphi *phi;
10042 tree vectype;
10043 tree zero;
10045 last = worklist.pop ();
10046 mask = gimple_call_arg (last, 2);
10047 bb = gimple_bb (last);
10048 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10049 the same loop as if_bb. It could be different to LOOP when two
10050 level loop-nest is vectorized and mask_store belongs to the inner
10051 one. */
10052 e = split_block (bb, last);
10053 bb_loop = bb->loop_father;
10054 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10055 join_bb = e->dest;
10056 store_bb = create_empty_bb (bb);
10057 add_bb_to_loop (store_bb, bb_loop);
10058 e->flags = EDGE_TRUE_VALUE;
10059 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10060 /* Put STORE_BB to likely part. */
10061 efalse->probability = profile_probability::unlikely ();
10062 store_bb->count = efalse->count ();
10063 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10064 if (dom_info_available_p (CDI_DOMINATORS))
10065 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10066 if (dump_enabled_p ())
10067 dump_printf_loc (MSG_NOTE, vect_location,
10068 "Create new block %d to sink mask stores.",
10069 store_bb->index);
10070 /* Create vector comparison with boolean result. */
10071 vectype = TREE_TYPE (mask);
10072 zero = build_zero_cst (vectype);
10073 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10074 gsi = gsi_last_bb (bb);
10075 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10076 /* Create new PHI node for vdef of the last masked store:
10077 .MEM_2 = VDEF <.MEM_1>
10078 will be converted to
10079 .MEM.3 = VDEF <.MEM_1>
10080 and new PHI node will be created in join bb
10081 .MEM_2 = PHI <.MEM_1, .MEM_3>
10083 vdef = gimple_vdef (last);
10084 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10085 gimple_set_vdef (last, new_vdef);
10086 phi = create_phi_node (vdef, join_bb);
10087 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10089 /* Put all masked stores with the same mask to STORE_BB if possible. */
10090 while (true)
10092 gimple_stmt_iterator gsi_from;
10093 gimple *stmt1 = NULL;
10095 /* Move masked store to STORE_BB. */
10096 last_store = last;
10097 gsi = gsi_for_stmt (last);
10098 gsi_from = gsi;
10099 /* Shift GSI to the previous stmt for further traversal. */
10100 gsi_prev (&gsi);
10101 gsi_to = gsi_start_bb (store_bb);
10102 gsi_move_before (&gsi_from, &gsi_to);
10103 /* Setup GSI_TO to the non-empty block start. */
10104 gsi_to = gsi_start_bb (store_bb);
10105 if (dump_enabled_p ())
10106 dump_printf_loc (MSG_NOTE, vect_location,
10107 "Move stmt to created bb\n%G", last);
10108 /* Move all stored value producers if possible. */
10109 while (!gsi_end_p (gsi))
10111 tree lhs;
10112 imm_use_iterator imm_iter;
10113 use_operand_p use_p;
10114 bool res;
10116 /* Skip debug statements. */
10117 if (is_gimple_debug (gsi_stmt (gsi)))
10119 gsi_prev (&gsi);
10120 continue;
10122 stmt1 = gsi_stmt (gsi);
10123 /* Do not consider statements writing to memory or having
10124 volatile operand. */
10125 if (gimple_vdef (stmt1)
10126 || gimple_has_volatile_ops (stmt1))
10127 break;
10128 gsi_from = gsi;
10129 gsi_prev (&gsi);
10130 lhs = gimple_get_lhs (stmt1);
10131 if (!lhs)
10132 break;
10134 /* LHS of vectorized stmt must be SSA_NAME. */
10135 if (TREE_CODE (lhs) != SSA_NAME)
10136 break;
10138 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10140 /* Remove dead scalar statement. */
10141 if (has_zero_uses (lhs))
10143 gsi_remove (&gsi_from, true);
10144 continue;
10148 /* Check that LHS does not have uses outside of STORE_BB. */
10149 res = true;
10150 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10152 gimple *use_stmt;
10153 use_stmt = USE_STMT (use_p);
10154 if (is_gimple_debug (use_stmt))
10155 continue;
10156 if (gimple_bb (use_stmt) != store_bb)
10158 res = false;
10159 break;
10162 if (!res)
10163 break;
10165 if (gimple_vuse (stmt1)
10166 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10167 break;
10169 /* Can move STMT1 to STORE_BB. */
10170 if (dump_enabled_p ())
10171 dump_printf_loc (MSG_NOTE, vect_location,
10172 "Move stmt to created bb\n%G", stmt1);
10173 gsi_move_before (&gsi_from, &gsi_to);
10174 /* Shift GSI_TO for further insertion. */
10175 gsi_prev (&gsi_to);
10177 /* Put other masked stores with the same mask to STORE_BB. */
10178 if (worklist.is_empty ()
10179 || gimple_call_arg (worklist.last (), 2) != mask
10180 || worklist.last () != stmt1)
10181 break;
10182 last = worklist.pop ();
10184 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10188 /* Decide whether it is possible to use a zero-based induction variable
10189 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10190 the value that the induction variable must be able to hold in order
10191 to ensure that the rgroups eventually have no active vector elements.
10192 Return -1 otherwise. */
10194 widest_int
10195 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10197 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10198 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10199 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10201 /* Calculate the value that the induction variable must be able
10202 to hit in order to ensure that we end the loop with an all-false mask.
10203 This involves adding the maximum number of inactive trailing scalar
10204 iterations. */
10205 widest_int iv_limit = -1;
10206 if (max_loop_iterations (loop, &iv_limit))
10208 if (niters_skip)
10210 /* Add the maximum number of skipped iterations to the
10211 maximum iteration count. */
10212 if (TREE_CODE (niters_skip) == INTEGER_CST)
10213 iv_limit += wi::to_widest (niters_skip);
10214 else
10215 iv_limit += max_vf - 1;
10217 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10218 /* Make a conservatively-correct assumption. */
10219 iv_limit += max_vf - 1;
10221 /* IV_LIMIT is the maximum number of latch iterations, which is also
10222 the maximum in-range IV value. Round this value down to the previous
10223 vector alignment boundary and then add an extra full iteration. */
10224 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10225 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10227 return iv_limit;
10230 /* For the given rgroup_controls RGC, check whether an induction variable
10231 would ever hit a value that produces a set of all-false masks or zero
10232 lengths before wrapping around. Return true if it's possible to wrap
10233 around before hitting the desirable value, otherwise return false. */
10235 bool
10236 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10238 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10240 if (iv_limit == -1)
10241 return true;
10243 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10244 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10245 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10247 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10248 return true;
10250 return false;