testsuite: pragma-optimize.c requires ifunc.
[official-gcc.git] / gcc / tree-vect-loop.c
blobf700d5e7ac2c05402407a46113320f79359906fa
1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
159 bool *, bool *);
161 /* Subroutine of vect_determine_vf_for_stmt that handles only one
162 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
163 may already be set for general statements (not just data refs). */
165 static opt_result
166 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
167 bool vectype_maybe_set_p,
168 poly_uint64 *vf)
170 gimple *stmt = stmt_info->stmt;
172 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
173 && !STMT_VINFO_LIVE_P (stmt_info))
174 || gimple_clobber_p (stmt))
176 if (dump_enabled_p ())
177 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
178 return opt_result::success ();
181 tree stmt_vectype, nunits_vectype;
182 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
183 &stmt_vectype,
184 &nunits_vectype);
185 if (!res)
186 return res;
188 if (stmt_vectype)
190 if (STMT_VINFO_VECTYPE (stmt_info))
191 /* The only case when a vectype had been already set is for stmts
192 that contain a data ref, or for "pattern-stmts" (stmts generated
193 by the vectorizer to represent/replace a certain idiom). */
194 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
195 || vectype_maybe_set_p)
196 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
197 else
198 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
201 if (nunits_vectype)
202 vect_update_max_nunits (vf, nunits_vectype);
204 return opt_result::success ();
207 /* Subroutine of vect_determine_vectorization_factor. Set the vector
208 types of STMT_INFO and all attached pattern statements and update
209 the vectorization factor VF accordingly. Return true on success
210 or false if something prevented vectorization. */
212 static opt_result
213 vect_determine_vf_for_stmt (vec_info *vinfo,
214 stmt_vec_info stmt_info, poly_uint64 *vf)
216 if (dump_enabled_p ())
217 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
218 stmt_info->stmt);
219 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
220 if (!res)
221 return res;
223 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
224 && STMT_VINFO_RELATED_STMT (stmt_info))
226 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
227 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
229 /* If a pattern statement has def stmts, analyze them too. */
230 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
231 !gsi_end_p (si); gsi_next (&si))
233 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
234 if (dump_enabled_p ())
235 dump_printf_loc (MSG_NOTE, vect_location,
236 "==> examining pattern def stmt: %G",
237 def_stmt_info->stmt);
238 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
239 if (!res)
240 return res;
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location,
245 "==> examining pattern statement: %G",
246 stmt_info->stmt);
247 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
248 if (!res)
249 return res;
252 return opt_result::success ();
255 /* Function vect_determine_vectorization_factor
257 Determine the vectorization factor (VF). VF is the number of data elements
258 that are operated upon in parallel in a single iteration of the vectorized
259 loop. For example, when vectorizing a loop that operates on 4byte elements,
260 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
261 elements can fit in a single vector register.
263 We currently support vectorization of loops in which all types operated upon
264 are of the same size. Therefore this function currently sets VF according to
265 the size of the types operated upon, and fails if there are multiple sizes
266 in the loop.
268 VF is also the factor by which the loop iterations are strip-mined, e.g.:
269 original loop:
270 for (i=0; i<N; i++){
271 a[i] = b[i] + c[i];
274 vectorized loop:
275 for (i=0; i<N; i+=VF){
276 a[i:VF] = b[i:VF] + c[i:VF];
280 static opt_result
281 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
283 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
284 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
285 unsigned nbbs = loop->num_nodes;
286 poly_uint64 vectorization_factor = 1;
287 tree scalar_type = NULL_TREE;
288 gphi *phi;
289 tree vectype;
290 stmt_vec_info stmt_info;
291 unsigned i;
293 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
295 for (i = 0; i < nbbs; i++)
297 basic_block bb = bbs[i];
299 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
300 gsi_next (&si))
302 phi = si.phi ();
303 stmt_info = loop_vinfo->lookup_stmt (phi);
304 if (dump_enabled_p ())
305 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
306 phi);
308 gcc_assert (stmt_info);
310 if (STMT_VINFO_RELEVANT_P (stmt_info)
311 || STMT_VINFO_LIVE_P (stmt_info))
313 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
314 scalar_type = TREE_TYPE (PHI_RESULT (phi));
316 if (dump_enabled_p ())
317 dump_printf_loc (MSG_NOTE, vect_location,
318 "get vectype for scalar type: %T\n",
319 scalar_type);
321 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
322 if (!vectype)
323 return opt_result::failure_at (phi,
324 "not vectorized: unsupported "
325 "data-type %T\n",
326 scalar_type);
327 STMT_VINFO_VECTYPE (stmt_info) = vectype;
329 if (dump_enabled_p ())
330 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
331 vectype);
333 if (dump_enabled_p ())
335 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
336 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
337 dump_printf (MSG_NOTE, "\n");
340 vect_update_max_nunits (&vectorization_factor, vectype);
344 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
345 gsi_next (&si))
347 if (is_gimple_debug (gsi_stmt (si)))
348 continue;
349 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
350 opt_result res
351 = vect_determine_vf_for_stmt (loop_vinfo,
352 stmt_info, &vectorization_factor);
353 if (!res)
354 return res;
358 /* TODO: Analyze cost. Decide if worth while to vectorize. */
359 if (dump_enabled_p ())
361 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
362 dump_dec (MSG_NOTE, vectorization_factor);
363 dump_printf (MSG_NOTE, "\n");
366 if (known_le (vectorization_factor, 1U))
367 return opt_result::failure_at (vect_location,
368 "not vectorized: unsupported data-type\n");
369 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
370 return opt_result::success ();
374 /* Function vect_is_simple_iv_evolution.
376 FORNOW: A simple evolution of an induction variables in the loop is
377 considered a polynomial evolution. */
379 static bool
380 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
381 tree * step)
383 tree init_expr;
384 tree step_expr;
385 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
386 basic_block bb;
388 /* When there is no evolution in this loop, the evolution function
389 is not "simple". */
390 if (evolution_part == NULL_TREE)
391 return false;
393 /* When the evolution is a polynomial of degree >= 2
394 the evolution function is not "simple". */
395 if (tree_is_chrec (evolution_part))
396 return false;
398 step_expr = evolution_part;
399 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
401 if (dump_enabled_p ())
402 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
403 step_expr, init_expr);
405 *init = init_expr;
406 *step = step_expr;
408 if (TREE_CODE (step_expr) != INTEGER_CST
409 && (TREE_CODE (step_expr) != SSA_NAME
410 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
411 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
412 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
413 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
414 || !flag_associative_math)))
415 && (TREE_CODE (step_expr) != REAL_CST
416 || !flag_associative_math))
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "step unknown.\n");
421 return false;
424 return true;
427 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
428 what we are assuming is a double reduction. For example, given
429 a structure like this:
431 outer1:
432 x_1 = PHI <x_4(outer2), ...>;
435 inner:
436 x_2 = PHI <x_1(outer1), ...>;
438 x_3 = ...;
441 outer2:
442 x_4 = PHI <x_3(inner)>;
445 outer loop analysis would treat x_1 as a double reduction phi and
446 this function would then return true for x_2. */
448 static bool
449 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
451 use_operand_p use_p;
452 ssa_op_iter op_iter;
453 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
454 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
455 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
456 return true;
457 return false;
460 /* Function vect_analyze_scalar_cycles_1.
462 Examine the cross iteration def-use cycles of scalar variables
463 in LOOP. LOOP_VINFO represents the loop that is now being
464 considered for vectorization (can be LOOP, or an outer-loop
465 enclosing LOOP). */
467 static void
468 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
470 basic_block bb = loop->header;
471 tree init, step;
472 auto_vec<stmt_vec_info, 64> worklist;
473 gphi_iterator gsi;
474 bool double_reduc, reduc_chain;
476 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
478 /* First - identify all inductions. Reduction detection assumes that all the
479 inductions have been identified, therefore, this order must not be
480 changed. */
481 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
483 gphi *phi = gsi.phi ();
484 tree access_fn = NULL;
485 tree def = PHI_RESULT (phi);
486 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
491 /* Skip virtual phi's. The data dependences that are associated with
492 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
493 if (virtual_operand_p (def))
494 continue;
496 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
498 /* Analyze the evolution function. */
499 access_fn = analyze_scalar_evolution (loop, def);
500 if (access_fn)
502 STRIP_NOPS (access_fn);
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_NOTE, vect_location,
505 "Access function of PHI: %T\n", access_fn);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
507 = initial_condition_in_loop_num (access_fn, loop->num);
508 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
509 = evolution_part_in_loop_num (access_fn, loop->num);
512 if (!access_fn
513 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
514 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
515 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
516 && TREE_CODE (step) != INTEGER_CST))
518 worklist.safe_push (stmt_vinfo);
519 continue;
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 != NULL_TREE);
524 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
526 if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
528 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
532 /* Second - identify all reductions and nested cycles. */
533 while (worklist.length () > 0)
535 stmt_vec_info stmt_vinfo = worklist.pop ();
536 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
537 tree def = PHI_RESULT (phi);
539 if (dump_enabled_p ())
540 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
542 gcc_assert (!virtual_operand_p (def)
543 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
545 stmt_vec_info reduc_stmt_info
546 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
547 &reduc_chain);
548 if (reduc_stmt_info)
550 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
551 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
552 if (double_reduc)
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location,
556 "Detected double reduction.\n");
558 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
559 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
561 else
563 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "Detected vectorizable nested cycle.\n");
569 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
571 else
573 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE, vect_location,
575 "Detected reduction.\n");
577 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
578 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
579 /* Store the reduction cycles for possible vectorization in
580 loop-aware SLP if it was not detected as reduction
581 chain. */
582 if (! reduc_chain)
583 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
584 (reduc_stmt_info);
588 else
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
591 "Unknown def-use cycle pattern.\n");
596 /* Function vect_analyze_scalar_cycles.
598 Examine the cross iteration def-use cycles of scalar variables, by
599 analyzing the loop-header PHIs of scalar variables. Classify each
600 cycle as one of the following: invariant, induction, reduction, unknown.
601 We do that for the loop represented by LOOP_VINFO, and also to its
602 inner-loop, if exists.
603 Examples for scalar cycles:
605 Example1: reduction:
607 loop1:
608 for (i=0; i<N; i++)
609 sum += a[i];
611 Example2: induction:
613 loop2:
614 for (i=0; i<N; i++)
615 a[i] = i; */
617 static void
618 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
620 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
622 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
624 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
625 Reductions in such inner-loop therefore have different properties than
626 the reductions in the nest that gets vectorized:
627 1. When vectorized, they are executed in the same order as in the original
628 scalar loop, so we can't change the order of computation when
629 vectorizing them.
630 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
631 current checks are too strict. */
633 if (loop->inner)
634 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
637 /* Transfer group and reduction information from STMT_INFO to its
638 pattern stmt. */
640 static void
641 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
643 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
644 stmt_vec_info stmtp;
645 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
646 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
647 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
650 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
651 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
652 == STMT_VINFO_DEF_TYPE (stmt_info));
653 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
654 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
655 if (stmt_info)
656 REDUC_GROUP_NEXT_ELEMENT (stmtp)
657 = STMT_VINFO_RELATED_STMT (stmt_info);
659 while (stmt_info);
662 /* Fixup scalar cycles that now have their stmts detected as patterns. */
664 static void
665 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
667 stmt_vec_info first;
668 unsigned i;
670 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
672 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
673 while (next)
675 if ((STMT_VINFO_IN_PATTERN_P (next)
676 != STMT_VINFO_IN_PATTERN_P (first))
677 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
678 break;
679 next = REDUC_GROUP_NEXT_ELEMENT (next);
681 /* If all reduction chain members are well-formed patterns adjust
682 the group to group the pattern stmts instead. */
683 if (! next
684 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
686 if (STMT_VINFO_IN_PATTERN_P (first))
688 vect_fixup_reduc_chain (first);
689 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
690 = STMT_VINFO_RELATED_STMT (first);
693 /* If not all stmt in the chain are patterns or if we failed
694 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
695 it as regular reduction instead. */
696 else
698 stmt_vec_info vinfo = first;
699 stmt_vec_info last = NULL;
700 while (vinfo)
702 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
703 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
704 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
705 last = vinfo;
706 vinfo = next;
708 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
709 = vect_internal_def;
710 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
711 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
712 --i;
717 /* Function vect_get_loop_niters.
719 Determine how many iterations the loop is executed and place it
720 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
721 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
722 niter information holds in ASSUMPTIONS.
724 Return the loop exit condition. */
727 static gcond *
728 vect_get_loop_niters (class loop *loop, tree *assumptions,
729 tree *number_of_iterations, tree *number_of_iterationsm1)
731 edge exit = single_exit (loop);
732 class tree_niter_desc niter_desc;
733 tree niter_assumptions, niter, may_be_zero;
734 gcond *cond = get_loop_exit_condition (loop);
736 *assumptions = boolean_true_node;
737 *number_of_iterationsm1 = chrec_dont_know;
738 *number_of_iterations = chrec_dont_know;
739 DUMP_VECT_SCOPE ("get_loop_niters");
741 if (!exit)
742 return cond;
744 may_be_zero = NULL_TREE;
745 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
746 || chrec_contains_undetermined (niter_desc.niter))
747 return cond;
749 niter_assumptions = niter_desc.assumptions;
750 may_be_zero = niter_desc.may_be_zero;
751 niter = niter_desc.niter;
753 if (may_be_zero && integer_zerop (may_be_zero))
754 may_be_zero = NULL_TREE;
756 if (may_be_zero)
758 if (COMPARISON_CLASS_P (may_be_zero))
760 /* Try to combine may_be_zero with assumptions, this can simplify
761 computation of niter expression. */
762 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
763 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
764 niter_assumptions,
765 fold_build1 (TRUTH_NOT_EXPR,
766 boolean_type_node,
767 may_be_zero));
768 else
769 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
770 build_int_cst (TREE_TYPE (niter), 0),
771 rewrite_to_non_trapping_overflow (niter));
773 may_be_zero = NULL_TREE;
775 else if (integer_nonzerop (may_be_zero))
777 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
778 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
779 return cond;
781 else
782 return cond;
785 *assumptions = niter_assumptions;
786 *number_of_iterationsm1 = niter;
788 /* We want the number of loop header executions which is the number
789 of latch executions plus one.
790 ??? For UINT_MAX latch executions this number overflows to zero
791 for loops like do { n++; } while (n != 0); */
792 if (niter && !chrec_contains_undetermined (niter))
793 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
794 build_int_cst (TREE_TYPE (niter), 1));
795 *number_of_iterations = niter;
797 return cond;
800 /* Function bb_in_loop_p
802 Used as predicate for dfs order traversal of the loop bbs. */
804 static bool
805 bb_in_loop_p (const_basic_block bb, const void *data)
807 const class loop *const loop = (const class loop *)data;
808 if (flow_bb_inside_loop_p (loop, bb))
809 return true;
810 return false;
814 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
815 stmt_vec_info structs for all the stmts in LOOP_IN. */
817 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
818 : vec_info (vec_info::loop, shared),
819 loop (loop_in),
820 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
821 num_itersm1 (NULL_TREE),
822 num_iters (NULL_TREE),
823 num_iters_unchanged (NULL_TREE),
824 num_iters_assumptions (NULL_TREE),
825 vector_costs (nullptr),
826 scalar_costs (nullptr),
827 th (0),
828 versioning_threshold (0),
829 vectorization_factor (0),
830 main_loop_edge (nullptr),
831 skip_main_loop_edge (nullptr),
832 skip_this_loop_edge (nullptr),
833 reusable_accumulators (),
834 max_vectorization_factor (0),
835 mask_skip_niters (NULL_TREE),
836 rgroup_compare_type (NULL_TREE),
837 simd_if_cond (NULL_TREE),
838 unaligned_dr (NULL),
839 peeling_for_alignment (0),
840 ptr_mask (0),
841 ivexpr_map (NULL),
842 scan_map (NULL),
843 slp_unrolling_factor (1),
844 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
845 vectorizable (false),
846 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
847 using_partial_vectors_p (false),
848 epil_using_partial_vectors_p (false),
849 peeling_for_gaps (false),
850 peeling_for_niter (false),
851 no_data_dependencies (false),
852 has_mask_store (false),
853 scalar_loop_scaling (profile_probability::uninitialized ()),
854 scalar_loop (NULL),
855 orig_loop_info (NULL)
857 /* CHECKME: We want to visit all BBs before their successors (except for
858 latch blocks, for which this assertion wouldn't hold). In the simple
859 case of the loop forms we allow, a dfs order of the BBs would the same
860 as reversed postorder traversal, so we are safe. */
862 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
863 bbs, loop->num_nodes, loop);
864 gcc_assert (nbbs == loop->num_nodes);
866 for (unsigned int i = 0; i < nbbs; i++)
868 basic_block bb = bbs[i];
869 gimple_stmt_iterator si;
871 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
873 gimple *phi = gsi_stmt (si);
874 gimple_set_uid (phi, 0);
875 add_stmt (phi);
878 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
880 gimple *stmt = gsi_stmt (si);
881 gimple_set_uid (stmt, 0);
882 if (is_gimple_debug (stmt))
883 continue;
884 add_stmt (stmt);
885 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
886 third argument is the #pragma omp simd if (x) condition, when 0,
887 loop shouldn't be vectorized, when non-zero constant, it should
888 be vectorized normally, otherwise versioned with vectorized loop
889 done if the condition is non-zero at runtime. */
890 if (loop_in->simduid
891 && is_gimple_call (stmt)
892 && gimple_call_internal_p (stmt)
893 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
894 && gimple_call_num_args (stmt) >= 3
895 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
896 && (loop_in->simduid
897 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
899 tree arg = gimple_call_arg (stmt, 2);
900 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
901 simd_if_cond = arg;
902 else
903 gcc_assert (integer_nonzerop (arg));
908 epilogue_vinfos.create (6);
911 /* Free all levels of rgroup CONTROLS. */
913 void
914 release_vec_loop_controls (vec<rgroup_controls> *controls)
916 rgroup_controls *rgc;
917 unsigned int i;
918 FOR_EACH_VEC_ELT (*controls, i, rgc)
919 rgc->controls.release ();
920 controls->release ();
923 /* Free all memory used by the _loop_vec_info, as well as all the
924 stmt_vec_info structs of all the stmts in the loop. */
926 _loop_vec_info::~_loop_vec_info ()
928 free (bbs);
930 release_vec_loop_controls (&masks);
931 release_vec_loop_controls (&lens);
932 delete ivexpr_map;
933 delete scan_map;
934 epilogue_vinfos.release ();
935 delete scalar_costs;
936 delete vector_costs;
938 /* When we release an epiloge vinfo that we do not intend to use
939 avoid clearing AUX of the main loop which should continue to
940 point to the main loop vinfo since otherwise we'll leak that. */
941 if (loop->aux == this)
942 loop->aux = NULL;
945 /* Return an invariant or register for EXPR and emit necessary
946 computations in the LOOP_VINFO loop preheader. */
948 tree
949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
951 if (is_gimple_reg (expr)
952 || is_gimple_min_invariant (expr))
953 return expr;
955 if (! loop_vinfo->ivexpr_map)
956 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
957 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
958 if (! cached)
960 gimple_seq stmts = NULL;
961 cached = force_gimple_operand (unshare_expr (expr),
962 &stmts, true, NULL_TREE);
963 if (stmts)
965 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
966 gsi_insert_seq_on_edge_immediate (e, stmts);
969 return cached;
972 /* Return true if we can use CMP_TYPE as the comparison type to produce
973 all masks required to mask LOOP_VINFO. */
975 static bool
976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
978 rgroup_controls *rgm;
979 unsigned int i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
981 if (rgm->type != NULL_TREE
982 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
983 cmp_type, rgm->type,
984 OPTIMIZE_FOR_SPEED))
985 return false;
986 return true;
989 /* Calculate the maximum number of scalars per iteration for every
990 rgroup in LOOP_VINFO. */
992 static unsigned int
993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
995 unsigned int res = 1;
996 unsigned int i;
997 rgroup_controls *rgm;
998 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
999 res = MAX (res, rgm->max_nscalars_per_iter);
1000 return res;
1003 /* Calculate the minimum precision necessary to represent:
1005 MAX_NITERS * FACTOR
1007 as an unsigned integer, where MAX_NITERS is the maximum number of
1008 loop header iterations for the original scalar form of LOOP_VINFO. */
1010 static unsigned
1011 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1013 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1015 /* Get the maximum number of iterations that is representable
1016 in the counter type. */
1017 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1018 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1020 /* Get a more refined estimate for the number of iterations. */
1021 widest_int max_back_edges;
1022 if (max_loop_iterations (loop, &max_back_edges))
1023 max_ni = wi::smin (max_ni, max_back_edges + 1);
1025 /* Work out how many bits we need to represent the limit. */
1026 return wi::min_precision (max_ni * factor, UNSIGNED);
1029 /* True if the loop needs peeling or partial vectors when vectorized. */
1031 static bool
1032 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1034 unsigned HOST_WIDE_INT const_vf;
1035 HOST_WIDE_INT max_niter
1036 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1038 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1039 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1040 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1041 (loop_vinfo));
1043 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1044 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1046 /* Work out the (constant) number of iterations that need to be
1047 peeled for reasons other than niters. */
1048 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1049 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1050 peel_niter += 1;
1051 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1052 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1053 return true;
1055 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1056 /* ??? When peeling for gaps but not alignment, we could
1057 try to check whether the (variable) niters is known to be
1058 VF * N + 1. That's something of a niche case though. */
1059 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1060 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1061 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1062 < (unsigned) exact_log2 (const_vf))
1063 /* In case of versioning, check if the maximum number of
1064 iterations is greater than th. If they are identical,
1065 the epilogue is unnecessary. */
1066 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1067 || ((unsigned HOST_WIDE_INT) max_niter
1068 > (th / const_vf) * const_vf))))
1069 return true;
1071 return false;
1074 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1075 whether we can actually generate the masks required. Return true if so,
1076 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1078 static bool
1079 vect_verify_full_masking (loop_vec_info loop_vinfo)
1081 unsigned int min_ni_width;
1082 unsigned int max_nscalars_per_iter
1083 = vect_get_max_nscalars_per_iter (loop_vinfo);
1085 /* Use a normal loop if there are no statements that need masking.
1086 This only happens in rare degenerate cases: it means that the loop
1087 has no loads, no stores, and no live-out values. */
1088 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1089 return false;
1091 /* Work out how many bits we need to represent the limit. */
1092 min_ni_width
1093 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1095 /* Find a scalar mode for which WHILE_ULT is supported. */
1096 opt_scalar_int_mode cmp_mode_iter;
1097 tree cmp_type = NULL_TREE;
1098 tree iv_type = NULL_TREE;
1099 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1100 unsigned int iv_precision = UINT_MAX;
1102 if (iv_limit != -1)
1103 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1104 UNSIGNED);
1106 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1108 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1109 if (cmp_bits >= min_ni_width
1110 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1112 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1113 if (this_type
1114 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1116 /* Although we could stop as soon as we find a valid mode,
1117 there are at least two reasons why that's not always the
1118 best choice:
1120 - An IV that's Pmode or wider is more likely to be reusable
1121 in address calculations than an IV that's narrower than
1122 Pmode.
1124 - Doing the comparison in IV_PRECISION or wider allows
1125 a natural 0-based IV, whereas using a narrower comparison
1126 type requires mitigations against wrap-around.
1128 Conversely, if the IV limit is variable, doing the comparison
1129 in a wider type than the original type can introduce
1130 unnecessary extensions, so picking the widest valid mode
1131 is not always a good choice either.
1133 Here we prefer the first IV type that's Pmode or wider,
1134 and the first comparison type that's IV_PRECISION or wider.
1135 (The comparison type must be no wider than the IV type,
1136 to avoid extensions in the vector loop.)
1138 ??? We might want to try continuing beyond Pmode for ILP32
1139 targets if CMP_BITS < IV_PRECISION. */
1140 iv_type = this_type;
1141 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1142 cmp_type = this_type;
1143 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1144 break;
1149 if (!cmp_type)
1150 return false;
1152 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1153 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1154 return true;
1157 /* Check whether we can use vector access with length based on precison
1158 comparison. So far, to keep it simple, we only allow the case that the
1159 precision of the target supported length is larger than the precision
1160 required by loop niters. */
1162 static bool
1163 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1165 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1166 return false;
1168 unsigned int max_nitems_per_iter = 1;
1169 unsigned int i;
1170 rgroup_controls *rgl;
1171 /* Find the maximum number of items per iteration for every rgroup. */
1172 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1174 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1175 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1178 /* Work out how many bits we need to represent the length limit. */
1179 unsigned int min_ni_prec
1180 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1182 /* Now use the maximum of below precisions for one suitable IV type:
1183 - the IV's natural precision
1184 - the precision needed to hold: the maximum number of scalar
1185 iterations multiplied by the scale factor (min_ni_prec above)
1186 - the Pmode precision
1188 If min_ni_prec is less than the precision of the current niters,
1189 we perfer to still use the niters type. Prefer to use Pmode and
1190 wider IV to avoid narrow conversions. */
1192 unsigned int ni_prec
1193 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1194 min_ni_prec = MAX (min_ni_prec, ni_prec);
1195 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1197 tree iv_type = NULL_TREE;
1198 opt_scalar_int_mode tmode_iter;
1199 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1201 scalar_mode tmode = tmode_iter.require ();
1202 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1204 /* ??? Do we really want to construct one IV whose precision exceeds
1205 BITS_PER_WORD? */
1206 if (tbits > BITS_PER_WORD)
1207 break;
1209 /* Find the first available standard integral type. */
1210 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1212 iv_type = build_nonstandard_integer_type (tbits, true);
1213 break;
1217 if (!iv_type)
1219 if (dump_enabled_p ())
1220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1221 "can't vectorize with length-based partial vectors"
1222 " because there is no suitable iv type.\n");
1223 return false;
1226 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1227 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1229 return true;
1232 /* Calculate the cost of one scalar iteration of the loop. */
1233 static void
1234 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1236 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1237 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1238 int nbbs = loop->num_nodes, factor;
1239 int innerloop_iters, i;
1241 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1243 /* Gather costs for statements in the scalar loop. */
1245 /* FORNOW. */
1246 innerloop_iters = 1;
1247 if (loop->inner)
1248 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1250 for (i = 0; i < nbbs; i++)
1252 gimple_stmt_iterator si;
1253 basic_block bb = bbs[i];
1255 if (bb->loop_father == loop->inner)
1256 factor = innerloop_iters;
1257 else
1258 factor = 1;
1260 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1262 gimple *stmt = gsi_stmt (si);
1263 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1265 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1266 continue;
1268 /* Skip stmts that are not vectorized inside the loop. */
1269 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1270 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1271 && (!STMT_VINFO_LIVE_P (vstmt_info)
1272 || !VECTORIZABLE_CYCLE_DEF
1273 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1274 continue;
1276 vect_cost_for_stmt kind;
1277 if (STMT_VINFO_DATA_REF (stmt_info))
1279 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1280 kind = scalar_load;
1281 else
1282 kind = scalar_store;
1284 else if (vect_nop_conversion_p (stmt_info))
1285 continue;
1286 else
1287 kind = scalar_stmt;
1289 /* We are using vect_prologue here to avoid scaling twice
1290 by the inner loop factor. */
1291 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1292 factor, kind, stmt_info, 0, vect_prologue);
1296 /* Now accumulate cost. */
1297 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1298 stmt_info_for_cost *si;
1299 int j;
1300 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1301 j, si)
1302 (void) add_stmt_cost (loop_vinfo->scalar_costs, si->count,
1303 si->kind, si->stmt_info, si->vectype,
1304 si->misalign, si->where);
1305 loop_vinfo->scalar_costs->finish_cost (nullptr);
1309 /* Function vect_analyze_loop_form.
1311 Verify that certain CFG restrictions hold, including:
1312 - the loop has a pre-header
1313 - the loop has a single entry and exit
1314 - the loop exit condition is simple enough
1315 - the number of iterations can be analyzed, i.e, a countable loop. The
1316 niter could be analyzed under some assumptions. */
1318 opt_result
1319 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1321 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1323 /* Different restrictions apply when we are considering an inner-most loop,
1324 vs. an outer (nested) loop.
1325 (FORNOW. May want to relax some of these restrictions in the future). */
1327 info->inner_loop_cond = NULL;
1328 if (!loop->inner)
1330 /* Inner-most loop. We currently require that the number of BBs is
1331 exactly 2 (the header and latch). Vectorizable inner-most loops
1332 look like this:
1334 (pre-header)
1336 header <--------+
1337 | | |
1338 | +--> latch --+
1340 (exit-bb) */
1342 if (loop->num_nodes != 2)
1343 return opt_result::failure_at (vect_location,
1344 "not vectorized:"
1345 " control flow in loop.\n");
1347 if (empty_block_p (loop->header))
1348 return opt_result::failure_at (vect_location,
1349 "not vectorized: empty loop.\n");
1351 else
1353 class loop *innerloop = loop->inner;
1354 edge entryedge;
1356 /* Nested loop. We currently require that the loop is doubly-nested,
1357 contains a single inner loop, and the number of BBs is exactly 5.
1358 Vectorizable outer-loops look like this:
1360 (pre-header)
1362 header <---+
1364 inner-loop |
1366 tail ------+
1368 (exit-bb)
1370 The inner-loop has the properties expected of inner-most loops
1371 as described above. */
1373 if ((loop->inner)->inner || (loop->inner)->next)
1374 return opt_result::failure_at (vect_location,
1375 "not vectorized:"
1376 " multiple nested loops.\n");
1378 if (loop->num_nodes != 5)
1379 return opt_result::failure_at (vect_location,
1380 "not vectorized:"
1381 " control flow in loop.\n");
1383 entryedge = loop_preheader_edge (innerloop);
1384 if (entryedge->src != loop->header
1385 || !single_exit (innerloop)
1386 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1387 return opt_result::failure_at (vect_location,
1388 "not vectorized:"
1389 " unsupported outerloop form.\n");
1391 /* Analyze the inner-loop. */
1392 vect_loop_form_info inner;
1393 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1394 if (!res)
1396 if (dump_enabled_p ())
1397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398 "not vectorized: Bad inner loop.\n");
1399 return res;
1402 /* Don't support analyzing niter under assumptions for inner
1403 loop. */
1404 if (!integer_onep (inner.assumptions))
1405 return opt_result::failure_at (vect_location,
1406 "not vectorized: Bad inner loop.\n");
1408 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1409 return opt_result::failure_at (vect_location,
1410 "not vectorized: inner-loop count not"
1411 " invariant.\n");
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_NOTE, vect_location,
1415 "Considering outer-loop vectorization.\n");
1416 info->inner_loop_cond = inner.loop_cond;
1419 if (!single_exit (loop))
1420 return opt_result::failure_at (vect_location,
1421 "not vectorized: multiple exits.\n");
1422 if (EDGE_COUNT (loop->header->preds) != 2)
1423 return opt_result::failure_at (vect_location,
1424 "not vectorized:"
1425 " too many incoming edges.\n");
1427 /* We assume that the loop exit condition is at the end of the loop. i.e,
1428 that the loop is represented as a do-while (with a proper if-guard
1429 before the loop if needed), where the loop header contains all the
1430 executable statements, and the latch is empty. */
1431 if (!empty_block_p (loop->latch)
1432 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1433 return opt_result::failure_at (vect_location,
1434 "not vectorized: latch block not empty.\n");
1436 /* Make sure the exit is not abnormal. */
1437 edge e = single_exit (loop);
1438 if (e->flags & EDGE_ABNORMAL)
1439 return opt_result::failure_at (vect_location,
1440 "not vectorized:"
1441 " abnormal loop exit edge.\n");
1443 info->loop_cond
1444 = vect_get_loop_niters (loop, &info->assumptions,
1445 &info->number_of_iterations,
1446 &info->number_of_iterationsm1);
1447 if (!info->loop_cond)
1448 return opt_result::failure_at
1449 (vect_location,
1450 "not vectorized: complicated exit condition.\n");
1452 if (integer_zerop (info->assumptions)
1453 || !info->number_of_iterations
1454 || chrec_contains_undetermined (info->number_of_iterations))
1455 return opt_result::failure_at
1456 (info->loop_cond,
1457 "not vectorized: number of iterations cannot be computed.\n");
1459 if (integer_zerop (info->number_of_iterations))
1460 return opt_result::failure_at
1461 (info->loop_cond,
1462 "not vectorized: number of iterations = 0.\n");
1464 if (!(tree_fits_shwi_p (info->number_of_iterations)
1465 && tree_to_shwi (info->number_of_iterations) > 0))
1467 if (dump_enabled_p ())
1469 dump_printf_loc (MSG_NOTE, vect_location,
1470 "Symbolic number of iterations is ");
1471 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1472 dump_printf (MSG_NOTE, "\n");
1476 return opt_result::success ();
1479 /* Create a loop_vec_info for LOOP with SHARED and the
1480 vect_analyze_loop_form result. */
1482 loop_vec_info
1483 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1484 const vect_loop_form_info *info,
1485 loop_vec_info main_loop_info)
1487 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1488 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1489 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1490 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1491 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1492 /* Also record the assumptions for versioning. */
1493 if (!integer_onep (info->assumptions) && !main_loop_info)
1494 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1496 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1497 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1498 if (info->inner_loop_cond)
1500 stmt_vec_info inner_loop_cond_info
1501 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1502 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503 /* If we have an estimate on the number of iterations of the inner
1504 loop use that to limit the scale for costing, otherwise use
1505 --param vect-inner-loop-cost-factor literally. */
1506 widest_int nit;
1507 if (estimated_stmt_executions (loop->inner, &nit))
1508 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1509 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1512 return loop_vinfo;
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518 statements update the vectorization factor. */
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1523 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525 int nbbs = loop->num_nodes;
1526 poly_uint64 vectorization_factor;
1527 int i;
1529 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1531 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532 gcc_assert (known_ne (vectorization_factor, 0U));
1534 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535 vectorization factor of the loop is the unrolling factor required by
1536 the SLP instances. If that unrolling factor is 1, we say, that we
1537 perform pure SLP on loop - cross iteration parallelism is not
1538 exploited. */
1539 bool only_slp_in_loop = true;
1540 for (i = 0; i < nbbs; i++)
1542 basic_block bb = bbs[i];
1543 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544 gsi_next (&si))
1546 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547 if (!stmt_info)
1548 continue;
1549 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551 && !PURE_SLP_STMT (stmt_info))
1552 /* STMT needs both SLP and loop-based vectorization. */
1553 only_slp_in_loop = false;
1555 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556 gsi_next (&si))
1558 if (is_gimple_debug (gsi_stmt (si)))
1559 continue;
1560 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561 stmt_info = vect_stmt_to_vectorize (stmt_info);
1562 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564 && !PURE_SLP_STMT (stmt_info))
1565 /* STMT needs both SLP and loop-based vectorization. */
1566 only_slp_in_loop = false;
1570 if (only_slp_in_loop)
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_NOTE, vect_location,
1574 "Loop contains only SLP stmts\n");
1575 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1577 else
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE, vect_location,
1581 "Loop contains SLP and non-SLP stmts\n");
1582 /* Both the vectorization factor and unroll factor have the form
1583 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584 so they must have a common multiple. */
1585 vectorization_factor
1586 = force_common_multiple (vectorization_factor,
1587 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1590 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "Updating vectorization factor to ");
1595 dump_dec (MSG_NOTE, vectorization_factor);
1596 dump_printf (MSG_NOTE, ".\n");
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601 the other phi in the reduction is also relevant for vectorization.
1602 This rejects cases such as:
1604 outer1:
1605 x_1 = PHI <x_3(outer2), ...>;
1608 inner:
1609 x_2 = ...;
1612 outer2:
1613 x_3 = PHI <x_2(inner)>;
1615 if nothing in x_2 or elsewhere makes x_1 relevant. */
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1620 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621 return false;
1623 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1626 /* Function vect_analyze_loop_operations.
1628 Scan the loop stmts and make sure they are all vectorizable. */
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1633 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635 int nbbs = loop->num_nodes;
1636 int i;
1637 stmt_vec_info stmt_info;
1638 bool need_to_vectorize = false;
1639 bool ok;
1641 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1643 auto_vec<stmt_info_for_cost> cost_vec;
1645 for (i = 0; i < nbbs; i++)
1647 basic_block bb = bbs[i];
1649 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650 gsi_next (&si))
1652 gphi *phi = si.phi ();
1653 ok = true;
1655 stmt_info = loop_vinfo->lookup_stmt (phi);
1656 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658 if (virtual_operand_p (gimple_phi_result (phi)))
1659 continue;
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb))
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info)
1670 && !vect_active_double_reduction_p (stmt_info))
1671 return opt_result::failure_at (phi,
1672 "Unsupported loop-closed phi"
1673 " in outer-loop.\n");
1675 /* If PHI is used in the outer loop, we check that its operand
1676 is defined in the inner loop. */
1677 if (STMT_VINFO_RELEVANT_P (stmt_info))
1679 tree phi_op;
1681 if (gimple_phi_num_args (phi) != 1)
1682 return opt_result::failure_at (phi, "unsupported phi");
1684 phi_op = PHI_ARG_DEF (phi, 0);
1685 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686 if (!op_def_info)
1687 return opt_result::failure_at (phi, "unsupported phi\n");
1689 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690 && (STMT_VINFO_RELEVANT (op_def_info)
1691 != vect_used_in_outer_by_reduction))
1692 return opt_result::failure_at (phi, "unsupported phi\n");
1694 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695 || (STMT_VINFO_DEF_TYPE (stmt_info)
1696 == vect_double_reduction_def))
1697 && !vectorizable_lc_phi (loop_vinfo,
1698 stmt_info, NULL, NULL))
1699 return opt_result::failure_at (phi, "unsupported phi\n");
1702 continue;
1705 gcc_assert (stmt_info);
1707 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708 || STMT_VINFO_LIVE_P (stmt_info))
1709 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710 /* A scalar-dependence cycle that we don't support. */
1711 return opt_result::failure_at (phi,
1712 "not vectorized:"
1713 " scalar dependence cycle.\n");
1715 if (STMT_VINFO_RELEVANT_P (stmt_info))
1717 need_to_vectorize = true;
1718 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719 && ! PURE_SLP_STMT (stmt_info))
1720 ok = vectorizable_induction (loop_vinfo,
1721 stmt_info, NULL, NULL,
1722 &cost_vec);
1723 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724 || (STMT_VINFO_DEF_TYPE (stmt_info)
1725 == vect_double_reduction_def)
1726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727 && ! PURE_SLP_STMT (stmt_info))
1728 ok = vectorizable_reduction (loop_vinfo,
1729 stmt_info, NULL, NULL, &cost_vec);
1732 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1733 if (ok
1734 && STMT_VINFO_LIVE_P (stmt_info)
1735 && !PURE_SLP_STMT (stmt_info))
1736 ok = vectorizable_live_operation (loop_vinfo,
1737 stmt_info, NULL, NULL, NULL,
1738 -1, false, &cost_vec);
1740 if (!ok)
1741 return opt_result::failure_at (phi,
1742 "not vectorized: relevant phi not "
1743 "supported: %G",
1744 static_cast <gimple *> (phi));
1747 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748 gsi_next (&si))
1750 gimple *stmt = gsi_stmt (si);
1751 if (!gimple_clobber_p (stmt)
1752 && !is_gimple_debug (stmt))
1754 opt_result res
1755 = vect_analyze_stmt (loop_vinfo,
1756 loop_vinfo->lookup_stmt (stmt),
1757 &need_to_vectorize,
1758 NULL, NULL, &cost_vec);
1759 if (!res)
1760 return res;
1763 } /* bbs */
1765 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1767 /* All operations in the loop are either irrelevant (deal with loop
1768 control, or dead), or only used outside the loop and can be moved
1769 out of the loop (e.g. invariants, inductions). The loop can be
1770 optimized away by scalar optimizations. We're better off not
1771 touching this loop. */
1772 if (!need_to_vectorize)
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "All the computation can be taken out of the loop.\n");
1777 return opt_result::failure_at
1778 (vect_location,
1779 "not vectorized: redundant loop. no profit to vectorize.\n");
1782 return opt_result::success ();
1785 /* Return true if we know that the iteration count is smaller than the
1786 vectorization factor. Return false if it isn't, or if we can't be sure
1787 either way. */
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1792 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1794 HOST_WIDE_INT max_niter;
1795 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797 else
1798 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1800 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801 return true;
1803 return false;
1806 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1807 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1808 definitely no, or -1 if it's worth retrying. */
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1813 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816 /* Only loops that can handle partially-populated vectors can have iteration
1817 counts less than the vectorization factor. */
1818 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1820 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: iteration count smaller than "
1825 "vectorization factor.\n");
1826 return 0;
1830 /* If using the "very cheap" model. reject cases in which we'd keep
1831 a copy of the scalar code (even if we might be able to vectorize it). */
1832 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1833 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1834 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1835 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "some scalar iterations would need to be peeled\n");
1840 return 0;
1843 int min_profitable_iters, min_profitable_estimate;
1844 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1845 &min_profitable_estimate);
1847 if (min_profitable_iters < 0)
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1851 "not vectorized: vectorization not profitable.\n");
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "not vectorized: vector version will never be "
1855 "profitable.\n");
1856 return -1;
1859 int min_scalar_loop_bound = (param_min_vect_loop_bound
1860 * assumed_vf);
1862 /* Use the cost model only if it is more conservative than user specified
1863 threshold. */
1864 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1865 min_profitable_iters);
1867 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1869 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1870 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 "not vectorized: vectorization not profitable.\n");
1875 if (dump_enabled_p ())
1876 dump_printf_loc (MSG_NOTE, vect_location,
1877 "not vectorized: iteration count smaller than user "
1878 "specified loop bound parameter or minimum profitable "
1879 "iterations (whichever is more conservative).\n");
1880 return 0;
1883 /* The static profitablity threshold min_profitable_estimate includes
1884 the cost of having to check at runtime whether the scalar loop
1885 should be used instead. If it turns out that we don't need or want
1886 such a check, the threshold we should use for the static estimate
1887 is simply the point at which the vector loop becomes more profitable
1888 than the scalar loop. */
1889 if (min_profitable_estimate > min_profitable_iters
1890 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1891 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1892 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1893 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1895 if (dump_enabled_p ())
1896 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1897 " choice between the scalar and vector loops\n");
1898 min_profitable_estimate = min_profitable_iters;
1901 /* If the vector loop needs multiple iterations to be beneficial then
1902 things are probably too close to call, and the conservative thing
1903 would be to stick with the scalar code. */
1904 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1905 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1907 if (dump_enabled_p ())
1908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909 "one iteration of the vector loop would be"
1910 " more expensive than the equivalent number of"
1911 " iterations of the scalar loop\n");
1912 return 0;
1915 HOST_WIDE_INT estimated_niter;
1917 /* If we are vectorizing an epilogue then we know the maximum number of
1918 scalar iterations it will cover is at least one lower than the
1919 vectorization factor of the main loop. */
1920 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1921 estimated_niter
1922 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1923 else
1925 estimated_niter = estimated_stmt_executions_int (loop);
1926 if (estimated_niter == -1)
1927 estimated_niter = likely_max_stmt_executions_int (loop);
1929 if (estimated_niter != -1
1930 && ((unsigned HOST_WIDE_INT) estimated_niter
1931 < MAX (th, (unsigned) min_profitable_estimate)))
1933 if (dump_enabled_p ())
1934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1935 "not vectorized: estimated iteration count too "
1936 "small.\n");
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_NOTE, vect_location,
1939 "not vectorized: estimated iteration count smaller "
1940 "than specified loop bound parameter or minimum "
1941 "profitable iterations (whichever is more "
1942 "conservative).\n");
1943 return -1;
1946 return 1;
1949 static opt_result
1950 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1951 vec<data_reference_p> *datarefs,
1952 unsigned int *n_stmts)
1954 *n_stmts = 0;
1955 for (unsigned i = 0; i < loop->num_nodes; i++)
1956 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1957 !gsi_end_p (gsi); gsi_next (&gsi))
1959 gimple *stmt = gsi_stmt (gsi);
1960 if (is_gimple_debug (stmt))
1961 continue;
1962 ++(*n_stmts);
1963 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1964 NULL, 0);
1965 if (!res)
1967 if (is_gimple_call (stmt) && loop->safelen)
1969 tree fndecl = gimple_call_fndecl (stmt), op;
1970 if (fndecl != NULL_TREE)
1972 cgraph_node *node = cgraph_node::get (fndecl);
1973 if (node != NULL && node->simd_clones != NULL)
1975 unsigned int j, n = gimple_call_num_args (stmt);
1976 for (j = 0; j < n; j++)
1978 op = gimple_call_arg (stmt, j);
1979 if (DECL_P (op)
1980 || (REFERENCE_CLASS_P (op)
1981 && get_base_address (op)))
1982 break;
1984 op = gimple_call_lhs (stmt);
1985 /* Ignore #pragma omp declare simd functions
1986 if they don't have data references in the
1987 call stmt itself. */
1988 if (j == n
1989 && !(op
1990 && (DECL_P (op)
1991 || (REFERENCE_CLASS_P (op)
1992 && get_base_address (op)))))
1993 continue;
1997 return res;
1999 /* If dependence analysis will give up due to the limit on the
2000 number of datarefs stop here and fail fatally. */
2001 if (datarefs->length ()
2002 > (unsigned)param_loop_max_datarefs_for_datadeps)
2003 return opt_result::failure_at (stmt, "exceeded param "
2004 "loop-max-datarefs-for-datadeps\n");
2006 return opt_result::success ();
2009 /* Look for SLP-only access groups and turn each individual access into its own
2010 group. */
2011 static void
2012 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2014 unsigned int i;
2015 struct data_reference *dr;
2017 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2019 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2020 FOR_EACH_VEC_ELT (datarefs, i, dr)
2022 gcc_assert (DR_REF (dr));
2023 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2025 /* Check if the load is a part of an interleaving chain. */
2026 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2028 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2029 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2030 unsigned int group_size = DR_GROUP_SIZE (first_element);
2032 /* Check if SLP-only groups. */
2033 if (!STMT_SLP_TYPE (stmt_info)
2034 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2036 /* Dissolve the group. */
2037 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2039 stmt_vec_info vinfo = first_element;
2040 while (vinfo)
2042 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2043 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2044 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2045 DR_GROUP_SIZE (vinfo) = 1;
2046 if (STMT_VINFO_STRIDED_P (first_element))
2047 DR_GROUP_GAP (vinfo) = 0;
2048 else
2049 DR_GROUP_GAP (vinfo) = group_size - 1;
2050 /* Duplicate and adjust alignment info, it needs to
2051 be present on each group leader, see dr_misalignment. */
2052 if (vinfo != first_element)
2054 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2055 dr_info2->target_alignment = dr_info->target_alignment;
2056 int misalignment = dr_info->misalignment;
2057 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2059 HOST_WIDE_INT diff
2060 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2061 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2062 unsigned HOST_WIDE_INT align_c
2063 = dr_info->target_alignment.to_constant ();
2064 misalignment = (misalignment + diff) % align_c;
2066 dr_info2->misalignment = misalignment;
2068 vinfo = next;
2075 /* Determine if operating on full vectors for LOOP_VINFO might leave
2076 some scalar iterations still to do. If so, decide how we should
2077 handle those scalar iterations. The possibilities are:
2079 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2080 In this case:
2082 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2083 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2084 LOOP_VINFO_PEELING_FOR_NITER == false
2086 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2087 to handle the remaining scalar iterations. In this case:
2089 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2090 LOOP_VINFO_PEELING_FOR_NITER == true
2092 There are two choices:
2094 (2a) Consider vectorizing the epilogue loop at the same VF as the
2095 main loop, but using partial vectors instead of full vectors.
2096 In this case:
2098 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2100 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2101 In this case:
2103 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2105 When FOR_EPILOGUE_P is true, make this determination based on the
2106 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2107 based on the assumption that LOOP_VINFO is the main loop. The caller
2108 has made sure that the number of iterations is set appropriately for
2109 this value of FOR_EPILOGUE_P. */
2111 opt_result
2112 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2113 bool for_epilogue_p)
2115 /* Determine whether there would be any scalar iterations left over. */
2116 bool need_peeling_or_partial_vectors_p
2117 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2119 /* Decide whether to vectorize the loop with partial vectors. */
2120 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2121 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2122 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2123 && need_peeling_or_partial_vectors_p)
2125 /* For partial-vector-usage=1, try to push the handling of partial
2126 vectors to the epilogue, with the main loop continuing to operate
2127 on full vectors.
2129 ??? We could then end up failing to use partial vectors if we
2130 decide to peel iterations into a prologue, and if the main loop
2131 then ends up processing fewer than VF iterations. */
2132 if (param_vect_partial_vector_usage == 1
2133 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2134 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2135 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2136 else
2137 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2140 if (dump_enabled_p ())
2142 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2143 dump_printf_loc (MSG_NOTE, vect_location,
2144 "operating on partial vectors%s.\n",
2145 for_epilogue_p ? " for epilogue loop" : "");
2146 else
2147 dump_printf_loc (MSG_NOTE, vect_location,
2148 "operating only on full vectors%s.\n",
2149 for_epilogue_p ? " for epilogue loop" : "");
2152 if (for_epilogue_p)
2154 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2155 gcc_assert (orig_loop_vinfo);
2156 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2157 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2158 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2161 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2162 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2164 /* Check that the loop processes at least one full vector. */
2165 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2166 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2167 if (known_lt (wi::to_widest (scalar_niters), vf))
2168 return opt_result::failure_at (vect_location,
2169 "loop does not have enough iterations"
2170 " to support vectorization.\n");
2172 /* If we need to peel an extra epilogue iteration to handle data
2173 accesses with gaps, check that there are enough scalar iterations
2174 available.
2176 The check above is redundant with this one when peeling for gaps,
2177 but the distinction is useful for diagnostics. */
2178 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2179 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2180 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2181 return opt_result::failure_at (vect_location,
2182 "loop does not have enough iterations"
2183 " to support peeling for gaps.\n");
2186 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2187 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2188 && need_peeling_or_partial_vectors_p);
2190 return opt_result::success ();
2193 /* Function vect_analyze_loop_2.
2195 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2196 for it. The different analyses will record information in the
2197 loop_vec_info struct. */
2198 static opt_result
2199 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2201 opt_result ok = opt_result::success ();
2202 int res;
2203 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2204 poly_uint64 min_vf = 2;
2205 loop_vec_info orig_loop_vinfo = NULL;
2207 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2208 loop_vec_info of the first vectorized loop. */
2209 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2210 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2211 else
2212 orig_loop_vinfo = loop_vinfo;
2213 gcc_assert (orig_loop_vinfo);
2215 /* The first group of checks is independent of the vector size. */
2216 fatal = true;
2218 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2219 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2220 return opt_result::failure_at (vect_location,
2221 "not vectorized: simd if(0)\n");
2223 /* Find all data references in the loop (which correspond to vdefs/vuses)
2224 and analyze their evolution in the loop. */
2226 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2228 /* Gather the data references and count stmts in the loop. */
2229 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2231 opt_result res
2232 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2233 &LOOP_VINFO_DATAREFS (loop_vinfo),
2234 &LOOP_VINFO_N_STMTS (loop_vinfo));
2235 if (!res)
2237 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239 "not vectorized: loop contains function "
2240 "calls or data references that cannot "
2241 "be analyzed\n");
2242 return res;
2244 loop_vinfo->shared->save_datarefs ();
2246 else
2247 loop_vinfo->shared->check_datarefs ();
2249 /* Analyze the data references and also adjust the minimal
2250 vectorization factor according to the loads and stores. */
2252 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2253 if (!ok)
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257 "bad data references.\n");
2258 return ok;
2261 /* Classify all cross-iteration scalar data-flow cycles.
2262 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2263 vect_analyze_scalar_cycles (loop_vinfo);
2265 vect_pattern_recog (loop_vinfo);
2267 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2269 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2270 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2272 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2273 if (!ok)
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277 "bad data access.\n");
2278 return ok;
2281 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2283 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2284 if (!ok)
2286 if (dump_enabled_p ())
2287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288 "unexpected pattern.\n");
2289 return ok;
2292 /* While the rest of the analysis below depends on it in some way. */
2293 fatal = false;
2295 /* Analyze data dependences between the data-refs in the loop
2296 and adjust the maximum vectorization factor according to
2297 the dependences.
2298 FORNOW: fail at the first data dependence that we encounter. */
2300 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2301 if (!ok)
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2305 "bad data dependence.\n");
2306 return ok;
2308 if (max_vf != MAX_VECTORIZATION_FACTOR
2309 && maybe_lt (max_vf, min_vf))
2310 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2311 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2313 ok = vect_determine_vectorization_factor (loop_vinfo);
2314 if (!ok)
2316 if (dump_enabled_p ())
2317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2318 "can't determine vectorization factor.\n");
2319 return ok;
2321 if (max_vf != MAX_VECTORIZATION_FACTOR
2322 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2323 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2325 /* Compute the scalar iteration cost. */
2326 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2328 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2330 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2331 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2332 if (!ok)
2333 return ok;
2335 /* If there are any SLP instances mark them as pure_slp. */
2336 bool slp = vect_make_slp_decision (loop_vinfo);
2337 if (slp)
2339 /* Find stmts that need to be both vectorized and SLPed. */
2340 vect_detect_hybrid_slp (loop_vinfo);
2342 /* Update the vectorization factor based on the SLP decision. */
2343 vect_update_vf_for_slp (loop_vinfo);
2345 /* Optimize the SLP graph with the vectorization factor fixed. */
2346 vect_optimize_slp (loop_vinfo);
2348 /* Gather the loads reachable from the SLP graph entries. */
2349 vect_gather_slp_loads (loop_vinfo);
2352 bool saved_can_use_partial_vectors_p
2353 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2355 /* We don't expect to have to roll back to anything other than an empty
2356 set of rgroups. */
2357 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2359 /* This is the point where we can re-start analysis with SLP forced off. */
2360 start_over:
2362 /* Now the vectorization factor is final. */
2363 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2364 gcc_assert (known_ne (vectorization_factor, 0U));
2366 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "vectorization_factor = ");
2370 dump_dec (MSG_NOTE, vectorization_factor);
2371 dump_printf (MSG_NOTE, ", niters = %wd\n",
2372 LOOP_VINFO_INT_NITERS (loop_vinfo));
2375 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2377 /* Analyze the alignment of the data-refs in the loop.
2378 Fail if a data reference is found that cannot be vectorized. */
2380 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2381 if (!ok)
2383 if (dump_enabled_p ())
2384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2385 "bad data alignment.\n");
2386 return ok;
2389 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2390 It is important to call pruning after vect_analyze_data_ref_accesses,
2391 since we use grouping information gathered by interleaving analysis. */
2392 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2393 if (!ok)
2394 return ok;
2396 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2397 vectorization, since we do not want to add extra peeling or
2398 add versioning for alignment. */
2399 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2400 /* This pass will decide on using loop versioning and/or loop peeling in
2401 order to enhance the alignment of data references in the loop. */
2402 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2403 if (!ok)
2404 return ok;
2406 if (slp)
2408 /* Analyze operations in the SLP instances. Note this may
2409 remove unsupported SLP instances which makes the above
2410 SLP kind detection invalid. */
2411 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2412 vect_slp_analyze_operations (loop_vinfo);
2413 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2415 ok = opt_result::failure_at (vect_location,
2416 "unsupported SLP instances\n");
2417 goto again;
2420 /* Check whether any load in ALL SLP instances is possibly permuted. */
2421 slp_tree load_node, slp_root;
2422 unsigned i, x;
2423 slp_instance instance;
2424 bool can_use_lanes = true;
2425 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2427 slp_root = SLP_INSTANCE_TREE (instance);
2428 int group_size = SLP_TREE_LANES (slp_root);
2429 tree vectype = SLP_TREE_VECTYPE (slp_root);
2430 bool loads_permuted = false;
2431 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2433 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2434 continue;
2435 unsigned j;
2436 stmt_vec_info load_info;
2437 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2438 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2440 loads_permuted = true;
2441 break;
2445 /* If the loads and stores can be handled with load/store-lane
2446 instructions record it and move on to the next instance. */
2447 if (loads_permuted
2448 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2449 && vect_store_lanes_supported (vectype, group_size, false))
2451 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2453 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2454 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2455 /* Use SLP for strided accesses (or if we can't
2456 load-lanes). */
2457 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2458 || ! vect_load_lanes_supported
2459 (STMT_VINFO_VECTYPE (stmt_vinfo),
2460 DR_GROUP_SIZE (stmt_vinfo), false))
2461 break;
2464 can_use_lanes
2465 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2467 if (can_use_lanes && dump_enabled_p ())
2468 dump_printf_loc (MSG_NOTE, vect_location,
2469 "SLP instance %p can use load/store-lanes\n",
2470 instance);
2472 else
2474 can_use_lanes = false;
2475 break;
2479 /* If all SLP instances can use load/store-lanes abort SLP and try again
2480 with SLP disabled. */
2481 if (can_use_lanes)
2483 ok = opt_result::failure_at (vect_location,
2484 "Built SLP cancelled: can use "
2485 "load/store-lanes\n");
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2488 "Built SLP cancelled: all SLP instances support "
2489 "load/store-lanes\n");
2490 goto again;
2494 /* Dissolve SLP-only groups. */
2495 vect_dissolve_slp_only_groups (loop_vinfo);
2497 /* Scan all the remaining operations in the loop that are not subject
2498 to SLP and make sure they are vectorizable. */
2499 ok = vect_analyze_loop_operations (loop_vinfo);
2500 if (!ok)
2502 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2504 "bad operation or unsupported loop bound.\n");
2505 return ok;
2508 /* For now, we don't expect to mix both masking and length approaches for one
2509 loop, disable it if both are recorded. */
2510 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2511 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2512 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2516 "can't vectorize a loop with partial vectors"
2517 " because we don't expect to mix different"
2518 " approaches with partial vectors for the"
2519 " same loop.\n");
2520 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2523 /* If we still have the option of using partial vectors,
2524 check whether we can generate the necessary loop controls. */
2525 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2526 && !vect_verify_full_masking (loop_vinfo)
2527 && !vect_verify_loop_lens (loop_vinfo))
2528 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2530 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2531 to be able to handle fewer than VF scalars, or needs to have a lower VF
2532 than the main loop. */
2533 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2534 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2535 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2536 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2537 return opt_result::failure_at (vect_location,
2538 "Vectorization factor too high for"
2539 " epilogue loop.\n");
2541 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2542 assuming that the loop will be used as a main loop. We will redo
2543 this analysis later if we instead decide to use the loop as an
2544 epilogue loop. */
2545 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2546 if (!ok)
2547 return ok;
2549 /* Check the costings of the loop make vectorizing worthwhile. */
2550 res = vect_analyze_loop_costing (loop_vinfo);
2551 if (res < 0)
2553 ok = opt_result::failure_at (vect_location,
2554 "Loop costings may not be worthwhile.\n");
2555 goto again;
2557 if (!res)
2558 return opt_result::failure_at (vect_location,
2559 "Loop costings not worthwhile.\n");
2561 /* If an epilogue loop is required make sure we can create one. */
2562 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2563 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2567 if (!vect_can_advance_ivs_p (loop_vinfo)
2568 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2569 single_exit (LOOP_VINFO_LOOP
2570 (loop_vinfo))))
2572 ok = opt_result::failure_at (vect_location,
2573 "not vectorized: can't create required "
2574 "epilog loop\n");
2575 goto again;
2579 /* During peeling, we need to check if number of loop iterations is
2580 enough for both peeled prolog loop and vector loop. This check
2581 can be merged along with threshold check of loop versioning, so
2582 increase threshold for this case if necessary.
2584 If we are analyzing an epilogue we still want to check what its
2585 versioning threshold would be. If we decide to vectorize the epilogues we
2586 will want to use the lowest versioning threshold of all epilogues and main
2587 loop. This will enable us to enter a vectorized epilogue even when
2588 versioning the loop. We can't simply check whether the epilogue requires
2589 versioning though since we may have skipped some versioning checks when
2590 analyzing the epilogue. For instance, checks for alias versioning will be
2591 skipped when dealing with epilogues as we assume we already checked them
2592 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2593 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2595 poly_uint64 niters_th = 0;
2596 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2598 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2600 /* Niters for peeled prolog loop. */
2601 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2603 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2604 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2605 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2607 else
2608 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2611 /* Niters for at least one iteration of vectorized loop. */
2612 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2613 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2614 /* One additional iteration because of peeling for gap. */
2615 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2616 niters_th += 1;
2618 /* Use the same condition as vect_transform_loop to decide when to use
2619 the cost to determine a versioning threshold. */
2620 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2621 && ordered_p (th, niters_th))
2622 niters_th = ordered_max (poly_uint64 (th), niters_th);
2624 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2627 gcc_assert (known_eq (vectorization_factor,
2628 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2630 /* Ok to vectorize! */
2631 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2632 return opt_result::success ();
2634 again:
2635 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2636 gcc_assert (!ok);
2638 /* Try again with SLP forced off but if we didn't do any SLP there is
2639 no point in re-trying. */
2640 if (!slp)
2641 return ok;
2643 /* If there are reduction chains re-trying will fail anyway. */
2644 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2645 return ok;
2647 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2648 via interleaving or lane instructions. */
2649 slp_instance instance;
2650 slp_tree node;
2651 unsigned i, j;
2652 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2654 stmt_vec_info vinfo;
2655 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2656 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2657 continue;
2658 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2659 unsigned int size = DR_GROUP_SIZE (vinfo);
2660 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2661 if (! vect_store_lanes_supported (vectype, size, false)
2662 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2663 && ! vect_grouped_store_supported (vectype, size))
2664 return opt_result::failure_at (vinfo->stmt,
2665 "unsupported grouped store\n");
2666 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2668 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2669 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2670 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2671 size = DR_GROUP_SIZE (vinfo);
2672 vectype = STMT_VINFO_VECTYPE (vinfo);
2673 if (! vect_load_lanes_supported (vectype, size, false)
2674 && ! vect_grouped_load_supported (vectype, single_element_p,
2675 size))
2676 return opt_result::failure_at (vinfo->stmt,
2677 "unsupported grouped load\n");
2681 if (dump_enabled_p ())
2682 dump_printf_loc (MSG_NOTE, vect_location,
2683 "re-trying with SLP disabled\n");
2685 /* Roll back state appropriately. No SLP this time. */
2686 slp = false;
2687 /* Restore vectorization factor as it were without SLP. */
2688 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2689 /* Free the SLP instances. */
2690 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2691 vect_free_slp_instance (instance);
2692 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2693 /* Reset SLP type to loop_vect on all stmts. */
2694 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2696 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2697 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2698 !gsi_end_p (si); gsi_next (&si))
2700 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2701 STMT_SLP_TYPE (stmt_info) = loop_vect;
2702 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2703 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2705 /* vectorizable_reduction adjusts reduction stmt def-types,
2706 restore them to that of the PHI. */
2707 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2708 = STMT_VINFO_DEF_TYPE (stmt_info);
2709 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2710 (STMT_VINFO_REDUC_DEF (stmt_info)))
2711 = STMT_VINFO_DEF_TYPE (stmt_info);
2714 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2715 !gsi_end_p (si); gsi_next (&si))
2717 if (is_gimple_debug (gsi_stmt (si)))
2718 continue;
2719 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2720 STMT_SLP_TYPE (stmt_info) = loop_vect;
2721 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2723 stmt_vec_info pattern_stmt_info
2724 = STMT_VINFO_RELATED_STMT (stmt_info);
2725 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2726 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2728 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2729 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2730 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2731 !gsi_end_p (pi); gsi_next (&pi))
2732 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2733 = loop_vect;
2737 /* Free optimized alias test DDRS. */
2738 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2739 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2740 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2741 /* Reset target cost data. */
2742 delete loop_vinfo->vector_costs;
2743 loop_vinfo->vector_costs = nullptr;
2744 /* Reset accumulated rgroup information. */
2745 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2746 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2747 /* Reset assorted flags. */
2748 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2749 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2750 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2751 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2752 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2753 = saved_can_use_partial_vectors_p;
2755 goto start_over;
2758 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2759 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2760 OLD_LOOP_VINFO is better unless something specifically indicates
2761 otherwise.
2763 Note that this deliberately isn't a partial order. */
2765 static bool
2766 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2767 loop_vec_info old_loop_vinfo)
2769 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2770 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2772 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2773 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2775 /* Always prefer a VF of loop->simdlen over any other VF. */
2776 if (loop->simdlen)
2778 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2779 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2780 if (new_simdlen_p != old_simdlen_p)
2781 return new_simdlen_p;
2784 const auto *old_costs = old_loop_vinfo->vector_costs;
2785 const auto *new_costs = new_loop_vinfo->vector_costs;
2786 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2787 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2789 return new_costs->better_main_loop_than_p (old_costs);
2792 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2793 true if we should. */
2795 static bool
2796 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2797 loop_vec_info old_loop_vinfo)
2799 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2800 return false;
2802 if (dump_enabled_p ())
2803 dump_printf_loc (MSG_NOTE, vect_location,
2804 "***** Preferring vector mode %s to vector mode %s\n",
2805 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2806 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2807 return true;
2810 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2811 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2812 MODE_I to the next mode useful to analyze.
2813 Return the loop_vinfo on success and wrapped null on failure. */
2815 static opt_loop_vec_info
2816 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2817 const vect_loop_form_info *loop_form_info,
2818 loop_vec_info main_loop_vinfo,
2819 const vector_modes &vector_modes, unsigned &mode_i,
2820 machine_mode &autodetected_vector_mode,
2821 bool &fatal)
2823 loop_vec_info loop_vinfo
2824 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2826 machine_mode vector_mode = vector_modes[mode_i];
2827 loop_vinfo->vector_mode = vector_mode;
2829 /* Run the main analysis. */
2830 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
2831 if (dump_enabled_p ())
2832 dump_printf_loc (MSG_NOTE, vect_location,
2833 "***** Analysis %s with vector mode %s\n",
2834 res ? "succeeded" : " failed",
2835 GET_MODE_NAME (loop_vinfo->vector_mode));
2837 /* Remember the autodetected vector mode. */
2838 if (vector_mode == VOIDmode)
2839 autodetected_vector_mode = loop_vinfo->vector_mode;
2841 /* Advance mode_i, first skipping modes that would result in the
2842 same analysis result. */
2843 while (mode_i + 1 < vector_modes.length ()
2844 && vect_chooses_same_modes_p (loop_vinfo,
2845 vector_modes[mode_i + 1]))
2847 if (dump_enabled_p ())
2848 dump_printf_loc (MSG_NOTE, vect_location,
2849 "***** The result for vector mode %s would"
2850 " be the same\n",
2851 GET_MODE_NAME (vector_modes[mode_i + 1]));
2852 mode_i += 1;
2854 if (mode_i + 1 < vector_modes.length ()
2855 && VECTOR_MODE_P (autodetected_vector_mode)
2856 && (related_vector_mode (vector_modes[mode_i + 1],
2857 GET_MODE_INNER (autodetected_vector_mode))
2858 == autodetected_vector_mode)
2859 && (related_vector_mode (autodetected_vector_mode,
2860 GET_MODE_INNER (vector_modes[mode_i + 1]))
2861 == vector_modes[mode_i + 1]))
2863 if (dump_enabled_p ())
2864 dump_printf_loc (MSG_NOTE, vect_location,
2865 "***** Skipping vector mode %s, which would"
2866 " repeat the analysis for %s\n",
2867 GET_MODE_NAME (vector_modes[mode_i + 1]),
2868 GET_MODE_NAME (autodetected_vector_mode));
2869 mode_i += 1;
2871 mode_i++;
2873 if (!res)
2875 delete loop_vinfo;
2876 if (fatal)
2877 gcc_checking_assert (main_loop_vinfo == NULL);
2878 return opt_loop_vec_info::propagate_failure (res);
2881 return opt_loop_vec_info::success (loop_vinfo);
2884 /* Function vect_analyze_loop.
2886 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2887 for it. The different analyses will record information in the
2888 loop_vec_info struct. */
2889 opt_loop_vec_info
2890 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2892 DUMP_VECT_SCOPE ("analyze_loop_nest");
2894 if (loop_outer (loop)
2895 && loop_vec_info_for_loop (loop_outer (loop))
2896 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2897 return opt_loop_vec_info::failure_at (vect_location,
2898 "outer-loop already vectorized.\n");
2900 if (!find_loop_nest (loop, &shared->loop_nest))
2901 return opt_loop_vec_info::failure_at
2902 (vect_location,
2903 "not vectorized: loop nest containing two or more consecutive inner"
2904 " loops cannot be vectorized\n");
2906 /* Analyze the loop form. */
2907 vect_loop_form_info loop_form_info;
2908 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2909 if (!res)
2911 if (dump_enabled_p ())
2912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2913 "bad loop form.\n");
2914 return opt_loop_vec_info::propagate_failure (res);
2916 if (!integer_onep (loop_form_info.assumptions))
2918 /* We consider to vectorize this loop by versioning it under
2919 some assumptions. In order to do this, we need to clear
2920 existing information computed by scev and niter analyzer. */
2921 scev_reset_htab ();
2922 free_numbers_of_iterations_estimates (loop);
2923 /* Also set flag for this loop so that following scev and niter
2924 analysis are done under the assumptions. */
2925 loop_constraint_set (loop, LOOP_C_FINITE);
2928 auto_vector_modes vector_modes;
2929 /* Autodetect first vector size we try. */
2930 vector_modes.safe_push (VOIDmode);
2931 unsigned int autovec_flags
2932 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2933 loop->simdlen != 0);
2934 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2935 && !unlimited_cost_model (loop));
2936 machine_mode autodetected_vector_mode = VOIDmode;
2937 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2938 unsigned int mode_i = 0;
2939 unsigned int first_loop_i = 0;
2940 unsigned int first_loop_next_i = 0;
2941 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2943 /* First determine the main loop vectorization mode, either the first
2944 one that works, starting with auto-detecting the vector mode and then
2945 following the targets order of preference, or the one with the
2946 lowest cost if pick_lowest_cost_p. */
2947 while (1)
2949 unsigned int loop_vinfo_i = mode_i;
2950 bool fatal;
2951 opt_loop_vec_info loop_vinfo
2952 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2953 NULL, vector_modes, mode_i,
2954 autodetected_vector_mode, fatal);
2955 if (fatal)
2956 break;
2958 if (loop_vinfo)
2960 /* Once we hit the desired simdlen for the first time,
2961 discard any previous attempts. */
2962 if (simdlen
2963 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2965 delete first_loop_vinfo;
2966 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2967 simdlen = 0;
2969 else if (pick_lowest_cost_p
2970 && first_loop_vinfo
2971 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2973 /* Pick loop_vinfo over first_loop_vinfo. */
2974 delete first_loop_vinfo;
2975 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2977 if (first_loop_vinfo == NULL)
2979 first_loop_vinfo = loop_vinfo;
2980 first_loop_i = loop_vinfo_i;
2981 first_loop_next_i = mode_i;
2983 else
2985 delete loop_vinfo;
2986 loop_vinfo = opt_loop_vec_info::success (NULL);
2989 /* Commit to first_loop_vinfo if we have no reason to try
2990 alternatives. */
2991 if (!simdlen && !pick_lowest_cost_p)
2992 break;
2994 if (mode_i == vector_modes.length ()
2995 || autodetected_vector_mode == VOIDmode)
2996 break;
2998 /* Try the next biggest vector size. */
2999 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_NOTE, vect_location,
3001 "***** Re-trying analysis with vector mode %s\n",
3002 GET_MODE_NAME (vector_modes[mode_i]));
3004 if (!first_loop_vinfo)
3005 return opt_loop_vec_info::propagate_failure (res);
3007 if (dump_enabled_p ())
3008 dump_printf_loc (MSG_NOTE, vect_location,
3009 "***** Choosing vector mode %s\n",
3010 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3012 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3013 enabled, SIMDUID is not set, it is the innermost loop and we have
3014 either already found the loop's SIMDLEN or there was no SIMDLEN to
3015 begin with.
3016 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3017 bool vect_epilogues = (!simdlen
3018 && loop->inner == NULL
3019 && param_vect_epilogues_nomask
3020 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3021 && !loop->simduid);
3022 if (!vect_epilogues)
3023 return first_loop_vinfo;
3025 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3026 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3028 /* Handle the case that the original loop can use partial
3029 vectorization, but want to only adopt it for the epilogue.
3030 The retry should be in the same mode as original. */
3031 if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
3033 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
3034 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
3035 if (dump_enabled_p ())
3036 dump_printf_loc (MSG_NOTE, vect_location,
3037 "***** Re-trying analysis with same vector mode"
3038 " %s for epilogue with partial vectors.\n",
3039 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3040 mode_i = first_loop_i;
3042 else
3044 mode_i = first_loop_next_i;
3045 if (mode_i == vector_modes.length ())
3046 return first_loop_vinfo;
3049 /* ??? If first_loop_vinfo was using VOIDmode then we probably
3050 want to instead search for the corresponding mode in vector_modes[]. */
3052 while (1)
3054 bool fatal;
3055 opt_loop_vec_info loop_vinfo
3056 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3057 first_loop_vinfo,
3058 vector_modes, mode_i,
3059 autodetected_vector_mode, fatal);
3060 if (fatal)
3061 break;
3063 if (loop_vinfo)
3065 if (pick_lowest_cost_p)
3067 /* Keep trying to roll back vectorization attempts while the
3068 loop_vec_infos they produced were worse than this one. */
3069 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3070 while (!vinfos.is_empty ()
3071 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3073 gcc_assert (vect_epilogues);
3074 delete vinfos.pop ();
3077 /* For now only allow one epilogue loop. */
3078 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3080 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3081 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3082 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3083 || maybe_ne (lowest_th, 0U));
3084 /* Keep track of the known smallest versioning
3085 threshold. */
3086 if (ordered_p (lowest_th, th))
3087 lowest_th = ordered_min (lowest_th, th);
3089 else
3091 delete loop_vinfo;
3092 loop_vinfo = opt_loop_vec_info::success (NULL);
3095 /* For now only allow one epilogue loop, but allow
3096 pick_lowest_cost_p to replace it, so commit to the
3097 first epilogue if we have no reason to try alternatives. */
3098 if (!pick_lowest_cost_p)
3099 break;
3102 if (mode_i == vector_modes.length ())
3103 break;
3105 /* Try the next biggest vector size. */
3106 if (dump_enabled_p ())
3107 dump_printf_loc (MSG_NOTE, vect_location,
3108 "***** Re-trying epilogue analysis with vector "
3109 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3112 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3114 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3115 if (dump_enabled_p ())
3116 dump_printf_loc (MSG_NOTE, vect_location,
3117 "***** Choosing epilogue vector mode %s\n",
3118 GET_MODE_NAME
3119 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3122 return first_loop_vinfo;
3125 /* Return true if there is an in-order reduction function for CODE, storing
3126 it in *REDUC_FN if so. */
3128 static bool
3129 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3131 if (code == PLUS_EXPR)
3133 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3134 return true;
3136 return false;
3139 /* Function reduction_fn_for_scalar_code
3141 Input:
3142 CODE - tree_code of a reduction operations.
3144 Output:
3145 REDUC_FN - the corresponding internal function to be used to reduce the
3146 vector of partial results into a single scalar result, or IFN_LAST
3147 if the operation is a supported reduction operation, but does not have
3148 such an internal function.
3150 Return FALSE if CODE currently cannot be vectorized as reduction. */
3152 bool
3153 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3155 if (code.is_tree_code ())
3156 switch (tree_code (code))
3158 case MAX_EXPR:
3159 *reduc_fn = IFN_REDUC_MAX;
3160 return true;
3162 case MIN_EXPR:
3163 *reduc_fn = IFN_REDUC_MIN;
3164 return true;
3166 case PLUS_EXPR:
3167 *reduc_fn = IFN_REDUC_PLUS;
3168 return true;
3170 case BIT_AND_EXPR:
3171 *reduc_fn = IFN_REDUC_AND;
3172 return true;
3174 case BIT_IOR_EXPR:
3175 *reduc_fn = IFN_REDUC_IOR;
3176 return true;
3178 case BIT_XOR_EXPR:
3179 *reduc_fn = IFN_REDUC_XOR;
3180 return true;
3182 case MULT_EXPR:
3183 case MINUS_EXPR:
3184 *reduc_fn = IFN_LAST;
3185 return true;
3187 default:
3188 return false;
3190 else
3191 switch (combined_fn (code))
3193 CASE_CFN_FMAX:
3194 *reduc_fn = IFN_REDUC_FMAX;
3195 return true;
3197 CASE_CFN_FMIN:
3198 *reduc_fn = IFN_REDUC_FMIN;
3199 return true;
3201 default:
3202 return false;
3206 /* If there is a neutral value X such that a reduction would not be affected
3207 by the introduction of additional X elements, return that X, otherwise
3208 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3209 of the scalar elements. If the reduction has just a single initial value
3210 then INITIAL_VALUE is that value, otherwise it is null. */
3212 tree
3213 neutral_op_for_reduction (tree scalar_type, code_helper code,
3214 tree initial_value)
3216 if (code.is_tree_code ())
3217 switch (tree_code (code))
3219 case WIDEN_SUM_EXPR:
3220 case DOT_PROD_EXPR:
3221 case SAD_EXPR:
3222 case PLUS_EXPR:
3223 case MINUS_EXPR:
3224 case BIT_IOR_EXPR:
3225 case BIT_XOR_EXPR:
3226 return build_zero_cst (scalar_type);
3228 case MULT_EXPR:
3229 return build_one_cst (scalar_type);
3231 case BIT_AND_EXPR:
3232 return build_all_ones_cst (scalar_type);
3234 case MAX_EXPR:
3235 case MIN_EXPR:
3236 return initial_value;
3238 default:
3239 return NULL_TREE;
3241 else
3242 switch (combined_fn (code))
3244 CASE_CFN_FMIN:
3245 CASE_CFN_FMAX:
3246 return initial_value;
3248 default:
3249 return NULL_TREE;
3253 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3254 STMT is printed with a message MSG. */
3256 static void
3257 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3259 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3262 /* Return true if we need an in-order reduction for operation CODE
3263 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3264 overflow must wrap. */
3266 bool
3267 needs_fold_left_reduction_p (tree type, code_helper code)
3269 /* CHECKME: check for !flag_finite_math_only too? */
3270 if (SCALAR_FLOAT_TYPE_P (type))
3272 if (code.is_tree_code ())
3273 switch (tree_code (code))
3275 case MIN_EXPR:
3276 case MAX_EXPR:
3277 return false;
3279 default:
3280 return !flag_associative_math;
3282 else
3283 switch (combined_fn (code))
3285 CASE_CFN_FMIN:
3286 CASE_CFN_FMAX:
3287 return false;
3289 default:
3290 return !flag_associative_math;
3294 if (INTEGRAL_TYPE_P (type))
3295 return (!code.is_tree_code ()
3296 || !operation_no_trapping_overflow (type, tree_code (code)));
3298 if (SAT_FIXED_POINT_TYPE_P (type))
3299 return true;
3301 return false;
3304 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3305 has a handled computation expression. Store the main reduction
3306 operation in *CODE. */
3308 static bool
3309 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3310 tree loop_arg, code_helper *code,
3311 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3313 auto_bitmap visited;
3314 tree lookfor = PHI_RESULT (phi);
3315 ssa_op_iter curri;
3316 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3317 while (USE_FROM_PTR (curr) != loop_arg)
3318 curr = op_iter_next_use (&curri);
3319 curri.i = curri.numops;
3322 path.safe_push (std::make_pair (curri, curr));
3323 tree use = USE_FROM_PTR (curr);
3324 if (use == lookfor)
3325 break;
3326 gimple *def = SSA_NAME_DEF_STMT (use);
3327 if (gimple_nop_p (def)
3328 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3330 pop:
3333 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3334 curri = x.first;
3335 curr = x.second;
3337 curr = op_iter_next_use (&curri);
3338 /* Skip already visited or non-SSA operands (from iterating
3339 over PHI args). */
3340 while (curr != NULL_USE_OPERAND_P
3341 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3342 || ! bitmap_set_bit (visited,
3343 SSA_NAME_VERSION
3344 (USE_FROM_PTR (curr)))));
3346 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3347 if (curr == NULL_USE_OPERAND_P)
3348 break;
3350 else
3352 if (gimple_code (def) == GIMPLE_PHI)
3353 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3354 else
3355 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3356 while (curr != NULL_USE_OPERAND_P
3357 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3358 || ! bitmap_set_bit (visited,
3359 SSA_NAME_VERSION
3360 (USE_FROM_PTR (curr)))))
3361 curr = op_iter_next_use (&curri);
3362 if (curr == NULL_USE_OPERAND_P)
3363 goto pop;
3366 while (1);
3367 if (dump_file && (dump_flags & TDF_DETAILS))
3369 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3370 unsigned i;
3371 std::pair<ssa_op_iter, use_operand_p> *x;
3372 FOR_EACH_VEC_ELT (path, i, x)
3373 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3374 dump_printf (MSG_NOTE, "\n");
3377 /* Check whether the reduction path detected is valid. */
3378 bool fail = path.length () == 0;
3379 bool neg = false;
3380 int sign = -1;
3381 *code = ERROR_MARK;
3382 for (unsigned i = 1; i < path.length (); ++i)
3384 gimple *use_stmt = USE_STMT (path[i].second);
3385 gimple_match_op op;
3386 if (!gimple_extract_op (use_stmt, &op))
3388 fail = true;
3389 break;
3391 unsigned int opi = op.num_ops;
3392 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3394 /* The following make sure we can compute the operand index
3395 easily plus it mostly disallows chaining via COND_EXPR condition
3396 operands. */
3397 for (opi = 0; opi < op.num_ops; ++opi)
3398 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3399 break;
3401 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3403 for (opi = 0; opi < op.num_ops; ++opi)
3404 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3405 break;
3407 if (opi == op.num_ops)
3409 fail = true;
3410 break;
3412 op.code = canonicalize_code (op.code, op.type);
3413 if (op.code == MINUS_EXPR)
3415 op.code = PLUS_EXPR;
3416 /* Track whether we negate the reduction value each iteration. */
3417 if (op.ops[1] == op.ops[opi])
3418 neg = ! neg;
3420 if (CONVERT_EXPR_CODE_P (op.code)
3421 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3423 else if (*code == ERROR_MARK)
3425 *code = op.code;
3426 sign = TYPE_SIGN (op.type);
3428 else if (op.code != *code)
3430 fail = true;
3431 break;
3433 else if ((op.code == MIN_EXPR
3434 || op.code == MAX_EXPR)
3435 && sign != TYPE_SIGN (op.type))
3437 fail = true;
3438 break;
3440 /* Check there's only a single stmt the op is used on. For the
3441 not value-changing tail and the last stmt allow out-of-loop uses.
3442 ??? We could relax this and handle arbitrary live stmts by
3443 forcing a scalar epilogue for example. */
3444 imm_use_iterator imm_iter;
3445 gimple *op_use_stmt;
3446 unsigned cnt = 0;
3447 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3448 if (!is_gimple_debug (op_use_stmt)
3449 && (*code != ERROR_MARK
3450 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3452 /* We want to allow x + x but not x < 1 ? x : 2. */
3453 if (is_gimple_assign (op_use_stmt)
3454 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3456 use_operand_p use_p;
3457 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3458 cnt++;
3460 else
3461 cnt++;
3463 if (cnt != 1)
3465 fail = true;
3466 break;
3469 return ! fail && ! neg && *code != ERROR_MARK;
3472 bool
3473 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3474 tree loop_arg, enum tree_code code)
3476 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3477 code_helper code_;
3478 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3479 && code_ == code);
3484 /* Function vect_is_simple_reduction
3486 (1) Detect a cross-iteration def-use cycle that represents a simple
3487 reduction computation. We look for the following pattern:
3489 loop_header:
3490 a1 = phi < a0, a2 >
3491 a3 = ...
3492 a2 = operation (a3, a1)
3496 a3 = ...
3497 loop_header:
3498 a1 = phi < a0, a2 >
3499 a2 = operation (a3, a1)
3501 such that:
3502 1. operation is commutative and associative and it is safe to
3503 change the order of the computation
3504 2. no uses for a2 in the loop (a2 is used out of the loop)
3505 3. no uses of a1 in the loop besides the reduction operation
3506 4. no uses of a1 outside the loop.
3508 Conditions 1,4 are tested here.
3509 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3511 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3512 nested cycles.
3514 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3515 reductions:
3517 a1 = phi < a0, a2 >
3518 inner loop (def of a3)
3519 a2 = phi < a3 >
3521 (4) Detect condition expressions, ie:
3522 for (int i = 0; i < N; i++)
3523 if (a[i] < val)
3524 ret_val = a[i];
3528 static stmt_vec_info
3529 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3530 bool *double_reduc, bool *reduc_chain_p)
3532 gphi *phi = as_a <gphi *> (phi_info->stmt);
3533 gimple *phi_use_stmt = NULL;
3534 imm_use_iterator imm_iter;
3535 use_operand_p use_p;
3537 *double_reduc = false;
3538 *reduc_chain_p = false;
3539 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3541 tree phi_name = PHI_RESULT (phi);
3542 /* ??? If there are no uses of the PHI result the inner loop reduction
3543 won't be detected as possibly double-reduction by vectorizable_reduction
3544 because that tries to walk the PHI arg from the preheader edge which
3545 can be constant. See PR60382. */
3546 if (has_zero_uses (phi_name))
3547 return NULL;
3548 class loop *loop = (gimple_bb (phi))->loop_father;
3549 unsigned nphi_def_loop_uses = 0;
3550 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3552 gimple *use_stmt = USE_STMT (use_p);
3553 if (is_gimple_debug (use_stmt))
3554 continue;
3556 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3558 if (dump_enabled_p ())
3559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3560 "intermediate value used outside loop.\n");
3562 return NULL;
3565 nphi_def_loop_uses++;
3566 phi_use_stmt = use_stmt;
3569 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3570 if (TREE_CODE (latch_def) != SSA_NAME)
3572 if (dump_enabled_p ())
3573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3574 "reduction: not ssa_name: %T\n", latch_def);
3575 return NULL;
3578 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3579 if (!def_stmt_info
3580 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3581 return NULL;
3583 bool nested_in_vect_loop
3584 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3585 unsigned nlatch_def_loop_uses = 0;
3586 auto_vec<gphi *, 3> lcphis;
3587 bool inner_loop_of_double_reduc = false;
3588 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3590 gimple *use_stmt = USE_STMT (use_p);
3591 if (is_gimple_debug (use_stmt))
3592 continue;
3593 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3594 nlatch_def_loop_uses++;
3595 else
3597 /* We can have more than one loop-closed PHI. */
3598 lcphis.safe_push (as_a <gphi *> (use_stmt));
3599 if (nested_in_vect_loop
3600 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3601 == vect_double_reduction_def))
3602 inner_loop_of_double_reduc = true;
3606 /* If we are vectorizing an inner reduction we are executing that
3607 in the original order only in case we are not dealing with a
3608 double reduction. */
3609 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3611 if (dump_enabled_p ())
3612 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3613 "detected nested cycle: ");
3614 return def_stmt_info;
3617 /* When the inner loop of a double reduction ends up with more than
3618 one loop-closed PHI we have failed to classify alternate such
3619 PHIs as double reduction, leading to wrong code. See PR103237. */
3620 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3622 if (dump_enabled_p ())
3623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3624 "unhandle double reduction\n");
3625 return NULL;
3628 /* If this isn't a nested cycle or if the nested cycle reduction value
3629 is used ouside of the inner loop we cannot handle uses of the reduction
3630 value. */
3631 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3633 if (dump_enabled_p ())
3634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3635 "reduction used in loop.\n");
3636 return NULL;
3639 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3640 defined in the inner loop. */
3641 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3643 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3644 if (gimple_phi_num_args (def_stmt) != 1
3645 || TREE_CODE (op1) != SSA_NAME)
3647 if (dump_enabled_p ())
3648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3649 "unsupported phi node definition.\n");
3651 return NULL;
3654 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3655 if (gimple_bb (def1)
3656 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3657 && loop->inner
3658 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3659 && (is_gimple_assign (def1) || is_gimple_call (def1))
3660 && is_a <gphi *> (phi_use_stmt)
3661 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3663 if (dump_enabled_p ())
3664 report_vect_op (MSG_NOTE, def_stmt,
3665 "detected double reduction: ");
3667 *double_reduc = true;
3668 return def_stmt_info;
3671 return NULL;
3674 /* Look for the expression computing latch_def from then loop PHI result. */
3675 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3676 code_helper code;
3677 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3678 path))
3680 STMT_VINFO_REDUC_CODE (phi_info) = code;
3681 if (code == COND_EXPR && !nested_in_vect_loop)
3682 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3684 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3685 reduction chain for which the additional restriction is that
3686 all operations in the chain are the same. */
3687 auto_vec<stmt_vec_info, 8> reduc_chain;
3688 unsigned i;
3689 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3690 for (i = path.length () - 1; i >= 1; --i)
3692 gimple *stmt = USE_STMT (path[i].second);
3693 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3694 gimple_match_op op;
3695 if (!gimple_extract_op (stmt, &op))
3696 gcc_unreachable ();
3697 if (gassign *assign = dyn_cast<gassign *> (stmt))
3698 STMT_VINFO_REDUC_IDX (stmt_info)
3699 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3700 else
3702 gcall *call = as_a<gcall *> (stmt);
3703 STMT_VINFO_REDUC_IDX (stmt_info)
3704 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3706 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3707 && (i == 1 || i == path.length () - 1));
3708 if ((op.code != code && !leading_conversion)
3709 /* We can only handle the final value in epilogue
3710 generation for reduction chains. */
3711 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3712 is_slp_reduc = false;
3713 /* For reduction chains we support a trailing/leading
3714 conversions. We do not store those in the actual chain. */
3715 if (leading_conversion)
3716 continue;
3717 reduc_chain.safe_push (stmt_info);
3719 if (is_slp_reduc && reduc_chain.length () > 1)
3721 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3723 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3724 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3726 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3727 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3729 /* Save the chain for further analysis in SLP detection. */
3730 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3731 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3733 *reduc_chain_p = true;
3734 if (dump_enabled_p ())
3735 dump_printf_loc (MSG_NOTE, vect_location,
3736 "reduction: detected reduction chain\n");
3738 else if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE, vect_location,
3740 "reduction: detected reduction\n");
3742 return def_stmt_info;
3745 if (dump_enabled_p ())
3746 dump_printf_loc (MSG_NOTE, vect_location,
3747 "reduction: unknown pattern\n");
3749 return NULL;
3752 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3753 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3754 or -1 if not known. */
3756 static int
3757 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3759 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3760 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3762 if (dump_enabled_p ())
3763 dump_printf_loc (MSG_NOTE, vect_location,
3764 "cost model: epilogue peel iters set to vf/2 "
3765 "because loop iterations are unknown .\n");
3766 return assumed_vf / 2;
3768 else
3770 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3771 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3772 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3773 /* If we need to peel for gaps, but no peeling is required, we have to
3774 peel VF iterations. */
3775 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3776 peel_iters_epilogue = assumed_vf;
3777 return peel_iters_epilogue;
3781 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3783 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3784 int *peel_iters_epilogue,
3785 stmt_vector_for_cost *scalar_cost_vec,
3786 stmt_vector_for_cost *prologue_cost_vec,
3787 stmt_vector_for_cost *epilogue_cost_vec)
3789 int retval = 0;
3791 *peel_iters_epilogue
3792 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3794 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3796 /* If peeled iterations are known but number of scalar loop
3797 iterations are unknown, count a taken branch per peeled loop. */
3798 if (peel_iters_prologue > 0)
3799 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3800 NULL, NULL_TREE, 0, vect_prologue);
3801 if (*peel_iters_epilogue > 0)
3802 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3803 NULL, NULL_TREE, 0, vect_epilogue);
3806 stmt_info_for_cost *si;
3807 int j;
3808 if (peel_iters_prologue)
3809 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3810 retval += record_stmt_cost (prologue_cost_vec,
3811 si->count * peel_iters_prologue,
3812 si->kind, si->stmt_info, si->misalign,
3813 vect_prologue);
3814 if (*peel_iters_epilogue)
3815 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3816 retval += record_stmt_cost (epilogue_cost_vec,
3817 si->count * *peel_iters_epilogue,
3818 si->kind, si->stmt_info, si->misalign,
3819 vect_epilogue);
3821 return retval;
3824 /* Function vect_estimate_min_profitable_iters
3826 Return the number of iterations required for the vector version of the
3827 loop to be profitable relative to the cost of the scalar version of the
3828 loop.
3830 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3831 of iterations for vectorization. -1 value means loop vectorization
3832 is not profitable. This returned value may be used for dynamic
3833 profitability check.
3835 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3836 for static check against estimated number of iterations. */
3838 static void
3839 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3840 int *ret_min_profitable_niters,
3841 int *ret_min_profitable_estimate)
3843 int min_profitable_iters;
3844 int min_profitable_estimate;
3845 int peel_iters_prologue;
3846 int peel_iters_epilogue;
3847 unsigned vec_inside_cost = 0;
3848 int vec_outside_cost = 0;
3849 unsigned vec_prologue_cost = 0;
3850 unsigned vec_epilogue_cost = 0;
3851 int scalar_single_iter_cost = 0;
3852 int scalar_outside_cost = 0;
3853 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3854 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3855 vector_costs *target_cost_data = loop_vinfo->vector_costs;
3857 /* Cost model disabled. */
3858 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3860 if (dump_enabled_p ())
3861 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3862 *ret_min_profitable_niters = 0;
3863 *ret_min_profitable_estimate = 0;
3864 return;
3867 /* Requires loop versioning tests to handle misalignment. */
3868 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3870 /* FIXME: Make cost depend on complexity of individual check. */
3871 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3872 (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3873 NULL, NULL_TREE, 0, vect_prologue);
3874 if (dump_enabled_p ())
3875 dump_printf (MSG_NOTE,
3876 "cost model: Adding cost of checks for loop "
3877 "versioning to treat misalignment.\n");
3880 /* Requires loop versioning with alias checks. */
3881 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3883 /* FIXME: Make cost depend on complexity of individual check. */
3884 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3885 (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3886 NULL, NULL_TREE, 0, vect_prologue);
3887 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3888 if (len)
3889 /* Count LEN - 1 ANDs and LEN comparisons. */
3890 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3891 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3892 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3893 if (len)
3895 /* Count LEN - 1 ANDs and LEN comparisons. */
3896 unsigned int nstmts = len * 2 - 1;
3897 /* +1 for each bias that needs adding. */
3898 for (unsigned int i = 0; i < len; ++i)
3899 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3900 nstmts += 1;
3901 (void) add_stmt_cost (target_cost_data, nstmts,
3902 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3904 if (dump_enabled_p ())
3905 dump_printf (MSG_NOTE,
3906 "cost model: Adding cost of checks for loop "
3907 "versioning aliasing.\n");
3910 /* Requires loop versioning with niter checks. */
3911 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3913 /* FIXME: Make cost depend on complexity of individual check. */
3914 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3915 NULL, NULL_TREE, 0, vect_prologue);
3916 if (dump_enabled_p ())
3917 dump_printf (MSG_NOTE,
3918 "cost model: Adding cost of checks for loop "
3919 "versioning niters.\n");
3922 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3923 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3924 NULL, NULL_TREE, 0, vect_prologue);
3926 /* Count statements in scalar loop. Using this as scalar cost for a single
3927 iteration for now.
3929 TODO: Add outer loop support.
3931 TODO: Consider assigning different costs to different scalar
3932 statements. */
3934 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
3936 /* Add additional cost for the peeled instructions in prologue and epilogue
3937 loop. (For fully-masked loops there will be no peeling.)
3939 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3940 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3942 TODO: Build an expression that represents peel_iters for prologue and
3943 epilogue to be used in a run-time test. */
3945 bool prologue_need_br_taken_cost = false;
3946 bool prologue_need_br_not_taken_cost = false;
3948 /* Calculate peel_iters_prologue. */
3949 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3950 peel_iters_prologue = 0;
3951 else if (npeel < 0)
3953 peel_iters_prologue = assumed_vf / 2;
3954 if (dump_enabled_p ())
3955 dump_printf (MSG_NOTE, "cost model: "
3956 "prologue peel iters set to vf/2.\n");
3958 /* If peeled iterations are unknown, count a taken branch and a not taken
3959 branch per peeled loop. Even if scalar loop iterations are known,
3960 vector iterations are not known since peeled prologue iterations are
3961 not known. Hence guards remain the same. */
3962 prologue_need_br_taken_cost = true;
3963 prologue_need_br_not_taken_cost = true;
3965 else
3967 peel_iters_prologue = npeel;
3968 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3969 /* If peeled iterations are known but number of scalar loop
3970 iterations are unknown, count a taken branch per peeled loop. */
3971 prologue_need_br_taken_cost = true;
3974 bool epilogue_need_br_taken_cost = false;
3975 bool epilogue_need_br_not_taken_cost = false;
3977 /* Calculate peel_iters_epilogue. */
3978 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3979 /* We need to peel exactly one iteration for gaps. */
3980 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3981 else if (npeel < 0)
3983 /* If peeling for alignment is unknown, loop bound of main loop
3984 becomes unknown. */
3985 peel_iters_epilogue = assumed_vf / 2;
3986 if (dump_enabled_p ())
3987 dump_printf (MSG_NOTE, "cost model: "
3988 "epilogue peel iters set to vf/2 because "
3989 "peeling for alignment is unknown.\n");
3991 /* See the same reason above in peel_iters_prologue calculation. */
3992 epilogue_need_br_taken_cost = true;
3993 epilogue_need_br_not_taken_cost = true;
3995 else
3997 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3998 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3999 /* If peeled iterations are known but number of scalar loop
4000 iterations are unknown, count a taken branch per peeled loop. */
4001 epilogue_need_br_taken_cost = true;
4004 stmt_info_for_cost *si;
4005 int j;
4006 /* Add costs associated with peel_iters_prologue. */
4007 if (peel_iters_prologue)
4008 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4010 (void) add_stmt_cost (target_cost_data,
4011 si->count * peel_iters_prologue, si->kind,
4012 si->stmt_info, si->vectype, si->misalign,
4013 vect_prologue);
4016 /* Add costs associated with peel_iters_epilogue. */
4017 if (peel_iters_epilogue)
4018 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4020 (void) add_stmt_cost (target_cost_data,
4021 si->count * peel_iters_epilogue, si->kind,
4022 si->stmt_info, si->vectype, si->misalign,
4023 vect_epilogue);
4026 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4028 if (prologue_need_br_taken_cost)
4029 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4030 NULL, NULL_TREE, 0, vect_prologue);
4032 if (prologue_need_br_not_taken_cost)
4033 (void) add_stmt_cost (target_cost_data, 1,
4034 cond_branch_not_taken, NULL, NULL_TREE, 0,
4035 vect_prologue);
4037 if (epilogue_need_br_taken_cost)
4038 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4039 NULL, NULL_TREE, 0, vect_epilogue);
4041 if (epilogue_need_br_not_taken_cost)
4042 (void) add_stmt_cost (target_cost_data, 1,
4043 cond_branch_not_taken, NULL, NULL_TREE, 0,
4044 vect_epilogue);
4046 /* Take care of special costs for rgroup controls of partial vectors. */
4047 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4049 /* Calculate how many masks we need to generate. */
4050 unsigned int num_masks = 0;
4051 rgroup_controls *rgm;
4052 unsigned int num_vectors_m1;
4053 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4054 if (rgm->type)
4055 num_masks += num_vectors_m1 + 1;
4056 gcc_assert (num_masks > 0);
4058 /* In the worst case, we need to generate each mask in the prologue
4059 and in the loop body. One of the loop body mask instructions
4060 replaces the comparison in the scalar loop, and since we don't
4061 count the scalar comparison against the scalar body, we shouldn't
4062 count that vector instruction against the vector body either.
4064 Sometimes we can use unpacks instead of generating prologue
4065 masks and sometimes the prologue mask will fold to a constant,
4066 so the actual prologue cost might be smaller. However, it's
4067 simpler and safer to use the worst-case cost; if this ends up
4068 being the tie-breaker between vectorizing or not, then it's
4069 probably better not to vectorize. */
4070 (void) add_stmt_cost (target_cost_data, num_masks,
4071 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4072 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4073 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4075 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4077 /* Referring to the functions vect_set_loop_condition_partial_vectors
4078 and vect_set_loop_controls_directly, we need to generate each
4079 length in the prologue and in the loop body if required. Although
4080 there are some possible optimizations, we consider the worst case
4081 here. */
4083 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4084 bool need_iterate_p
4085 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4086 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4088 /* Calculate how many statements to be added. */
4089 unsigned int prologue_stmts = 0;
4090 unsigned int body_stmts = 0;
4092 rgroup_controls *rgc;
4093 unsigned int num_vectors_m1;
4094 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4095 if (rgc->type)
4097 /* May need one SHIFT for nitems_total computation. */
4098 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4099 if (nitems != 1 && !niters_known_p)
4100 prologue_stmts += 1;
4102 /* May need one MAX and one MINUS for wrap around. */
4103 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4104 prologue_stmts += 2;
4106 /* Need one MAX and one MINUS for each batch limit excepting for
4107 the 1st one. */
4108 prologue_stmts += num_vectors_m1 * 2;
4110 unsigned int num_vectors = num_vectors_m1 + 1;
4112 /* Need to set up lengths in prologue, only one MIN required
4113 for each since start index is zero. */
4114 prologue_stmts += num_vectors;
4116 /* Each may need two MINs and one MINUS to update lengths in body
4117 for next iteration. */
4118 if (need_iterate_p)
4119 body_stmts += 3 * num_vectors;
4122 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4123 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4124 (void) add_stmt_cost (target_cost_data, body_stmts,
4125 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4128 /* FORNOW: The scalar outside cost is incremented in one of the
4129 following ways:
4131 1. The vectorizer checks for alignment and aliasing and generates
4132 a condition that allows dynamic vectorization. A cost model
4133 check is ANDED with the versioning condition. Hence scalar code
4134 path now has the added cost of the versioning check.
4136 if (cost > th & versioning_check)
4137 jmp to vector code
4139 Hence run-time scalar is incremented by not-taken branch cost.
4141 2. The vectorizer then checks if a prologue is required. If the
4142 cost model check was not done before during versioning, it has to
4143 be done before the prologue check.
4145 if (cost <= th)
4146 prologue = scalar_iters
4147 if (prologue == 0)
4148 jmp to vector code
4149 else
4150 execute prologue
4151 if (prologue == num_iters)
4152 go to exit
4154 Hence the run-time scalar cost is incremented by a taken branch,
4155 plus a not-taken branch, plus a taken branch cost.
4157 3. The vectorizer then checks if an epilogue is required. If the
4158 cost model check was not done before during prologue check, it
4159 has to be done with the epilogue check.
4161 if (prologue == 0)
4162 jmp to vector code
4163 else
4164 execute prologue
4165 if (prologue == num_iters)
4166 go to exit
4167 vector code:
4168 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4169 jmp to epilogue
4171 Hence the run-time scalar cost should be incremented by 2 taken
4172 branches.
4174 TODO: The back end may reorder the BBS's differently and reverse
4175 conditions/branch directions. Change the estimates below to
4176 something more reasonable. */
4178 /* If the number of iterations is known and we do not do versioning, we can
4179 decide whether to vectorize at compile time. Hence the scalar version
4180 do not carry cost model guard costs. */
4181 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4182 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4184 /* Cost model check occurs at versioning. */
4185 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4186 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4187 else
4189 /* Cost model check occurs at prologue generation. */
4190 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4191 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4192 + vect_get_stmt_cost (cond_branch_not_taken);
4193 /* Cost model check occurs at epilogue generation. */
4194 else
4195 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4199 /* Complete the target-specific cost calculations. */
4200 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4201 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
4203 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4205 if (dump_enabled_p ())
4207 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4208 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4209 vec_inside_cost);
4210 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4211 vec_prologue_cost);
4212 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4213 vec_epilogue_cost);
4214 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4215 scalar_single_iter_cost);
4216 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4217 scalar_outside_cost);
4218 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4219 vec_outside_cost);
4220 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4221 peel_iters_prologue);
4222 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4223 peel_iters_epilogue);
4226 /* Calculate number of iterations required to make the vector version
4227 profitable, relative to the loop bodies only. The following condition
4228 must hold true:
4229 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4230 where
4231 SIC = scalar iteration cost, VIC = vector iteration cost,
4232 VOC = vector outside cost, VF = vectorization factor,
4233 NPEEL = prologue iterations + epilogue iterations,
4234 SOC = scalar outside cost for run time cost model check. */
4236 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4237 - vec_inside_cost);
4238 if (saving_per_viter <= 0)
4240 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4241 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4242 "vectorization did not happen for a simd loop");
4244 if (dump_enabled_p ())
4245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4246 "cost model: the vector iteration cost = %d "
4247 "divided by the scalar iteration cost = %d "
4248 "is greater or equal to the vectorization factor = %d"
4249 ".\n",
4250 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4251 *ret_min_profitable_niters = -1;
4252 *ret_min_profitable_estimate = -1;
4253 return;
4256 /* ??? The "if" arm is written to handle all cases; see below for what
4257 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4258 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4260 /* Rewriting the condition above in terms of the number of
4261 vector iterations (vniters) rather than the number of
4262 scalar iterations (niters) gives:
4264 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4266 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4268 For integer N, X and Y when X > 0:
4270 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4271 int outside_overhead = (vec_outside_cost
4272 - scalar_single_iter_cost * peel_iters_prologue
4273 - scalar_single_iter_cost * peel_iters_epilogue
4274 - scalar_outside_cost);
4275 /* We're only interested in cases that require at least one
4276 vector iteration. */
4277 int min_vec_niters = 1;
4278 if (outside_overhead > 0)
4279 min_vec_niters = outside_overhead / saving_per_viter + 1;
4281 if (dump_enabled_p ())
4282 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4283 min_vec_niters);
4285 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4287 /* Now that we know the minimum number of vector iterations,
4288 find the minimum niters for which the scalar cost is larger:
4290 SIC * niters > VIC * vniters + VOC - SOC
4292 We know that the minimum niters is no more than
4293 vniters * VF + NPEEL, but it might be (and often is) less
4294 than that if a partial vector iteration is cheaper than the
4295 equivalent scalar code. */
4296 int threshold = (vec_inside_cost * min_vec_niters
4297 + vec_outside_cost
4298 - scalar_outside_cost);
4299 if (threshold <= 0)
4300 min_profitable_iters = 1;
4301 else
4302 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4304 else
4305 /* Convert the number of vector iterations into a number of
4306 scalar iterations. */
4307 min_profitable_iters = (min_vec_niters * assumed_vf
4308 + peel_iters_prologue
4309 + peel_iters_epilogue);
4311 else
4313 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4314 * assumed_vf
4315 - vec_inside_cost * peel_iters_prologue
4316 - vec_inside_cost * peel_iters_epilogue);
4317 if (min_profitable_iters <= 0)
4318 min_profitable_iters = 0;
4319 else
4321 min_profitable_iters /= saving_per_viter;
4323 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4324 <= (((int) vec_inside_cost * min_profitable_iters)
4325 + (((int) vec_outside_cost - scalar_outside_cost)
4326 * assumed_vf)))
4327 min_profitable_iters++;
4331 if (dump_enabled_p ())
4332 dump_printf (MSG_NOTE,
4333 " Calculated minimum iters for profitability: %d\n",
4334 min_profitable_iters);
4336 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4337 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4338 /* We want the vectorized loop to execute at least once. */
4339 min_profitable_iters = assumed_vf + peel_iters_prologue;
4340 else if (min_profitable_iters < peel_iters_prologue)
4341 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4342 vectorized loop executes at least once. */
4343 min_profitable_iters = peel_iters_prologue;
4345 if (dump_enabled_p ())
4346 dump_printf_loc (MSG_NOTE, vect_location,
4347 " Runtime profitability threshold = %d\n",
4348 min_profitable_iters);
4350 *ret_min_profitable_niters = min_profitable_iters;
4352 /* Calculate number of iterations required to make the vector version
4353 profitable, relative to the loop bodies only.
4355 Non-vectorized variant is SIC * niters and it must win over vector
4356 variant on the expected loop trip count. The following condition must hold true:
4357 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4359 if (vec_outside_cost <= 0)
4360 min_profitable_estimate = 0;
4361 /* ??? This "else if" arm is written to handle all cases; see below for
4362 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4363 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4365 /* This is a repeat of the code above, but with + SOC rather
4366 than - SOC. */
4367 int outside_overhead = (vec_outside_cost
4368 - scalar_single_iter_cost * peel_iters_prologue
4369 - scalar_single_iter_cost * peel_iters_epilogue
4370 + scalar_outside_cost);
4371 int min_vec_niters = 1;
4372 if (outside_overhead > 0)
4373 min_vec_niters = outside_overhead / saving_per_viter + 1;
4375 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4377 int threshold = (vec_inside_cost * min_vec_niters
4378 + vec_outside_cost
4379 + scalar_outside_cost);
4380 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4382 else
4383 min_profitable_estimate = (min_vec_niters * assumed_vf
4384 + peel_iters_prologue
4385 + peel_iters_epilogue);
4387 else
4389 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4390 * assumed_vf
4391 - vec_inside_cost * peel_iters_prologue
4392 - vec_inside_cost * peel_iters_epilogue)
4393 / ((scalar_single_iter_cost * assumed_vf)
4394 - vec_inside_cost);
4396 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4397 if (dump_enabled_p ())
4398 dump_printf_loc (MSG_NOTE, vect_location,
4399 " Static estimate profitability threshold = %d\n",
4400 min_profitable_estimate);
4402 *ret_min_profitable_estimate = min_profitable_estimate;
4405 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4406 vector elements (not bits) for a vector with NELT elements. */
4407 static void
4408 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4409 vec_perm_builder *sel)
4411 /* The encoding is a single stepped pattern. Any wrap-around is handled
4412 by vec_perm_indices. */
4413 sel->new_vector (nelt, 1, 3);
4414 for (unsigned int i = 0; i < 3; i++)
4415 sel->quick_push (i + offset);
4418 /* Checks whether the target supports whole-vector shifts for vectors of mode
4419 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4420 it supports vec_perm_const with masks for all necessary shift amounts. */
4421 static bool
4422 have_whole_vector_shift (machine_mode mode)
4424 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4425 return true;
4427 /* Variable-length vectors should be handled via the optab. */
4428 unsigned int nelt;
4429 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4430 return false;
4432 vec_perm_builder sel;
4433 vec_perm_indices indices;
4434 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4436 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4437 indices.new_vector (sel, 2, nelt);
4438 if (!can_vec_perm_const_p (mode, indices, false))
4439 return false;
4441 return true;
4444 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4445 functions. Design better to avoid maintenance issues. */
4447 /* Function vect_model_reduction_cost.
4449 Models cost for a reduction operation, including the vector ops
4450 generated within the strip-mine loop in some cases, the initial
4451 definition before the loop, and the epilogue code that must be generated. */
4453 static void
4454 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4455 stmt_vec_info stmt_info, internal_fn reduc_fn,
4456 vect_reduction_type reduction_type,
4457 int ncopies, stmt_vector_for_cost *cost_vec)
4459 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4460 tree vectype;
4461 machine_mode mode;
4462 class loop *loop = NULL;
4464 if (loop_vinfo)
4465 loop = LOOP_VINFO_LOOP (loop_vinfo);
4467 /* Condition reductions generate two reductions in the loop. */
4468 if (reduction_type == COND_REDUCTION)
4469 ncopies *= 2;
4471 vectype = STMT_VINFO_VECTYPE (stmt_info);
4472 mode = TYPE_MODE (vectype);
4473 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4475 gimple_match_op op;
4476 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4477 gcc_unreachable ();
4479 if (reduction_type == EXTRACT_LAST_REDUCTION)
4480 /* No extra instructions are needed in the prologue. The loop body
4481 operations are costed in vectorizable_condition. */
4482 inside_cost = 0;
4483 else if (reduction_type == FOLD_LEFT_REDUCTION)
4485 /* No extra instructions needed in the prologue. */
4486 prologue_cost = 0;
4488 if (reduc_fn != IFN_LAST)
4489 /* Count one reduction-like operation per vector. */
4490 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4491 stmt_info, 0, vect_body);
4492 else
4494 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4495 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4496 inside_cost = record_stmt_cost (cost_vec, nelements,
4497 vec_to_scalar, stmt_info, 0,
4498 vect_body);
4499 inside_cost += record_stmt_cost (cost_vec, nelements,
4500 scalar_stmt, stmt_info, 0,
4501 vect_body);
4504 else
4506 /* Add in cost for initial definition.
4507 For cond reduction we have four vectors: initial index, step,
4508 initial result of the data reduction, initial value of the index
4509 reduction. */
4510 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4511 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4512 scalar_to_vec, stmt_info, 0,
4513 vect_prologue);
4516 /* Determine cost of epilogue code.
4518 We have a reduction operator that will reduce the vector in one statement.
4519 Also requires scalar extract. */
4521 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4523 if (reduc_fn != IFN_LAST)
4525 if (reduction_type == COND_REDUCTION)
4527 /* An EQ stmt and an COND_EXPR stmt. */
4528 epilogue_cost += record_stmt_cost (cost_vec, 2,
4529 vector_stmt, stmt_info, 0,
4530 vect_epilogue);
4531 /* Reduction of the max index and a reduction of the found
4532 values. */
4533 epilogue_cost += record_stmt_cost (cost_vec, 2,
4534 vec_to_scalar, stmt_info, 0,
4535 vect_epilogue);
4536 /* A broadcast of the max value. */
4537 epilogue_cost += record_stmt_cost (cost_vec, 1,
4538 scalar_to_vec, stmt_info, 0,
4539 vect_epilogue);
4541 else
4543 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4544 stmt_info, 0, vect_epilogue);
4545 epilogue_cost += record_stmt_cost (cost_vec, 1,
4546 vec_to_scalar, stmt_info, 0,
4547 vect_epilogue);
4550 else if (reduction_type == COND_REDUCTION)
4552 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4553 /* Extraction of scalar elements. */
4554 epilogue_cost += record_stmt_cost (cost_vec,
4555 2 * estimated_nunits,
4556 vec_to_scalar, stmt_info, 0,
4557 vect_epilogue);
4558 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4559 epilogue_cost += record_stmt_cost (cost_vec,
4560 2 * estimated_nunits - 3,
4561 scalar_stmt, stmt_info, 0,
4562 vect_epilogue);
4564 else if (reduction_type == EXTRACT_LAST_REDUCTION
4565 || reduction_type == FOLD_LEFT_REDUCTION)
4566 /* No extra instructions need in the epilogue. */
4568 else
4570 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4571 tree bitsize = TYPE_SIZE (op.type);
4572 int element_bitsize = tree_to_uhwi (bitsize);
4573 int nelements = vec_size_in_bits / element_bitsize;
4575 if (op.code == COND_EXPR)
4576 op.code = MAX_EXPR;
4578 /* We have a whole vector shift available. */
4579 if (VECTOR_MODE_P (mode)
4580 && directly_supported_p (op.code, vectype)
4581 && have_whole_vector_shift (mode))
4583 /* Final reduction via vector shifts and the reduction operator.
4584 Also requires scalar extract. */
4585 epilogue_cost += record_stmt_cost (cost_vec,
4586 exact_log2 (nelements) * 2,
4587 vector_stmt, stmt_info, 0,
4588 vect_epilogue);
4589 epilogue_cost += record_stmt_cost (cost_vec, 1,
4590 vec_to_scalar, stmt_info, 0,
4591 vect_epilogue);
4593 else
4594 /* Use extracts and reduction op for final reduction. For N
4595 elements, we have N extracts and N-1 reduction ops. */
4596 epilogue_cost += record_stmt_cost (cost_vec,
4597 nelements + nelements - 1,
4598 vector_stmt, stmt_info, 0,
4599 vect_epilogue);
4603 if (dump_enabled_p ())
4604 dump_printf (MSG_NOTE,
4605 "vect_model_reduction_cost: inside_cost = %d, "
4606 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4607 prologue_cost, epilogue_cost);
4610 /* SEQ is a sequence of instructions that initialize the reduction
4611 described by REDUC_INFO. Emit them in the appropriate place. */
4613 static void
4614 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4615 stmt_vec_info reduc_info, gimple *seq)
4617 if (reduc_info->reused_accumulator)
4619 /* When reusing an accumulator from the main loop, we only need
4620 initialization instructions if the main loop can be skipped.
4621 In that case, emit the initialization instructions at the end
4622 of the guard block that does the skip. */
4623 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4624 gcc_assert (skip_edge);
4625 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4626 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4628 else
4630 /* The normal case: emit the initialization instructions on the
4631 preheader edge. */
4632 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4633 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4637 /* Function get_initial_def_for_reduction
4639 Input:
4640 REDUC_INFO - the info_for_reduction
4641 INIT_VAL - the initial value of the reduction variable
4642 NEUTRAL_OP - a value that has no effect on the reduction, as per
4643 neutral_op_for_reduction
4645 Output:
4646 Return a vector variable, initialized according to the operation that
4647 STMT_VINFO performs. This vector will be used as the initial value
4648 of the vector of partial results.
4650 The value we need is a vector in which element 0 has value INIT_VAL
4651 and every other element has value NEUTRAL_OP. */
4653 static tree
4654 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4655 stmt_vec_info reduc_info,
4656 tree init_val, tree neutral_op)
4658 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4659 tree scalar_type = TREE_TYPE (init_val);
4660 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4661 tree init_def;
4662 gimple_seq stmts = NULL;
4664 gcc_assert (vectype);
4666 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4667 || SCALAR_FLOAT_TYPE_P (scalar_type));
4669 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4670 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4672 if (operand_equal_p (init_val, neutral_op))
4674 /* If both elements are equal then the vector described above is
4675 just a splat. */
4676 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4677 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4679 else
4681 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4682 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4683 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4685 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4686 element 0. */
4687 init_def = gimple_build_vector_from_val (&stmts, vectype,
4688 neutral_op);
4689 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4690 vectype, init_def, init_val);
4692 else
4694 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4695 tree_vector_builder elts (vectype, 1, 2);
4696 elts.quick_push (init_val);
4697 elts.quick_push (neutral_op);
4698 init_def = gimple_build_vector (&stmts, &elts);
4702 if (stmts)
4703 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4704 return init_def;
4707 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4708 which performs a reduction involving GROUP_SIZE scalar statements.
4709 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4710 is nonnull, introducing extra elements of that value will not change the
4711 result. */
4713 static void
4714 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4715 stmt_vec_info reduc_info,
4716 vec<tree> *vec_oprnds,
4717 unsigned int number_of_vectors,
4718 unsigned int group_size, tree neutral_op)
4720 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4721 unsigned HOST_WIDE_INT nunits;
4722 unsigned j, number_of_places_left_in_vector;
4723 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4724 unsigned int i;
4726 gcc_assert (group_size == initial_values.length () || neutral_op);
4728 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4729 created vectors. It is greater than 1 if unrolling is performed.
4731 For example, we have two scalar operands, s1 and s2 (e.g., group of
4732 strided accesses of size two), while NUNITS is four (i.e., four scalars
4733 of this type can be packed in a vector). The output vector will contain
4734 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4735 will be 2).
4737 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4738 vectors containing the operands.
4740 For example, NUNITS is four as before, and the group size is 8
4741 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4742 {s5, s6, s7, s8}. */
4744 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4745 nunits = group_size;
4747 number_of_places_left_in_vector = nunits;
4748 bool constant_p = true;
4749 tree_vector_builder elts (vector_type, nunits, 1);
4750 elts.quick_grow (nunits);
4751 gimple_seq ctor_seq = NULL;
4752 for (j = 0; j < nunits * number_of_vectors; ++j)
4754 tree op;
4755 i = j % group_size;
4757 /* Get the def before the loop. In reduction chain we have only
4758 one initial value. Else we have as many as PHIs in the group. */
4759 if (i >= initial_values.length () || (j > i && neutral_op))
4760 op = neutral_op;
4761 else
4762 op = initial_values[i];
4764 /* Create 'vect_ = {op0,op1,...,opn}'. */
4765 number_of_places_left_in_vector--;
4766 elts[nunits - number_of_places_left_in_vector - 1] = op;
4767 if (!CONSTANT_CLASS_P (op))
4768 constant_p = false;
4770 if (number_of_places_left_in_vector == 0)
4772 tree init;
4773 if (constant_p && !neutral_op
4774 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4775 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4776 /* Build the vector directly from ELTS. */
4777 init = gimple_build_vector (&ctor_seq, &elts);
4778 else if (neutral_op)
4780 /* Build a vector of the neutral value and shift the
4781 other elements into place. */
4782 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4783 neutral_op);
4784 int k = nunits;
4785 while (k > 0 && elts[k - 1] == neutral_op)
4786 k -= 1;
4787 while (k > 0)
4789 k -= 1;
4790 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4791 vector_type, init, elts[k]);
4794 else
4796 /* First time round, duplicate ELTS to fill the
4797 required number of vectors. */
4798 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4799 elts, number_of_vectors, *vec_oprnds);
4800 break;
4802 vec_oprnds->quick_push (init);
4804 number_of_places_left_in_vector = nunits;
4805 elts.new_vector (vector_type, nunits, 1);
4806 elts.quick_grow (nunits);
4807 constant_p = true;
4810 if (ctor_seq != NULL)
4811 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4814 /* For a statement STMT_INFO taking part in a reduction operation return
4815 the stmt_vec_info the meta information is stored on. */
4817 stmt_vec_info
4818 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4820 stmt_info = vect_orig_stmt (stmt_info);
4821 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4822 if (!is_a <gphi *> (stmt_info->stmt)
4823 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4824 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4825 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4826 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4828 if (gimple_phi_num_args (phi) == 1)
4829 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4831 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4833 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4834 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4835 stmt_info = info;
4837 return stmt_info;
4840 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4841 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4842 return false. */
4844 static bool
4845 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4846 stmt_vec_info reduc_info)
4848 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4849 if (!main_loop_vinfo)
4850 return false;
4852 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4853 return false;
4855 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4856 auto_vec<tree, 16> main_loop_results (num_phis);
4857 auto_vec<tree, 16> initial_values (num_phis);
4858 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4860 /* The epilogue loop can be entered either from the main loop or
4861 from an earlier guard block. */
4862 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4863 for (tree incoming_value : reduc_info->reduc_initial_values)
4865 /* Look for:
4867 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4868 INITIAL_VALUE(guard block)>. */
4869 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4871 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4872 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4874 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4875 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4877 main_loop_results.quick_push (from_main_loop);
4878 initial_values.quick_push (from_skip);
4881 else
4882 /* The main loop dominates the epilogue loop. */
4883 main_loop_results.splice (reduc_info->reduc_initial_values);
4885 /* See if the main loop has the kind of accumulator we need. */
4886 vect_reusable_accumulator *accumulator
4887 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4888 if (!accumulator
4889 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4890 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4891 accumulator->reduc_info->reduc_scalar_results.begin ()))
4892 return false;
4894 /* Handle the case where we can reduce wider vectors to narrower ones. */
4895 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4896 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4897 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4898 TYPE_VECTOR_SUBPARTS (vectype)))
4899 return false;
4901 /* Non-SLP reductions might apply an adjustment after the reduction
4902 operation, in order to simplify the initialization of the accumulator.
4903 If the epilogue loop carries on from where the main loop left off,
4904 it should apply the same adjustment to the final reduction result.
4906 If the epilogue loop can also be entered directly (rather than via
4907 the main loop), we need to be able to handle that case in the same way,
4908 with the same adjustment. (In principle we could add a PHI node
4909 to select the correct adjustment, but in practice that shouldn't be
4910 necessary.) */
4911 tree main_adjustment
4912 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4913 if (loop_vinfo->main_loop_edge && main_adjustment)
4915 gcc_assert (num_phis == 1);
4916 tree initial_value = initial_values[0];
4917 /* Check that we can use INITIAL_VALUE as the adjustment and
4918 initialize the accumulator with a neutral value instead. */
4919 if (!operand_equal_p (initial_value, main_adjustment))
4920 return false;
4921 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
4922 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4923 code, initial_value);
4925 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4926 reduc_info->reduc_initial_values.truncate (0);
4927 reduc_info->reduc_initial_values.splice (initial_values);
4928 reduc_info->reused_accumulator = accumulator;
4929 return true;
4932 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4933 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
4935 static tree
4936 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
4937 gimple_seq *seq)
4939 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4940 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4941 tree stype = TREE_TYPE (vectype);
4942 tree new_temp = vec_def;
4943 while (nunits > nunits1)
4945 nunits /= 2;
4946 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4947 stype, nunits);
4948 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4950 /* The target has to make sure we support lowpart/highpart
4951 extraction, either via direct vector extract or through
4952 an integer mode punning. */
4953 tree dst1, dst2;
4954 gimple *epilog_stmt;
4955 if (convert_optab_handler (vec_extract_optab,
4956 TYPE_MODE (TREE_TYPE (new_temp)),
4957 TYPE_MODE (vectype1))
4958 != CODE_FOR_nothing)
4960 /* Extract sub-vectors directly once vec_extract becomes
4961 a conversion optab. */
4962 dst1 = make_ssa_name (vectype1);
4963 epilog_stmt
4964 = gimple_build_assign (dst1, BIT_FIELD_REF,
4965 build3 (BIT_FIELD_REF, vectype1,
4966 new_temp, TYPE_SIZE (vectype1),
4967 bitsize_int (0)));
4968 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4969 dst2 = make_ssa_name (vectype1);
4970 epilog_stmt
4971 = gimple_build_assign (dst2, BIT_FIELD_REF,
4972 build3 (BIT_FIELD_REF, vectype1,
4973 new_temp, TYPE_SIZE (vectype1),
4974 bitsize_int (bitsize)));
4975 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4977 else
4979 /* Extract via punning to appropriately sized integer mode
4980 vector. */
4981 tree eltype = build_nonstandard_integer_type (bitsize, 1);
4982 tree etype = build_vector_type (eltype, 2);
4983 gcc_assert (convert_optab_handler (vec_extract_optab,
4984 TYPE_MODE (etype),
4985 TYPE_MODE (eltype))
4986 != CODE_FOR_nothing);
4987 tree tem = make_ssa_name (etype);
4988 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4989 build1 (VIEW_CONVERT_EXPR,
4990 etype, new_temp));
4991 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4992 new_temp = tem;
4993 tem = make_ssa_name (eltype);
4994 epilog_stmt
4995 = gimple_build_assign (tem, BIT_FIELD_REF,
4996 build3 (BIT_FIELD_REF, eltype,
4997 new_temp, TYPE_SIZE (eltype),
4998 bitsize_int (0)));
4999 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5000 dst1 = make_ssa_name (vectype1);
5001 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5002 build1 (VIEW_CONVERT_EXPR,
5003 vectype1, tem));
5004 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5005 tem = make_ssa_name (eltype);
5006 epilog_stmt
5007 = gimple_build_assign (tem, BIT_FIELD_REF,
5008 build3 (BIT_FIELD_REF, eltype,
5009 new_temp, TYPE_SIZE (eltype),
5010 bitsize_int (bitsize)));
5011 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5012 dst2 = make_ssa_name (vectype1);
5013 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5014 build1 (VIEW_CONVERT_EXPR,
5015 vectype1, tem));
5016 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5019 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5022 return new_temp;
5025 /* Function vect_create_epilog_for_reduction
5027 Create code at the loop-epilog to finalize the result of a reduction
5028 computation.
5030 STMT_INFO is the scalar reduction stmt that is being vectorized.
5031 SLP_NODE is an SLP node containing a group of reduction statements. The
5032 first one in this group is STMT_INFO.
5033 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5034 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5035 (counting from 0)
5037 This function:
5038 1. Completes the reduction def-use cycles.
5039 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5040 by calling the function specified by REDUC_FN if available, or by
5041 other means (whole-vector shifts or a scalar loop).
5042 The function also creates a new phi node at the loop exit to preserve
5043 loop-closed form, as illustrated below.
5045 The flow at the entry to this function:
5047 loop:
5048 vec_def = phi <vec_init, null> # REDUCTION_PHI
5049 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5050 s_loop = scalar_stmt # (scalar) STMT_INFO
5051 loop_exit:
5052 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5053 use <s_out0>
5054 use <s_out0>
5056 The above is transformed by this function into:
5058 loop:
5059 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5060 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5061 s_loop = scalar_stmt # (scalar) STMT_INFO
5062 loop_exit:
5063 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5064 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5065 v_out2 = reduce <v_out1>
5066 s_out3 = extract_field <v_out2, 0>
5067 s_out4 = adjust_result <s_out3>
5068 use <s_out4>
5069 use <s_out4>
5072 static void
5073 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5074 stmt_vec_info stmt_info,
5075 slp_tree slp_node,
5076 slp_instance slp_node_instance)
5078 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5079 gcc_assert (reduc_info->is_reduc_info);
5080 /* For double reductions we need to get at the inner loop reduction
5081 stmt which has the meta info attached. Our stmt_info is that of the
5082 loop-closed PHI of the inner loop which we remember as
5083 def for the reduction PHI generation. */
5084 bool double_reduc = false;
5085 stmt_vec_info rdef_info = stmt_info;
5086 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5088 gcc_assert (!slp_node);
5089 double_reduc = true;
5090 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5091 (stmt_info->stmt, 0));
5092 stmt_info = vect_stmt_to_vectorize (stmt_info);
5094 gphi *reduc_def_stmt
5095 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5096 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5097 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5098 tree vectype;
5099 machine_mode mode;
5100 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5101 basic_block exit_bb;
5102 tree scalar_dest;
5103 tree scalar_type;
5104 gimple *new_phi = NULL, *phi;
5105 gimple_stmt_iterator exit_gsi;
5106 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5107 gimple *epilog_stmt = NULL;
5108 gimple *exit_phi;
5109 tree bitsize;
5110 tree def;
5111 tree orig_name, scalar_result;
5112 imm_use_iterator imm_iter, phi_imm_iter;
5113 use_operand_p use_p, phi_use_p;
5114 gimple *use_stmt;
5115 auto_vec<tree> reduc_inputs;
5116 int j, i;
5117 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5118 unsigned int group_size = 1, k;
5119 auto_vec<gimple *> phis;
5120 /* SLP reduction without reduction chain, e.g.,
5121 # a1 = phi <a2, a0>
5122 # b1 = phi <b2, b0>
5123 a2 = operation (a1)
5124 b2 = operation (b1) */
5125 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5126 bool direct_slp_reduc;
5127 tree induction_index = NULL_TREE;
5129 if (slp_node)
5130 group_size = SLP_TREE_LANES (slp_node);
5132 if (nested_in_vect_loop_p (loop, stmt_info))
5134 outer_loop = loop;
5135 loop = loop->inner;
5136 gcc_assert (!slp_node && double_reduc);
5139 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5140 gcc_assert (vectype);
5141 mode = TYPE_MODE (vectype);
5143 tree induc_val = NULL_TREE;
5144 tree adjustment_def = NULL;
5145 if (slp_node)
5147 else
5149 /* Optimize: for induction condition reduction, if we can't use zero
5150 for induc_val, use initial_def. */
5151 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5152 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5153 else if (double_reduc)
5155 else
5156 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5159 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5160 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5161 if (slp_reduc)
5162 /* All statements produce live-out values. */
5163 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5164 else if (slp_node)
5165 /* The last statement in the reduction chain produces the live-out
5166 value. */
5167 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5169 unsigned vec_num;
5170 int ncopies;
5171 if (slp_node)
5173 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5174 ncopies = 1;
5176 else
5178 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5179 vec_num = 1;
5180 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5183 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5184 which is updated with the current index of the loop for every match of
5185 the original loop's cond_expr (VEC_STMT). This results in a vector
5186 containing the last time the condition passed for that vector lane.
5187 The first match will be a 1 to allow 0 to be used for non-matching
5188 indexes. If there are no matches at all then the vector will be all
5189 zeroes.
5191 PR92772: This algorithm is broken for architectures that support
5192 masked vectors, but do not provide fold_extract_last. */
5193 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5195 auto_vec<std::pair<tree, bool>, 2> ccompares;
5196 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5197 cond_info = vect_stmt_to_vectorize (cond_info);
5198 while (cond_info != reduc_info)
5200 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5202 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5203 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5204 ccompares.safe_push
5205 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5206 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5208 cond_info
5209 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5210 1 + STMT_VINFO_REDUC_IDX
5211 (cond_info)));
5212 cond_info = vect_stmt_to_vectorize (cond_info);
5214 gcc_assert (ccompares.length () != 0);
5216 tree indx_before_incr, indx_after_incr;
5217 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5218 int scalar_precision
5219 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5220 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5221 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5222 (TYPE_MODE (vectype), cr_index_scalar_type,
5223 TYPE_VECTOR_SUBPARTS (vectype));
5225 /* First we create a simple vector induction variable which starts
5226 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5227 vector size (STEP). */
5229 /* Create a {1,2,3,...} vector. */
5230 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5232 /* Create a vector of the step value. */
5233 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5234 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5236 /* Create an induction variable. */
5237 gimple_stmt_iterator incr_gsi;
5238 bool insert_after;
5239 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5240 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5241 insert_after, &indx_before_incr, &indx_after_incr);
5243 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5244 filled with zeros (VEC_ZERO). */
5246 /* Create a vector of 0s. */
5247 tree zero = build_zero_cst (cr_index_scalar_type);
5248 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5250 /* Create a vector phi node. */
5251 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5252 new_phi = create_phi_node (new_phi_tree, loop->header);
5253 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5254 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5256 /* Now take the condition from the loops original cond_exprs
5257 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5258 every match uses values from the induction variable
5259 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5260 (NEW_PHI_TREE).
5261 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5262 the new cond_expr (INDEX_COND_EXPR). */
5263 gimple_seq stmts = NULL;
5264 for (int i = ccompares.length () - 1; i != -1; --i)
5266 tree ccompare = ccompares[i].first;
5267 if (ccompares[i].second)
5268 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5269 cr_index_vector_type,
5270 ccompare,
5271 indx_before_incr, new_phi_tree);
5272 else
5273 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5274 cr_index_vector_type,
5275 ccompare,
5276 new_phi_tree, indx_before_incr);
5278 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5280 /* Update the phi with the vec cond. */
5281 induction_index = new_phi_tree;
5282 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5283 loop_latch_edge (loop), UNKNOWN_LOCATION);
5286 /* 2. Create epilog code.
5287 The reduction epilog code operates across the elements of the vector
5288 of partial results computed by the vectorized loop.
5289 The reduction epilog code consists of:
5291 step 1: compute the scalar result in a vector (v_out2)
5292 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5293 step 3: adjust the scalar result (s_out3) if needed.
5295 Step 1 can be accomplished using one the following three schemes:
5296 (scheme 1) using reduc_fn, if available.
5297 (scheme 2) using whole-vector shifts, if available.
5298 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5299 combined.
5301 The overall epilog code looks like this:
5303 s_out0 = phi <s_loop> # original EXIT_PHI
5304 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5305 v_out2 = reduce <v_out1> # step 1
5306 s_out3 = extract_field <v_out2, 0> # step 2
5307 s_out4 = adjust_result <s_out3> # step 3
5309 (step 3 is optional, and steps 1 and 2 may be combined).
5310 Lastly, the uses of s_out0 are replaced by s_out4. */
5313 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5314 v_out1 = phi <VECT_DEF>
5315 Store them in NEW_PHIS. */
5316 if (double_reduc)
5317 loop = outer_loop;
5318 exit_bb = single_exit (loop)->dest;
5319 exit_gsi = gsi_after_labels (exit_bb);
5320 reduc_inputs.create (slp_node ? vec_num : ncopies);
5321 for (unsigned i = 0; i < vec_num; i++)
5323 gimple_seq stmts = NULL;
5324 if (slp_node)
5325 def = vect_get_slp_vect_def (slp_node, i);
5326 else
5327 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5328 for (j = 0; j < ncopies; j++)
5330 tree new_def = copy_ssa_name (def);
5331 phi = create_phi_node (new_def, exit_bb);
5332 if (j)
5333 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5334 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5335 new_def = gimple_convert (&stmts, vectype, new_def);
5336 reduc_inputs.quick_push (new_def);
5338 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5341 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5342 (i.e. when reduc_fn is not available) and in the final adjustment
5343 code (if needed). Also get the original scalar reduction variable as
5344 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5345 represents a reduction pattern), the tree-code and scalar-def are
5346 taken from the original stmt that the pattern-stmt (STMT) replaces.
5347 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5348 are taken from STMT. */
5350 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5351 if (orig_stmt_info != stmt_info)
5353 /* Reduction pattern */
5354 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5355 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5358 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5359 scalar_type = TREE_TYPE (scalar_dest);
5360 scalar_results.create (group_size);
5361 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5362 bitsize = TYPE_SIZE (scalar_type);
5364 /* True if we should implement SLP_REDUC using native reduction operations
5365 instead of scalar operations. */
5366 direct_slp_reduc = (reduc_fn != IFN_LAST
5367 && slp_reduc
5368 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5370 /* In case of reduction chain, e.g.,
5371 # a1 = phi <a3, a0>
5372 a2 = operation (a1)
5373 a3 = operation (a2),
5375 we may end up with more than one vector result. Here we reduce them
5376 to one vector.
5378 The same is true if we couldn't use a single defuse cycle. */
5379 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5380 || direct_slp_reduc
5381 || ncopies > 1)
5383 gimple_seq stmts = NULL;
5384 tree single_input = reduc_inputs[0];
5385 for (k = 1; k < reduc_inputs.length (); k++)
5386 single_input = gimple_build (&stmts, code, vectype,
5387 single_input, reduc_inputs[k]);
5388 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5390 reduc_inputs.truncate (0);
5391 reduc_inputs.safe_push (single_input);
5394 tree orig_reduc_input = reduc_inputs[0];
5396 /* If this loop is an epilogue loop that can be skipped after the
5397 main loop, we can only share a reduction operation between the
5398 main loop and the epilogue if we put it at the target of the
5399 skip edge.
5401 We can still reuse accumulators if this check fails. Doing so has
5402 the minor(?) benefit of making the epilogue loop's scalar result
5403 independent of the main loop's scalar result. */
5404 bool unify_with_main_loop_p = false;
5405 if (reduc_info->reused_accumulator
5406 && loop_vinfo->skip_this_loop_edge
5407 && single_succ_p (exit_bb)
5408 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5410 unify_with_main_loop_p = true;
5412 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5413 reduc_inputs[0] = make_ssa_name (vectype);
5414 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5415 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5416 UNKNOWN_LOCATION);
5417 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5418 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5419 exit_gsi = gsi_after_labels (reduc_block);
5422 /* Shouldn't be used beyond this point. */
5423 exit_bb = nullptr;
5425 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5426 && reduc_fn != IFN_LAST)
5428 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5429 various data values where the condition matched and another vector
5430 (INDUCTION_INDEX) containing all the indexes of those matches. We
5431 need to extract the last matching index (which will be the index with
5432 highest value) and use this to index into the data vector.
5433 For the case where there were no matches, the data vector will contain
5434 all default values and the index vector will be all zeros. */
5436 /* Get various versions of the type of the vector of indexes. */
5437 tree index_vec_type = TREE_TYPE (induction_index);
5438 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5439 tree index_scalar_type = TREE_TYPE (index_vec_type);
5440 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5442 /* Get an unsigned integer version of the type of the data vector. */
5443 int scalar_precision
5444 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5445 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5446 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5447 vectype);
5449 /* First we need to create a vector (ZERO_VEC) of zeros and another
5450 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5451 can create using a MAX reduction and then expanding.
5452 In the case where the loop never made any matches, the max index will
5453 be zero. */
5455 /* Vector of {0, 0, 0,...}. */
5456 tree zero_vec = build_zero_cst (vectype);
5458 /* Find maximum value from the vector of found indexes. */
5459 tree max_index = make_ssa_name (index_scalar_type);
5460 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5461 1, induction_index);
5462 gimple_call_set_lhs (max_index_stmt, max_index);
5463 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5465 /* Vector of {max_index, max_index, max_index,...}. */
5466 tree max_index_vec = make_ssa_name (index_vec_type);
5467 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5468 max_index);
5469 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5470 max_index_vec_rhs);
5471 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5473 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5474 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5475 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5476 otherwise. Only one value should match, resulting in a vector
5477 (VEC_COND) with one data value and the rest zeros.
5478 In the case where the loop never made any matches, every index will
5479 match, resulting in a vector with all data values (which will all be
5480 the default value). */
5482 /* Compare the max index vector to the vector of found indexes to find
5483 the position of the max value. */
5484 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5485 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5486 induction_index,
5487 max_index_vec);
5488 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5490 /* Use the compare to choose either values from the data vector or
5491 zero. */
5492 tree vec_cond = make_ssa_name (vectype);
5493 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5494 vec_compare,
5495 reduc_inputs[0],
5496 zero_vec);
5497 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5499 /* Finally we need to extract the data value from the vector (VEC_COND)
5500 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5501 reduction, but because this doesn't exist, we can use a MAX reduction
5502 instead. The data value might be signed or a float so we need to cast
5503 it first.
5504 In the case where the loop never made any matches, the data values are
5505 all identical, and so will reduce down correctly. */
5507 /* Make the matched data values unsigned. */
5508 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5509 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5510 vec_cond);
5511 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5512 VIEW_CONVERT_EXPR,
5513 vec_cond_cast_rhs);
5514 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5516 /* Reduce down to a scalar value. */
5517 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5518 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5519 1, vec_cond_cast);
5520 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5521 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5523 /* Convert the reduced value back to the result type and set as the
5524 result. */
5525 gimple_seq stmts = NULL;
5526 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5527 data_reduc);
5528 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5529 scalar_results.safe_push (new_temp);
5531 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5532 && reduc_fn == IFN_LAST)
5534 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5535 idx = 0;
5536 idx_val = induction_index[0];
5537 val = data_reduc[0];
5538 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5539 if (induction_index[i] > idx_val)
5540 val = data_reduc[i], idx_val = induction_index[i];
5541 return val; */
5543 tree data_eltype = TREE_TYPE (vectype);
5544 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5545 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5546 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5547 /* Enforced by vectorizable_reduction, which ensures we have target
5548 support before allowing a conditional reduction on variable-length
5549 vectors. */
5550 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5551 tree idx_val = NULL_TREE, val = NULL_TREE;
5552 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5554 tree old_idx_val = idx_val;
5555 tree old_val = val;
5556 idx_val = make_ssa_name (idx_eltype);
5557 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5558 build3 (BIT_FIELD_REF, idx_eltype,
5559 induction_index,
5560 bitsize_int (el_size),
5561 bitsize_int (off)));
5562 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5563 val = make_ssa_name (data_eltype);
5564 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5565 build3 (BIT_FIELD_REF,
5566 data_eltype,
5567 reduc_inputs[0],
5568 bitsize_int (el_size),
5569 bitsize_int (off)));
5570 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5571 if (off != 0)
5573 tree new_idx_val = idx_val;
5574 if (off != v_size - el_size)
5576 new_idx_val = make_ssa_name (idx_eltype);
5577 epilog_stmt = gimple_build_assign (new_idx_val,
5578 MAX_EXPR, idx_val,
5579 old_idx_val);
5580 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5582 tree new_val = make_ssa_name (data_eltype);
5583 epilog_stmt = gimple_build_assign (new_val,
5584 COND_EXPR,
5585 build2 (GT_EXPR,
5586 boolean_type_node,
5587 idx_val,
5588 old_idx_val),
5589 val, old_val);
5590 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5591 idx_val = new_idx_val;
5592 val = new_val;
5595 /* Convert the reduced value back to the result type and set as the
5596 result. */
5597 gimple_seq stmts = NULL;
5598 val = gimple_convert (&stmts, scalar_type, val);
5599 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5600 scalar_results.safe_push (val);
5603 /* 2.3 Create the reduction code, using one of the three schemes described
5604 above. In SLP we simply need to extract all the elements from the
5605 vector (without reducing them), so we use scalar shifts. */
5606 else if (reduc_fn != IFN_LAST && !slp_reduc)
5608 tree tmp;
5609 tree vec_elem_type;
5611 /* Case 1: Create:
5612 v_out2 = reduc_expr <v_out1> */
5614 if (dump_enabled_p ())
5615 dump_printf_loc (MSG_NOTE, vect_location,
5616 "Reduce using direct vector reduction.\n");
5618 gimple_seq stmts = NULL;
5619 vec_elem_type = TREE_TYPE (vectype);
5620 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5621 vec_elem_type, reduc_inputs[0]);
5622 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5623 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5625 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5626 && induc_val)
5628 /* Earlier we set the initial value to be a vector if induc_val
5629 values. Check the result and if it is induc_val then replace
5630 with the original initial value, unless induc_val is
5631 the same as initial_def already. */
5632 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5633 induc_val);
5634 tree initial_def = reduc_info->reduc_initial_values[0];
5636 tmp = make_ssa_name (new_scalar_dest);
5637 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5638 initial_def, new_temp);
5639 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5640 new_temp = tmp;
5643 scalar_results.safe_push (new_temp);
5645 else if (direct_slp_reduc)
5647 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5648 with the elements for other SLP statements replaced with the
5649 neutral value. We can then do a normal reduction on each vector. */
5651 /* Enforced by vectorizable_reduction. */
5652 gcc_assert (reduc_inputs.length () == 1);
5653 gcc_assert (pow2p_hwi (group_size));
5655 gimple_seq seq = NULL;
5657 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5658 and the same element size as VECTYPE. */
5659 tree index = build_index_vector (vectype, 0, 1);
5660 tree index_type = TREE_TYPE (index);
5661 tree index_elt_type = TREE_TYPE (index_type);
5662 tree mask_type = truth_type_for (index_type);
5664 /* Create a vector that, for each element, identifies which of
5665 the REDUC_GROUP_SIZE results should use it. */
5666 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5667 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5668 build_vector_from_val (index_type, index_mask));
5670 /* Get a neutral vector value. This is simply a splat of the neutral
5671 scalar value if we have one, otherwise the initial scalar value
5672 is itself a neutral value. */
5673 tree vector_identity = NULL_TREE;
5674 tree neutral_op = NULL_TREE;
5675 if (slp_node)
5677 tree initial_value = NULL_TREE;
5678 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5679 initial_value = reduc_info->reduc_initial_values[0];
5680 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5681 initial_value);
5683 if (neutral_op)
5684 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5685 neutral_op);
5686 for (unsigned int i = 0; i < group_size; ++i)
5688 /* If there's no univeral neutral value, we can use the
5689 initial scalar value from the original PHI. This is used
5690 for MIN and MAX reduction, for example. */
5691 if (!neutral_op)
5693 tree scalar_value = reduc_info->reduc_initial_values[i];
5694 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5695 scalar_value);
5696 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5697 scalar_value);
5700 /* Calculate the equivalent of:
5702 sel[j] = (index[j] == i);
5704 which selects the elements of REDUC_INPUTS[0] that should
5705 be included in the result. */
5706 tree compare_val = build_int_cst (index_elt_type, i);
5707 compare_val = build_vector_from_val (index_type, compare_val);
5708 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5709 index, compare_val);
5711 /* Calculate the equivalent of:
5713 vec = seq ? reduc_inputs[0] : vector_identity;
5715 VEC is now suitable for a full vector reduction. */
5716 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5717 sel, reduc_inputs[0], vector_identity);
5719 /* Do the reduction and convert it to the appropriate type. */
5720 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5721 TREE_TYPE (vectype), vec);
5722 scalar = gimple_convert (&seq, scalar_type, scalar);
5723 scalar_results.safe_push (scalar);
5725 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5727 else
5729 bool reduce_with_shift;
5730 tree vec_temp;
5732 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5734 /* See if the target wants to do the final (shift) reduction
5735 in a vector mode of smaller size and first reduce upper/lower
5736 halves against each other. */
5737 enum machine_mode mode1 = mode;
5738 tree stype = TREE_TYPE (vectype);
5739 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5740 unsigned nunits1 = nunits;
5741 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5742 && reduc_inputs.length () == 1)
5744 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5745 /* For SLP reductions we have to make sure lanes match up, but
5746 since we're doing individual element final reduction reducing
5747 vector width here is even more important.
5748 ??? We can also separate lanes with permutes, for the common
5749 case of power-of-two group-size odd/even extracts would work. */
5750 if (slp_reduc && nunits != nunits1)
5752 nunits1 = least_common_multiple (nunits1, group_size);
5753 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5756 if (!slp_reduc
5757 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5758 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5760 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5761 stype, nunits1);
5762 reduce_with_shift = have_whole_vector_shift (mode1);
5763 if (!VECTOR_MODE_P (mode1)
5764 || !directly_supported_p (code, vectype1))
5765 reduce_with_shift = false;
5767 /* First reduce the vector to the desired vector size we should
5768 do shift reduction on by combining upper and lower halves. */
5769 gimple_seq stmts = NULL;
5770 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5771 code, &stmts);
5772 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5773 reduc_inputs[0] = new_temp;
5775 if (reduce_with_shift && !slp_reduc)
5777 int element_bitsize = tree_to_uhwi (bitsize);
5778 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5779 for variable-length vectors and also requires direct target support
5780 for loop reductions. */
5781 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5782 int nelements = vec_size_in_bits / element_bitsize;
5783 vec_perm_builder sel;
5784 vec_perm_indices indices;
5786 int elt_offset;
5788 tree zero_vec = build_zero_cst (vectype1);
5789 /* Case 2: Create:
5790 for (offset = nelements/2; offset >= 1; offset/=2)
5792 Create: va' = vec_shift <va, offset>
5793 Create: va = vop <va, va'>
5794 } */
5796 tree rhs;
5798 if (dump_enabled_p ())
5799 dump_printf_loc (MSG_NOTE, vect_location,
5800 "Reduce using vector shifts\n");
5802 gimple_seq stmts = NULL;
5803 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5804 for (elt_offset = nelements / 2;
5805 elt_offset >= 1;
5806 elt_offset /= 2)
5808 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5809 indices.new_vector (sel, 2, nelements);
5810 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5811 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5812 new_temp, zero_vec, mask);
5813 new_temp = gimple_build (&stmts, code,
5814 vectype1, new_name, new_temp);
5816 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5818 /* 2.4 Extract the final scalar result. Create:
5819 s_out3 = extract_field <v_out2, bitpos> */
5821 if (dump_enabled_p ())
5822 dump_printf_loc (MSG_NOTE, vect_location,
5823 "extract scalar result\n");
5825 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5826 bitsize, bitsize_zero_node);
5827 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5828 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5829 gimple_assign_set_lhs (epilog_stmt, new_temp);
5830 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5831 scalar_results.safe_push (new_temp);
5833 else
5835 /* Case 3: Create:
5836 s = extract_field <v_out2, 0>
5837 for (offset = element_size;
5838 offset < vector_size;
5839 offset += element_size;)
5841 Create: s' = extract_field <v_out2, offset>
5842 Create: s = op <s, s'> // For non SLP cases
5843 } */
5845 if (dump_enabled_p ())
5846 dump_printf_loc (MSG_NOTE, vect_location,
5847 "Reduce using scalar code.\n");
5849 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5850 int element_bitsize = tree_to_uhwi (bitsize);
5851 tree compute_type = TREE_TYPE (vectype);
5852 gimple_seq stmts = NULL;
5853 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5855 int bit_offset;
5856 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5857 vec_temp, bitsize, bitsize_zero_node);
5859 /* In SLP we don't need to apply reduction operation, so we just
5860 collect s' values in SCALAR_RESULTS. */
5861 if (slp_reduc)
5862 scalar_results.safe_push (new_temp);
5864 for (bit_offset = element_bitsize;
5865 bit_offset < vec_size_in_bits;
5866 bit_offset += element_bitsize)
5868 tree bitpos = bitsize_int (bit_offset);
5869 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5870 compute_type, vec_temp,
5871 bitsize, bitpos);
5872 if (slp_reduc)
5874 /* In SLP we don't need to apply reduction operation, so
5875 we just collect s' values in SCALAR_RESULTS. */
5876 new_temp = new_name;
5877 scalar_results.safe_push (new_name);
5879 else
5880 new_temp = gimple_build (&stmts, code, compute_type,
5881 new_name, new_temp);
5885 /* The only case where we need to reduce scalar results in SLP, is
5886 unrolling. If the size of SCALAR_RESULTS is greater than
5887 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5888 REDUC_GROUP_SIZE. */
5889 if (slp_reduc)
5891 tree res, first_res, new_res;
5893 /* Reduce multiple scalar results in case of SLP unrolling. */
5894 for (j = group_size; scalar_results.iterate (j, &res);
5895 j++)
5897 first_res = scalar_results[j % group_size];
5898 new_res = gimple_build (&stmts, code, compute_type,
5899 first_res, res);
5900 scalar_results[j % group_size] = new_res;
5902 scalar_results.truncate (group_size);
5903 for (k = 0; k < group_size; k++)
5904 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5905 scalar_results[k]);
5907 else
5909 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5910 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5911 scalar_results.safe_push (new_temp);
5914 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5917 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5918 && induc_val)
5920 /* Earlier we set the initial value to be a vector if induc_val
5921 values. Check the result and if it is induc_val then replace
5922 with the original initial value, unless induc_val is
5923 the same as initial_def already. */
5924 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5925 induc_val);
5926 tree initial_def = reduc_info->reduc_initial_values[0];
5928 tree tmp = make_ssa_name (new_scalar_dest);
5929 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5930 initial_def, new_temp);
5931 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5932 scalar_results[0] = tmp;
5936 /* 2.5 Adjust the final result by the initial value of the reduction
5937 variable. (When such adjustment is not needed, then
5938 'adjustment_def' is zero). For example, if code is PLUS we create:
5939 new_temp = loop_exit_def + adjustment_def */
5941 if (adjustment_def)
5943 gcc_assert (!slp_reduc);
5944 gimple_seq stmts = NULL;
5945 if (double_reduc)
5947 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5948 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5949 new_temp = gimple_build (&stmts, code, vectype,
5950 reduc_inputs[0], adjustment_def);
5952 else
5954 new_temp = scalar_results[0];
5955 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5956 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5957 new_temp = gimple_build (&stmts, code, scalar_type,
5958 new_temp, adjustment_def);
5961 epilog_stmt = gimple_seq_last_stmt (stmts);
5962 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5963 scalar_results[0] = new_temp;
5966 /* Record this operation if it could be reused by the epilogue loop. */
5967 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
5968 loop_vinfo->reusable_accumulators.put (scalar_results[0],
5969 { orig_reduc_input, reduc_info });
5971 if (double_reduc)
5972 loop = outer_loop;
5974 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5975 phis with new adjusted scalar results, i.e., replace use <s_out0>
5976 with use <s_out4>.
5978 Transform:
5979 loop_exit:
5980 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5981 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5982 v_out2 = reduce <v_out1>
5983 s_out3 = extract_field <v_out2, 0>
5984 s_out4 = adjust_result <s_out3>
5985 use <s_out0>
5986 use <s_out0>
5988 into:
5990 loop_exit:
5991 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5992 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5993 v_out2 = reduce <v_out1>
5994 s_out3 = extract_field <v_out2, 0>
5995 s_out4 = adjust_result <s_out3>
5996 use <s_out4>
5997 use <s_out4> */
5999 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6000 for (k = 0; k < live_out_stmts.size (); k++)
6002 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6003 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6005 phis.create (3);
6006 /* Find the loop-closed-use at the loop exit of the original scalar
6007 result. (The reduction result is expected to have two immediate uses,
6008 one at the latch block, and one at the loop exit). For double
6009 reductions we are looking for exit phis of the outer loop. */
6010 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6012 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6014 if (!is_gimple_debug (USE_STMT (use_p)))
6015 phis.safe_push (USE_STMT (use_p));
6017 else
6019 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6021 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6023 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6025 if (!flow_bb_inside_loop_p (loop,
6026 gimple_bb (USE_STMT (phi_use_p)))
6027 && !is_gimple_debug (USE_STMT (phi_use_p)))
6028 phis.safe_push (USE_STMT (phi_use_p));
6034 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6036 /* Replace the uses: */
6037 orig_name = PHI_RESULT (exit_phi);
6039 /* Look for a single use at the target of the skip edge. */
6040 if (unify_with_main_loop_p)
6042 use_operand_p use_p;
6043 gimple *user;
6044 if (!single_imm_use (orig_name, &use_p, &user))
6045 gcc_unreachable ();
6046 orig_name = gimple_get_lhs (user);
6049 scalar_result = scalar_results[k];
6050 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6052 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6053 SET_USE (use_p, scalar_result);
6054 update_stmt (use_stmt);
6058 phis.release ();
6062 /* Return a vector of type VECTYPE that is equal to the vector select
6063 operation "MASK ? VEC : IDENTITY". Insert the select statements
6064 before GSI. */
6066 static tree
6067 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6068 tree vec, tree identity)
6070 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6071 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6072 mask, vec, identity);
6073 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6074 return cond;
6077 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6078 order, starting with LHS. Insert the extraction statements before GSI and
6079 associate the new scalar SSA names with variable SCALAR_DEST.
6080 Return the SSA name for the result. */
6082 static tree
6083 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6084 tree_code code, tree lhs, tree vector_rhs)
6086 tree vectype = TREE_TYPE (vector_rhs);
6087 tree scalar_type = TREE_TYPE (vectype);
6088 tree bitsize = TYPE_SIZE (scalar_type);
6089 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6090 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6092 for (unsigned HOST_WIDE_INT bit_offset = 0;
6093 bit_offset < vec_size_in_bits;
6094 bit_offset += element_bitsize)
6096 tree bitpos = bitsize_int (bit_offset);
6097 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6098 bitsize, bitpos);
6100 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6101 rhs = make_ssa_name (scalar_dest, stmt);
6102 gimple_assign_set_lhs (stmt, rhs);
6103 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6105 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6106 tree new_name = make_ssa_name (scalar_dest, stmt);
6107 gimple_assign_set_lhs (stmt, new_name);
6108 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6109 lhs = new_name;
6111 return lhs;
6114 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6115 type of the vector input. */
6117 static internal_fn
6118 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6120 internal_fn mask_reduc_fn;
6122 switch (reduc_fn)
6124 case IFN_FOLD_LEFT_PLUS:
6125 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6126 break;
6128 default:
6129 return IFN_LAST;
6132 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6133 OPTIMIZE_FOR_SPEED))
6134 return mask_reduc_fn;
6135 return IFN_LAST;
6138 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6139 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6140 statement. CODE is the operation performed by STMT_INFO and OPS are
6141 its scalar operands. REDUC_INDEX is the index of the operand in
6142 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6143 implements in-order reduction, or IFN_LAST if we should open-code it.
6144 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6145 that should be used to control the operation in a fully-masked loop. */
6147 static bool
6148 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6149 stmt_vec_info stmt_info,
6150 gimple_stmt_iterator *gsi,
6151 gimple **vec_stmt, slp_tree slp_node,
6152 gimple *reduc_def_stmt,
6153 tree_code code, internal_fn reduc_fn,
6154 tree ops[3], tree vectype_in,
6155 int reduc_index, vec_loop_masks *masks)
6157 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6158 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6159 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6161 int ncopies;
6162 if (slp_node)
6163 ncopies = 1;
6164 else
6165 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6167 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6168 gcc_assert (ncopies == 1);
6169 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6171 if (slp_node)
6172 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6173 TYPE_VECTOR_SUBPARTS (vectype_in)));
6175 tree op0 = ops[1 - reduc_index];
6177 int group_size = 1;
6178 stmt_vec_info scalar_dest_def_info;
6179 auto_vec<tree> vec_oprnds0;
6180 if (slp_node)
6182 auto_vec<vec<tree> > vec_defs (2);
6183 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6184 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6185 vec_defs[0].release ();
6186 vec_defs[1].release ();
6187 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6188 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6190 else
6192 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6193 op0, &vec_oprnds0);
6194 scalar_dest_def_info = stmt_info;
6197 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6198 tree scalar_type = TREE_TYPE (scalar_dest);
6199 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6201 int vec_num = vec_oprnds0.length ();
6202 gcc_assert (vec_num == 1 || slp_node);
6203 tree vec_elem_type = TREE_TYPE (vectype_out);
6204 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6206 tree vector_identity = NULL_TREE;
6207 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6208 vector_identity = build_zero_cst (vectype_out);
6210 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6211 int i;
6212 tree def0;
6213 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6215 gimple *new_stmt;
6216 tree mask = NULL_TREE;
6217 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6218 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6220 /* Handle MINUS by adding the negative. */
6221 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6223 tree negated = make_ssa_name (vectype_out);
6224 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6225 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6226 def0 = negated;
6229 if (mask && mask_reduc_fn == IFN_LAST)
6230 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6231 vector_identity);
6233 /* On the first iteration the input is simply the scalar phi
6234 result, and for subsequent iterations it is the output of
6235 the preceding operation. */
6236 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6238 if (mask && mask_reduc_fn != IFN_LAST)
6239 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6240 def0, mask);
6241 else
6242 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6243 def0);
6244 /* For chained SLP reductions the output of the previous reduction
6245 operation serves as the input of the next. For the final statement
6246 the output cannot be a temporary - we reuse the original
6247 scalar destination of the last statement. */
6248 if (i != vec_num - 1)
6250 gimple_set_lhs (new_stmt, scalar_dest_var);
6251 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6252 gimple_set_lhs (new_stmt, reduc_var);
6255 else
6257 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6258 reduc_var, def0);
6259 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6260 /* Remove the statement, so that we can use the same code paths
6261 as for statements that we've just created. */
6262 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6263 gsi_remove (&tmp_gsi, true);
6266 if (i == vec_num - 1)
6268 gimple_set_lhs (new_stmt, scalar_dest);
6269 vect_finish_replace_stmt (loop_vinfo,
6270 scalar_dest_def_info,
6271 new_stmt);
6273 else
6274 vect_finish_stmt_generation (loop_vinfo,
6275 scalar_dest_def_info,
6276 new_stmt, gsi);
6278 if (slp_node)
6279 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6280 else
6282 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6283 *vec_stmt = new_stmt;
6287 return true;
6290 /* Function is_nonwrapping_integer_induction.
6292 Check if STMT_VINO (which is part of loop LOOP) both increments and
6293 does not cause overflow. */
6295 static bool
6296 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6298 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6299 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6300 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6301 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6302 widest_int ni, max_loop_value, lhs_max;
6303 wi::overflow_type overflow = wi::OVF_NONE;
6305 /* Make sure the loop is integer based. */
6306 if (TREE_CODE (base) != INTEGER_CST
6307 || TREE_CODE (step) != INTEGER_CST)
6308 return false;
6310 /* Check that the max size of the loop will not wrap. */
6312 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6313 return true;
6315 if (! max_stmt_executions (loop, &ni))
6316 return false;
6318 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6319 &overflow);
6320 if (overflow)
6321 return false;
6323 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6324 TYPE_SIGN (lhs_type), &overflow);
6325 if (overflow)
6326 return false;
6328 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6329 <= TYPE_PRECISION (lhs_type));
6332 /* Check if masking can be supported by inserting a conditional expression.
6333 CODE is the code for the operation. COND_FN is the conditional internal
6334 function, if it exists. VECTYPE_IN is the type of the vector input. */
6335 static bool
6336 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6337 tree vectype_in)
6339 if (cond_fn != IFN_LAST
6340 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6341 OPTIMIZE_FOR_SPEED))
6342 return false;
6344 if (code.is_tree_code ())
6345 switch (tree_code (code))
6347 case DOT_PROD_EXPR:
6348 case SAD_EXPR:
6349 return true;
6351 default:
6352 break;
6354 return false;
6357 /* Insert a conditional expression to enable masked vectorization. CODE is the
6358 code for the operation. VOP is the array of operands. MASK is the loop
6359 mask. GSI is a statement iterator used to place the new conditional
6360 expression. */
6361 static void
6362 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6363 gimple_stmt_iterator *gsi)
6365 switch (tree_code (code))
6367 case DOT_PROD_EXPR:
6369 tree vectype = TREE_TYPE (vop[1]);
6370 tree zero = build_zero_cst (vectype);
6371 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6372 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6373 mask, vop[1], zero);
6374 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6375 vop[1] = masked_op1;
6376 break;
6379 case SAD_EXPR:
6381 tree vectype = TREE_TYPE (vop[1]);
6382 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6383 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6384 mask, vop[1], vop[0]);
6385 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6386 vop[1] = masked_op1;
6387 break;
6390 default:
6391 gcc_unreachable ();
6395 /* Function vectorizable_reduction.
6397 Check if STMT_INFO performs a reduction operation that can be vectorized.
6398 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6399 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6400 Return true if STMT_INFO is vectorizable in this way.
6402 This function also handles reduction idioms (patterns) that have been
6403 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6404 may be of this form:
6405 X = pattern_expr (arg0, arg1, ..., X)
6406 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6407 sequence that had been detected and replaced by the pattern-stmt
6408 (STMT_INFO).
6410 This function also handles reduction of condition expressions, for example:
6411 for (int i = 0; i < N; i++)
6412 if (a[i] < value)
6413 last = a[i];
6414 This is handled by vectorising the loop and creating an additional vector
6415 containing the loop indexes for which "a[i] < value" was true. In the
6416 function epilogue this is reduced to a single max value and then used to
6417 index into the vector of results.
6419 In some cases of reduction patterns, the type of the reduction variable X is
6420 different than the type of the other arguments of STMT_INFO.
6421 In such cases, the vectype that is used when transforming STMT_INFO into
6422 a vector stmt is different than the vectype that is used to determine the
6423 vectorization factor, because it consists of a different number of elements
6424 than the actual number of elements that are being operated upon in parallel.
6426 For example, consider an accumulation of shorts into an int accumulator.
6427 On some targets it's possible to vectorize this pattern operating on 8
6428 shorts at a time (hence, the vectype for purposes of determining the
6429 vectorization factor should be V8HI); on the other hand, the vectype that
6430 is used to create the vector form is actually V4SI (the type of the result).
6432 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6433 indicates what is the actual level of parallelism (V8HI in the example), so
6434 that the right vectorization factor would be derived. This vectype
6435 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6436 be used to create the vectorized stmt. The right vectype for the vectorized
6437 stmt is obtained from the type of the result X:
6438 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6440 This means that, contrary to "regular" reductions (or "regular" stmts in
6441 general), the following equation:
6442 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6443 does *NOT* necessarily hold for reduction patterns. */
6445 bool
6446 vectorizable_reduction (loop_vec_info loop_vinfo,
6447 stmt_vec_info stmt_info, slp_tree slp_node,
6448 slp_instance slp_node_instance,
6449 stmt_vector_for_cost *cost_vec)
6451 tree vectype_in = NULL_TREE;
6452 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6453 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6454 stmt_vec_info cond_stmt_vinfo = NULL;
6455 int i;
6456 int ncopies;
6457 bool single_defuse_cycle = false;
6458 bool nested_cycle = false;
6459 bool double_reduc = false;
6460 int vec_num;
6461 tree tem;
6462 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6463 tree cond_reduc_val = NULL_TREE;
6465 /* Make sure it was already recognized as a reduction computation. */
6466 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6467 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6468 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6469 return false;
6471 /* The stmt we store reduction analysis meta on. */
6472 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6473 reduc_info->is_reduc_info = true;
6475 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6477 if (is_a <gphi *> (stmt_info->stmt))
6479 if (slp_node)
6481 /* We eventually need to set a vector type on invariant
6482 arguments. */
6483 unsigned j;
6484 slp_tree child;
6485 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6486 if (!vect_maybe_update_slp_op_vectype
6487 (child, SLP_TREE_VECTYPE (slp_node)))
6489 if (dump_enabled_p ())
6490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491 "incompatible vector types for "
6492 "invariants\n");
6493 return false;
6496 /* Analysis for double-reduction is done on the outer
6497 loop PHI, nested cycles have no further restrictions. */
6498 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6500 else
6501 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6502 return true;
6505 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6506 stmt_vec_info phi_info = stmt_info;
6507 if (!is_a <gphi *> (stmt_info->stmt))
6509 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6510 return true;
6512 if (slp_node)
6514 slp_node_instance->reduc_phis = slp_node;
6515 /* ??? We're leaving slp_node to point to the PHIs, we only
6516 need it to get at the number of vector stmts which wasn't
6517 yet initialized for the instance root. */
6519 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6520 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6521 else
6523 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6524 == vect_double_reduction_def);
6525 use_operand_p use_p;
6526 gimple *use_stmt;
6527 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6528 &use_p, &use_stmt);
6529 gcc_assert (res);
6530 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6531 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6534 /* PHIs should not participate in patterns. */
6535 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6536 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6538 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6539 and compute the reduction chain length. Discover the real
6540 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6541 tree reduc_def
6542 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6543 loop_latch_edge
6544 (gimple_bb (reduc_def_phi)->loop_father));
6545 unsigned reduc_chain_length = 0;
6546 bool only_slp_reduc_chain = true;
6547 stmt_info = NULL;
6548 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6549 while (reduc_def != PHI_RESULT (reduc_def_phi))
6551 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6552 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6553 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6555 if (dump_enabled_p ())
6556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6557 "reduction chain broken by patterns.\n");
6558 return false;
6560 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6561 only_slp_reduc_chain = false;
6562 /* ??? For epilogue generation live members of the chain need
6563 to point back to the PHI via their original stmt for
6564 info_for_reduction to work. */
6565 if (STMT_VINFO_LIVE_P (vdef))
6566 STMT_VINFO_REDUC_DEF (def) = phi_info;
6567 gimple_match_op op;
6568 if (!gimple_extract_op (vdef->stmt, &op))
6570 if (dump_enabled_p ())
6571 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6572 "reduction chain includes unsupported"
6573 " statement type.\n");
6574 return false;
6576 if (CONVERT_EXPR_CODE_P (op.code))
6578 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6580 if (dump_enabled_p ())
6581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6582 "conversion in the reduction chain.\n");
6583 return false;
6586 else if (!stmt_info)
6587 /* First non-conversion stmt. */
6588 stmt_info = vdef;
6589 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6590 reduc_chain_length++;
6591 if (!stmt_info && slp_node)
6592 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6594 /* PHIs should not participate in patterns. */
6595 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6597 if (nested_in_vect_loop_p (loop, stmt_info))
6599 loop = loop->inner;
6600 nested_cycle = true;
6603 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6604 element. */
6605 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6607 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6608 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6610 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6611 gcc_assert (slp_node
6612 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6614 /* 1. Is vectorizable reduction? */
6615 /* Not supportable if the reduction variable is used in the loop, unless
6616 it's a reduction chain. */
6617 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6618 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6619 return false;
6621 /* Reductions that are not used even in an enclosing outer-loop,
6622 are expected to be "live" (used out of the loop). */
6623 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6624 && !STMT_VINFO_LIVE_P (stmt_info))
6625 return false;
6627 /* 2. Has this been recognized as a reduction pattern?
6629 Check if STMT represents a pattern that has been recognized
6630 in earlier analysis stages. For stmts that represent a pattern,
6631 the STMT_VINFO_RELATED_STMT field records the last stmt in
6632 the original sequence that constitutes the pattern. */
6634 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6635 if (orig_stmt_info)
6637 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6638 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6641 /* 3. Check the operands of the operation. The first operands are defined
6642 inside the loop body. The last operand is the reduction variable,
6643 which is defined by the loop-header-phi. */
6645 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6646 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6647 gimple_match_op op;
6648 if (!gimple_extract_op (stmt_info->stmt, &op))
6649 gcc_unreachable ();
6650 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6651 || op.code == WIDEN_SUM_EXPR
6652 || op.code == SAD_EXPR);
6653 enum optab_subtype optab_query_kind = optab_vector;
6654 if (op.code == DOT_PROD_EXPR
6655 && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6656 != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6657 optab_query_kind = optab_vector_mixed_sign;
6659 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6660 && !SCALAR_FLOAT_TYPE_P (op.type))
6661 return false;
6663 /* Do not try to vectorize bit-precision reductions. */
6664 if (!type_has_mode_precision_p (op.type))
6665 return false;
6667 /* For lane-reducing ops we're reducing the number of reduction PHIs
6668 which means the only use of that may be in the lane-reducing operation. */
6669 if (lane_reduc_code_p
6670 && reduc_chain_length != 1
6671 && !only_slp_reduc_chain)
6673 if (dump_enabled_p ())
6674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6675 "lane-reducing reduction with extra stmts.\n");
6676 return false;
6679 /* All uses but the last are expected to be defined in the loop.
6680 The last use is the reduction variable. In case of nested cycle this
6681 assumption is not true: we use reduc_index to record the index of the
6682 reduction variable. */
6683 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6684 /* We need to skip an extra operand for COND_EXPRs with embedded
6685 comparison. */
6686 unsigned opno_adjust = 0;
6687 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6688 opno_adjust = 1;
6689 for (i = 0; i < (int) op.num_ops; i++)
6691 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6692 if (i == 0 && op.code == COND_EXPR)
6693 continue;
6695 stmt_vec_info def_stmt_info;
6696 enum vect_def_type dt;
6697 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6698 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6699 &tem, &def_stmt_info))
6701 if (dump_enabled_p ())
6702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6703 "use not simple.\n");
6704 return false;
6706 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6707 continue;
6709 /* There should be only one cycle def in the stmt, the one
6710 leading to reduc_def. */
6711 if (VECTORIZABLE_CYCLE_DEF (dt))
6712 return false;
6714 /* To properly compute ncopies we are interested in the widest
6715 non-reduction input type in case we're looking at a widening
6716 accumulation that we later handle in vect_transform_reduction. */
6717 if (lane_reduc_code_p
6718 && tem
6719 && (!vectype_in
6720 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6721 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6722 vectype_in = tem;
6724 if (op.code == COND_EXPR)
6726 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6727 if (dt == vect_constant_def)
6729 cond_reduc_dt = dt;
6730 cond_reduc_val = op.ops[i];
6732 if (dt == vect_induction_def
6733 && def_stmt_info
6734 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6736 cond_reduc_dt = dt;
6737 cond_stmt_vinfo = def_stmt_info;
6741 if (!vectype_in)
6742 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6743 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6745 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6746 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6747 /* If we have a condition reduction, see if we can simplify it further. */
6748 if (v_reduc_type == COND_REDUCTION)
6750 if (slp_node)
6751 return false;
6753 /* When the condition uses the reduction value in the condition, fail. */
6754 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6756 if (dump_enabled_p ())
6757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758 "condition depends on previous iteration\n");
6759 return false;
6762 if (reduc_chain_length == 1
6763 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6764 vectype_in, OPTIMIZE_FOR_SPEED))
6766 if (dump_enabled_p ())
6767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6768 "optimizing condition reduction with"
6769 " FOLD_EXTRACT_LAST.\n");
6770 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6772 else if (cond_reduc_dt == vect_induction_def)
6774 tree base
6775 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6776 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6778 gcc_assert (TREE_CODE (base) == INTEGER_CST
6779 && TREE_CODE (step) == INTEGER_CST);
6780 cond_reduc_val = NULL_TREE;
6781 enum tree_code cond_reduc_op_code = ERROR_MARK;
6782 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6783 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6785 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6786 above base; punt if base is the minimum value of the type for
6787 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6788 else if (tree_int_cst_sgn (step) == -1)
6790 cond_reduc_op_code = MIN_EXPR;
6791 if (tree_int_cst_sgn (base) == -1)
6792 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6793 else if (tree_int_cst_lt (base,
6794 TYPE_MAX_VALUE (TREE_TYPE (base))))
6795 cond_reduc_val
6796 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6798 else
6800 cond_reduc_op_code = MAX_EXPR;
6801 if (tree_int_cst_sgn (base) == 1)
6802 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6803 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6804 base))
6805 cond_reduc_val
6806 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6808 if (cond_reduc_val)
6810 if (dump_enabled_p ())
6811 dump_printf_loc (MSG_NOTE, vect_location,
6812 "condition expression based on "
6813 "integer induction.\n");
6814 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6815 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6816 = cond_reduc_val;
6817 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6820 else if (cond_reduc_dt == vect_constant_def)
6822 enum vect_def_type cond_initial_dt;
6823 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6824 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6825 if (cond_initial_dt == vect_constant_def
6826 && types_compatible_p (TREE_TYPE (cond_initial_val),
6827 TREE_TYPE (cond_reduc_val)))
6829 tree e = fold_binary (LE_EXPR, boolean_type_node,
6830 cond_initial_val, cond_reduc_val);
6831 if (e && (integer_onep (e) || integer_zerop (e)))
6833 if (dump_enabled_p ())
6834 dump_printf_loc (MSG_NOTE, vect_location,
6835 "condition expression based on "
6836 "compile time constant.\n");
6837 /* Record reduction code at analysis stage. */
6838 STMT_VINFO_REDUC_CODE (reduc_info)
6839 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6840 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6846 if (STMT_VINFO_LIVE_P (phi_info))
6847 return false;
6849 if (slp_node)
6850 ncopies = 1;
6851 else
6852 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6854 gcc_assert (ncopies >= 1);
6856 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6858 if (nested_cycle)
6860 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6861 == vect_double_reduction_def);
6862 double_reduc = true;
6865 /* 4.2. Check support for the epilog operation.
6867 If STMT represents a reduction pattern, then the type of the
6868 reduction variable may be different than the type of the rest
6869 of the arguments. For example, consider the case of accumulation
6870 of shorts into an int accumulator; The original code:
6871 S1: int_a = (int) short_a;
6872 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6874 was replaced with:
6875 STMT: int_acc = widen_sum <short_a, int_acc>
6877 This means that:
6878 1. The tree-code that is used to create the vector operation in the
6879 epilog code (that reduces the partial results) is not the
6880 tree-code of STMT, but is rather the tree-code of the original
6881 stmt from the pattern that STMT is replacing. I.e, in the example
6882 above we want to use 'widen_sum' in the loop, but 'plus' in the
6883 epilog.
6884 2. The type (mode) we use to check available target support
6885 for the vector operation to be created in the *epilog*, is
6886 determined by the type of the reduction variable (in the example
6887 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6888 However the type (mode) we use to check available target support
6889 for the vector operation to be created *inside the loop*, is
6890 determined by the type of the other arguments to STMT (in the
6891 example we'd check this: optab_handler (widen_sum_optab,
6892 vect_short_mode)).
6894 This is contrary to "regular" reductions, in which the types of all
6895 the arguments are the same as the type of the reduction variable.
6896 For "regular" reductions we can therefore use the same vector type
6897 (and also the same tree-code) when generating the epilog code and
6898 when generating the code inside the loop. */
6900 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6901 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6903 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6904 if (reduction_type == TREE_CODE_REDUCTION)
6906 /* Check whether it's ok to change the order of the computation.
6907 Generally, when vectorizing a reduction we change the order of the
6908 computation. This may change the behavior of the program in some
6909 cases, so we need to check that this is ok. One exception is when
6910 vectorizing an outer-loop: the inner-loop is executed sequentially,
6911 and therefore vectorizing reductions in the inner-loop during
6912 outer-loop vectorization is safe. Likewise when we are vectorizing
6913 a series of reductions using SLP and the VF is one the reductions
6914 are performed in scalar order. */
6915 if (slp_node
6916 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6917 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6919 else if (needs_fold_left_reduction_p (op.type, orig_code))
6921 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6922 is not directy used in stmt. */
6923 if (!only_slp_reduc_chain
6924 && reduc_chain_length != 1)
6926 if (dump_enabled_p ())
6927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6928 "in-order reduction chain without SLP.\n");
6929 return false;
6931 STMT_VINFO_REDUC_TYPE (reduc_info)
6932 = reduction_type = FOLD_LEFT_REDUCTION;
6934 else if (!commutative_binary_op_p (orig_code, op.type)
6935 || !associative_binary_op_p (orig_code, op.type))
6937 if (dump_enabled_p ())
6938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6939 "reduction: not commutative/associative");
6940 return false;
6944 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6945 && ncopies > 1)
6947 if (dump_enabled_p ())
6948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6949 "multiple types in double reduction or condition "
6950 "reduction or fold-left reduction.\n");
6951 return false;
6954 internal_fn reduc_fn = IFN_LAST;
6955 if (reduction_type == TREE_CODE_REDUCTION
6956 || reduction_type == FOLD_LEFT_REDUCTION
6957 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6958 || reduction_type == CONST_COND_REDUCTION)
6960 if (reduction_type == FOLD_LEFT_REDUCTION
6961 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6962 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6964 if (reduc_fn != IFN_LAST
6965 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6966 OPTIMIZE_FOR_SPEED))
6968 if (dump_enabled_p ())
6969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6970 "reduc op not supported by target.\n");
6972 reduc_fn = IFN_LAST;
6975 else
6977 if (!nested_cycle || double_reduc)
6979 if (dump_enabled_p ())
6980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6981 "no reduc code for scalar code.\n");
6983 return false;
6987 else if (reduction_type == COND_REDUCTION)
6989 int scalar_precision
6990 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
6991 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6992 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
6993 vectype_out);
6995 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6996 OPTIMIZE_FOR_SPEED))
6997 reduc_fn = IFN_REDUC_MAX;
6999 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7001 if (reduction_type != EXTRACT_LAST_REDUCTION
7002 && (!nested_cycle || double_reduc)
7003 && reduc_fn == IFN_LAST
7004 && !nunits_out.is_constant ())
7006 if (dump_enabled_p ())
7007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7008 "missing target support for reduction on"
7009 " variable-length vectors.\n");
7010 return false;
7013 /* For SLP reductions, see if there is a neutral value we can use. */
7014 tree neutral_op = NULL_TREE;
7015 if (slp_node)
7017 tree initial_value = NULL_TREE;
7018 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7019 initial_value = vect_phi_initial_value (reduc_def_phi);
7020 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7021 orig_code, initial_value);
7024 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7026 /* We can't support in-order reductions of code such as this:
7028 for (int i = 0; i < n1; ++i)
7029 for (int j = 0; j < n2; ++j)
7030 l += a[j];
7032 since GCC effectively transforms the loop when vectorizing:
7034 for (int i = 0; i < n1 / VF; ++i)
7035 for (int j = 0; j < n2; ++j)
7036 for (int k = 0; k < VF; ++k)
7037 l += a[j];
7039 which is a reassociation of the original operation. */
7040 if (dump_enabled_p ())
7041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7042 "in-order double reduction not supported.\n");
7044 return false;
7047 if (reduction_type == FOLD_LEFT_REDUCTION
7048 && slp_node
7049 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7051 /* We cannot use in-order reductions in this case because there is
7052 an implicit reassociation of the operations involved. */
7053 if (dump_enabled_p ())
7054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7055 "in-order unchained SLP reductions not supported.\n");
7056 return false;
7059 /* For double reductions, and for SLP reductions with a neutral value,
7060 we construct a variable-length initial vector by loading a vector
7061 full of the neutral value and then shift-and-inserting the start
7062 values into the low-numbered elements. */
7063 if ((double_reduc || neutral_op)
7064 && !nunits_out.is_constant ()
7065 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7066 vectype_out, OPTIMIZE_FOR_SPEED))
7068 if (dump_enabled_p ())
7069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7070 "reduction on variable-length vectors requires"
7071 " target support for a vector-shift-and-insert"
7072 " operation.\n");
7073 return false;
7076 /* Check extra constraints for variable-length unchained SLP reductions. */
7077 if (STMT_SLP_TYPE (stmt_info)
7078 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7079 && !nunits_out.is_constant ())
7081 /* We checked above that we could build the initial vector when
7082 there's a neutral element value. Check here for the case in
7083 which each SLP statement has its own initial value and in which
7084 that value needs to be repeated for every instance of the
7085 statement within the initial vector. */
7086 unsigned int group_size = SLP_TREE_LANES (slp_node);
7087 if (!neutral_op
7088 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7089 TREE_TYPE (vectype_out)))
7091 if (dump_enabled_p ())
7092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7093 "unsupported form of SLP reduction for"
7094 " variable-length vectors: cannot build"
7095 " initial vector.\n");
7096 return false;
7098 /* The epilogue code relies on the number of elements being a multiple
7099 of the group size. The duplicate-and-interleave approach to setting
7100 up the initial vector does too. */
7101 if (!multiple_p (nunits_out, group_size))
7103 if (dump_enabled_p ())
7104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7105 "unsupported form of SLP reduction for"
7106 " variable-length vectors: the vector size"
7107 " is not a multiple of the number of results.\n");
7108 return false;
7112 if (reduction_type == COND_REDUCTION)
7114 widest_int ni;
7116 if (! max_loop_iterations (loop, &ni))
7118 if (dump_enabled_p ())
7119 dump_printf_loc (MSG_NOTE, vect_location,
7120 "loop count not known, cannot create cond "
7121 "reduction.\n");
7122 return false;
7124 /* Convert backedges to iterations. */
7125 ni += 1;
7127 /* The additional index will be the same type as the condition. Check
7128 that the loop can fit into this less one (because we'll use up the
7129 zero slot for when there are no matches). */
7130 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7131 if (wi::geu_p (ni, wi::to_widest (max_index)))
7133 if (dump_enabled_p ())
7134 dump_printf_loc (MSG_NOTE, vect_location,
7135 "loop size is greater than data size.\n");
7136 return false;
7140 /* In case the vectorization factor (VF) is bigger than the number
7141 of elements that we can fit in a vectype (nunits), we have to generate
7142 more than one vector stmt - i.e - we need to "unroll" the
7143 vector stmt by a factor VF/nunits. For more details see documentation
7144 in vectorizable_operation. */
7146 /* If the reduction is used in an outer loop we need to generate
7147 VF intermediate results, like so (e.g. for ncopies=2):
7148 r0 = phi (init, r0)
7149 r1 = phi (init, r1)
7150 r0 = x0 + r0;
7151 r1 = x1 + r1;
7152 (i.e. we generate VF results in 2 registers).
7153 In this case we have a separate def-use cycle for each copy, and therefore
7154 for each copy we get the vector def for the reduction variable from the
7155 respective phi node created for this copy.
7157 Otherwise (the reduction is unused in the loop nest), we can combine
7158 together intermediate results, like so (e.g. for ncopies=2):
7159 r = phi (init, r)
7160 r = x0 + r;
7161 r = x1 + r;
7162 (i.e. we generate VF/2 results in a single register).
7163 In this case for each copy we get the vector def for the reduction variable
7164 from the vectorized reduction operation generated in the previous iteration.
7166 This only works when we see both the reduction PHI and its only consumer
7167 in vectorizable_reduction and there are no intermediate stmts
7168 participating. */
7169 if (ncopies > 1
7170 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7171 && reduc_chain_length == 1)
7172 single_defuse_cycle = true;
7174 if (single_defuse_cycle || lane_reduc_code_p)
7176 gcc_assert (op.code != COND_EXPR);
7178 /* 4. Supportable by target? */
7179 bool ok = true;
7181 /* 4.1. check support for the operation in the loop */
7182 machine_mode vec_mode = TYPE_MODE (vectype_in);
7183 if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7185 if (dump_enabled_p ())
7186 dump_printf (MSG_NOTE, "op not supported by target.\n");
7187 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7188 || !vect_can_vectorize_without_simd_p (op.code))
7189 ok = false;
7190 else
7191 if (dump_enabled_p ())
7192 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7195 if (vect_emulated_vector_p (vectype_in)
7196 && !vect_can_vectorize_without_simd_p (op.code))
7198 if (dump_enabled_p ())
7199 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7200 return false;
7203 /* lane-reducing operations have to go through vect_transform_reduction.
7204 For the other cases try without the single cycle optimization. */
7205 if (!ok)
7207 if (lane_reduc_code_p)
7208 return false;
7209 else
7210 single_defuse_cycle = false;
7213 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7215 /* If the reduction stmt is one of the patterns that have lane
7216 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7217 if ((ncopies > 1 && ! single_defuse_cycle)
7218 && lane_reduc_code_p)
7220 if (dump_enabled_p ())
7221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7222 "multi def-use cycle not possible for lane-reducing "
7223 "reduction operation\n");
7224 return false;
7227 if (slp_node
7228 && !(!single_defuse_cycle
7229 && !lane_reduc_code_p
7230 && reduction_type != FOLD_LEFT_REDUCTION))
7231 for (i = 0; i < (int) op.num_ops; i++)
7232 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7234 if (dump_enabled_p ())
7235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7236 "incompatible vector types for invariants\n");
7237 return false;
7240 if (slp_node)
7241 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7242 else
7243 vec_num = 1;
7245 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7246 reduction_type, ncopies, cost_vec);
7247 /* Cost the reduction op inside the loop if transformed via
7248 vect_transform_reduction. Otherwise this is costed by the
7249 separate vectorizable_* routines. */
7250 if (single_defuse_cycle || lane_reduc_code_p)
7251 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7253 if (dump_enabled_p ()
7254 && reduction_type == FOLD_LEFT_REDUCTION)
7255 dump_printf_loc (MSG_NOTE, vect_location,
7256 "using an in-order (fold-left) reduction.\n");
7257 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7258 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7259 reductions go through their own vectorizable_* routines. */
7260 if (!single_defuse_cycle
7261 && !lane_reduc_code_p
7262 && reduction_type != FOLD_LEFT_REDUCTION)
7264 stmt_vec_info tem
7265 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7266 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7268 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7269 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7271 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7272 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7274 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7276 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7277 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7279 if (reduction_type != FOLD_LEFT_REDUCTION
7280 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7281 && (cond_fn == IFN_LAST
7282 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7283 OPTIMIZE_FOR_SPEED)))
7285 if (dump_enabled_p ())
7286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7287 "can't operate on partial vectors because"
7288 " no conditional operation is available.\n");
7289 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7291 else if (reduction_type == FOLD_LEFT_REDUCTION
7292 && reduc_fn == IFN_LAST
7293 && !expand_vec_cond_expr_p (vectype_in,
7294 truth_type_for (vectype_in),
7295 SSA_NAME))
7297 if (dump_enabled_p ())
7298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7299 "can't operate on partial vectors because"
7300 " no conditional operation is available.\n");
7301 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7303 else
7304 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7305 vectype_in, NULL);
7307 return true;
7310 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7311 value. */
7313 bool
7314 vect_transform_reduction (loop_vec_info loop_vinfo,
7315 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7316 gimple **vec_stmt, slp_tree slp_node)
7318 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7319 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7320 int i;
7321 int ncopies;
7322 int vec_num;
7324 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7325 gcc_assert (reduc_info->is_reduc_info);
7327 if (nested_in_vect_loop_p (loop, stmt_info))
7329 loop = loop->inner;
7330 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7333 gimple_match_op op;
7334 if (!gimple_extract_op (stmt_info->stmt, &op))
7335 gcc_unreachable ();
7336 gcc_assert (op.code.is_tree_code ());
7337 auto code = tree_code (op.code);
7339 /* All uses but the last are expected to be defined in the loop.
7340 The last use is the reduction variable. In case of nested cycle this
7341 assumption is not true: we use reduc_index to record the index of the
7342 reduction variable. */
7343 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7344 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7345 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7346 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7348 if (slp_node)
7350 ncopies = 1;
7351 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7353 else
7355 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7356 vec_num = 1;
7359 internal_fn cond_fn = get_conditional_internal_fn (code);
7360 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7361 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7363 /* Transform. */
7364 tree new_temp = NULL_TREE;
7365 auto_vec<tree> vec_oprnds0;
7366 auto_vec<tree> vec_oprnds1;
7367 auto_vec<tree> vec_oprnds2;
7368 tree def0;
7370 if (dump_enabled_p ())
7371 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7373 /* FORNOW: Multiple types are not supported for condition. */
7374 if (code == COND_EXPR)
7375 gcc_assert (ncopies == 1);
7377 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7379 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7380 if (reduction_type == FOLD_LEFT_REDUCTION)
7382 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7383 return vectorize_fold_left_reduction
7384 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7385 reduc_fn, op.ops, vectype_in, reduc_index, masks);
7388 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7389 gcc_assert (single_defuse_cycle
7390 || code == DOT_PROD_EXPR
7391 || code == WIDEN_SUM_EXPR
7392 || code == SAD_EXPR);
7394 /* Create the destination vector */
7395 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7396 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7398 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7399 single_defuse_cycle && reduc_index == 0
7400 ? NULL_TREE : op.ops[0], &vec_oprnds0,
7401 single_defuse_cycle && reduc_index == 1
7402 ? NULL_TREE : op.ops[1], &vec_oprnds1,
7403 op.num_ops == 3
7404 && !(single_defuse_cycle && reduc_index == 2)
7405 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7406 if (single_defuse_cycle)
7408 gcc_assert (!slp_node);
7409 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7410 op.ops[reduc_index],
7411 reduc_index == 0 ? &vec_oprnds0
7412 : (reduc_index == 1 ? &vec_oprnds1
7413 : &vec_oprnds2));
7416 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7418 gimple *new_stmt;
7419 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7420 if (masked_loop_p && !mask_by_cond_expr)
7422 /* Make sure that the reduction accumulator is vop[0]. */
7423 if (reduc_index == 1)
7425 gcc_assert (commutative_tree_code (code));
7426 std::swap (vop[0], vop[1]);
7428 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7429 vectype_in, i);
7430 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7431 vop[0], vop[1], vop[0]);
7432 new_temp = make_ssa_name (vec_dest, call);
7433 gimple_call_set_lhs (call, new_temp);
7434 gimple_call_set_nothrow (call, true);
7435 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7436 new_stmt = call;
7438 else
7440 if (op.num_ops == 3)
7441 vop[2] = vec_oprnds2[i];
7443 if (masked_loop_p && mask_by_cond_expr)
7445 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7446 vectype_in, i);
7447 build_vect_cond_expr (code, vop, mask, gsi);
7450 new_stmt = gimple_build_assign (vec_dest, code,
7451 vop[0], vop[1], vop[2]);
7452 new_temp = make_ssa_name (vec_dest, new_stmt);
7453 gimple_assign_set_lhs (new_stmt, new_temp);
7454 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7457 if (slp_node)
7458 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7459 else if (single_defuse_cycle
7460 && i < ncopies - 1)
7462 if (reduc_index == 0)
7463 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7464 else if (reduc_index == 1)
7465 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7466 else if (reduc_index == 2)
7467 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7469 else
7470 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7473 if (!slp_node)
7474 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7476 return true;
7479 /* Transform phase of a cycle PHI. */
7481 bool
7482 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7483 stmt_vec_info stmt_info, gimple **vec_stmt,
7484 slp_tree slp_node, slp_instance slp_node_instance)
7486 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7487 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7488 int i;
7489 int ncopies;
7490 int j;
7491 bool nested_cycle = false;
7492 int vec_num;
7494 if (nested_in_vect_loop_p (loop, stmt_info))
7496 loop = loop->inner;
7497 nested_cycle = true;
7500 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7501 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7502 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7503 gcc_assert (reduc_info->is_reduc_info);
7505 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7506 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7507 /* Leave the scalar phi in place. */
7508 return true;
7510 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7511 /* For a nested cycle we do not fill the above. */
7512 if (!vectype_in)
7513 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7514 gcc_assert (vectype_in);
7516 if (slp_node)
7518 /* The size vect_schedule_slp_instance computes is off for us. */
7519 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7520 * SLP_TREE_LANES (slp_node), vectype_in);
7521 ncopies = 1;
7523 else
7525 vec_num = 1;
7526 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7529 /* Check whether we should use a single PHI node and accumulate
7530 vectors to one before the backedge. */
7531 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7532 ncopies = 1;
7534 /* Create the destination vector */
7535 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7536 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7537 vectype_out);
7539 /* Get the loop-entry arguments. */
7540 tree vec_initial_def = NULL_TREE;
7541 auto_vec<tree> vec_initial_defs;
7542 if (slp_node)
7544 vec_initial_defs.reserve (vec_num);
7545 if (nested_cycle)
7547 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7548 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7549 &vec_initial_defs);
7551 else
7553 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7554 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7555 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7557 unsigned int num_phis = stmts.length ();
7558 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7559 num_phis = 1;
7560 initial_values.reserve (num_phis);
7561 for (unsigned int i = 0; i < num_phis; ++i)
7563 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7564 initial_values.quick_push (vect_phi_initial_value (this_phi));
7566 if (vec_num == 1)
7567 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7568 if (!initial_values.is_empty ())
7570 tree initial_value
7571 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7572 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7573 tree neutral_op
7574 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7575 code, initial_value);
7576 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7577 &vec_initial_defs, vec_num,
7578 stmts.length (), neutral_op);
7582 else
7584 /* Get at the scalar def before the loop, that defines the initial
7585 value of the reduction variable. */
7586 tree initial_def = vect_phi_initial_value (phi);
7587 reduc_info->reduc_initial_values.safe_push (initial_def);
7588 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7589 and we can't use zero for induc_val, use initial_def. Similarly
7590 for REDUC_MIN and initial_def larger than the base. */
7591 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7593 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7594 if (TREE_CODE (initial_def) == INTEGER_CST
7595 && !integer_zerop (induc_val)
7596 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7597 && tree_int_cst_lt (initial_def, induc_val))
7598 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7599 && tree_int_cst_lt (induc_val, initial_def))))
7601 induc_val = initial_def;
7602 /* Communicate we used the initial_def to epilouge
7603 generation. */
7604 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7606 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7608 else if (nested_cycle)
7610 /* Do not use an adjustment def as that case is not supported
7611 correctly if ncopies is not one. */
7612 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7613 ncopies, initial_def,
7614 &vec_initial_defs);
7616 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7617 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7618 /* Fill the initial vector with the initial scalar value. */
7619 vec_initial_def
7620 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7621 initial_def, initial_def);
7622 else
7624 if (ncopies == 1)
7625 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7626 if (!reduc_info->reduc_initial_values.is_empty ())
7628 initial_def = reduc_info->reduc_initial_values[0];
7629 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7630 tree neutral_op
7631 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7632 code, initial_def);
7633 gcc_assert (neutral_op);
7634 /* Try to simplify the vector initialization by applying an
7635 adjustment after the reduction has been performed. */
7636 if (!reduc_info->reused_accumulator
7637 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7638 && !operand_equal_p (neutral_op, initial_def))
7640 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7641 = initial_def;
7642 initial_def = neutral_op;
7644 vec_initial_def
7645 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7646 initial_def, neutral_op);
7651 if (vec_initial_def)
7653 vec_initial_defs.create (ncopies);
7654 for (i = 0; i < ncopies; ++i)
7655 vec_initial_defs.quick_push (vec_initial_def);
7658 if (auto *accumulator = reduc_info->reused_accumulator)
7660 tree def = accumulator->reduc_input;
7661 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7663 unsigned int nreduc;
7664 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7665 (TREE_TYPE (def)),
7666 TYPE_VECTOR_SUBPARTS (vectype_out),
7667 &nreduc);
7668 gcc_assert (res);
7669 gimple_seq stmts = NULL;
7670 /* Reduce the single vector to a smaller one. */
7671 if (nreduc != 1)
7673 /* Perform the reduction in the appropriate type. */
7674 tree rvectype = vectype_out;
7675 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7676 TREE_TYPE (TREE_TYPE (def))))
7677 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7678 TYPE_VECTOR_SUBPARTS
7679 (vectype_out));
7680 def = vect_create_partial_epilog (def, rvectype,
7681 STMT_VINFO_REDUC_CODE
7682 (reduc_info),
7683 &stmts);
7685 /* The epilogue loop might use a different vector mode, like
7686 VNx2DI vs. V2DI. */
7687 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7689 tree reduc_type = build_vector_type_for_mode
7690 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7691 def = gimple_convert (&stmts, reduc_type, def);
7693 /* Adjust the input so we pick up the partially reduced value
7694 for the skip edge in vect_create_epilog_for_reduction. */
7695 accumulator->reduc_input = def;
7696 /* And the reduction could be carried out using a different sign. */
7697 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7698 def = gimple_convert (&stmts, vectype_out, def);
7699 if (loop_vinfo->main_loop_edge)
7701 /* While we'd like to insert on the edge this will split
7702 blocks and disturb bookkeeping, we also will eventually
7703 need this on the skip edge. Rely on sinking to
7704 fixup optimal placement and insert in the pred. */
7705 gimple_stmt_iterator gsi
7706 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7707 /* Insert before a cond that eventually skips the
7708 epilogue. */
7709 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7710 gsi_prev (&gsi);
7711 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7713 else
7714 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7715 stmts);
7717 if (loop_vinfo->main_loop_edge)
7718 vec_initial_defs[0]
7719 = vect_get_main_loop_result (loop_vinfo, def,
7720 vec_initial_defs[0]);
7721 else
7722 vec_initial_defs.safe_push (def);
7725 /* Generate the reduction PHIs upfront. */
7726 for (i = 0; i < vec_num; i++)
7728 tree vec_init_def = vec_initial_defs[i];
7729 for (j = 0; j < ncopies; j++)
7731 /* Create the reduction-phi that defines the reduction
7732 operand. */
7733 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7735 /* Set the loop-entry arg of the reduction-phi. */
7736 if (j != 0 && nested_cycle)
7737 vec_init_def = vec_initial_defs[j];
7738 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7739 UNKNOWN_LOCATION);
7741 /* The loop-latch arg is set in epilogue processing. */
7743 if (slp_node)
7744 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7745 else
7747 if (j == 0)
7748 *vec_stmt = new_phi;
7749 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7754 return true;
7757 /* Vectorizes LC PHIs. */
7759 bool
7760 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7761 stmt_vec_info stmt_info, gimple **vec_stmt,
7762 slp_tree slp_node)
7764 if (!loop_vinfo
7765 || !is_a <gphi *> (stmt_info->stmt)
7766 || gimple_phi_num_args (stmt_info->stmt) != 1)
7767 return false;
7769 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7770 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7771 return false;
7773 if (!vec_stmt) /* transformation not required. */
7775 /* Deal with copies from externs or constants that disguise as
7776 loop-closed PHI nodes (PR97886). */
7777 if (slp_node
7778 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7779 SLP_TREE_VECTYPE (slp_node)))
7781 if (dump_enabled_p ())
7782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7783 "incompatible vector types for invariants\n");
7784 return false;
7786 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7787 return true;
7790 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7791 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7792 basic_block bb = gimple_bb (stmt_info->stmt);
7793 edge e = single_pred_edge (bb);
7794 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7795 auto_vec<tree> vec_oprnds;
7796 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7797 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7798 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7799 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7801 /* Create the vectorized LC PHI node. */
7802 gphi *new_phi = create_phi_node (vec_dest, bb);
7803 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7804 if (slp_node)
7805 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7806 else
7807 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7809 if (!slp_node)
7810 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7812 return true;
7815 /* Vectorizes PHIs. */
7817 bool
7818 vectorizable_phi (vec_info *,
7819 stmt_vec_info stmt_info, gimple **vec_stmt,
7820 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7822 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7823 return false;
7825 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7826 return false;
7828 tree vectype = SLP_TREE_VECTYPE (slp_node);
7830 if (!vec_stmt) /* transformation not required. */
7832 slp_tree child;
7833 unsigned i;
7834 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7835 if (!child)
7837 if (dump_enabled_p ())
7838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7839 "PHI node with unvectorized backedge def\n");
7840 return false;
7842 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7844 if (dump_enabled_p ())
7845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7846 "incompatible vector types for invariants\n");
7847 return false;
7849 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7850 && !useless_type_conversion_p (vectype,
7851 SLP_TREE_VECTYPE (child)))
7853 /* With bools we can have mask and non-mask precision vectors,
7854 while pattern recog is supposed to guarantee consistency here
7855 bugs in it can cause mismatches (PR103489 for example).
7856 Deal with them here instead of ICEing later. */
7857 if (dump_enabled_p ())
7858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7859 "incompatible vector type setup from "
7860 "bool pattern detection\n");
7861 gcc_checking_assert
7862 (VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (child))
7863 != VECTOR_BOOLEAN_TYPE_P (vectype));
7864 return false;
7867 /* For single-argument PHIs assume coalescing which means zero cost
7868 for the scalar and the vector PHIs. This avoids artificially
7869 favoring the vector path (but may pessimize it in some cases). */
7870 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7871 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7872 vector_stmt, stmt_info, vectype, 0, vect_body);
7873 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7874 return true;
7877 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7878 basic_block bb = gimple_bb (stmt_info->stmt);
7879 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7880 auto_vec<gphi *> new_phis;
7881 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7883 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7885 /* Skip not yet vectorized defs. */
7886 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7887 && SLP_TREE_VEC_STMTS (child).is_empty ())
7888 continue;
7890 auto_vec<tree> vec_oprnds;
7891 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7892 if (!new_phis.exists ())
7894 new_phis.create (vec_oprnds.length ());
7895 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7897 /* Create the vectorized LC PHI node. */
7898 new_phis.quick_push (create_phi_node (vec_dest, bb));
7899 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7902 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7903 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7904 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7906 /* We should have at least one already vectorized child. */
7907 gcc_assert (new_phis.exists ());
7909 return true;
7912 /* Return true if VECTYPE represents a vector that requires lowering
7913 by the vector lowering pass. */
7915 bool
7916 vect_emulated_vector_p (tree vectype)
7918 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7919 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7920 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7923 /* Return true if we can emulate CODE on an integer mode representation
7924 of a vector. */
7926 bool
7927 vect_can_vectorize_without_simd_p (tree_code code)
7929 switch (code)
7931 case PLUS_EXPR:
7932 case MINUS_EXPR:
7933 case NEGATE_EXPR:
7934 case BIT_AND_EXPR:
7935 case BIT_IOR_EXPR:
7936 case BIT_XOR_EXPR:
7937 case BIT_NOT_EXPR:
7938 return true;
7940 default:
7941 return false;
7945 /* Likewise, but taking a code_helper. */
7947 bool
7948 vect_can_vectorize_without_simd_p (code_helper code)
7950 return (code.is_tree_code ()
7951 && vect_can_vectorize_without_simd_p (tree_code (code)));
7954 /* Function vectorizable_induction
7956 Check if STMT_INFO performs an induction computation that can be vectorized.
7957 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7958 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7959 Return true if STMT_INFO is vectorizable in this way. */
7961 bool
7962 vectorizable_induction (loop_vec_info loop_vinfo,
7963 stmt_vec_info stmt_info,
7964 gimple **vec_stmt, slp_tree slp_node,
7965 stmt_vector_for_cost *cost_vec)
7967 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7968 unsigned ncopies;
7969 bool nested_in_vect_loop = false;
7970 class loop *iv_loop;
7971 tree vec_def;
7972 edge pe = loop_preheader_edge (loop);
7973 basic_block new_bb;
7974 tree new_vec, vec_init, vec_step, t;
7975 tree new_name;
7976 gimple *new_stmt;
7977 gphi *induction_phi;
7978 tree induc_def, vec_dest;
7979 tree init_expr, step_expr;
7980 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7981 unsigned i;
7982 tree expr;
7983 gimple_stmt_iterator si;
7985 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7986 if (!phi)
7987 return false;
7989 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7990 return false;
7992 /* Make sure it was recognized as induction computation. */
7993 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7994 return false;
7996 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7997 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7999 if (slp_node)
8000 ncopies = 1;
8001 else
8002 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8003 gcc_assert (ncopies >= 1);
8005 /* FORNOW. These restrictions should be relaxed. */
8006 if (nested_in_vect_loop_p (loop, stmt_info))
8008 imm_use_iterator imm_iter;
8009 use_operand_p use_p;
8010 gimple *exit_phi;
8011 edge latch_e;
8012 tree loop_arg;
8014 if (ncopies > 1)
8016 if (dump_enabled_p ())
8017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8018 "multiple types in nested loop.\n");
8019 return false;
8022 exit_phi = NULL;
8023 latch_e = loop_latch_edge (loop->inner);
8024 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8025 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8027 gimple *use_stmt = USE_STMT (use_p);
8028 if (is_gimple_debug (use_stmt))
8029 continue;
8031 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8033 exit_phi = use_stmt;
8034 break;
8037 if (exit_phi)
8039 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8040 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8041 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8043 if (dump_enabled_p ())
8044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8045 "inner-loop induction only used outside "
8046 "of the outer vectorized loop.\n");
8047 return false;
8051 nested_in_vect_loop = true;
8052 iv_loop = loop->inner;
8054 else
8055 iv_loop = loop;
8056 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8058 if (slp_node && !nunits.is_constant ())
8060 /* The current SLP code creates the step value element-by-element. */
8061 if (dump_enabled_p ())
8062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8063 "SLP induction not supported for variable-length"
8064 " vectors.\n");
8065 return false;
8068 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8069 gcc_assert (step_expr != NULL_TREE);
8070 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8072 /* Check for backend support of PLUS/MINUS_EXPR. */
8073 if (!directly_supported_p (PLUS_EXPR, step_vectype)
8074 || !directly_supported_p (MINUS_EXPR, step_vectype))
8075 return false;
8077 if (!vec_stmt) /* transformation not required. */
8079 unsigned inside_cost = 0, prologue_cost = 0;
8080 if (slp_node)
8082 /* We eventually need to set a vector type on invariant
8083 arguments. */
8084 unsigned j;
8085 slp_tree child;
8086 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8087 if (!vect_maybe_update_slp_op_vectype
8088 (child, SLP_TREE_VECTYPE (slp_node)))
8090 if (dump_enabled_p ())
8091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8092 "incompatible vector types for "
8093 "invariants\n");
8094 return false;
8096 /* loop cost for vec_loop. */
8097 inside_cost
8098 = record_stmt_cost (cost_vec,
8099 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8100 vector_stmt, stmt_info, 0, vect_body);
8101 /* prologue cost for vec_init (if not nested) and step. */
8102 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8103 scalar_to_vec,
8104 stmt_info, 0, vect_prologue);
8106 else /* if (!slp_node) */
8108 /* loop cost for vec_loop. */
8109 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8110 stmt_info, 0, vect_body);
8111 /* prologue cost for vec_init and vec_step. */
8112 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8113 stmt_info, 0, vect_prologue);
8115 if (dump_enabled_p ())
8116 dump_printf_loc (MSG_NOTE, vect_location,
8117 "vect_model_induction_cost: inside_cost = %d, "
8118 "prologue_cost = %d .\n", inside_cost,
8119 prologue_cost);
8121 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8122 DUMP_VECT_SCOPE ("vectorizable_induction");
8123 return true;
8126 /* Transform. */
8128 /* Compute a vector variable, initialized with the first VF values of
8129 the induction variable. E.g., for an iv with IV_PHI='X' and
8130 evolution S, for a vector of 4 units, we want to compute:
8131 [X, X + S, X + 2*S, X + 3*S]. */
8133 if (dump_enabled_p ())
8134 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8136 pe = loop_preheader_edge (iv_loop);
8137 /* Find the first insertion point in the BB. */
8138 basic_block bb = gimple_bb (phi);
8139 si = gsi_after_labels (bb);
8141 /* For SLP induction we have to generate several IVs as for example
8142 with group size 3 we need
8143 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8144 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8145 if (slp_node)
8147 /* Enforced above. */
8148 unsigned int const_nunits = nunits.to_constant ();
8150 /* The initial values are vectorized, but any lanes > group_size
8151 need adjustment. */
8152 slp_tree init_node
8153 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8155 /* Gather steps. Since we do not vectorize inductions as
8156 cycles we have to reconstruct the step from SCEV data. */
8157 unsigned group_size = SLP_TREE_LANES (slp_node);
8158 tree *steps = XALLOCAVEC (tree, group_size);
8159 tree *inits = XALLOCAVEC (tree, group_size);
8160 stmt_vec_info phi_info;
8161 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8163 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8164 if (!init_node)
8165 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8166 pe->dest_idx);
8169 /* Now generate the IVs. */
8170 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8171 gcc_assert ((const_nunits * nvects) % group_size == 0);
8172 unsigned nivs;
8173 if (nested_in_vect_loop)
8174 nivs = nvects;
8175 else
8177 /* Compute the number of distinct IVs we need. First reduce
8178 group_size if it is a multiple of const_nunits so we get
8179 one IV for a group_size of 4 but const_nunits 2. */
8180 unsigned group_sizep = group_size;
8181 if (group_sizep % const_nunits == 0)
8182 group_sizep = group_sizep / const_nunits;
8183 nivs = least_common_multiple (group_sizep,
8184 const_nunits) / const_nunits;
8186 tree stept = TREE_TYPE (step_vectype);
8187 tree lupdate_mul = NULL_TREE;
8188 if (!nested_in_vect_loop)
8190 /* The number of iterations covered in one vector iteration. */
8191 unsigned lup_mul = (nvects * const_nunits) / group_size;
8192 lupdate_mul
8193 = build_vector_from_val (step_vectype,
8194 SCALAR_FLOAT_TYPE_P (stept)
8195 ? build_real_from_wide (stept, lup_mul,
8196 UNSIGNED)
8197 : build_int_cstu (stept, lup_mul));
8199 tree peel_mul = NULL_TREE;
8200 gimple_seq init_stmts = NULL;
8201 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8203 if (SCALAR_FLOAT_TYPE_P (stept))
8204 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8205 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8206 else
8207 peel_mul = gimple_convert (&init_stmts, stept,
8208 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8209 peel_mul = gimple_build_vector_from_val (&init_stmts,
8210 step_vectype, peel_mul);
8212 unsigned ivn;
8213 auto_vec<tree> vec_steps;
8214 for (ivn = 0; ivn < nivs; ++ivn)
8216 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8217 tree_vector_builder init_elts (vectype, const_nunits, 1);
8218 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8219 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8221 /* The scalar steps of the IVs. */
8222 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8223 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8224 step_elts.quick_push (elt);
8225 if (!init_node)
8227 /* The scalar inits of the IVs if not vectorized. */
8228 elt = inits[(ivn*const_nunits + eltn) % group_size];
8229 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8230 TREE_TYPE (elt)))
8231 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8232 TREE_TYPE (vectype), elt);
8233 init_elts.quick_push (elt);
8235 /* The number of steps to add to the initial values. */
8236 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8237 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8238 ? build_real_from_wide (stept,
8239 mul_elt, UNSIGNED)
8240 : build_int_cstu (stept, mul_elt));
8242 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8243 vec_steps.safe_push (vec_step);
8244 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8245 if (peel_mul)
8246 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8247 step_mul, peel_mul);
8248 if (!init_node)
8249 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8251 /* Create the induction-phi that defines the induction-operand. */
8252 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8253 "vec_iv_");
8254 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8255 induc_def = PHI_RESULT (induction_phi);
8257 /* Create the iv update inside the loop */
8258 tree up = vec_step;
8259 if (lupdate_mul)
8260 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8261 vec_step, lupdate_mul);
8262 gimple_seq stmts = NULL;
8263 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8264 vec_def = gimple_build (&stmts,
8265 PLUS_EXPR, step_vectype, vec_def, up);
8266 vec_def = gimple_convert (&stmts, vectype, vec_def);
8267 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8268 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8269 UNKNOWN_LOCATION);
8271 if (init_node)
8272 vec_init = vect_get_slp_vect_def (init_node, ivn);
8273 if (!nested_in_vect_loop
8274 && !integer_zerop (step_mul))
8276 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8277 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8278 vec_step, step_mul);
8279 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8280 vec_def, up);
8281 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8284 /* Set the arguments of the phi node: */
8285 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8287 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8289 if (!nested_in_vect_loop)
8291 /* Fill up to the number of vectors we need for the whole group. */
8292 nivs = least_common_multiple (group_size,
8293 const_nunits) / const_nunits;
8294 vec_steps.reserve (nivs-ivn);
8295 for (; ivn < nivs; ++ivn)
8297 SLP_TREE_VEC_STMTS (slp_node)
8298 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8299 vec_steps.quick_push (vec_steps[0]);
8303 /* Re-use IVs when we can. We are generating further vector
8304 stmts by adding VF' * stride to the IVs generated above. */
8305 if (ivn < nvects)
8307 unsigned vfp
8308 = least_common_multiple (group_size, const_nunits) / group_size;
8309 tree lupdate_mul
8310 = build_vector_from_val (step_vectype,
8311 SCALAR_FLOAT_TYPE_P (stept)
8312 ? build_real_from_wide (stept,
8313 vfp, UNSIGNED)
8314 : build_int_cstu (stept, vfp));
8315 for (; ivn < nvects; ++ivn)
8317 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8318 tree def = gimple_get_lhs (iv);
8319 if (ivn < 2*nivs)
8320 vec_steps[ivn - nivs]
8321 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8322 vec_steps[ivn - nivs], lupdate_mul);
8323 gimple_seq stmts = NULL;
8324 def = gimple_convert (&stmts, step_vectype, def);
8325 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8326 def, vec_steps[ivn % nivs]);
8327 def = gimple_convert (&stmts, vectype, def);
8328 if (gimple_code (iv) == GIMPLE_PHI)
8329 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8330 else
8332 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8333 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8335 SLP_TREE_VEC_STMTS (slp_node)
8336 .quick_push (SSA_NAME_DEF_STMT (def));
8340 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8341 gcc_assert (!new_bb);
8343 return true;
8346 init_expr = vect_phi_initial_value (phi);
8348 gimple_seq stmts = NULL;
8349 if (!nested_in_vect_loop)
8351 /* Convert the initial value to the IV update type. */
8352 tree new_type = TREE_TYPE (step_expr);
8353 init_expr = gimple_convert (&stmts, new_type, init_expr);
8355 /* If we are using the loop mask to "peel" for alignment then we need
8356 to adjust the start value here. */
8357 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8358 if (skip_niters != NULL_TREE)
8360 if (FLOAT_TYPE_P (vectype))
8361 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8362 skip_niters);
8363 else
8364 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8365 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8366 skip_niters, step_expr);
8367 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8368 init_expr, skip_step);
8372 if (stmts)
8374 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8375 gcc_assert (!new_bb);
8378 /* Create the vector that holds the initial_value of the induction. */
8379 if (nested_in_vect_loop)
8381 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8382 been created during vectorization of previous stmts. We obtain it
8383 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8384 auto_vec<tree> vec_inits;
8385 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8386 init_expr, &vec_inits);
8387 vec_init = vec_inits[0];
8388 /* If the initial value is not of proper type, convert it. */
8389 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8391 new_stmt
8392 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8393 vect_simple_var,
8394 "vec_iv_"),
8395 VIEW_CONVERT_EXPR,
8396 build1 (VIEW_CONVERT_EXPR, vectype,
8397 vec_init));
8398 vec_init = gimple_assign_lhs (new_stmt);
8399 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8400 new_stmt);
8401 gcc_assert (!new_bb);
8404 else
8406 /* iv_loop is the loop to be vectorized. Create:
8407 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8408 stmts = NULL;
8409 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8411 unsigned HOST_WIDE_INT const_nunits;
8412 if (nunits.is_constant (&const_nunits))
8414 tree_vector_builder elts (step_vectype, const_nunits, 1);
8415 elts.quick_push (new_name);
8416 for (i = 1; i < const_nunits; i++)
8418 /* Create: new_name_i = new_name + step_expr */
8419 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8420 new_name, step_expr);
8421 elts.quick_push (new_name);
8423 /* Create a vector from [new_name_0, new_name_1, ...,
8424 new_name_nunits-1] */
8425 vec_init = gimple_build_vector (&stmts, &elts);
8427 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8428 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8429 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8430 new_name, step_expr);
8431 else
8433 /* Build:
8434 [base, base, base, ...]
8435 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8436 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8437 gcc_assert (flag_associative_math);
8438 tree index = build_index_vector (step_vectype, 0, 1);
8439 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8440 new_name);
8441 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8442 step_expr);
8443 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8444 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8445 vec_init, step_vec);
8446 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8447 vec_init, base_vec);
8449 vec_init = gimple_convert (&stmts, vectype, vec_init);
8451 if (stmts)
8453 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8454 gcc_assert (!new_bb);
8459 /* Create the vector that holds the step of the induction. */
8460 if (nested_in_vect_loop)
8461 /* iv_loop is nested in the loop to be vectorized. Generate:
8462 vec_step = [S, S, S, S] */
8463 new_name = step_expr;
8464 else
8466 /* iv_loop is the loop to be vectorized. Generate:
8467 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8468 gimple_seq seq = NULL;
8469 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8471 expr = build_int_cst (integer_type_node, vf);
8472 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8474 else
8475 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8476 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8477 expr, step_expr);
8478 if (seq)
8480 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8481 gcc_assert (!new_bb);
8485 t = unshare_expr (new_name);
8486 gcc_assert (CONSTANT_CLASS_P (new_name)
8487 || TREE_CODE (new_name) == SSA_NAME);
8488 new_vec = build_vector_from_val (step_vectype, t);
8489 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8490 new_vec, step_vectype, NULL);
8493 /* Create the following def-use cycle:
8494 loop prolog:
8495 vec_init = ...
8496 vec_step = ...
8497 loop:
8498 vec_iv = PHI <vec_init, vec_loop>
8500 STMT
8502 vec_loop = vec_iv + vec_step; */
8504 /* Create the induction-phi that defines the induction-operand. */
8505 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8506 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8507 induc_def = PHI_RESULT (induction_phi);
8509 /* Create the iv update inside the loop */
8510 stmts = NULL;
8511 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8512 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8513 vec_def = gimple_convert (&stmts, vectype, vec_def);
8514 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8515 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8517 /* Set the arguments of the phi node: */
8518 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8519 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8520 UNKNOWN_LOCATION);
8522 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8523 *vec_stmt = induction_phi;
8525 /* In case that vectorization factor (VF) is bigger than the number
8526 of elements that we can fit in a vectype (nunits), we have to generate
8527 more than one vector stmt - i.e - we need to "unroll" the
8528 vector stmt by a factor VF/nunits. For more details see documentation
8529 in vectorizable_operation. */
8531 if (ncopies > 1)
8533 gimple_seq seq = NULL;
8534 /* FORNOW. This restriction should be relaxed. */
8535 gcc_assert (!nested_in_vect_loop);
8537 /* Create the vector that holds the step of the induction. */
8538 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8540 expr = build_int_cst (integer_type_node, nunits);
8541 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8543 else
8544 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8545 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8546 expr, step_expr);
8547 if (seq)
8549 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8550 gcc_assert (!new_bb);
8553 t = unshare_expr (new_name);
8554 gcc_assert (CONSTANT_CLASS_P (new_name)
8555 || TREE_CODE (new_name) == SSA_NAME);
8556 new_vec = build_vector_from_val (step_vectype, t);
8557 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8558 new_vec, step_vectype, NULL);
8560 vec_def = induc_def;
8561 for (i = 1; i < ncopies; i++)
8563 /* vec_i = vec_prev + vec_step */
8564 gimple_seq stmts = NULL;
8565 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8566 vec_def = gimple_build (&stmts,
8567 PLUS_EXPR, step_vectype, vec_def, vec_step);
8568 vec_def = gimple_convert (&stmts, vectype, vec_def);
8570 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8571 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8572 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8576 if (dump_enabled_p ())
8577 dump_printf_loc (MSG_NOTE, vect_location,
8578 "transform induction: created def-use cycle: %G%G",
8579 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8581 return true;
8584 /* Function vectorizable_live_operation.
8586 STMT_INFO computes a value that is used outside the loop. Check if
8587 it can be supported. */
8589 bool
8590 vectorizable_live_operation (vec_info *vinfo,
8591 stmt_vec_info stmt_info,
8592 gimple_stmt_iterator *gsi,
8593 slp_tree slp_node, slp_instance slp_node_instance,
8594 int slp_index, bool vec_stmt_p,
8595 stmt_vector_for_cost *cost_vec)
8597 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8598 imm_use_iterator imm_iter;
8599 tree lhs, lhs_type, bitsize;
8600 tree vectype = (slp_node
8601 ? SLP_TREE_VECTYPE (slp_node)
8602 : STMT_VINFO_VECTYPE (stmt_info));
8603 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8604 int ncopies;
8605 gimple *use_stmt;
8606 auto_vec<tree> vec_oprnds;
8607 int vec_entry = 0;
8608 poly_uint64 vec_index = 0;
8610 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8612 /* If a stmt of a reduction is live, vectorize it via
8613 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8614 validity so just trigger the transform here. */
8615 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8617 if (!vec_stmt_p)
8618 return true;
8619 if (slp_node)
8621 /* For reduction chains the meta-info is attached to
8622 the group leader. */
8623 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8624 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8625 /* For SLP reductions we vectorize the epilogue for
8626 all involved stmts together. */
8627 else if (slp_index != 0)
8628 return true;
8629 else
8630 /* For SLP reductions the meta-info is attached to
8631 the representative. */
8632 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8634 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8635 gcc_assert (reduc_info->is_reduc_info);
8636 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8637 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8638 return true;
8639 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8640 slp_node_instance);
8641 return true;
8644 /* If STMT is not relevant and it is a simple assignment and its inputs are
8645 invariant then it can remain in place, unvectorized. The original last
8646 scalar value that it computes will be used. */
8647 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8649 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8650 if (dump_enabled_p ())
8651 dump_printf_loc (MSG_NOTE, vect_location,
8652 "statement is simple and uses invariant. Leaving in "
8653 "place.\n");
8654 return true;
8657 if (slp_node)
8658 ncopies = 1;
8659 else
8660 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8662 if (slp_node)
8664 gcc_assert (slp_index >= 0);
8666 /* Get the last occurrence of the scalar index from the concatenation of
8667 all the slp vectors. Calculate which slp vector it is and the index
8668 within. */
8669 int num_scalar = SLP_TREE_LANES (slp_node);
8670 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8671 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8673 /* Calculate which vector contains the result, and which lane of
8674 that vector we need. */
8675 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8677 if (dump_enabled_p ())
8678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8679 "Cannot determine which vector holds the"
8680 " final result.\n");
8681 return false;
8685 if (!vec_stmt_p)
8687 /* No transformation required. */
8688 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8690 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8691 OPTIMIZE_FOR_SPEED))
8693 if (dump_enabled_p ())
8694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8695 "can't operate on partial vectors "
8696 "because the target doesn't support extract "
8697 "last reduction.\n");
8698 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8700 else if (slp_node)
8702 if (dump_enabled_p ())
8703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8704 "can't operate on partial vectors "
8705 "because an SLP statement is live after "
8706 "the loop.\n");
8707 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8709 else if (ncopies > 1)
8711 if (dump_enabled_p ())
8712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8713 "can't operate on partial vectors "
8714 "because ncopies is greater than 1.\n");
8715 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8717 else
8719 gcc_assert (ncopies == 1 && !slp_node);
8720 vect_record_loop_mask (loop_vinfo,
8721 &LOOP_VINFO_MASKS (loop_vinfo),
8722 1, vectype, NULL);
8725 /* ??? Enable for loop costing as well. */
8726 if (!loop_vinfo)
8727 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8728 0, vect_epilogue);
8729 return true;
8732 /* Use the lhs of the original scalar statement. */
8733 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8734 if (dump_enabled_p ())
8735 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8736 "stmt %G", stmt);
8738 lhs = gimple_get_lhs (stmt);
8739 lhs_type = TREE_TYPE (lhs);
8741 bitsize = vector_element_bits_tree (vectype);
8743 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8744 tree vec_lhs, bitstart;
8745 gimple *vec_stmt;
8746 if (slp_node)
8748 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8750 /* Get the correct slp vectorized stmt. */
8751 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8752 vec_lhs = gimple_get_lhs (vec_stmt);
8754 /* Get entry to use. */
8755 bitstart = bitsize_int (vec_index);
8756 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8758 else
8760 /* For multiple copies, get the last copy. */
8761 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8762 vec_lhs = gimple_get_lhs (vec_stmt);
8764 /* Get the last lane in the vector. */
8765 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8768 if (loop_vinfo)
8770 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8771 requirement, insert one phi node for it. It looks like:
8772 loop;
8774 # lhs' = PHI <lhs>
8776 loop;
8778 # vec_lhs' = PHI <vec_lhs>
8779 new_tree = lane_extract <vec_lhs', ...>;
8780 lhs' = new_tree; */
8782 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8783 basic_block exit_bb = single_exit (loop)->dest;
8784 gcc_assert (single_pred_p (exit_bb));
8786 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8787 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8788 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8790 gimple_seq stmts = NULL;
8791 tree new_tree;
8792 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8794 /* Emit:
8796 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8798 where VEC_LHS is the vectorized live-out result and MASK is
8799 the loop mask for the final iteration. */
8800 gcc_assert (ncopies == 1 && !slp_node);
8801 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8802 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8803 1, vectype, 0);
8804 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8805 mask, vec_lhs_phi);
8807 /* Convert the extracted vector element to the scalar type. */
8808 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8810 else
8812 tree bftype = TREE_TYPE (vectype);
8813 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8814 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8815 new_tree = build3 (BIT_FIELD_REF, bftype,
8816 vec_lhs_phi, bitsize, bitstart);
8817 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8818 &stmts, true, NULL_TREE);
8821 if (stmts)
8823 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8824 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8826 /* Remove existing phi from lhs and create one copy from new_tree. */
8827 tree lhs_phi = NULL_TREE;
8828 gimple_stmt_iterator gsi;
8829 for (gsi = gsi_start_phis (exit_bb);
8830 !gsi_end_p (gsi); gsi_next (&gsi))
8832 gimple *phi = gsi_stmt (gsi);
8833 if ((gimple_phi_arg_def (phi, 0) == lhs))
8835 remove_phi_node (&gsi, false);
8836 lhs_phi = gimple_phi_result (phi);
8837 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8838 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8839 break;
8844 /* Replace use of lhs with newly computed result. If the use stmt is a
8845 single arg PHI, just replace all uses of PHI result. It's necessary
8846 because lcssa PHI defining lhs may be before newly inserted stmt. */
8847 use_operand_p use_p;
8848 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8849 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8850 && !is_gimple_debug (use_stmt))
8852 if (gimple_code (use_stmt) == GIMPLE_PHI
8853 && gimple_phi_num_args (use_stmt) == 1)
8855 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8857 else
8859 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8860 SET_USE (use_p, new_tree);
8862 update_stmt (use_stmt);
8865 else
8867 /* For basic-block vectorization simply insert the lane-extraction. */
8868 tree bftype = TREE_TYPE (vectype);
8869 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8870 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8871 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8872 vec_lhs, bitsize, bitstart);
8873 gimple_seq stmts = NULL;
8874 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8875 &stmts, true, NULL_TREE);
8876 if (TREE_CODE (new_tree) == SSA_NAME
8877 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8878 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8879 if (is_a <gphi *> (vec_stmt))
8881 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8882 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8884 else
8886 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8887 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8890 /* Replace use of lhs with newly computed result. If the use stmt is a
8891 single arg PHI, just replace all uses of PHI result. It's necessary
8892 because lcssa PHI defining lhs may be before newly inserted stmt. */
8893 use_operand_p use_p;
8894 stmt_vec_info use_stmt_info;
8895 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8896 if (!is_gimple_debug (use_stmt)
8897 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8898 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8900 /* ??? This can happen when the live lane ends up being
8901 used in a vector construction code-generated by an
8902 external SLP node (and code-generation for that already
8903 happened). See gcc.dg/vect/bb-slp-47.c.
8904 Doing this is what would happen if that vector CTOR
8905 were not code-generated yet so it is not too bad.
8906 ??? In fact we'd likely want to avoid this situation
8907 in the first place. */
8908 if (TREE_CODE (new_tree) == SSA_NAME
8909 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8910 && gimple_code (use_stmt) != GIMPLE_PHI
8911 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8912 use_stmt))
8914 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8915 gcc_assert (code == CONSTRUCTOR
8916 || code == VIEW_CONVERT_EXPR
8917 || CONVERT_EXPR_CODE_P (code));
8918 if (dump_enabled_p ())
8919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8920 "Using original scalar computation for "
8921 "live lane because use preceeds vector "
8922 "def\n");
8923 continue;
8925 /* ??? It can also happen that we end up pulling a def into
8926 a loop where replacing out-of-loop uses would require
8927 a new LC SSA PHI node. Retain the original scalar in
8928 those cases as well. PR98064. */
8929 if (TREE_CODE (new_tree) == SSA_NAME
8930 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8931 && (gimple_bb (use_stmt)->loop_father
8932 != gimple_bb (vec_stmt)->loop_father)
8933 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8934 gimple_bb (use_stmt)->loop_father))
8936 if (dump_enabled_p ())
8937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8938 "Using original scalar computation for "
8939 "live lane because there is an out-of-loop "
8940 "definition for it\n");
8941 continue;
8943 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8944 SET_USE (use_p, new_tree);
8945 update_stmt (use_stmt);
8949 return true;
8952 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8954 static void
8955 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8957 ssa_op_iter op_iter;
8958 imm_use_iterator imm_iter;
8959 def_operand_p def_p;
8960 gimple *ustmt;
8962 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8964 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8966 basic_block bb;
8968 if (!is_gimple_debug (ustmt))
8969 continue;
8971 bb = gimple_bb (ustmt);
8973 if (!flow_bb_inside_loop_p (loop, bb))
8975 if (gimple_debug_bind_p (ustmt))
8977 if (dump_enabled_p ())
8978 dump_printf_loc (MSG_NOTE, vect_location,
8979 "killing debug use\n");
8981 gimple_debug_bind_reset_value (ustmt);
8982 update_stmt (ustmt);
8984 else
8985 gcc_unreachable ();
8991 /* Given loop represented by LOOP_VINFO, return true if computation of
8992 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8993 otherwise. */
8995 static bool
8996 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8998 /* Constant case. */
8999 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9001 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9002 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9004 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9005 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9006 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9007 return true;
9010 widest_int max;
9011 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9012 /* Check the upper bound of loop niters. */
9013 if (get_max_loop_iterations (loop, &max))
9015 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9016 signop sgn = TYPE_SIGN (type);
9017 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9018 if (max < type_max)
9019 return true;
9021 return false;
9024 /* Return a mask type with half the number of elements as OLD_TYPE,
9025 given that it should have mode NEW_MODE. */
9027 tree
9028 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9030 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9031 return build_truth_vector_type_for_mode (nunits, new_mode);
9034 /* Return a mask type with twice as many elements as OLD_TYPE,
9035 given that it should have mode NEW_MODE. */
9037 tree
9038 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9040 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9041 return build_truth_vector_type_for_mode (nunits, new_mode);
9044 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9045 contain a sequence of NVECTORS masks that each control a vector of type
9046 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9047 these vector masks with the vector version of SCALAR_MASK. */
9049 void
9050 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9051 unsigned int nvectors, tree vectype, tree scalar_mask)
9053 gcc_assert (nvectors != 0);
9054 if (masks->length () < nvectors)
9055 masks->safe_grow_cleared (nvectors, true);
9056 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9057 /* The number of scalars per iteration and the number of vectors are
9058 both compile-time constants. */
9059 unsigned int nscalars_per_iter
9060 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9061 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9063 if (scalar_mask)
9065 scalar_cond_masked_key cond (scalar_mask, nvectors);
9066 loop_vinfo->scalar_cond_masked_set.add (cond);
9069 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9071 rgm->max_nscalars_per_iter = nscalars_per_iter;
9072 rgm->type = truth_type_for (vectype);
9073 rgm->factor = 1;
9077 /* Given a complete set of masks MASKS, extract mask number INDEX
9078 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9079 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9081 See the comment above vec_loop_masks for more details about the mask
9082 arrangement. */
9084 tree
9085 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9086 unsigned int nvectors, tree vectype, unsigned int index)
9088 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9089 tree mask_type = rgm->type;
9091 /* Populate the rgroup's mask array, if this is the first time we've
9092 used it. */
9093 if (rgm->controls.is_empty ())
9095 rgm->controls.safe_grow_cleared (nvectors, true);
9096 for (unsigned int i = 0; i < nvectors; ++i)
9098 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9099 /* Provide a dummy definition until the real one is available. */
9100 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9101 rgm->controls[i] = mask;
9105 tree mask = rgm->controls[index];
9106 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9107 TYPE_VECTOR_SUBPARTS (vectype)))
9109 /* A loop mask for data type X can be reused for data type Y
9110 if X has N times more elements than Y and if Y's elements
9111 are N times bigger than X's. In this case each sequence
9112 of N elements in the loop mask will be all-zero or all-one.
9113 We can then view-convert the mask so that each sequence of
9114 N elements is replaced by a single element. */
9115 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9116 TYPE_VECTOR_SUBPARTS (vectype)));
9117 gimple_seq seq = NULL;
9118 mask_type = truth_type_for (vectype);
9119 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9120 if (seq)
9121 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9123 return mask;
9126 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9127 lengths for controlling an operation on VECTYPE. The operation splits
9128 each element of VECTYPE into FACTOR separate subelements, measuring the
9129 length as a number of these subelements. */
9131 void
9132 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9133 unsigned int nvectors, tree vectype, unsigned int factor)
9135 gcc_assert (nvectors != 0);
9136 if (lens->length () < nvectors)
9137 lens->safe_grow_cleared (nvectors, true);
9138 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9140 /* The number of scalars per iteration, scalar occupied bytes and
9141 the number of vectors are both compile-time constants. */
9142 unsigned int nscalars_per_iter
9143 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9144 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9146 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9148 /* For now, we only support cases in which all loads and stores fall back
9149 to VnQI or none do. */
9150 gcc_assert (!rgl->max_nscalars_per_iter
9151 || (rgl->factor == 1 && factor == 1)
9152 || (rgl->max_nscalars_per_iter * rgl->factor
9153 == nscalars_per_iter * factor));
9154 rgl->max_nscalars_per_iter = nscalars_per_iter;
9155 rgl->type = vectype;
9156 rgl->factor = factor;
9160 /* Given a complete set of length LENS, extract length number INDEX for an
9161 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9163 tree
9164 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9165 unsigned int nvectors, unsigned int index)
9167 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9169 /* Populate the rgroup's len array, if this is the first time we've
9170 used it. */
9171 if (rgl->controls.is_empty ())
9173 rgl->controls.safe_grow_cleared (nvectors, true);
9174 for (unsigned int i = 0; i < nvectors; ++i)
9176 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9177 gcc_assert (len_type != NULL_TREE);
9178 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9180 /* Provide a dummy definition until the real one is available. */
9181 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9182 rgl->controls[i] = len;
9186 return rgl->controls[index];
9189 /* Scale profiling counters by estimation for LOOP which is vectorized
9190 by factor VF. */
9192 static void
9193 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9195 edge preheader = loop_preheader_edge (loop);
9196 /* Reduce loop iterations by the vectorization factor. */
9197 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9198 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9200 if (freq_h.nonzero_p ())
9202 profile_probability p;
9204 /* Avoid dropping loop body profile counter to 0 because of zero count
9205 in loop's preheader. */
9206 if (!(freq_e == profile_count::zero ()))
9207 freq_e = freq_e.force_nonzero ();
9208 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9209 scale_loop_frequencies (loop, p);
9212 edge exit_e = single_exit (loop);
9213 exit_e->probability = profile_probability::always ()
9214 .apply_scale (1, new_est_niter + 1);
9216 edge exit_l = single_pred_edge (loop->latch);
9217 profile_probability prob = exit_l->probability;
9218 exit_l->probability = exit_e->probability.invert ();
9219 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9220 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9223 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9224 latch edge values originally defined by it. */
9226 static void
9227 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9228 stmt_vec_info def_stmt_info)
9230 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9231 if (!def || TREE_CODE (def) != SSA_NAME)
9232 return;
9233 stmt_vec_info phi_info;
9234 imm_use_iterator iter;
9235 use_operand_p use_p;
9236 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9237 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9238 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9239 && (phi_info = loop_vinfo->lookup_stmt (phi))
9240 && STMT_VINFO_RELEVANT_P (phi_info)
9241 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9242 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9243 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9245 loop_p loop = gimple_bb (phi)->loop_father;
9246 edge e = loop_latch_edge (loop);
9247 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9249 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9250 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9251 gcc_assert (phi_defs.length () == latch_defs.length ());
9252 for (unsigned i = 0; i < phi_defs.length (); ++i)
9253 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9254 gimple_get_lhs (latch_defs[i]), e,
9255 gimple_phi_arg_location (phi, e->dest_idx));
9260 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9261 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9262 stmt_vec_info. */
9264 static bool
9265 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9266 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9268 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9269 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9271 if (dump_enabled_p ())
9272 dump_printf_loc (MSG_NOTE, vect_location,
9273 "------>vectorizing statement: %G", stmt_info->stmt);
9275 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9276 vect_loop_kill_debug_uses (loop, stmt_info);
9278 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9279 && !STMT_VINFO_LIVE_P (stmt_info))
9280 return false;
9282 if (STMT_VINFO_VECTYPE (stmt_info))
9284 poly_uint64 nunits
9285 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9286 if (!STMT_SLP_TYPE (stmt_info)
9287 && maybe_ne (nunits, vf)
9288 && dump_enabled_p ())
9289 /* For SLP VF is set according to unrolling factor, and not
9290 to vector size, hence for SLP this print is not valid. */
9291 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9294 /* Pure SLP statements have already been vectorized. We still need
9295 to apply loop vectorization to hybrid SLP statements. */
9296 if (PURE_SLP_STMT (stmt_info))
9297 return false;
9299 if (dump_enabled_p ())
9300 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9302 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9303 *seen_store = stmt_info;
9305 return true;
9308 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9309 in the hash_map with its corresponding values. */
9311 static tree
9312 find_in_mapping (tree t, void *context)
9314 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9316 tree *value = mapping->get (t);
9317 return value ? *value : t;
9320 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9321 original loop that has now been vectorized.
9323 The inits of the data_references need to be advanced with the number of
9324 iterations of the main loop. This has been computed in vect_do_peeling and
9325 is stored in parameter ADVANCE. We first restore the data_references
9326 initial offset with the values recored in ORIG_DRS_INIT.
9328 Since the loop_vec_info of this EPILOGUE was constructed for the original
9329 loop, its stmt_vec_infos all point to the original statements. These need
9330 to be updated to point to their corresponding copies as well as the SSA_NAMES
9331 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9333 The data_reference's connections also need to be updated. Their
9334 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9335 stmt_vec_infos, their statements need to point to their corresponding copy,
9336 if they are gather loads or scatter stores then their reference needs to be
9337 updated to point to its corresponding copy and finally we set
9338 'base_misaligned' to false as we have already peeled for alignment in the
9339 prologue of the main loop. */
9341 static void
9342 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9344 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9345 auto_vec<gimple *> stmt_worklist;
9346 hash_map<tree,tree> mapping;
9347 gimple *orig_stmt, *new_stmt;
9348 gimple_stmt_iterator epilogue_gsi;
9349 gphi_iterator epilogue_phi_gsi;
9350 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9351 basic_block *epilogue_bbs = get_loop_body (epilogue);
9352 unsigned i;
9354 free (LOOP_VINFO_BBS (epilogue_vinfo));
9355 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9357 /* Advance data_reference's with the number of iterations of the previous
9358 loop and its prologue. */
9359 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9362 /* The EPILOGUE loop is a copy of the original loop so they share the same
9363 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9364 point to the copied statements. We also create a mapping of all LHS' in
9365 the original loop and all the LHS' in the EPILOGUE and create worklists to
9366 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9367 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9369 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9370 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9372 new_stmt = epilogue_phi_gsi.phi ();
9374 gcc_assert (gimple_uid (new_stmt) > 0);
9375 stmt_vinfo
9376 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9378 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9379 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9381 mapping.put (gimple_phi_result (orig_stmt),
9382 gimple_phi_result (new_stmt));
9383 /* PHI nodes can not have patterns or related statements. */
9384 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9385 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9388 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9389 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9391 new_stmt = gsi_stmt (epilogue_gsi);
9392 if (is_gimple_debug (new_stmt))
9393 continue;
9395 gcc_assert (gimple_uid (new_stmt) > 0);
9396 stmt_vinfo
9397 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9399 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9400 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9402 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9403 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9405 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9407 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9408 for (gimple_stmt_iterator gsi = gsi_start (seq);
9409 !gsi_end_p (gsi); gsi_next (&gsi))
9410 stmt_worklist.safe_push (gsi_stmt (gsi));
9413 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9414 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9416 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9417 stmt_worklist.safe_push (stmt);
9418 /* Set BB such that the assert in
9419 'get_initial_def_for_reduction' is able to determine that
9420 the BB of the related stmt is inside this loop. */
9421 gimple_set_bb (stmt,
9422 gimple_bb (new_stmt));
9423 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9424 gcc_assert (related_vinfo == NULL
9425 || related_vinfo == stmt_vinfo);
9430 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9431 using the original main loop and thus need to be updated to refer to the
9432 cloned variables used in the epilogue. */
9433 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9435 gimple *stmt = stmt_worklist[i];
9436 tree *new_op;
9438 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9440 tree op = gimple_op (stmt, j);
9441 if ((new_op = mapping.get(op)))
9442 gimple_set_op (stmt, j, *new_op);
9443 else
9445 /* PR92429: The last argument of simplify_replace_tree disables
9446 folding when replacing arguments. This is required as
9447 otherwise you might end up with different statements than the
9448 ones analyzed in vect_loop_analyze, leading to different
9449 vectorization. */
9450 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9451 &find_in_mapping, &mapping, false);
9452 gimple_set_op (stmt, j, op);
9457 struct data_reference *dr;
9458 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9459 FOR_EACH_VEC_ELT (datarefs, i, dr)
9461 orig_stmt = DR_STMT (dr);
9462 gcc_assert (gimple_uid (orig_stmt) > 0);
9463 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9464 /* Data references for gather loads and scatter stores do not use the
9465 updated offset we set using ADVANCE. Instead we have to make sure the
9466 reference in the data references point to the corresponding copy of
9467 the original in the epilogue. */
9468 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9469 == VMAT_GATHER_SCATTER)
9471 DR_REF (dr)
9472 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9473 &find_in_mapping, &mapping);
9474 DR_BASE_ADDRESS (dr)
9475 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9476 &find_in_mapping, &mapping);
9478 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9479 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9480 /* The vector size of the epilogue is smaller than that of the main loop
9481 so the alignment is either the same or lower. This means the dr will
9482 thus by definition be aligned. */
9483 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9486 epilogue_vinfo->shared->datarefs_copy.release ();
9487 epilogue_vinfo->shared->save_datarefs ();
9490 /* Function vect_transform_loop.
9492 The analysis phase has determined that the loop is vectorizable.
9493 Vectorize the loop - created vectorized stmts to replace the scalar
9494 stmts in the loop, and update the loop exit condition.
9495 Returns scalar epilogue loop if any. */
9497 class loop *
9498 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9500 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9501 class loop *epilogue = NULL;
9502 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9503 int nbbs = loop->num_nodes;
9504 int i;
9505 tree niters_vector = NULL_TREE;
9506 tree step_vector = NULL_TREE;
9507 tree niters_vector_mult_vf = NULL_TREE;
9508 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9509 unsigned int lowest_vf = constant_lower_bound (vf);
9510 gimple *stmt;
9511 bool check_profitability = false;
9512 unsigned int th;
9514 DUMP_VECT_SCOPE ("vec_transform_loop");
9516 loop_vinfo->shared->check_datarefs ();
9518 /* Use the more conservative vectorization threshold. If the number
9519 of iterations is constant assume the cost check has been performed
9520 by our caller. If the threshold makes all loops profitable that
9521 run at least the (estimated) vectorization factor number of times
9522 checking is pointless, too. */
9523 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9524 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9526 if (dump_enabled_p ())
9527 dump_printf_loc (MSG_NOTE, vect_location,
9528 "Profitability threshold is %d loop iterations.\n",
9529 th);
9530 check_profitability = true;
9533 /* Make sure there exists a single-predecessor exit bb. Do this before
9534 versioning. */
9535 edge e = single_exit (loop);
9536 if (! single_pred_p (e->dest))
9538 split_loop_exit_edge (e, true);
9539 if (dump_enabled_p ())
9540 dump_printf (MSG_NOTE, "split exit edge\n");
9543 /* Version the loop first, if required, so the profitability check
9544 comes first. */
9546 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9548 class loop *sloop
9549 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9550 sloop->force_vectorize = false;
9551 check_profitability = false;
9554 /* Make sure there exists a single-predecessor exit bb also on the
9555 scalar loop copy. Do this after versioning but before peeling
9556 so CFG structure is fine for both scalar and if-converted loop
9557 to make slpeel_duplicate_current_defs_from_edges face matched
9558 loop closed PHI nodes on the exit. */
9559 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9561 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9562 if (! single_pred_p (e->dest))
9564 split_loop_exit_edge (e, true);
9565 if (dump_enabled_p ())
9566 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9570 tree niters = vect_build_loop_niters (loop_vinfo);
9571 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9572 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9573 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9574 tree advance;
9575 drs_init_vec orig_drs_init;
9577 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9578 &step_vector, &niters_vector_mult_vf, th,
9579 check_profitability, niters_no_overflow,
9580 &advance);
9582 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9583 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9584 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9585 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9587 if (niters_vector == NULL_TREE)
9589 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9590 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9591 && known_eq (lowest_vf, vf))
9593 niters_vector
9594 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9595 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9596 step_vector = build_one_cst (TREE_TYPE (niters));
9598 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9599 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9600 &step_vector, niters_no_overflow);
9601 else
9602 /* vect_do_peeling subtracted the number of peeled prologue
9603 iterations from LOOP_VINFO_NITERS. */
9604 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9605 &niters_vector, &step_vector,
9606 niters_no_overflow);
9609 /* 1) Make sure the loop header has exactly two entries
9610 2) Make sure we have a preheader basic block. */
9612 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9614 split_edge (loop_preheader_edge (loop));
9616 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9617 /* This will deal with any possible peeling. */
9618 vect_prepare_for_masked_peels (loop_vinfo);
9620 /* Schedule the SLP instances first, then handle loop vectorization
9621 below. */
9622 if (!loop_vinfo->slp_instances.is_empty ())
9624 DUMP_VECT_SCOPE ("scheduling SLP instances");
9625 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9628 /* FORNOW: the vectorizer supports only loops which body consist
9629 of one basic block (header + empty latch). When the vectorizer will
9630 support more involved loop forms, the order by which the BBs are
9631 traversed need to be reconsidered. */
9633 for (i = 0; i < nbbs; i++)
9635 basic_block bb = bbs[i];
9636 stmt_vec_info stmt_info;
9638 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9639 gsi_next (&si))
9641 gphi *phi = si.phi ();
9642 if (dump_enabled_p ())
9643 dump_printf_loc (MSG_NOTE, vect_location,
9644 "------>vectorizing phi: %G", phi);
9645 stmt_info = loop_vinfo->lookup_stmt (phi);
9646 if (!stmt_info)
9647 continue;
9649 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9650 vect_loop_kill_debug_uses (loop, stmt_info);
9652 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9653 && !STMT_VINFO_LIVE_P (stmt_info))
9654 continue;
9656 if (STMT_VINFO_VECTYPE (stmt_info)
9657 && (maybe_ne
9658 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9659 && dump_enabled_p ())
9660 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9662 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9663 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9664 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9665 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9666 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9667 && ! PURE_SLP_STMT (stmt_info))
9669 if (dump_enabled_p ())
9670 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9671 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9675 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9676 gsi_next (&si))
9678 gphi *phi = si.phi ();
9679 stmt_info = loop_vinfo->lookup_stmt (phi);
9680 if (!stmt_info)
9681 continue;
9683 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9684 && !STMT_VINFO_LIVE_P (stmt_info))
9685 continue;
9687 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9688 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9689 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9690 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9691 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9692 && ! PURE_SLP_STMT (stmt_info))
9693 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9696 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9697 !gsi_end_p (si);)
9699 stmt = gsi_stmt (si);
9700 /* During vectorization remove existing clobber stmts. */
9701 if (gimple_clobber_p (stmt))
9703 unlink_stmt_vdef (stmt);
9704 gsi_remove (&si, true);
9705 release_defs (stmt);
9707 else
9709 /* Ignore vector stmts created in the outer loop. */
9710 stmt_info = loop_vinfo->lookup_stmt (stmt);
9712 /* vector stmts created in the outer-loop during vectorization of
9713 stmts in an inner-loop may not have a stmt_info, and do not
9714 need to be vectorized. */
9715 stmt_vec_info seen_store = NULL;
9716 if (stmt_info)
9718 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9720 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9721 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9722 !gsi_end_p (subsi); gsi_next (&subsi))
9724 stmt_vec_info pat_stmt_info
9725 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9726 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9727 &si, &seen_store);
9729 stmt_vec_info pat_stmt_info
9730 = STMT_VINFO_RELATED_STMT (stmt_info);
9731 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9732 &si, &seen_store))
9733 maybe_set_vectorized_backedge_value (loop_vinfo,
9734 pat_stmt_info);
9736 else
9738 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9739 &seen_store))
9740 maybe_set_vectorized_backedge_value (loop_vinfo,
9741 stmt_info);
9744 gsi_next (&si);
9745 if (seen_store)
9747 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9748 /* Interleaving. If IS_STORE is TRUE, the
9749 vectorization of the interleaving chain was
9750 completed - free all the stores in the chain. */
9751 vect_remove_stores (loop_vinfo,
9752 DR_GROUP_FIRST_ELEMENT (seen_store));
9753 else
9754 /* Free the attached stmt_vec_info and remove the stmt. */
9755 loop_vinfo->remove_stmt (stmt_info);
9760 /* Stub out scalar statements that must not survive vectorization.
9761 Doing this here helps with grouped statements, or statements that
9762 are involved in patterns. */
9763 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9764 !gsi_end_p (gsi); gsi_next (&gsi))
9766 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9767 if (!call || !gimple_call_internal_p (call))
9768 continue;
9769 internal_fn ifn = gimple_call_internal_fn (call);
9770 if (ifn == IFN_MASK_LOAD)
9772 tree lhs = gimple_get_lhs (call);
9773 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9775 tree zero = build_zero_cst (TREE_TYPE (lhs));
9776 gimple *new_stmt = gimple_build_assign (lhs, zero);
9777 gsi_replace (&gsi, new_stmt, true);
9780 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9782 tree lhs = gimple_get_lhs (call);
9783 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9785 tree else_arg
9786 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9787 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9788 gsi_replace (&gsi, new_stmt, true);
9792 } /* BBs in loop */
9794 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9795 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9796 if (integer_onep (step_vector))
9797 niters_no_overflow = true;
9798 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9799 niters_vector_mult_vf, !niters_no_overflow);
9801 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9802 scale_profile_for_vect_loop (loop, assumed_vf);
9804 /* True if the final iteration might not handle a full vector's
9805 worth of scalar iterations. */
9806 bool final_iter_may_be_partial
9807 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9808 /* The minimum number of iterations performed by the epilogue. This
9809 is 1 when peeling for gaps because we always need a final scalar
9810 iteration. */
9811 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9812 /* +1 to convert latch counts to loop iteration counts,
9813 -min_epilogue_iters to remove iterations that cannot be performed
9814 by the vector code. */
9815 int bias_for_lowest = 1 - min_epilogue_iters;
9816 int bias_for_assumed = bias_for_lowest;
9817 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9818 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9820 /* When the amount of peeling is known at compile time, the first
9821 iteration will have exactly alignment_npeels active elements.
9822 In the worst case it will have at least one. */
9823 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9824 bias_for_lowest += lowest_vf - min_first_active;
9825 bias_for_assumed += assumed_vf - min_first_active;
9827 /* In these calculations the "- 1" converts loop iteration counts
9828 back to latch counts. */
9829 if (loop->any_upper_bound)
9831 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9832 loop->nb_iterations_upper_bound
9833 = (final_iter_may_be_partial
9834 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9835 lowest_vf) - 1
9836 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9837 lowest_vf) - 1);
9838 if (main_vinfo)
9840 unsigned int bound;
9841 poly_uint64 main_iters
9842 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9843 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9844 main_iters
9845 = upper_bound (main_iters,
9846 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9847 if (can_div_away_from_zero_p (main_iters,
9848 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9849 &bound))
9850 loop->nb_iterations_upper_bound
9851 = wi::umin ((widest_int) (bound - 1),
9852 loop->nb_iterations_upper_bound);
9855 if (loop->any_likely_upper_bound)
9856 loop->nb_iterations_likely_upper_bound
9857 = (final_iter_may_be_partial
9858 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9859 + bias_for_lowest, lowest_vf) - 1
9860 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9861 + bias_for_lowest, lowest_vf) - 1);
9862 if (loop->any_estimate)
9863 loop->nb_iterations_estimate
9864 = (final_iter_may_be_partial
9865 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9866 assumed_vf) - 1
9867 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9868 assumed_vf) - 1);
9870 if (dump_enabled_p ())
9872 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9874 dump_printf_loc (MSG_NOTE, vect_location,
9875 "LOOP VECTORIZED\n");
9876 if (loop->inner)
9877 dump_printf_loc (MSG_NOTE, vect_location,
9878 "OUTER LOOP VECTORIZED\n");
9879 dump_printf (MSG_NOTE, "\n");
9881 else
9882 dump_printf_loc (MSG_NOTE, vect_location,
9883 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9884 GET_MODE_NAME (loop_vinfo->vector_mode));
9887 /* Loops vectorized with a variable factor won't benefit from
9888 unrolling/peeling. */
9889 if (!vf.is_constant ())
9891 loop->unroll = 1;
9892 if (dump_enabled_p ())
9893 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9894 " variable-length vectorization factor\n");
9896 /* Free SLP instances here because otherwise stmt reference counting
9897 won't work. */
9898 slp_instance instance;
9899 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9900 vect_free_slp_instance (instance);
9901 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9902 /* Clear-up safelen field since its value is invalid after vectorization
9903 since vectorized loop can have loop-carried dependencies. */
9904 loop->safelen = 0;
9906 if (epilogue)
9908 update_epilogue_loop_vinfo (epilogue, advance);
9910 epilogue->simduid = loop->simduid;
9911 epilogue->force_vectorize = loop->force_vectorize;
9912 epilogue->dont_vectorize = false;
9915 return epilogue;
9918 /* The code below is trying to perform simple optimization - revert
9919 if-conversion for masked stores, i.e. if the mask of a store is zero
9920 do not perform it and all stored value producers also if possible.
9921 For example,
9922 for (i=0; i<n; i++)
9923 if (c[i])
9925 p1[i] += 1;
9926 p2[i] = p3[i] +2;
9928 this transformation will produce the following semi-hammock:
9930 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9932 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9933 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9934 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9935 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9936 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9937 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9941 void
9942 optimize_mask_stores (class loop *loop)
9944 basic_block *bbs = get_loop_body (loop);
9945 unsigned nbbs = loop->num_nodes;
9946 unsigned i;
9947 basic_block bb;
9948 class loop *bb_loop;
9949 gimple_stmt_iterator gsi;
9950 gimple *stmt;
9951 auto_vec<gimple *> worklist;
9952 auto_purge_vect_location sentinel;
9954 vect_location = find_loop_location (loop);
9955 /* Pick up all masked stores in loop if any. */
9956 for (i = 0; i < nbbs; i++)
9958 bb = bbs[i];
9959 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9960 gsi_next (&gsi))
9962 stmt = gsi_stmt (gsi);
9963 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9964 worklist.safe_push (stmt);
9968 free (bbs);
9969 if (worklist.is_empty ())
9970 return;
9972 /* Loop has masked stores. */
9973 while (!worklist.is_empty ())
9975 gimple *last, *last_store;
9976 edge e, efalse;
9977 tree mask;
9978 basic_block store_bb, join_bb;
9979 gimple_stmt_iterator gsi_to;
9980 tree vdef, new_vdef;
9981 gphi *phi;
9982 tree vectype;
9983 tree zero;
9985 last = worklist.pop ();
9986 mask = gimple_call_arg (last, 2);
9987 bb = gimple_bb (last);
9988 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9989 the same loop as if_bb. It could be different to LOOP when two
9990 level loop-nest is vectorized and mask_store belongs to the inner
9991 one. */
9992 e = split_block (bb, last);
9993 bb_loop = bb->loop_father;
9994 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9995 join_bb = e->dest;
9996 store_bb = create_empty_bb (bb);
9997 add_bb_to_loop (store_bb, bb_loop);
9998 e->flags = EDGE_TRUE_VALUE;
9999 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10000 /* Put STORE_BB to likely part. */
10001 efalse->probability = profile_probability::unlikely ();
10002 store_bb->count = efalse->count ();
10003 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10004 if (dom_info_available_p (CDI_DOMINATORS))
10005 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10006 if (dump_enabled_p ())
10007 dump_printf_loc (MSG_NOTE, vect_location,
10008 "Create new block %d to sink mask stores.",
10009 store_bb->index);
10010 /* Create vector comparison with boolean result. */
10011 vectype = TREE_TYPE (mask);
10012 zero = build_zero_cst (vectype);
10013 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10014 gsi = gsi_last_bb (bb);
10015 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10016 /* Create new PHI node for vdef of the last masked store:
10017 .MEM_2 = VDEF <.MEM_1>
10018 will be converted to
10019 .MEM.3 = VDEF <.MEM_1>
10020 and new PHI node will be created in join bb
10021 .MEM_2 = PHI <.MEM_1, .MEM_3>
10023 vdef = gimple_vdef (last);
10024 new_vdef = make_ssa_name (gimple_vop (cfun), last);
10025 gimple_set_vdef (last, new_vdef);
10026 phi = create_phi_node (vdef, join_bb);
10027 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10029 /* Put all masked stores with the same mask to STORE_BB if possible. */
10030 while (true)
10032 gimple_stmt_iterator gsi_from;
10033 gimple *stmt1 = NULL;
10035 /* Move masked store to STORE_BB. */
10036 last_store = last;
10037 gsi = gsi_for_stmt (last);
10038 gsi_from = gsi;
10039 /* Shift GSI to the previous stmt for further traversal. */
10040 gsi_prev (&gsi);
10041 gsi_to = gsi_start_bb (store_bb);
10042 gsi_move_before (&gsi_from, &gsi_to);
10043 /* Setup GSI_TO to the non-empty block start. */
10044 gsi_to = gsi_start_bb (store_bb);
10045 if (dump_enabled_p ())
10046 dump_printf_loc (MSG_NOTE, vect_location,
10047 "Move stmt to created bb\n%G", last);
10048 /* Move all stored value producers if possible. */
10049 while (!gsi_end_p (gsi))
10051 tree lhs;
10052 imm_use_iterator imm_iter;
10053 use_operand_p use_p;
10054 bool res;
10056 /* Skip debug statements. */
10057 if (is_gimple_debug (gsi_stmt (gsi)))
10059 gsi_prev (&gsi);
10060 continue;
10062 stmt1 = gsi_stmt (gsi);
10063 /* Do not consider statements writing to memory or having
10064 volatile operand. */
10065 if (gimple_vdef (stmt1)
10066 || gimple_has_volatile_ops (stmt1))
10067 break;
10068 gsi_from = gsi;
10069 gsi_prev (&gsi);
10070 lhs = gimple_get_lhs (stmt1);
10071 if (!lhs)
10072 break;
10074 /* LHS of vectorized stmt must be SSA_NAME. */
10075 if (TREE_CODE (lhs) != SSA_NAME)
10076 break;
10078 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10080 /* Remove dead scalar statement. */
10081 if (has_zero_uses (lhs))
10083 gsi_remove (&gsi_from, true);
10084 continue;
10088 /* Check that LHS does not have uses outside of STORE_BB. */
10089 res = true;
10090 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10092 gimple *use_stmt;
10093 use_stmt = USE_STMT (use_p);
10094 if (is_gimple_debug (use_stmt))
10095 continue;
10096 if (gimple_bb (use_stmt) != store_bb)
10098 res = false;
10099 break;
10102 if (!res)
10103 break;
10105 if (gimple_vuse (stmt1)
10106 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10107 break;
10109 /* Can move STMT1 to STORE_BB. */
10110 if (dump_enabled_p ())
10111 dump_printf_loc (MSG_NOTE, vect_location,
10112 "Move stmt to created bb\n%G", stmt1);
10113 gsi_move_before (&gsi_from, &gsi_to);
10114 /* Shift GSI_TO for further insertion. */
10115 gsi_prev (&gsi_to);
10117 /* Put other masked stores with the same mask to STORE_BB. */
10118 if (worklist.is_empty ()
10119 || gimple_call_arg (worklist.last (), 2) != mask
10120 || worklist.last () != stmt1)
10121 break;
10122 last = worklist.pop ();
10124 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10128 /* Decide whether it is possible to use a zero-based induction variable
10129 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10130 the value that the induction variable must be able to hold in order
10131 to ensure that the rgroups eventually have no active vector elements.
10132 Return -1 otherwise. */
10134 widest_int
10135 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10137 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10138 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10139 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10141 /* Calculate the value that the induction variable must be able
10142 to hit in order to ensure that we end the loop with an all-false mask.
10143 This involves adding the maximum number of inactive trailing scalar
10144 iterations. */
10145 widest_int iv_limit = -1;
10146 if (max_loop_iterations (loop, &iv_limit))
10148 if (niters_skip)
10150 /* Add the maximum number of skipped iterations to the
10151 maximum iteration count. */
10152 if (TREE_CODE (niters_skip) == INTEGER_CST)
10153 iv_limit += wi::to_widest (niters_skip);
10154 else
10155 iv_limit += max_vf - 1;
10157 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10158 /* Make a conservatively-correct assumption. */
10159 iv_limit += max_vf - 1;
10161 /* IV_LIMIT is the maximum number of latch iterations, which is also
10162 the maximum in-range IV value. Round this value down to the previous
10163 vector alignment boundary and then add an extra full iteration. */
10164 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10165 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10167 return iv_limit;
10170 /* For the given rgroup_controls RGC, check whether an induction variable
10171 would ever hit a value that produces a set of all-false masks or zero
10172 lengths before wrapping around. Return true if it's possible to wrap
10173 around before hitting the desirable value, otherwise return false. */
10175 bool
10176 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10178 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10180 if (iv_limit == -1)
10181 return true;
10183 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10184 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10185 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10187 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10188 return true;
10190 return false;