Remove extra newline
[official-gcc.git] / gcc / tree-vect-loop.c
blobc4c3cc9ecaa41ffb01b3c59181de2b0ca22724e9
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
346 opt_result res
347 = vect_determine_vf_for_stmt (loop_vinfo,
348 stmt_info, &vectorization_factor);
349 if (!res)
350 return res;
354 /* TODO: Analyze cost. Decide if worth while to vectorize. */
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
358 dump_dec (MSG_NOTE, vectorization_factor);
359 dump_printf (MSG_NOTE, "\n");
362 if (known_le (vectorization_factor, 1U))
363 return opt_result::failure_at (vect_location,
364 "not vectorized: unsupported data-type\n");
365 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
366 return opt_result::success ();
370 /* Function vect_is_simple_iv_evolution.
372 FORNOW: A simple evolution of an induction variables in the loop is
373 considered a polynomial evolution. */
375 static bool
376 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
377 tree * step)
379 tree init_expr;
380 tree step_expr;
381 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
382 basic_block bb;
384 /* When there is no evolution in this loop, the evolution function
385 is not "simple". */
386 if (evolution_part == NULL_TREE)
387 return false;
389 /* When the evolution is a polynomial of degree >= 2
390 the evolution function is not "simple". */
391 if (tree_is_chrec (evolution_part))
392 return false;
394 step_expr = evolution_part;
395 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
397 if (dump_enabled_p ())
398 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
399 step_expr, init_expr);
401 *init = init_expr;
402 *step = step_expr;
404 if (TREE_CODE (step_expr) != INTEGER_CST
405 && (TREE_CODE (step_expr) != SSA_NAME
406 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
407 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
408 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
409 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
410 || !flag_associative_math)))
411 && (TREE_CODE (step_expr) != REAL_CST
412 || !flag_associative_math))
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "step unknown.\n");
417 return false;
420 return true;
423 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
424 what we are assuming is a double reduction. For example, given
425 a structure like this:
427 outer1:
428 x_1 = PHI <x_4(outer2), ...>;
431 inner:
432 x_2 = PHI <x_1(outer1), ...>;
434 x_3 = ...;
437 outer2:
438 x_4 = PHI <x_3(inner)>;
441 outer loop analysis would treat x_1 as a double reduction phi and
442 this function would then return true for x_2. */
444 static bool
445 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
447 use_operand_p use_p;
448 ssa_op_iter op_iter;
449 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
450 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
451 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
452 return true;
453 return false;
456 /* Function vect_analyze_scalar_cycles_1.
458 Examine the cross iteration def-use cycles of scalar variables
459 in LOOP. LOOP_VINFO represents the loop that is now being
460 considered for vectorization (can be LOOP, or an outer-loop
461 enclosing LOOP). */
463 static void
464 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
466 basic_block bb = loop->header;
467 tree init, step;
468 auto_vec<stmt_vec_info, 64> worklist;
469 gphi_iterator gsi;
470 bool double_reduc, reduc_chain;
472 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
474 /* First - identify all inductions. Reduction detection assumes that all the
475 inductions have been identified, therefore, this order must not be
476 changed. */
477 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
479 gphi *phi = gsi.phi ();
480 tree access_fn = NULL;
481 tree def = PHI_RESULT (phi);
482 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
487 /* Skip virtual phi's. The data dependences that are associated with
488 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
489 if (virtual_operand_p (def))
490 continue;
492 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
494 /* Analyze the evolution function. */
495 access_fn = analyze_scalar_evolution (loop, def);
496 if (access_fn)
498 STRIP_NOPS (access_fn);
499 if (dump_enabled_p ())
500 dump_printf_loc (MSG_NOTE, vect_location,
501 "Access function of PHI: %T\n", access_fn);
502 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
503 = initial_condition_in_loop_num (access_fn, loop->num);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
505 = evolution_part_in_loop_num (access_fn, loop->num);
508 if (!access_fn
509 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
510 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
511 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
512 && TREE_CODE (step) != INTEGER_CST))
514 worklist.safe_push (stmt_vinfo);
515 continue;
518 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
519 != NULL_TREE);
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
522 if (dump_enabled_p ())
523 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
524 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
528 /* Second - identify all reductions and nested cycles. */
529 while (worklist.length () > 0)
531 stmt_vec_info stmt_vinfo = worklist.pop ();
532 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
533 tree def = PHI_RESULT (phi);
535 if (dump_enabled_p ())
536 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
538 gcc_assert (!virtual_operand_p (def)
539 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
541 stmt_vec_info reduc_stmt_info
542 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
543 &reduc_chain);
544 if (reduc_stmt_info)
546 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
547 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
548 if (double_reduc)
550 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location,
552 "Detected double reduction.\n");
554 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
555 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
557 else
559 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
561 if (dump_enabled_p ())
562 dump_printf_loc (MSG_NOTE, vect_location,
563 "Detected vectorizable nested cycle.\n");
565 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
567 else
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "Detected reduction.\n");
573 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
574 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
575 /* Store the reduction cycles for possible vectorization in
576 loop-aware SLP if it was not detected as reduction
577 chain. */
578 if (! reduc_chain)
579 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
580 (reduc_stmt_info);
584 else
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Unknown def-use cycle pattern.\n");
592 /* Function vect_analyze_scalar_cycles.
594 Examine the cross iteration def-use cycles of scalar variables, by
595 analyzing the loop-header PHIs of scalar variables. Classify each
596 cycle as one of the following: invariant, induction, reduction, unknown.
597 We do that for the loop represented by LOOP_VINFO, and also to its
598 inner-loop, if exists.
599 Examples for scalar cycles:
601 Example1: reduction:
603 loop1:
604 for (i=0; i<N; i++)
605 sum += a[i];
607 Example2: induction:
609 loop2:
610 for (i=0; i<N; i++)
611 a[i] = i; */
613 static void
614 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
616 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
618 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
620 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
621 Reductions in such inner-loop therefore have different properties than
622 the reductions in the nest that gets vectorized:
623 1. When vectorized, they are executed in the same order as in the original
624 scalar loop, so we can't change the order of computation when
625 vectorizing them.
626 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
627 current checks are too strict. */
629 if (loop->inner)
630 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
633 /* Transfer group and reduction information from STMT_INFO to its
634 pattern stmt. */
636 static void
637 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
639 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
640 stmt_vec_info stmtp;
641 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
642 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
643 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
646 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
647 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
648 == STMT_VINFO_DEF_TYPE (stmt_info));
649 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
650 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
651 if (stmt_info)
652 REDUC_GROUP_NEXT_ELEMENT (stmtp)
653 = STMT_VINFO_RELATED_STMT (stmt_info);
655 while (stmt_info);
658 /* Fixup scalar cycles that now have their stmts detected as patterns. */
660 static void
661 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
663 stmt_vec_info first;
664 unsigned i;
666 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
667 if (STMT_VINFO_IN_PATTERN_P (first))
669 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
670 while (next)
672 if (! STMT_VINFO_IN_PATTERN_P (next)
673 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
674 break;
675 next = REDUC_GROUP_NEXT_ELEMENT (next);
677 /* If not all stmt in the chain are patterns or if we failed
678 to update STMT_VINFO_REDUC_IDX try to handle the chain
679 without patterns. */
680 if (! next
681 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
683 vect_fixup_reduc_chain (first);
684 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
685 = STMT_VINFO_RELATED_STMT (first);
690 /* Function vect_get_loop_niters.
692 Determine how many iterations the loop is executed and place it
693 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
694 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
695 niter information holds in ASSUMPTIONS.
697 Return the loop exit condition. */
700 static gcond *
701 vect_get_loop_niters (class loop *loop, tree *assumptions,
702 tree *number_of_iterations, tree *number_of_iterationsm1)
704 edge exit = single_exit (loop);
705 class tree_niter_desc niter_desc;
706 tree niter_assumptions, niter, may_be_zero;
707 gcond *cond = get_loop_exit_condition (loop);
709 *assumptions = boolean_true_node;
710 *number_of_iterationsm1 = chrec_dont_know;
711 *number_of_iterations = chrec_dont_know;
712 DUMP_VECT_SCOPE ("get_loop_niters");
714 if (!exit)
715 return cond;
717 may_be_zero = NULL_TREE;
718 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
719 || chrec_contains_undetermined (niter_desc.niter))
720 return cond;
722 niter_assumptions = niter_desc.assumptions;
723 may_be_zero = niter_desc.may_be_zero;
724 niter = niter_desc.niter;
726 if (may_be_zero && integer_zerop (may_be_zero))
727 may_be_zero = NULL_TREE;
729 if (may_be_zero)
731 if (COMPARISON_CLASS_P (may_be_zero))
733 /* Try to combine may_be_zero with assumptions, this can simplify
734 computation of niter expression. */
735 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
736 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
737 niter_assumptions,
738 fold_build1 (TRUTH_NOT_EXPR,
739 boolean_type_node,
740 may_be_zero));
741 else
742 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
743 build_int_cst (TREE_TYPE (niter), 0),
744 rewrite_to_non_trapping_overflow (niter));
746 may_be_zero = NULL_TREE;
748 else if (integer_nonzerop (may_be_zero))
750 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
751 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
752 return cond;
754 else
755 return cond;
758 *assumptions = niter_assumptions;
759 *number_of_iterationsm1 = niter;
761 /* We want the number of loop header executions which is the number
762 of latch executions plus one.
763 ??? For UINT_MAX latch executions this number overflows to zero
764 for loops like do { n++; } while (n != 0); */
765 if (niter && !chrec_contains_undetermined (niter))
766 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
767 build_int_cst (TREE_TYPE (niter), 1));
768 *number_of_iterations = niter;
770 return cond;
773 /* Function bb_in_loop_p
775 Used as predicate for dfs order traversal of the loop bbs. */
777 static bool
778 bb_in_loop_p (const_basic_block bb, const void *data)
780 const class loop *const loop = (const class loop *)data;
781 if (flow_bb_inside_loop_p (loop, bb))
782 return true;
783 return false;
787 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
788 stmt_vec_info structs for all the stmts in LOOP_IN. */
790 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
791 : vec_info (vec_info::loop, init_cost (loop_in), shared),
792 loop (loop_in),
793 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
794 num_itersm1 (NULL_TREE),
795 num_iters (NULL_TREE),
796 num_iters_unchanged (NULL_TREE),
797 num_iters_assumptions (NULL_TREE),
798 th (0),
799 versioning_threshold (0),
800 vectorization_factor (0),
801 max_vectorization_factor (0),
802 mask_skip_niters (NULL_TREE),
803 mask_compare_type (NULL_TREE),
804 simd_if_cond (NULL_TREE),
805 unaligned_dr (NULL),
806 peeling_for_alignment (0),
807 ptr_mask (0),
808 ivexpr_map (NULL),
809 scan_map (NULL),
810 slp_unrolling_factor (1),
811 single_scalar_iteration_cost (0),
812 vec_outside_cost (0),
813 vec_inside_cost (0),
814 vectorizable (false),
815 can_fully_mask_p (true),
816 fully_masked_p (false),
817 peeling_for_gaps (false),
818 peeling_for_niter (false),
819 no_data_dependencies (false),
820 has_mask_store (false),
821 scalar_loop_scaling (profile_probability::uninitialized ()),
822 scalar_loop (NULL),
823 orig_loop_info (NULL)
825 /* CHECKME: We want to visit all BBs before their successors (except for
826 latch blocks, for which this assertion wouldn't hold). In the simple
827 case of the loop forms we allow, a dfs order of the BBs would the same
828 as reversed postorder traversal, so we are safe. */
830 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
831 bbs, loop->num_nodes, loop);
832 gcc_assert (nbbs == loop->num_nodes);
834 for (unsigned int i = 0; i < nbbs; i++)
836 basic_block bb = bbs[i];
837 gimple_stmt_iterator si;
839 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
841 gimple *phi = gsi_stmt (si);
842 gimple_set_uid (phi, 0);
843 add_stmt (phi);
846 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
848 gimple *stmt = gsi_stmt (si);
849 gimple_set_uid (stmt, 0);
850 add_stmt (stmt);
851 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
852 third argument is the #pragma omp simd if (x) condition, when 0,
853 loop shouldn't be vectorized, when non-zero constant, it should
854 be vectorized normally, otherwise versioned with vectorized loop
855 done if the condition is non-zero at runtime. */
856 if (loop_in->simduid
857 && is_gimple_call (stmt)
858 && gimple_call_internal_p (stmt)
859 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
860 && gimple_call_num_args (stmt) >= 3
861 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
862 && (loop_in->simduid
863 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
865 tree arg = gimple_call_arg (stmt, 2);
866 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
867 simd_if_cond = arg;
868 else
869 gcc_assert (integer_nonzerop (arg));
874 epilogue_vinfos.create (6);
877 /* Free all levels of MASKS. */
879 void
880 release_vec_loop_masks (vec_loop_masks *masks)
882 rgroup_masks *rgm;
883 unsigned int i;
884 FOR_EACH_VEC_ELT (*masks, i, rgm)
885 rgm->masks.release ();
886 masks->release ();
889 /* Free all memory used by the _loop_vec_info, as well as all the
890 stmt_vec_info structs of all the stmts in the loop. */
892 _loop_vec_info::~_loop_vec_info ()
894 free (bbs);
896 release_vec_loop_masks (&masks);
897 delete ivexpr_map;
898 delete scan_map;
899 epilogue_vinfos.release ();
901 loop->aux = NULL;
904 /* Return an invariant or register for EXPR and emit necessary
905 computations in the LOOP_VINFO loop preheader. */
907 tree
908 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
910 if (is_gimple_reg (expr)
911 || is_gimple_min_invariant (expr))
912 return expr;
914 if (! loop_vinfo->ivexpr_map)
915 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
916 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
917 if (! cached)
919 gimple_seq stmts = NULL;
920 cached = force_gimple_operand (unshare_expr (expr),
921 &stmts, true, NULL_TREE);
922 if (stmts)
924 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
925 gsi_insert_seq_on_edge_immediate (e, stmts);
928 return cached;
931 /* Return true if we can use CMP_TYPE as the comparison type to produce
932 all masks required to mask LOOP_VINFO. */
934 static bool
935 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
937 rgroup_masks *rgm;
938 unsigned int i;
939 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
940 if (rgm->mask_type != NULL_TREE
941 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
942 cmp_type, rgm->mask_type,
943 OPTIMIZE_FOR_SPEED))
944 return false;
945 return true;
948 /* Calculate the maximum number of scalars per iteration for every
949 rgroup in LOOP_VINFO. */
951 static unsigned int
952 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
954 unsigned int res = 1;
955 unsigned int i;
956 rgroup_masks *rgm;
957 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
958 res = MAX (res, rgm->max_nscalars_per_iter);
959 return res;
962 /* Each statement in LOOP_VINFO can be masked where necessary. Check
963 whether we can actually generate the masks required. Return true if so,
964 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
966 static bool
967 vect_verify_full_masking (loop_vec_info loop_vinfo)
969 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
970 unsigned int min_ni_width;
971 unsigned int max_nscalars_per_iter
972 = vect_get_max_nscalars_per_iter (loop_vinfo);
974 /* Use a normal loop if there are no statements that need masking.
975 This only happens in rare degenerate cases: it means that the loop
976 has no loads, no stores, and no live-out values. */
977 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
978 return false;
980 /* Get the maximum number of iterations that is representable
981 in the counter type. */
982 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
983 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
985 /* Get a more refined estimate for the number of iterations. */
986 widest_int max_back_edges;
987 if (max_loop_iterations (loop, &max_back_edges))
988 max_ni = wi::smin (max_ni, max_back_edges + 1);
990 /* Account for rgroup masks, in which each bit is replicated N times. */
991 max_ni *= max_nscalars_per_iter;
993 /* Work out how many bits we need to represent the limit. */
994 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
996 /* Find a scalar mode for which WHILE_ULT is supported. */
997 opt_scalar_int_mode cmp_mode_iter;
998 tree cmp_type = NULL_TREE;
999 tree iv_type = NULL_TREE;
1000 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1001 unsigned int iv_precision = UINT_MAX;
1003 if (iv_limit != -1)
1004 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1005 UNSIGNED);
1007 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1009 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1010 if (cmp_bits >= min_ni_width
1011 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1013 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1014 if (this_type
1015 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1017 /* Although we could stop as soon as we find a valid mode,
1018 there are at least two reasons why that's not always the
1019 best choice:
1021 - An IV that's Pmode or wider is more likely to be reusable
1022 in address calculations than an IV that's narrower than
1023 Pmode.
1025 - Doing the comparison in IV_PRECISION or wider allows
1026 a natural 0-based IV, whereas using a narrower comparison
1027 type requires mitigations against wrap-around.
1029 Conversely, if the IV limit is variable, doing the comparison
1030 in a wider type than the original type can introduce
1031 unnecessary extensions, so picking the widest valid mode
1032 is not always a good choice either.
1034 Here we prefer the first IV type that's Pmode or wider,
1035 and the first comparison type that's IV_PRECISION or wider.
1036 (The comparison type must be no wider than the IV type,
1037 to avoid extensions in the vector loop.)
1039 ??? We might want to try continuing beyond Pmode for ILP32
1040 targets if CMP_BITS < IV_PRECISION. */
1041 iv_type = this_type;
1042 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1043 cmp_type = this_type;
1044 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1045 break;
1050 if (!cmp_type)
1051 return false;
1053 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1054 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1055 return true;
1058 /* Calculate the cost of one scalar iteration of the loop. */
1059 static void
1060 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1062 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1063 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1064 int nbbs = loop->num_nodes, factor;
1065 int innerloop_iters, i;
1067 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1069 /* Gather costs for statements in the scalar loop. */
1071 /* FORNOW. */
1072 innerloop_iters = 1;
1073 if (loop->inner)
1074 innerloop_iters = 50; /* FIXME */
1076 for (i = 0; i < nbbs; i++)
1078 gimple_stmt_iterator si;
1079 basic_block bb = bbs[i];
1081 if (bb->loop_father == loop->inner)
1082 factor = innerloop_iters;
1083 else
1084 factor = 1;
1086 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1088 gimple *stmt = gsi_stmt (si);
1089 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1091 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1092 continue;
1094 /* Skip stmts that are not vectorized inside the loop. */
1095 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1096 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1097 && (!STMT_VINFO_LIVE_P (vstmt_info)
1098 || !VECTORIZABLE_CYCLE_DEF
1099 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1100 continue;
1102 vect_cost_for_stmt kind;
1103 if (STMT_VINFO_DATA_REF (stmt_info))
1105 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1106 kind = scalar_load;
1107 else
1108 kind = scalar_store;
1110 else if (vect_nop_conversion_p (stmt_info))
1111 continue;
1112 else
1113 kind = scalar_stmt;
1115 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1116 factor, kind, stmt_info, 0, vect_prologue);
1120 /* Now accumulate cost. */
1121 void *target_cost_data = init_cost (loop);
1122 stmt_info_for_cost *si;
1123 int j;
1124 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1125 j, si)
1126 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1127 si->kind, si->stmt_info, si->misalign,
1128 vect_body);
1129 unsigned dummy, body_cost = 0;
1130 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1131 destroy_cost_data (target_cost_data);
1132 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1136 /* Function vect_analyze_loop_form_1.
1138 Verify that certain CFG restrictions hold, including:
1139 - the loop has a pre-header
1140 - the loop has a single entry and exit
1141 - the loop exit condition is simple enough
1142 - the number of iterations can be analyzed, i.e, a countable loop. The
1143 niter could be analyzed under some assumptions. */
1145 opt_result
1146 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1147 tree *assumptions, tree *number_of_iterationsm1,
1148 tree *number_of_iterations, gcond **inner_loop_cond)
1150 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1152 /* Different restrictions apply when we are considering an inner-most loop,
1153 vs. an outer (nested) loop.
1154 (FORNOW. May want to relax some of these restrictions in the future). */
1156 if (!loop->inner)
1158 /* Inner-most loop. We currently require that the number of BBs is
1159 exactly 2 (the header and latch). Vectorizable inner-most loops
1160 look like this:
1162 (pre-header)
1164 header <--------+
1165 | | |
1166 | +--> latch --+
1168 (exit-bb) */
1170 if (loop->num_nodes != 2)
1171 return opt_result::failure_at (vect_location,
1172 "not vectorized:"
1173 " control flow in loop.\n");
1175 if (empty_block_p (loop->header))
1176 return opt_result::failure_at (vect_location,
1177 "not vectorized: empty loop.\n");
1179 else
1181 class loop *innerloop = loop->inner;
1182 edge entryedge;
1184 /* Nested loop. We currently require that the loop is doubly-nested,
1185 contains a single inner loop, and the number of BBs is exactly 5.
1186 Vectorizable outer-loops look like this:
1188 (pre-header)
1190 header <---+
1192 inner-loop |
1194 tail ------+
1196 (exit-bb)
1198 The inner-loop has the properties expected of inner-most loops
1199 as described above. */
1201 if ((loop->inner)->inner || (loop->inner)->next)
1202 return opt_result::failure_at (vect_location,
1203 "not vectorized:"
1204 " multiple nested loops.\n");
1206 if (loop->num_nodes != 5)
1207 return opt_result::failure_at (vect_location,
1208 "not vectorized:"
1209 " control flow in loop.\n");
1211 entryedge = loop_preheader_edge (innerloop);
1212 if (entryedge->src != loop->header
1213 || !single_exit (innerloop)
1214 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1215 return opt_result::failure_at (vect_location,
1216 "not vectorized:"
1217 " unsupported outerloop form.\n");
1219 /* Analyze the inner-loop. */
1220 tree inner_niterm1, inner_niter, inner_assumptions;
1221 opt_result res
1222 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1223 &inner_assumptions, &inner_niterm1,
1224 &inner_niter, NULL);
1225 if (!res)
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229 "not vectorized: Bad inner loop.\n");
1230 return res;
1233 /* Don't support analyzing niter under assumptions for inner
1234 loop. */
1235 if (!integer_onep (inner_assumptions))
1236 return opt_result::failure_at (vect_location,
1237 "not vectorized: Bad inner loop.\n");
1239 if (!expr_invariant_in_loop_p (loop, inner_niter))
1240 return opt_result::failure_at (vect_location,
1241 "not vectorized: inner-loop count not"
1242 " invariant.\n");
1244 if (dump_enabled_p ())
1245 dump_printf_loc (MSG_NOTE, vect_location,
1246 "Considering outer-loop vectorization.\n");
1249 if (!single_exit (loop))
1250 return opt_result::failure_at (vect_location,
1251 "not vectorized: multiple exits.\n");
1252 if (EDGE_COUNT (loop->header->preds) != 2)
1253 return opt_result::failure_at (vect_location,
1254 "not vectorized:"
1255 " too many incoming edges.\n");
1257 /* We assume that the loop exit condition is at the end of the loop. i.e,
1258 that the loop is represented as a do-while (with a proper if-guard
1259 before the loop if needed), where the loop header contains all the
1260 executable statements, and the latch is empty. */
1261 if (!empty_block_p (loop->latch)
1262 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1263 return opt_result::failure_at (vect_location,
1264 "not vectorized: latch block not empty.\n");
1266 /* Make sure the exit is not abnormal. */
1267 edge e = single_exit (loop);
1268 if (e->flags & EDGE_ABNORMAL)
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized:"
1271 " abnormal loop exit edge.\n");
1273 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1274 number_of_iterationsm1);
1275 if (!*loop_cond)
1276 return opt_result::failure_at
1277 (vect_location,
1278 "not vectorized: complicated exit condition.\n");
1280 if (integer_zerop (*assumptions)
1281 || !*number_of_iterations
1282 || chrec_contains_undetermined (*number_of_iterations))
1283 return opt_result::failure_at
1284 (*loop_cond,
1285 "not vectorized: number of iterations cannot be computed.\n");
1287 if (integer_zerop (*number_of_iterations))
1288 return opt_result::failure_at
1289 (*loop_cond,
1290 "not vectorized: number of iterations = 0.\n");
1292 return opt_result::success ();
1295 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1297 opt_loop_vec_info
1298 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1300 tree assumptions, number_of_iterations, number_of_iterationsm1;
1301 gcond *loop_cond, *inner_loop_cond = NULL;
1303 opt_result res
1304 = vect_analyze_loop_form_1 (loop, &loop_cond,
1305 &assumptions, &number_of_iterationsm1,
1306 &number_of_iterations, &inner_loop_cond);
1307 if (!res)
1308 return opt_loop_vec_info::propagate_failure (res);
1310 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1311 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1312 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1313 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1314 if (!integer_onep (assumptions))
1316 /* We consider to vectorize this loop by versioning it under
1317 some assumptions. In order to do this, we need to clear
1318 existing information computed by scev and niter analyzer. */
1319 scev_reset_htab ();
1320 free_numbers_of_iterations_estimates (loop);
1321 /* Also set flag for this loop so that following scev and niter
1322 analysis are done under the assumptions. */
1323 loop_constraint_set (loop, LOOP_C_FINITE);
1324 /* Also record the assumptions for versioning. */
1325 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1328 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1330 if (dump_enabled_p ())
1332 dump_printf_loc (MSG_NOTE, vect_location,
1333 "Symbolic number of iterations is ");
1334 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1335 dump_printf (MSG_NOTE, "\n");
1339 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1340 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1341 if (inner_loop_cond)
1343 stmt_vec_info inner_loop_cond_info
1344 = loop_vinfo->lookup_stmt (inner_loop_cond);
1345 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1348 gcc_assert (!loop->aux);
1349 loop->aux = loop_vinfo;
1350 return opt_loop_vec_info::success (loop_vinfo);
1355 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1356 statements update the vectorization factor. */
1358 static void
1359 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1361 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1362 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1363 int nbbs = loop->num_nodes;
1364 poly_uint64 vectorization_factor;
1365 int i;
1367 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1369 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1370 gcc_assert (known_ne (vectorization_factor, 0U));
1372 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1373 vectorization factor of the loop is the unrolling factor required by
1374 the SLP instances. If that unrolling factor is 1, we say, that we
1375 perform pure SLP on loop - cross iteration parallelism is not
1376 exploited. */
1377 bool only_slp_in_loop = true;
1378 for (i = 0; i < nbbs; i++)
1380 basic_block bb = bbs[i];
1381 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1382 gsi_next (&si))
1384 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1385 if (!stmt_info)
1386 continue;
1387 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1388 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1389 && !PURE_SLP_STMT (stmt_info))
1390 /* STMT needs both SLP and loop-based vectorization. */
1391 only_slp_in_loop = false;
1393 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1394 gsi_next (&si))
1396 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1397 stmt_info = vect_stmt_to_vectorize (stmt_info);
1398 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1399 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1400 && !PURE_SLP_STMT (stmt_info))
1401 /* STMT needs both SLP and loop-based vectorization. */
1402 only_slp_in_loop = false;
1406 if (only_slp_in_loop)
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_NOTE, vect_location,
1410 "Loop contains only SLP stmts\n");
1411 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1413 else
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_NOTE, vect_location,
1417 "Loop contains SLP and non-SLP stmts\n");
1418 /* Both the vectorization factor and unroll factor have the form
1419 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1420 so they must have a common multiple. */
1421 vectorization_factor
1422 = force_common_multiple (vectorization_factor,
1423 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1426 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1427 if (dump_enabled_p ())
1429 dump_printf_loc (MSG_NOTE, vect_location,
1430 "Updating vectorization factor to ");
1431 dump_dec (MSG_NOTE, vectorization_factor);
1432 dump_printf (MSG_NOTE, ".\n");
1436 /* Return true if STMT_INFO describes a double reduction phi and if
1437 the other phi in the reduction is also relevant for vectorization.
1438 This rejects cases such as:
1440 outer1:
1441 x_1 = PHI <x_3(outer2), ...>;
1444 inner:
1445 x_2 = ...;
1448 outer2:
1449 x_3 = PHI <x_2(inner)>;
1451 if nothing in x_2 or elsewhere makes x_1 relevant. */
1453 static bool
1454 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1456 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1457 return false;
1459 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1462 /* Function vect_analyze_loop_operations.
1464 Scan the loop stmts and make sure they are all vectorizable. */
1466 static opt_result
1467 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1469 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1470 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1471 int nbbs = loop->num_nodes;
1472 int i;
1473 stmt_vec_info stmt_info;
1474 bool need_to_vectorize = false;
1475 bool ok;
1477 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1479 auto_vec<stmt_info_for_cost> cost_vec;
1481 for (i = 0; i < nbbs; i++)
1483 basic_block bb = bbs[i];
1485 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1486 gsi_next (&si))
1488 gphi *phi = si.phi ();
1489 ok = true;
1491 stmt_info = loop_vinfo->lookup_stmt (phi);
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1494 if (virtual_operand_p (gimple_phi_result (phi)))
1495 continue;
1497 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1498 (i.e., a phi in the tail of the outer-loop). */
1499 if (! is_loop_header_bb_p (bb))
1501 /* FORNOW: we currently don't support the case that these phis
1502 are not used in the outerloop (unless it is double reduction,
1503 i.e., this phi is vect_reduction_def), cause this case
1504 requires to actually do something here. */
1505 if (STMT_VINFO_LIVE_P (stmt_info)
1506 && !vect_active_double_reduction_p (stmt_info))
1507 return opt_result::failure_at (phi,
1508 "Unsupported loop-closed phi"
1509 " in outer-loop.\n");
1511 /* If PHI is used in the outer loop, we check that its operand
1512 is defined in the inner loop. */
1513 if (STMT_VINFO_RELEVANT_P (stmt_info))
1515 tree phi_op;
1517 if (gimple_phi_num_args (phi) != 1)
1518 return opt_result::failure_at (phi, "unsupported phi");
1520 phi_op = PHI_ARG_DEF (phi, 0);
1521 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1522 if (!op_def_info)
1523 return opt_result::failure_at (phi, "unsupported phi\n");
1525 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1526 && (STMT_VINFO_RELEVANT (op_def_info)
1527 != vect_used_in_outer_by_reduction))
1528 return opt_result::failure_at (phi, "unsupported phi\n");
1530 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1531 || (STMT_VINFO_DEF_TYPE (stmt_info)
1532 == vect_double_reduction_def))
1533 && !vectorizable_lc_phi (loop_vinfo,
1534 stmt_info, NULL, NULL))
1535 return opt_result::failure_at (phi, "unsupported phi\n");
1538 continue;
1541 gcc_assert (stmt_info);
1543 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1544 || STMT_VINFO_LIVE_P (stmt_info))
1545 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1546 /* A scalar-dependence cycle that we don't support. */
1547 return opt_result::failure_at (phi,
1548 "not vectorized:"
1549 " scalar dependence cycle.\n");
1551 if (STMT_VINFO_RELEVANT_P (stmt_info))
1553 need_to_vectorize = true;
1554 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1555 && ! PURE_SLP_STMT (stmt_info))
1556 ok = vectorizable_induction (loop_vinfo,
1557 stmt_info, NULL, NULL, NULL,
1558 &cost_vec);
1559 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1560 || (STMT_VINFO_DEF_TYPE (stmt_info)
1561 == vect_double_reduction_def)
1562 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1563 && ! PURE_SLP_STMT (stmt_info))
1564 ok = vectorizable_reduction (loop_vinfo,
1565 stmt_info, NULL, NULL, &cost_vec);
1568 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1569 if (ok
1570 && STMT_VINFO_LIVE_P (stmt_info)
1571 && !PURE_SLP_STMT (stmt_info))
1572 ok = vectorizable_live_operation (loop_vinfo,
1573 stmt_info, NULL, NULL, NULL,
1574 -1, false, &cost_vec);
1576 if (!ok)
1577 return opt_result::failure_at (phi,
1578 "not vectorized: relevant phi not "
1579 "supported: %G",
1580 static_cast <gimple *> (phi));
1583 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1584 gsi_next (&si))
1586 gimple *stmt = gsi_stmt (si);
1587 if (!gimple_clobber_p (stmt))
1589 opt_result res
1590 = vect_analyze_stmt (loop_vinfo,
1591 loop_vinfo->lookup_stmt (stmt),
1592 &need_to_vectorize,
1593 NULL, NULL, &cost_vec);
1594 if (!res)
1595 return res;
1598 } /* bbs */
1600 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1602 /* All operations in the loop are either irrelevant (deal with loop
1603 control, or dead), or only used outside the loop and can be moved
1604 out of the loop (e.g. invariants, inductions). The loop can be
1605 optimized away by scalar optimizations. We're better off not
1606 touching this loop. */
1607 if (!need_to_vectorize)
1609 if (dump_enabled_p ())
1610 dump_printf_loc (MSG_NOTE, vect_location,
1611 "All the computation can be taken out of the loop.\n");
1612 return opt_result::failure_at
1613 (vect_location,
1614 "not vectorized: redundant loop. no profit to vectorize.\n");
1617 return opt_result::success ();
1620 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1621 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1622 definitely no, or -1 if it's worth retrying. */
1624 static int
1625 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1627 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1630 /* Only fully-masked loops can have iteration counts less than the
1631 vectorization factor. */
1632 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1634 HOST_WIDE_INT max_niter;
1636 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1637 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1638 else
1639 max_niter = max_stmt_executions_int (loop);
1641 if (max_niter != -1
1642 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1644 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646 "not vectorized: iteration count smaller than "
1647 "vectorization factor.\n");
1648 return 0;
1652 int min_profitable_iters, min_profitable_estimate;
1653 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1654 &min_profitable_estimate);
1656 if (min_profitable_iters < 0)
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660 "not vectorized: vectorization not profitable.\n");
1661 if (dump_enabled_p ())
1662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663 "not vectorized: vector version will never be "
1664 "profitable.\n");
1665 return -1;
1668 int min_scalar_loop_bound = (param_min_vect_loop_bound
1669 * assumed_vf);
1671 /* Use the cost model only if it is more conservative than user specified
1672 threshold. */
1673 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1674 min_profitable_iters);
1676 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1678 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1679 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "not vectorized: vectorization not profitable.\n");
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_NOTE, vect_location,
1686 "not vectorized: iteration count smaller than user "
1687 "specified loop bound parameter or minimum profitable "
1688 "iterations (whichever is more conservative).\n");
1689 return 0;
1692 /* The static profitablity threshold min_profitable_estimate includes
1693 the cost of having to check at runtime whether the scalar loop
1694 should be used instead. If it turns out that we don't need or want
1695 such a check, the threshold we should use for the static estimate
1696 is simply the point at which the vector loop becomes more profitable
1697 than the scalar loop. */
1698 if (min_profitable_estimate > min_profitable_iters
1699 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1700 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1701 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1702 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1704 if (dump_enabled_p ())
1705 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1706 " choice between the scalar and vector loops\n");
1707 min_profitable_estimate = min_profitable_iters;
1710 HOST_WIDE_INT estimated_niter;
1712 /* If we are vectorizing an epilogue then we know the maximum number of
1713 scalar iterations it will cover is at least one lower than the
1714 vectorization factor of the main loop. */
1715 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1716 estimated_niter
1717 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1718 else
1720 estimated_niter = estimated_stmt_executions_int (loop);
1721 if (estimated_niter == -1)
1722 estimated_niter = likely_max_stmt_executions_int (loop);
1724 if (estimated_niter != -1
1725 && ((unsigned HOST_WIDE_INT) estimated_niter
1726 < MAX (th, (unsigned) min_profitable_estimate)))
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730 "not vectorized: estimated iteration count too "
1731 "small.\n");
1732 if (dump_enabled_p ())
1733 dump_printf_loc (MSG_NOTE, vect_location,
1734 "not vectorized: estimated iteration count smaller "
1735 "than specified loop bound parameter or minimum "
1736 "profitable iterations (whichever is more "
1737 "conservative).\n");
1738 return -1;
1741 return 1;
1744 static opt_result
1745 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1746 vec<data_reference_p> *datarefs,
1747 unsigned int *n_stmts)
1749 *n_stmts = 0;
1750 for (unsigned i = 0; i < loop->num_nodes; i++)
1751 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1752 !gsi_end_p (gsi); gsi_next (&gsi))
1754 gimple *stmt = gsi_stmt (gsi);
1755 if (is_gimple_debug (stmt))
1756 continue;
1757 ++(*n_stmts);
1758 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1759 if (!res)
1761 if (is_gimple_call (stmt) && loop->safelen)
1763 tree fndecl = gimple_call_fndecl (stmt), op;
1764 if (fndecl != NULL_TREE)
1766 cgraph_node *node = cgraph_node::get (fndecl);
1767 if (node != NULL && node->simd_clones != NULL)
1769 unsigned int j, n = gimple_call_num_args (stmt);
1770 for (j = 0; j < n; j++)
1772 op = gimple_call_arg (stmt, j);
1773 if (DECL_P (op)
1774 || (REFERENCE_CLASS_P (op)
1775 && get_base_address (op)))
1776 break;
1778 op = gimple_call_lhs (stmt);
1779 /* Ignore #pragma omp declare simd functions
1780 if they don't have data references in the
1781 call stmt itself. */
1782 if (j == n
1783 && !(op
1784 && (DECL_P (op)
1785 || (REFERENCE_CLASS_P (op)
1786 && get_base_address (op)))))
1787 continue;
1791 return res;
1793 /* If dependence analysis will give up due to the limit on the
1794 number of datarefs stop here and fail fatally. */
1795 if (datarefs->length ()
1796 > (unsigned)param_loop_max_datarefs_for_datadeps)
1797 return opt_result::failure_at (stmt, "exceeded param "
1798 "loop-max-datarefs-for-datadeps\n");
1800 return opt_result::success ();
1803 /* Look for SLP-only access groups and turn each individual access into its own
1804 group. */
1805 static void
1806 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1808 unsigned int i;
1809 struct data_reference *dr;
1811 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1813 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1814 FOR_EACH_VEC_ELT (datarefs, i, dr)
1816 gcc_assert (DR_REF (dr));
1817 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1819 /* Check if the load is a part of an interleaving chain. */
1820 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1822 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1823 unsigned int group_size = DR_GROUP_SIZE (first_element);
1825 /* Check if SLP-only groups. */
1826 if (!STMT_SLP_TYPE (stmt_info)
1827 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1829 /* Dissolve the group. */
1830 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1832 stmt_vec_info vinfo = first_element;
1833 while (vinfo)
1835 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1836 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1837 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1838 DR_GROUP_SIZE (vinfo) = 1;
1839 if (STMT_VINFO_STRIDED_P (first_element))
1840 DR_GROUP_GAP (vinfo) = 0;
1841 else
1842 DR_GROUP_GAP (vinfo) = group_size - 1;
1843 vinfo = next;
1851 /* Decides whether we need to create an epilogue loop to handle
1852 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1854 void
1855 determine_peel_for_niter (loop_vec_info loop_vinfo)
1857 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1859 unsigned HOST_WIDE_INT const_vf;
1860 HOST_WIDE_INT max_niter
1861 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1863 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1864 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1865 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1866 (loop_vinfo));
1868 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1869 /* The main loop handles all iterations. */
1870 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1871 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1872 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1874 /* Work out the (constant) number of iterations that need to be
1875 peeled for reasons other than niters. */
1876 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1877 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1878 peel_niter += 1;
1879 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1880 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1881 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1883 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1884 /* ??? When peeling for gaps but not alignment, we could
1885 try to check whether the (variable) niters is known to be
1886 VF * N + 1. That's something of a niche case though. */
1887 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1888 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1889 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1890 < (unsigned) exact_log2 (const_vf))
1891 /* In case of versioning, check if the maximum number of
1892 iterations is greater than th. If they are identical,
1893 the epilogue is unnecessary. */
1894 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1895 || ((unsigned HOST_WIDE_INT) max_niter
1896 > (th / const_vf) * const_vf))))
1897 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1901 /* Function vect_analyze_loop_2.
1903 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1904 for it. The different analyses will record information in the
1905 loop_vec_info struct. */
1906 static opt_result
1907 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1909 opt_result ok = opt_result::success ();
1910 int res;
1911 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1912 poly_uint64 min_vf = 2;
1913 loop_vec_info orig_loop_vinfo = NULL;
1915 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1916 loop_vec_info of the first vectorized loop. */
1917 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1918 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1919 else
1920 orig_loop_vinfo = loop_vinfo;
1921 gcc_assert (orig_loop_vinfo);
1923 /* The first group of checks is independent of the vector size. */
1924 fatal = true;
1926 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1927 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1928 return opt_result::failure_at (vect_location,
1929 "not vectorized: simd if(0)\n");
1931 /* Find all data references in the loop (which correspond to vdefs/vuses)
1932 and analyze their evolution in the loop. */
1934 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1936 /* Gather the data references and count stmts in the loop. */
1937 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1939 opt_result res
1940 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1941 &LOOP_VINFO_DATAREFS (loop_vinfo),
1942 n_stmts);
1943 if (!res)
1945 if (dump_enabled_p ())
1946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1947 "not vectorized: loop contains function "
1948 "calls or data references that cannot "
1949 "be analyzed\n");
1950 return res;
1952 loop_vinfo->shared->save_datarefs ();
1954 else
1955 loop_vinfo->shared->check_datarefs ();
1957 /* Analyze the data references and also adjust the minimal
1958 vectorization factor according to the loads and stores. */
1960 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1961 if (!ok)
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965 "bad data references.\n");
1966 return ok;
1969 /* Classify all cross-iteration scalar data-flow cycles.
1970 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1971 vect_analyze_scalar_cycles (loop_vinfo);
1973 vect_pattern_recog (loop_vinfo);
1975 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1977 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1978 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1980 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1981 if (!ok)
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985 "bad data access.\n");
1986 return ok;
1989 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1991 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1992 if (!ok)
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996 "unexpected pattern.\n");
1997 return ok;
2000 /* While the rest of the analysis below depends on it in some way. */
2001 fatal = false;
2003 /* Analyze data dependences between the data-refs in the loop
2004 and adjust the maximum vectorization factor according to
2005 the dependences.
2006 FORNOW: fail at the first data dependence that we encounter. */
2008 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2009 if (!ok)
2011 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013 "bad data dependence.\n");
2014 return ok;
2016 if (max_vf != MAX_VECTORIZATION_FACTOR
2017 && maybe_lt (max_vf, min_vf))
2018 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2019 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2021 ok = vect_determine_vectorization_factor (loop_vinfo);
2022 if (!ok)
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026 "can't determine vectorization factor.\n");
2027 return ok;
2029 if (max_vf != MAX_VECTORIZATION_FACTOR
2030 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2031 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2033 /* Compute the scalar iteration cost. */
2034 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2036 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2038 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2039 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2040 if (!ok)
2041 return ok;
2043 /* If there are any SLP instances mark them as pure_slp. */
2044 bool slp = vect_make_slp_decision (loop_vinfo);
2045 if (slp)
2047 /* Find stmts that need to be both vectorized and SLPed. */
2048 vect_detect_hybrid_slp (loop_vinfo);
2050 /* Update the vectorization factor based on the SLP decision. */
2051 vect_update_vf_for_slp (loop_vinfo);
2054 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2056 /* We don't expect to have to roll back to anything other than an empty
2057 set of rgroups. */
2058 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2060 /* This is the point where we can re-start analysis with SLP forced off. */
2061 start_over:
2063 /* Now the vectorization factor is final. */
2064 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2065 gcc_assert (known_ne (vectorization_factor, 0U));
2067 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE, vect_location,
2070 "vectorization_factor = ");
2071 dump_dec (MSG_NOTE, vectorization_factor);
2072 dump_printf (MSG_NOTE, ", niters = %wd\n",
2073 LOOP_VINFO_INT_NITERS (loop_vinfo));
2076 /* Analyze the alignment of the data-refs in the loop.
2077 Fail if a data reference is found that cannot be vectorized. */
2079 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2080 if (!ok)
2082 if (dump_enabled_p ())
2083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084 "bad data alignment.\n");
2085 return ok;
2088 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2089 It is important to call pruning after vect_analyze_data_ref_accesses,
2090 since we use grouping information gathered by interleaving analysis. */
2091 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2092 if (!ok)
2093 return ok;
2095 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2096 vectorization, since we do not want to add extra peeling or
2097 add versioning for alignment. */
2098 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2099 /* This pass will decide on using loop versioning and/or loop peeling in
2100 order to enhance the alignment of data references in the loop. */
2101 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2102 else
2103 ok = vect_verify_datarefs_alignment (loop_vinfo);
2104 if (!ok)
2105 return ok;
2107 if (slp)
2109 /* Analyze operations in the SLP instances. Note this may
2110 remove unsupported SLP instances which makes the above
2111 SLP kind detection invalid. */
2112 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2113 vect_slp_analyze_operations (loop_vinfo);
2114 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2116 ok = opt_result::failure_at (vect_location,
2117 "unsupported SLP instances\n");
2118 goto again;
2122 /* Dissolve SLP-only groups. */
2123 vect_dissolve_slp_only_groups (loop_vinfo);
2125 /* Scan all the remaining operations in the loop that are not subject
2126 to SLP and make sure they are vectorizable. */
2127 ok = vect_analyze_loop_operations (loop_vinfo);
2128 if (!ok)
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2132 "bad operation or unsupported loop bound.\n");
2133 return ok;
2136 /* Decide whether to use a fully-masked loop for this vectorization
2137 factor. */
2138 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2139 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2140 && vect_verify_full_masking (loop_vinfo));
2141 if (dump_enabled_p ())
2143 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2144 dump_printf_loc (MSG_NOTE, vect_location,
2145 "using a fully-masked loop.\n");
2146 else
2147 dump_printf_loc (MSG_NOTE, vect_location,
2148 "not using a fully-masked loop.\n");
2151 /* If epilog loop is required because of data accesses with gaps,
2152 one additional iteration needs to be peeled. Check if there is
2153 enough iterations for vectorization. */
2154 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2155 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2156 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2158 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2159 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2161 if (known_lt (wi::to_widest (scalar_niters), vf))
2162 return opt_result::failure_at (vect_location,
2163 "loop has no enough iterations to"
2164 " support peeling for gaps.\n");
2167 /* If we're vectorizing an epilogue loop, we either need a fully-masked
2168 loop or a loop that has a lower VF than the main loop. */
2169 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2170 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2171 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2172 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2173 return opt_result::failure_at (vect_location,
2174 "Vectorization factor too high for"
2175 " epilogue loop.\n");
2177 /* Check the costings of the loop make vectorizing worthwhile. */
2178 res = vect_analyze_loop_costing (loop_vinfo);
2179 if (res < 0)
2181 ok = opt_result::failure_at (vect_location,
2182 "Loop costings may not be worthwhile.\n");
2183 goto again;
2185 if (!res)
2186 return opt_result::failure_at (vect_location,
2187 "Loop costings not worthwhile.\n");
2189 determine_peel_for_niter (loop_vinfo);
2190 /* If an epilogue loop is required make sure we can create one. */
2191 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2192 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2194 if (dump_enabled_p ())
2195 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2196 if (!vect_can_advance_ivs_p (loop_vinfo)
2197 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2198 single_exit (LOOP_VINFO_LOOP
2199 (loop_vinfo))))
2201 ok = opt_result::failure_at (vect_location,
2202 "not vectorized: can't create required "
2203 "epilog loop\n");
2204 goto again;
2208 /* During peeling, we need to check if number of loop iterations is
2209 enough for both peeled prolog loop and vector loop. This check
2210 can be merged along with threshold check of loop versioning, so
2211 increase threshold for this case if necessary.
2213 If we are analyzing an epilogue we still want to check what its
2214 versioning threshold would be. If we decide to vectorize the epilogues we
2215 will want to use the lowest versioning threshold of all epilogues and main
2216 loop. This will enable us to enter a vectorized epilogue even when
2217 versioning the loop. We can't simply check whether the epilogue requires
2218 versioning though since we may have skipped some versioning checks when
2219 analyzing the epilogue. For instance, checks for alias versioning will be
2220 skipped when dealing with epilogues as we assume we already checked them
2221 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2222 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2224 poly_uint64 niters_th = 0;
2225 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2227 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2229 /* Niters for peeled prolog loop. */
2230 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2232 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2233 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2234 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2236 else
2237 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2240 /* Niters for at least one iteration of vectorized loop. */
2241 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2242 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2243 /* One additional iteration because of peeling for gap. */
2244 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2245 niters_th += 1;
2247 /* Use the same condition as vect_transform_loop to decide when to use
2248 the cost to determine a versioning threshold. */
2249 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2250 && ordered_p (th, niters_th))
2251 niters_th = ordered_max (poly_uint64 (th), niters_th);
2253 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2256 gcc_assert (known_eq (vectorization_factor,
2257 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2259 /* Ok to vectorize! */
2260 return opt_result::success ();
2262 again:
2263 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2264 gcc_assert (!ok);
2266 /* Try again with SLP forced off but if we didn't do any SLP there is
2267 no point in re-trying. */
2268 if (!slp)
2269 return ok;
2271 /* If there are reduction chains re-trying will fail anyway. */
2272 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2273 return ok;
2275 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2276 via interleaving or lane instructions. */
2277 slp_instance instance;
2278 slp_tree node;
2279 unsigned i, j;
2280 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2282 stmt_vec_info vinfo;
2283 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2284 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2285 continue;
2286 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2287 unsigned int size = DR_GROUP_SIZE (vinfo);
2288 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2289 if (! vect_store_lanes_supported (vectype, size, false)
2290 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2291 && ! vect_grouped_store_supported (vectype, size))
2292 return opt_result::failure_at (vinfo->stmt,
2293 "unsupported grouped store\n");
2294 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2296 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2297 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2298 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2299 size = DR_GROUP_SIZE (vinfo);
2300 vectype = STMT_VINFO_VECTYPE (vinfo);
2301 if (! vect_load_lanes_supported (vectype, size, false)
2302 && ! vect_grouped_load_supported (vectype, single_element_p,
2303 size))
2304 return opt_result::failure_at (vinfo->stmt,
2305 "unsupported grouped load\n");
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_NOTE, vect_location,
2311 "re-trying with SLP disabled\n");
2313 /* Roll back state appropriately. No SLP this time. */
2314 slp = false;
2315 /* Restore vectorization factor as it were without SLP. */
2316 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2317 /* Free the SLP instances. */
2318 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2319 vect_free_slp_instance (instance, false);
2320 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2321 /* Reset SLP type to loop_vect on all stmts. */
2322 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2324 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2325 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2326 !gsi_end_p (si); gsi_next (&si))
2328 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2329 STMT_SLP_TYPE (stmt_info) = loop_vect;
2330 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2331 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2333 /* vectorizable_reduction adjusts reduction stmt def-types,
2334 restore them to that of the PHI. */
2335 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2336 = STMT_VINFO_DEF_TYPE (stmt_info);
2337 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2338 (STMT_VINFO_REDUC_DEF (stmt_info)))
2339 = STMT_VINFO_DEF_TYPE (stmt_info);
2342 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2343 !gsi_end_p (si); gsi_next (&si))
2345 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2346 STMT_SLP_TYPE (stmt_info) = loop_vect;
2347 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2349 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2350 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2351 STMT_SLP_TYPE (stmt_info) = loop_vect;
2352 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2353 !gsi_end_p (pi); gsi_next (&pi))
2354 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2355 = loop_vect;
2359 /* Free optimized alias test DDRS. */
2360 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2361 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2362 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2363 /* Reset target cost data. */
2364 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2365 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2366 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2367 /* Reset accumulated rgroup information. */
2368 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2369 /* Reset assorted flags. */
2370 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2371 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2372 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2373 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2374 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2376 goto start_over;
2379 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2380 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2381 OLD_LOOP_VINFO is better unless something specifically indicates
2382 otherwise.
2384 Note that this deliberately isn't a partial order. */
2386 static bool
2387 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2388 loop_vec_info old_loop_vinfo)
2390 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2391 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2393 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2394 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2396 /* Always prefer a VF of loop->simdlen over any other VF. */
2397 if (loop->simdlen)
2399 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2400 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2401 if (new_simdlen_p != old_simdlen_p)
2402 return new_simdlen_p;
2405 /* Limit the VFs to what is likely to be the maximum number of iterations,
2406 to handle cases in which at least one loop_vinfo is fully-masked. */
2407 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2408 if (estimated_max_niter != -1)
2410 if (known_le (estimated_max_niter, new_vf))
2411 new_vf = estimated_max_niter;
2412 if (known_le (estimated_max_niter, old_vf))
2413 old_vf = estimated_max_niter;
2416 /* Check whether the (fractional) cost per scalar iteration is lower
2417 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2418 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2419 * poly_widest_int (old_vf));
2420 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2421 * poly_widest_int (new_vf));
2422 if (maybe_lt (rel_old, rel_new))
2424 /* When old_loop_vinfo uses a variable vectorization factor,
2425 we know that it has a lower cost for at least one runtime VF.
2426 However, we don't know how likely that VF is.
2428 One option would be to compare the costs for the estimated VFs.
2429 The problem is that that can put too much pressure on the cost
2430 model. E.g. if the estimated VF is also the lowest possible VF,
2431 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2432 for the estimated VF, we'd then choose new_loop_vinfo even
2433 though (a) new_loop_vinfo might not actually be better than
2434 old_loop_vinfo for that VF and (b) it would be significantly
2435 worse at larger VFs.
2437 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2438 no more expensive than old_loop_vinfo even after doubling the
2439 estimated old_loop_vinfo VF. For all but trivial loops, this
2440 ensures that we only pick new_loop_vinfo if it is significantly
2441 better than old_loop_vinfo at the estimated VF. */
2442 if (rel_new.is_constant ())
2443 return false;
2445 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2446 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2447 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2448 * widest_int (old_estimated_vf));
2449 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2450 * widest_int (new_estimated_vf));
2451 return estimated_rel_new * 2 <= estimated_rel_old;
2453 if (known_lt (rel_new, rel_old))
2454 return true;
2456 /* If there's nothing to choose between the loop bodies, see whether
2457 there's a difference in the prologue and epilogue costs. */
2458 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2459 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2461 return false;
2464 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2465 true if we should. */
2467 static bool
2468 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2469 loop_vec_info old_loop_vinfo)
2471 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2472 return false;
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE, vect_location,
2476 "***** Preferring vector mode %s to vector mode %s\n",
2477 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2478 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2479 return true;
2482 /* Function vect_analyze_loop.
2484 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2485 for it. The different analyses will record information in the
2486 loop_vec_info struct. */
2487 opt_loop_vec_info
2488 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2490 auto_vector_modes vector_modes;
2492 /* Autodetect first vector size we try. */
2493 unsigned int autovec_flags
2494 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2495 loop->simdlen != 0);
2496 unsigned int mode_i = 0;
2498 DUMP_VECT_SCOPE ("analyze_loop_nest");
2500 if (loop_outer (loop)
2501 && loop_vec_info_for_loop (loop_outer (loop))
2502 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2503 return opt_loop_vec_info::failure_at (vect_location,
2504 "outer-loop already vectorized.\n");
2506 if (!find_loop_nest (loop, &shared->loop_nest))
2507 return opt_loop_vec_info::failure_at
2508 (vect_location,
2509 "not vectorized: loop nest containing two or more consecutive inner"
2510 " loops cannot be vectorized\n");
2512 unsigned n_stmts = 0;
2513 machine_mode autodetected_vector_mode = VOIDmode;
2514 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2515 machine_mode next_vector_mode = VOIDmode;
2516 poly_uint64 lowest_th = 0;
2517 unsigned vectorized_loops = 0;
2518 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2519 && !unlimited_cost_model (loop));
2521 bool vect_epilogues = false;
2522 opt_result res = opt_result::success ();
2523 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2524 while (1)
2526 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2527 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2528 if (!loop_vinfo)
2530 if (dump_enabled_p ())
2531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2532 "bad loop form.\n");
2533 gcc_checking_assert (first_loop_vinfo == NULL);
2534 return loop_vinfo;
2536 loop_vinfo->vector_mode = next_vector_mode;
2538 bool fatal = false;
2540 /* When pick_lowest_cost_p is true, we should in principle iterate
2541 over all the loop_vec_infos that LOOP_VINFO could replace and
2542 try to vectorize LOOP_VINFO under the same conditions.
2543 E.g. when trying to replace an epilogue loop, we should vectorize
2544 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2545 to replace the main loop, we should vectorize LOOP_VINFO as a main
2546 loop too.
2548 However, autovectorize_vector_modes is usually sorted as follows:
2550 - Modes that naturally produce lower VFs usually follow modes that
2551 naturally produce higher VFs.
2553 - When modes naturally produce the same VF, maskable modes
2554 usually follow unmaskable ones, so that the maskable mode
2555 can be used to vectorize the epilogue of the unmaskable mode.
2557 This order is preferred because it leads to the maximum
2558 epilogue vectorization opportunities. Targets should only use
2559 a different order if they want to make wide modes available while
2560 disparaging them relative to earlier, smaller modes. The assumption
2561 in that case is that the wider modes are more expensive in some
2562 way that isn't reflected directly in the costs.
2564 There should therefore be few interesting cases in which
2565 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2566 treated as a standalone loop, and ends up being genuinely cheaper
2567 than FIRST_LOOP_VINFO. */
2568 if (vect_epilogues)
2569 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2571 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2572 if (mode_i == 0)
2573 autodetected_vector_mode = loop_vinfo->vector_mode;
2574 if (dump_enabled_p ())
2576 if (res)
2577 dump_printf_loc (MSG_NOTE, vect_location,
2578 "***** Analysis succeeded with vector mode %s\n",
2579 GET_MODE_NAME (loop_vinfo->vector_mode));
2580 else
2581 dump_printf_loc (MSG_NOTE, vect_location,
2582 "***** Analysis failed with vector mode %s\n",
2583 GET_MODE_NAME (loop_vinfo->vector_mode));
2586 loop->aux = NULL;
2588 if (!fatal)
2589 while (mode_i < vector_modes.length ()
2590 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2592 if (dump_enabled_p ())
2593 dump_printf_loc (MSG_NOTE, vect_location,
2594 "***** The result for vector mode %s would"
2595 " be the same\n",
2596 GET_MODE_NAME (vector_modes[mode_i]));
2597 mode_i += 1;
2600 if (res)
2602 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2603 vectorized_loops++;
2605 /* Once we hit the desired simdlen for the first time,
2606 discard any previous attempts. */
2607 if (simdlen
2608 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2610 delete first_loop_vinfo;
2611 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2612 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2613 simdlen = 0;
2615 else if (pick_lowest_cost_p && first_loop_vinfo)
2617 /* Keep trying to roll back vectorization attempts while the
2618 loop_vec_infos they produced were worse than this one. */
2619 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2620 while (!vinfos.is_empty ()
2621 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2623 gcc_assert (vect_epilogues);
2624 delete vinfos.pop ();
2626 if (vinfos.is_empty ()
2627 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2629 delete first_loop_vinfo;
2630 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2631 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2635 if (first_loop_vinfo == NULL)
2637 first_loop_vinfo = loop_vinfo;
2638 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2640 else if (vect_epilogues
2641 /* For now only allow one epilogue loop. */
2642 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2644 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2645 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2646 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2647 || maybe_ne (lowest_th, 0U));
2648 /* Keep track of the known smallest versioning
2649 threshold. */
2650 if (ordered_p (lowest_th, th))
2651 lowest_th = ordered_min (lowest_th, th);
2653 else
2654 delete loop_vinfo;
2656 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2657 enabled, SIMDUID is not set, it is the innermost loop and we have
2658 either already found the loop's SIMDLEN or there was no SIMDLEN to
2659 begin with.
2660 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2661 vect_epilogues = (!simdlen
2662 && loop->inner == NULL
2663 && param_vect_epilogues_nomask
2664 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2665 && !loop->simduid
2666 /* For now only allow one epilogue loop, but allow
2667 pick_lowest_cost_p to replace it. */
2668 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2669 || pick_lowest_cost_p));
2671 /* Commit to first_loop_vinfo if we have no reason to try
2672 alternatives. */
2673 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2674 break;
2676 else
2678 delete loop_vinfo;
2679 if (fatal)
2681 gcc_checking_assert (first_loop_vinfo == NULL);
2682 break;
2686 if (mode_i < vector_modes.length ()
2687 && VECTOR_MODE_P (autodetected_vector_mode)
2688 && (related_vector_mode (vector_modes[mode_i],
2689 GET_MODE_INNER (autodetected_vector_mode))
2690 == autodetected_vector_mode)
2691 && (related_vector_mode (autodetected_vector_mode,
2692 GET_MODE_INNER (vector_modes[mode_i]))
2693 == vector_modes[mode_i]))
2695 if (dump_enabled_p ())
2696 dump_printf_loc (MSG_NOTE, vect_location,
2697 "***** Skipping vector mode %s, which would"
2698 " repeat the analysis for %s\n",
2699 GET_MODE_NAME (vector_modes[mode_i]),
2700 GET_MODE_NAME (autodetected_vector_mode));
2701 mode_i += 1;
2704 if (mode_i == vector_modes.length ()
2705 || autodetected_vector_mode == VOIDmode)
2706 break;
2708 /* Try the next biggest vector size. */
2709 next_vector_mode = vector_modes[mode_i++];
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_NOTE, vect_location,
2712 "***** Re-trying analysis with vector mode %s\n",
2713 GET_MODE_NAME (next_vector_mode));
2716 if (first_loop_vinfo)
2718 loop->aux = (loop_vec_info) first_loop_vinfo;
2719 if (dump_enabled_p ())
2720 dump_printf_loc (MSG_NOTE, vect_location,
2721 "***** Choosing vector mode %s\n",
2722 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2723 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2724 return first_loop_vinfo;
2727 return opt_loop_vec_info::propagate_failure (res);
2730 /* Return true if there is an in-order reduction function for CODE, storing
2731 it in *REDUC_FN if so. */
2733 static bool
2734 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2736 switch (code)
2738 case PLUS_EXPR:
2739 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2740 return true;
2742 default:
2743 return false;
2747 /* Function reduction_fn_for_scalar_code
2749 Input:
2750 CODE - tree_code of a reduction operations.
2752 Output:
2753 REDUC_FN - the corresponding internal function to be used to reduce the
2754 vector of partial results into a single scalar result, or IFN_LAST
2755 if the operation is a supported reduction operation, but does not have
2756 such an internal function.
2758 Return FALSE if CODE currently cannot be vectorized as reduction. */
2760 static bool
2761 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2763 switch (code)
2765 case MAX_EXPR:
2766 *reduc_fn = IFN_REDUC_MAX;
2767 return true;
2769 case MIN_EXPR:
2770 *reduc_fn = IFN_REDUC_MIN;
2771 return true;
2773 case PLUS_EXPR:
2774 *reduc_fn = IFN_REDUC_PLUS;
2775 return true;
2777 case BIT_AND_EXPR:
2778 *reduc_fn = IFN_REDUC_AND;
2779 return true;
2781 case BIT_IOR_EXPR:
2782 *reduc_fn = IFN_REDUC_IOR;
2783 return true;
2785 case BIT_XOR_EXPR:
2786 *reduc_fn = IFN_REDUC_XOR;
2787 return true;
2789 case MULT_EXPR:
2790 case MINUS_EXPR:
2791 *reduc_fn = IFN_LAST;
2792 return true;
2794 default:
2795 return false;
2799 /* If there is a neutral value X such that SLP reduction NODE would not
2800 be affected by the introduction of additional X elements, return that X,
2801 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2802 is the vector type that would hold element X. REDUC_CHAIN is true if
2803 the SLP statements perform a single reduction, false if each statement
2804 performs an independent reduction. */
2806 static tree
2807 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2808 tree_code code, bool reduc_chain)
2810 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2811 stmt_vec_info stmt_vinfo = stmts[0];
2812 tree scalar_type = TREE_TYPE (vector_type);
2813 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2814 gcc_assert (loop);
2816 switch (code)
2818 case WIDEN_SUM_EXPR:
2819 case DOT_PROD_EXPR:
2820 case SAD_EXPR:
2821 case PLUS_EXPR:
2822 case MINUS_EXPR:
2823 case BIT_IOR_EXPR:
2824 case BIT_XOR_EXPR:
2825 return build_zero_cst (scalar_type);
2827 case MULT_EXPR:
2828 return build_one_cst (scalar_type);
2830 case BIT_AND_EXPR:
2831 return build_all_ones_cst (scalar_type);
2833 case MAX_EXPR:
2834 case MIN_EXPR:
2835 /* For MIN/MAX the initial values are neutral. A reduction chain
2836 has only a single initial value, so that value is neutral for
2837 all statements. */
2838 if (reduc_chain)
2839 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2840 loop_preheader_edge (loop));
2841 return NULL_TREE;
2843 default:
2844 return NULL_TREE;
2848 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2849 STMT is printed with a message MSG. */
2851 static void
2852 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2854 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2857 /* Return true if we need an in-order reduction for operation CODE
2858 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2859 overflow must wrap. */
2861 bool
2862 needs_fold_left_reduction_p (tree type, tree_code code)
2864 /* CHECKME: check for !flag_finite_math_only too? */
2865 if (SCALAR_FLOAT_TYPE_P (type))
2866 switch (code)
2868 case MIN_EXPR:
2869 case MAX_EXPR:
2870 return false;
2872 default:
2873 return !flag_associative_math;
2876 if (INTEGRAL_TYPE_P (type))
2878 if (!operation_no_trapping_overflow (type, code))
2879 return true;
2880 return false;
2883 if (SAT_FIXED_POINT_TYPE_P (type))
2884 return true;
2886 return false;
2889 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2890 has a handled computation expression. Store the main reduction
2891 operation in *CODE. */
2893 static bool
2894 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2895 tree loop_arg, enum tree_code *code,
2896 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2898 auto_bitmap visited;
2899 tree lookfor = PHI_RESULT (phi);
2900 ssa_op_iter curri;
2901 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2902 while (USE_FROM_PTR (curr) != loop_arg)
2903 curr = op_iter_next_use (&curri);
2904 curri.i = curri.numops;
2907 path.safe_push (std::make_pair (curri, curr));
2908 tree use = USE_FROM_PTR (curr);
2909 if (use == lookfor)
2910 break;
2911 gimple *def = SSA_NAME_DEF_STMT (use);
2912 if (gimple_nop_p (def)
2913 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2915 pop:
2918 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2919 curri = x.first;
2920 curr = x.second;
2922 curr = op_iter_next_use (&curri);
2923 /* Skip already visited or non-SSA operands (from iterating
2924 over PHI args). */
2925 while (curr != NULL_USE_OPERAND_P
2926 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2927 || ! bitmap_set_bit (visited,
2928 SSA_NAME_VERSION
2929 (USE_FROM_PTR (curr)))));
2931 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2932 if (curr == NULL_USE_OPERAND_P)
2933 break;
2935 else
2937 if (gimple_code (def) == GIMPLE_PHI)
2938 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2939 else
2940 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2941 while (curr != NULL_USE_OPERAND_P
2942 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2943 || ! bitmap_set_bit (visited,
2944 SSA_NAME_VERSION
2945 (USE_FROM_PTR (curr)))))
2946 curr = op_iter_next_use (&curri);
2947 if (curr == NULL_USE_OPERAND_P)
2948 goto pop;
2951 while (1);
2952 if (dump_file && (dump_flags & TDF_DETAILS))
2954 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2955 unsigned i;
2956 std::pair<ssa_op_iter, use_operand_p> *x;
2957 FOR_EACH_VEC_ELT (path, i, x)
2958 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2959 dump_printf (MSG_NOTE, "\n");
2962 /* Check whether the reduction path detected is valid. */
2963 bool fail = path.length () == 0;
2964 bool neg = false;
2965 int sign = -1;
2966 *code = ERROR_MARK;
2967 for (unsigned i = 1; i < path.length (); ++i)
2969 gimple *use_stmt = USE_STMT (path[i].second);
2970 tree op = USE_FROM_PTR (path[i].second);
2971 if (! is_gimple_assign (use_stmt)
2972 /* The following make sure we can compute the operand index
2973 easily plus it mostly disallows chaining via COND_EXPR condition
2974 operands. */
2975 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2976 && (gimple_num_ops (use_stmt) <= 2
2977 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2978 && (gimple_num_ops (use_stmt) <= 3
2979 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2981 fail = true;
2982 break;
2984 /* Check there's only a single stmt the op is used on inside
2985 of the loop. */
2986 imm_use_iterator imm_iter;
2987 gimple *op_use_stmt;
2988 unsigned cnt = 0;
2989 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2990 if (!is_gimple_debug (op_use_stmt)
2991 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2993 /* We want to allow x + x but not x < 1 ? x : 2. */
2994 if (is_gimple_assign (op_use_stmt)
2995 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2997 use_operand_p use_p;
2998 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2999 cnt++;
3001 else
3002 cnt++;
3004 if (cnt != 1)
3006 fail = true;
3007 break;
3009 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3010 if (use_code == MINUS_EXPR)
3012 use_code = PLUS_EXPR;
3013 /* Track whether we negate the reduction value each iteration. */
3014 if (gimple_assign_rhs2 (use_stmt) == op)
3015 neg = ! neg;
3017 if (CONVERT_EXPR_CODE_P (use_code)
3018 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3019 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3021 else if (*code == ERROR_MARK)
3023 *code = use_code;
3024 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3026 else if (use_code != *code)
3028 fail = true;
3029 break;
3031 else if ((use_code == MIN_EXPR
3032 || use_code == MAX_EXPR)
3033 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3035 fail = true;
3036 break;
3039 return ! fail && ! neg && *code != ERROR_MARK;
3042 bool
3043 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3044 tree loop_arg, enum tree_code code)
3046 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3047 enum tree_code code_;
3048 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3049 && code_ == code);
3054 /* Function vect_is_simple_reduction
3056 (1) Detect a cross-iteration def-use cycle that represents a simple
3057 reduction computation. We look for the following pattern:
3059 loop_header:
3060 a1 = phi < a0, a2 >
3061 a3 = ...
3062 a2 = operation (a3, a1)
3066 a3 = ...
3067 loop_header:
3068 a1 = phi < a0, a2 >
3069 a2 = operation (a3, a1)
3071 such that:
3072 1. operation is commutative and associative and it is safe to
3073 change the order of the computation
3074 2. no uses for a2 in the loop (a2 is used out of the loop)
3075 3. no uses of a1 in the loop besides the reduction operation
3076 4. no uses of a1 outside the loop.
3078 Conditions 1,4 are tested here.
3079 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3081 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3082 nested cycles.
3084 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3085 reductions:
3087 a1 = phi < a0, a2 >
3088 inner loop (def of a3)
3089 a2 = phi < a3 >
3091 (4) Detect condition expressions, ie:
3092 for (int i = 0; i < N; i++)
3093 if (a[i] < val)
3094 ret_val = a[i];
3098 static stmt_vec_info
3099 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3100 bool *double_reduc, bool *reduc_chain_p)
3102 gphi *phi = as_a <gphi *> (phi_info->stmt);
3103 gimple *phi_use_stmt = NULL;
3104 imm_use_iterator imm_iter;
3105 use_operand_p use_p;
3107 *double_reduc = false;
3108 *reduc_chain_p = false;
3109 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3111 tree phi_name = PHI_RESULT (phi);
3112 /* ??? If there are no uses of the PHI result the inner loop reduction
3113 won't be detected as possibly double-reduction by vectorizable_reduction
3114 because that tries to walk the PHI arg from the preheader edge which
3115 can be constant. See PR60382. */
3116 if (has_zero_uses (phi_name))
3117 return NULL;
3118 class loop *loop = (gimple_bb (phi))->loop_father;
3119 unsigned nphi_def_loop_uses = 0;
3120 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3122 gimple *use_stmt = USE_STMT (use_p);
3123 if (is_gimple_debug (use_stmt))
3124 continue;
3126 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3128 if (dump_enabled_p ())
3129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130 "intermediate value used outside loop.\n");
3132 return NULL;
3135 nphi_def_loop_uses++;
3136 phi_use_stmt = use_stmt;
3139 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3140 if (TREE_CODE (latch_def) != SSA_NAME)
3142 if (dump_enabled_p ())
3143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144 "reduction: not ssa_name: %T\n", latch_def);
3145 return NULL;
3148 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3149 if (!def_stmt_info
3150 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3151 return NULL;
3153 bool nested_in_vect_loop
3154 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3155 unsigned nlatch_def_loop_uses = 0;
3156 auto_vec<gphi *, 3> lcphis;
3157 bool inner_loop_of_double_reduc = false;
3158 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3160 gimple *use_stmt = USE_STMT (use_p);
3161 if (is_gimple_debug (use_stmt))
3162 continue;
3163 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3164 nlatch_def_loop_uses++;
3165 else
3167 /* We can have more than one loop-closed PHI. */
3168 lcphis.safe_push (as_a <gphi *> (use_stmt));
3169 if (nested_in_vect_loop
3170 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3171 == vect_double_reduction_def))
3172 inner_loop_of_double_reduc = true;
3176 /* If we are vectorizing an inner reduction we are executing that
3177 in the original order only in case we are not dealing with a
3178 double reduction. */
3179 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3181 if (dump_enabled_p ())
3182 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3183 "detected nested cycle: ");
3184 return def_stmt_info;
3187 /* If this isn't a nested cycle or if the nested cycle reduction value
3188 is used ouside of the inner loop we cannot handle uses of the reduction
3189 value. */
3190 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3192 if (dump_enabled_p ())
3193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3194 "reduction used in loop.\n");
3195 return NULL;
3198 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3199 defined in the inner loop. */
3200 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3202 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3203 if (gimple_phi_num_args (def_stmt) != 1
3204 || TREE_CODE (op1) != SSA_NAME)
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3208 "unsupported phi node definition.\n");
3210 return NULL;
3213 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3214 if (gimple_bb (def1)
3215 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3216 && loop->inner
3217 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3218 && is_gimple_assign (def1)
3219 && is_a <gphi *> (phi_use_stmt)
3220 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3222 if (dump_enabled_p ())
3223 report_vect_op (MSG_NOTE, def_stmt,
3224 "detected double reduction: ");
3226 *double_reduc = true;
3227 return def_stmt_info;
3230 return NULL;
3233 /* Look for the expression computing latch_def from then loop PHI result. */
3234 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3235 enum tree_code code;
3236 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3237 path))
3239 STMT_VINFO_REDUC_CODE (phi_info) = code;
3240 if (code == COND_EXPR && !nested_in_vect_loop)
3241 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3243 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3244 reduction chain for which the additional restriction is that
3245 all operations in the chain are the same. */
3246 auto_vec<stmt_vec_info, 8> reduc_chain;
3247 unsigned i;
3248 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3249 for (i = path.length () - 1; i >= 1; --i)
3251 gimple *stmt = USE_STMT (path[i].second);
3252 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3253 STMT_VINFO_REDUC_IDX (stmt_info)
3254 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3255 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3256 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3257 && (i == 1 || i == path.length () - 1));
3258 if ((stmt_code != code && !leading_conversion)
3259 /* We can only handle the final value in epilogue
3260 generation for reduction chains. */
3261 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3262 is_slp_reduc = false;
3263 /* For reduction chains we support a trailing/leading
3264 conversions. We do not store those in the actual chain. */
3265 if (leading_conversion)
3266 continue;
3267 reduc_chain.safe_push (stmt_info);
3269 if (is_slp_reduc && reduc_chain.length () > 1)
3271 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3273 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3274 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3276 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3277 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3279 /* Save the chain for further analysis in SLP detection. */
3280 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3281 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3283 *reduc_chain_p = true;
3284 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_NOTE, vect_location,
3286 "reduction: detected reduction chain\n");
3288 else if (dump_enabled_p ())
3289 dump_printf_loc (MSG_NOTE, vect_location,
3290 "reduction: detected reduction\n");
3292 return def_stmt_info;
3295 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE, vect_location,
3297 "reduction: unknown pattern\n");
3299 return NULL;
3302 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3304 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3305 int *peel_iters_epilogue,
3306 stmt_vector_for_cost *scalar_cost_vec,
3307 stmt_vector_for_cost *prologue_cost_vec,
3308 stmt_vector_for_cost *epilogue_cost_vec)
3310 int retval = 0;
3311 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3313 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3315 *peel_iters_epilogue = assumed_vf / 2;
3316 if (dump_enabled_p ())
3317 dump_printf_loc (MSG_NOTE, vect_location,
3318 "cost model: epilogue peel iters set to vf/2 "
3319 "because loop iterations are unknown .\n");
3321 /* If peeled iterations are known but number of scalar loop
3322 iterations are unknown, count a taken branch per peeled loop. */
3323 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3324 NULL, 0, vect_prologue);
3325 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3326 NULL, 0, vect_epilogue);
3328 else
3330 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3331 peel_iters_prologue = niters < peel_iters_prologue ?
3332 niters : peel_iters_prologue;
3333 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3334 /* If we need to peel for gaps, but no peeling is required, we have to
3335 peel VF iterations. */
3336 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3337 *peel_iters_epilogue = assumed_vf;
3340 stmt_info_for_cost *si;
3341 int j;
3342 if (peel_iters_prologue)
3343 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3344 retval += record_stmt_cost (prologue_cost_vec,
3345 si->count * peel_iters_prologue,
3346 si->kind, si->stmt_info, si->misalign,
3347 vect_prologue);
3348 if (*peel_iters_epilogue)
3349 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3350 retval += record_stmt_cost (epilogue_cost_vec,
3351 si->count * *peel_iters_epilogue,
3352 si->kind, si->stmt_info, si->misalign,
3353 vect_epilogue);
3355 return retval;
3358 /* Function vect_estimate_min_profitable_iters
3360 Return the number of iterations required for the vector version of the
3361 loop to be profitable relative to the cost of the scalar version of the
3362 loop.
3364 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3365 of iterations for vectorization. -1 value means loop vectorization
3366 is not profitable. This returned value may be used for dynamic
3367 profitability check.
3369 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3370 for static check against estimated number of iterations. */
3372 static void
3373 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3374 int *ret_min_profitable_niters,
3375 int *ret_min_profitable_estimate)
3377 int min_profitable_iters;
3378 int min_profitable_estimate;
3379 int peel_iters_prologue;
3380 int peel_iters_epilogue;
3381 unsigned vec_inside_cost = 0;
3382 int vec_outside_cost = 0;
3383 unsigned vec_prologue_cost = 0;
3384 unsigned vec_epilogue_cost = 0;
3385 int scalar_single_iter_cost = 0;
3386 int scalar_outside_cost = 0;
3387 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3388 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3389 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3391 /* Cost model disabled. */
3392 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3394 if (dump_enabled_p ())
3395 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3396 *ret_min_profitable_niters = 0;
3397 *ret_min_profitable_estimate = 0;
3398 return;
3401 /* Requires loop versioning tests to handle misalignment. */
3402 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3404 /* FIXME: Make cost depend on complexity of individual check. */
3405 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3406 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3407 NULL, 0, vect_prologue);
3408 if (dump_enabled_p ())
3409 dump_printf (MSG_NOTE,
3410 "cost model: Adding cost of checks for loop "
3411 "versioning to treat misalignment.\n");
3414 /* Requires loop versioning with alias checks. */
3415 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3417 /* FIXME: Make cost depend on complexity of individual check. */
3418 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3419 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3420 NULL, 0, vect_prologue);
3421 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3422 if (len)
3423 /* Count LEN - 1 ANDs and LEN comparisons. */
3424 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3425 scalar_stmt, NULL, 0, vect_prologue);
3426 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3427 if (len)
3429 /* Count LEN - 1 ANDs and LEN comparisons. */
3430 unsigned int nstmts = len * 2 - 1;
3431 /* +1 for each bias that needs adding. */
3432 for (unsigned int i = 0; i < len; ++i)
3433 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3434 nstmts += 1;
3435 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3436 scalar_stmt, NULL, 0, vect_prologue);
3438 if (dump_enabled_p ())
3439 dump_printf (MSG_NOTE,
3440 "cost model: Adding cost of checks for loop "
3441 "versioning aliasing.\n");
3444 /* Requires loop versioning with niter checks. */
3445 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3447 /* FIXME: Make cost depend on complexity of individual check. */
3448 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3449 NULL, 0, vect_prologue);
3450 if (dump_enabled_p ())
3451 dump_printf (MSG_NOTE,
3452 "cost model: Adding cost of checks for loop "
3453 "versioning niters.\n");
3456 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3457 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3458 NULL, 0, vect_prologue);
3460 /* Count statements in scalar loop. Using this as scalar cost for a single
3461 iteration for now.
3463 TODO: Add outer loop support.
3465 TODO: Consider assigning different costs to different scalar
3466 statements. */
3468 scalar_single_iter_cost
3469 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3471 /* Add additional cost for the peeled instructions in prologue and epilogue
3472 loop. (For fully-masked loops there will be no peeling.)
3474 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3475 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3477 TODO: Build an expression that represents peel_iters for prologue and
3478 epilogue to be used in a run-time test. */
3480 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3482 peel_iters_prologue = 0;
3483 peel_iters_epilogue = 0;
3485 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3487 /* We need to peel exactly one iteration. */
3488 peel_iters_epilogue += 1;
3489 stmt_info_for_cost *si;
3490 int j;
3491 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3492 j, si)
3493 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
3494 si->kind, si->stmt_info, si->misalign,
3495 vect_epilogue);
3498 /* Calculate how many masks we need to generate. */
3499 unsigned int num_masks = 0;
3500 rgroup_masks *rgm;
3501 unsigned int num_vectors_m1;
3502 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3503 if (rgm->mask_type)
3504 num_masks += num_vectors_m1 + 1;
3505 gcc_assert (num_masks > 0);
3507 /* In the worst case, we need to generate each mask in the prologue
3508 and in the loop body. One of the loop body mask instructions
3509 replaces the comparison in the scalar loop, and since we don't
3510 count the scalar comparison against the scalar body, we shouldn't
3511 count that vector instruction against the vector body either.
3513 Sometimes we can use unpacks instead of generating prologue
3514 masks and sometimes the prologue mask will fold to a constant,
3515 so the actual prologue cost might be smaller. However, it's
3516 simpler and safer to use the worst-case cost; if this ends up
3517 being the tie-breaker between vectorizing or not, then it's
3518 probably better not to vectorize. */
3519 (void) add_stmt_cost (loop_vinfo,
3520 target_cost_data, num_masks, vector_stmt,
3521 NULL, 0, vect_prologue);
3522 (void) add_stmt_cost (loop_vinfo,
3523 target_cost_data, num_masks - 1, vector_stmt,
3524 NULL, 0, vect_body);
3526 else if (npeel < 0)
3528 peel_iters_prologue = assumed_vf / 2;
3529 if (dump_enabled_p ())
3530 dump_printf (MSG_NOTE, "cost model: "
3531 "prologue peel iters set to vf/2.\n");
3533 /* If peeling for alignment is unknown, loop bound of main loop becomes
3534 unknown. */
3535 peel_iters_epilogue = assumed_vf / 2;
3536 if (dump_enabled_p ())
3537 dump_printf (MSG_NOTE, "cost model: "
3538 "epilogue peel iters set to vf/2 because "
3539 "peeling for alignment is unknown.\n");
3541 /* If peeled iterations are unknown, count a taken branch and a not taken
3542 branch per peeled loop. Even if scalar loop iterations are known,
3543 vector iterations are not known since peeled prologue iterations are
3544 not known. Hence guards remain the same. */
3545 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3546 NULL, 0, vect_prologue);
3547 (void) add_stmt_cost (loop_vinfo,
3548 target_cost_data, 1, cond_branch_not_taken,
3549 NULL, 0, vect_prologue);
3550 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3551 NULL, 0, vect_epilogue);
3552 (void) add_stmt_cost (loop_vinfo,
3553 target_cost_data, 1, cond_branch_not_taken,
3554 NULL, 0, vect_epilogue);
3555 stmt_info_for_cost *si;
3556 int j;
3557 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3559 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3560 si->count * peel_iters_prologue,
3561 si->kind, si->stmt_info, si->misalign,
3562 vect_prologue);
3563 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3564 si->count * peel_iters_epilogue,
3565 si->kind, si->stmt_info, si->misalign,
3566 vect_epilogue);
3569 else
3571 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3572 stmt_info_for_cost *si;
3573 int j;
3574 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3576 prologue_cost_vec.create (2);
3577 epilogue_cost_vec.create (2);
3578 peel_iters_prologue = npeel;
3580 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3581 &peel_iters_epilogue,
3582 &LOOP_VINFO_SCALAR_ITERATION_COST
3583 (loop_vinfo),
3584 &prologue_cost_vec,
3585 &epilogue_cost_vec);
3587 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3588 (void) add_stmt_cost (loop_vinfo,
3589 data, si->count, si->kind, si->stmt_info,
3590 si->misalign, vect_prologue);
3592 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3593 (void) add_stmt_cost (loop_vinfo,
3594 data, si->count, si->kind, si->stmt_info,
3595 si->misalign, vect_epilogue);
3597 prologue_cost_vec.release ();
3598 epilogue_cost_vec.release ();
3601 /* FORNOW: The scalar outside cost is incremented in one of the
3602 following ways:
3604 1. The vectorizer checks for alignment and aliasing and generates
3605 a condition that allows dynamic vectorization. A cost model
3606 check is ANDED with the versioning condition. Hence scalar code
3607 path now has the added cost of the versioning check.
3609 if (cost > th & versioning_check)
3610 jmp to vector code
3612 Hence run-time scalar is incremented by not-taken branch cost.
3614 2. The vectorizer then checks if a prologue is required. If the
3615 cost model check was not done before during versioning, it has to
3616 be done before the prologue check.
3618 if (cost <= th)
3619 prologue = scalar_iters
3620 if (prologue == 0)
3621 jmp to vector code
3622 else
3623 execute prologue
3624 if (prologue == num_iters)
3625 go to exit
3627 Hence the run-time scalar cost is incremented by a taken branch,
3628 plus a not-taken branch, plus a taken branch cost.
3630 3. The vectorizer then checks if an epilogue is required. If the
3631 cost model check was not done before during prologue check, it
3632 has to be done with the epilogue check.
3634 if (prologue == 0)
3635 jmp to vector code
3636 else
3637 execute prologue
3638 if (prologue == num_iters)
3639 go to exit
3640 vector code:
3641 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3642 jmp to epilogue
3644 Hence the run-time scalar cost should be incremented by 2 taken
3645 branches.
3647 TODO: The back end may reorder the BBS's differently and reverse
3648 conditions/branch directions. Change the estimates below to
3649 something more reasonable. */
3651 /* If the number of iterations is known and we do not do versioning, we can
3652 decide whether to vectorize at compile time. Hence the scalar version
3653 do not carry cost model guard costs. */
3654 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3655 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3657 /* Cost model check occurs at versioning. */
3658 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3659 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3660 else
3662 /* Cost model check occurs at prologue generation. */
3663 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3664 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3665 + vect_get_stmt_cost (cond_branch_not_taken);
3666 /* Cost model check occurs at epilogue generation. */
3667 else
3668 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3672 /* Complete the target-specific cost calculations. */
3673 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3674 &vec_inside_cost, &vec_epilogue_cost);
3676 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3678 /* Stash the costs so that we can compare two loop_vec_infos. */
3679 loop_vinfo->vec_inside_cost = vec_inside_cost;
3680 loop_vinfo->vec_outside_cost = vec_outside_cost;
3682 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3685 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3686 vec_inside_cost);
3687 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3688 vec_prologue_cost);
3689 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3690 vec_epilogue_cost);
3691 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3692 scalar_single_iter_cost);
3693 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3694 scalar_outside_cost);
3695 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3696 vec_outside_cost);
3697 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3698 peel_iters_prologue);
3699 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3700 peel_iters_epilogue);
3703 /* Calculate number of iterations required to make the vector version
3704 profitable, relative to the loop bodies only. The following condition
3705 must hold true:
3706 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3707 where
3708 SIC = scalar iteration cost, VIC = vector iteration cost,
3709 VOC = vector outside cost, VF = vectorization factor,
3710 NPEEL = prologue iterations + epilogue iterations,
3711 SOC = scalar outside cost for run time cost model check. */
3713 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3714 - vec_inside_cost);
3715 if (saving_per_viter <= 0)
3717 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3718 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3719 "vectorization did not happen for a simd loop");
3721 if (dump_enabled_p ())
3722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3723 "cost model: the vector iteration cost = %d "
3724 "divided by the scalar iteration cost = %d "
3725 "is greater or equal to the vectorization factor = %d"
3726 ".\n",
3727 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3728 *ret_min_profitable_niters = -1;
3729 *ret_min_profitable_estimate = -1;
3730 return;
3733 /* ??? The "if" arm is written to handle all cases; see below for what
3734 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3735 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3737 /* Rewriting the condition above in terms of the number of
3738 vector iterations (vniters) rather than the number of
3739 scalar iterations (niters) gives:
3741 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3743 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3745 For integer N, X and Y when X > 0:
3747 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3748 int outside_overhead = (vec_outside_cost
3749 - scalar_single_iter_cost * peel_iters_prologue
3750 - scalar_single_iter_cost * peel_iters_epilogue
3751 - scalar_outside_cost);
3752 /* We're only interested in cases that require at least one
3753 vector iteration. */
3754 int min_vec_niters = 1;
3755 if (outside_overhead > 0)
3756 min_vec_niters = outside_overhead / saving_per_viter + 1;
3758 if (dump_enabled_p ())
3759 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3760 min_vec_niters);
3762 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3764 /* Now that we know the minimum number of vector iterations,
3765 find the minimum niters for which the scalar cost is larger:
3767 SIC * niters > VIC * vniters + VOC - SOC
3769 We know that the minimum niters is no more than
3770 vniters * VF + NPEEL, but it might be (and often is) less
3771 than that if a partial vector iteration is cheaper than the
3772 equivalent scalar code. */
3773 int threshold = (vec_inside_cost * min_vec_niters
3774 + vec_outside_cost
3775 - scalar_outside_cost);
3776 if (threshold <= 0)
3777 min_profitable_iters = 1;
3778 else
3779 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3781 else
3782 /* Convert the number of vector iterations into a number of
3783 scalar iterations. */
3784 min_profitable_iters = (min_vec_niters * assumed_vf
3785 + peel_iters_prologue
3786 + peel_iters_epilogue);
3788 else
3790 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3791 * assumed_vf
3792 - vec_inside_cost * peel_iters_prologue
3793 - vec_inside_cost * peel_iters_epilogue);
3794 if (min_profitable_iters <= 0)
3795 min_profitable_iters = 0;
3796 else
3798 min_profitable_iters /= saving_per_viter;
3800 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3801 <= (((int) vec_inside_cost * min_profitable_iters)
3802 + (((int) vec_outside_cost - scalar_outside_cost)
3803 * assumed_vf)))
3804 min_profitable_iters++;
3808 if (dump_enabled_p ())
3809 dump_printf (MSG_NOTE,
3810 " Calculated minimum iters for profitability: %d\n",
3811 min_profitable_iters);
3813 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3814 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3815 /* We want the vectorized loop to execute at least once. */
3816 min_profitable_iters = assumed_vf + peel_iters_prologue;
3818 if (dump_enabled_p ())
3819 dump_printf_loc (MSG_NOTE, vect_location,
3820 " Runtime profitability threshold = %d\n",
3821 min_profitable_iters);
3823 *ret_min_profitable_niters = min_profitable_iters;
3825 /* Calculate number of iterations required to make the vector version
3826 profitable, relative to the loop bodies only.
3828 Non-vectorized variant is SIC * niters and it must win over vector
3829 variant on the expected loop trip count. The following condition must hold true:
3830 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3832 if (vec_outside_cost <= 0)
3833 min_profitable_estimate = 0;
3834 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3836 /* This is a repeat of the code above, but with + SOC rather
3837 than - SOC. */
3838 int outside_overhead = (vec_outside_cost
3839 - scalar_single_iter_cost * peel_iters_prologue
3840 - scalar_single_iter_cost * peel_iters_epilogue
3841 + scalar_outside_cost);
3842 int min_vec_niters = 1;
3843 if (outside_overhead > 0)
3844 min_vec_niters = outside_overhead / saving_per_viter + 1;
3846 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3848 int threshold = (vec_inside_cost * min_vec_niters
3849 + vec_outside_cost
3850 + scalar_outside_cost);
3851 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3853 else
3854 min_profitable_estimate = (min_vec_niters * assumed_vf
3855 + peel_iters_prologue
3856 + peel_iters_epilogue);
3858 else
3860 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3861 * assumed_vf
3862 - vec_inside_cost * peel_iters_prologue
3863 - vec_inside_cost * peel_iters_epilogue)
3864 / ((scalar_single_iter_cost * assumed_vf)
3865 - vec_inside_cost);
3867 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3868 if (dump_enabled_p ())
3869 dump_printf_loc (MSG_NOTE, vect_location,
3870 " Static estimate profitability threshold = %d\n",
3871 min_profitable_estimate);
3873 *ret_min_profitable_estimate = min_profitable_estimate;
3876 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3877 vector elements (not bits) for a vector with NELT elements. */
3878 static void
3879 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3880 vec_perm_builder *sel)
3882 /* The encoding is a single stepped pattern. Any wrap-around is handled
3883 by vec_perm_indices. */
3884 sel->new_vector (nelt, 1, 3);
3885 for (unsigned int i = 0; i < 3; i++)
3886 sel->quick_push (i + offset);
3889 /* Checks whether the target supports whole-vector shifts for vectors of mode
3890 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3891 it supports vec_perm_const with masks for all necessary shift amounts. */
3892 static bool
3893 have_whole_vector_shift (machine_mode mode)
3895 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3896 return true;
3898 /* Variable-length vectors should be handled via the optab. */
3899 unsigned int nelt;
3900 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3901 return false;
3903 vec_perm_builder sel;
3904 vec_perm_indices indices;
3905 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3907 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3908 indices.new_vector (sel, 2, nelt);
3909 if (!can_vec_perm_const_p (mode, indices, false))
3910 return false;
3912 return true;
3915 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3916 functions. Design better to avoid maintenance issues. */
3918 /* Function vect_model_reduction_cost.
3920 Models cost for a reduction operation, including the vector ops
3921 generated within the strip-mine loop, the initial definition before
3922 the loop, and the epilogue code that must be generated. */
3924 static void
3925 vect_model_reduction_cost (loop_vec_info loop_vinfo,
3926 stmt_vec_info stmt_info, internal_fn reduc_fn,
3927 vect_reduction_type reduction_type,
3928 int ncopies, stmt_vector_for_cost *cost_vec)
3930 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3931 enum tree_code code;
3932 optab optab;
3933 tree vectype;
3934 machine_mode mode;
3935 class loop *loop = NULL;
3937 if (loop_vinfo)
3938 loop = LOOP_VINFO_LOOP (loop_vinfo);
3940 /* Condition reductions generate two reductions in the loop. */
3941 if (reduction_type == COND_REDUCTION)
3942 ncopies *= 2;
3944 vectype = STMT_VINFO_VECTYPE (stmt_info);
3945 mode = TYPE_MODE (vectype);
3946 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3948 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3950 if (reduction_type == EXTRACT_LAST_REDUCTION)
3951 /* No extra instructions are needed in the prologue. The loop body
3952 operations are costed in vectorizable_condition. */
3953 inside_cost = 0;
3954 else if (reduction_type == FOLD_LEFT_REDUCTION)
3956 /* No extra instructions needed in the prologue. */
3957 prologue_cost = 0;
3959 if (reduc_fn != IFN_LAST)
3960 /* Count one reduction-like operation per vector. */
3961 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3962 stmt_info, 0, vect_body);
3963 else
3965 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3966 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3967 inside_cost = record_stmt_cost (cost_vec, nelements,
3968 vec_to_scalar, stmt_info, 0,
3969 vect_body);
3970 inside_cost += record_stmt_cost (cost_vec, nelements,
3971 scalar_stmt, stmt_info, 0,
3972 vect_body);
3975 else
3977 /* Add in cost for initial definition.
3978 For cond reduction we have four vectors: initial index, step,
3979 initial result of the data reduction, initial value of the index
3980 reduction. */
3981 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3982 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3983 scalar_to_vec, stmt_info, 0,
3984 vect_prologue);
3986 /* Cost of reduction op inside loop. */
3987 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3988 stmt_info, 0, vect_body);
3991 /* Determine cost of epilogue code.
3993 We have a reduction operator that will reduce the vector in one statement.
3994 Also requires scalar extract. */
3996 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3998 if (reduc_fn != IFN_LAST)
4000 if (reduction_type == COND_REDUCTION)
4002 /* An EQ stmt and an COND_EXPR stmt. */
4003 epilogue_cost += record_stmt_cost (cost_vec, 2,
4004 vector_stmt, stmt_info, 0,
4005 vect_epilogue);
4006 /* Reduction of the max index and a reduction of the found
4007 values. */
4008 epilogue_cost += record_stmt_cost (cost_vec, 2,
4009 vec_to_scalar, stmt_info, 0,
4010 vect_epilogue);
4011 /* A broadcast of the max value. */
4012 epilogue_cost += record_stmt_cost (cost_vec, 1,
4013 scalar_to_vec, stmt_info, 0,
4014 vect_epilogue);
4016 else
4018 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4019 stmt_info, 0, vect_epilogue);
4020 epilogue_cost += record_stmt_cost (cost_vec, 1,
4021 vec_to_scalar, stmt_info, 0,
4022 vect_epilogue);
4025 else if (reduction_type == COND_REDUCTION)
4027 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4028 /* Extraction of scalar elements. */
4029 epilogue_cost += record_stmt_cost (cost_vec,
4030 2 * estimated_nunits,
4031 vec_to_scalar, stmt_info, 0,
4032 vect_epilogue);
4033 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4034 epilogue_cost += record_stmt_cost (cost_vec,
4035 2 * estimated_nunits - 3,
4036 scalar_stmt, stmt_info, 0,
4037 vect_epilogue);
4039 else if (reduction_type == EXTRACT_LAST_REDUCTION
4040 || reduction_type == FOLD_LEFT_REDUCTION)
4041 /* No extra instructions need in the epilogue. */
4043 else
4045 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4046 tree bitsize =
4047 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4048 int element_bitsize = tree_to_uhwi (bitsize);
4049 int nelements = vec_size_in_bits / element_bitsize;
4051 if (code == COND_EXPR)
4052 code = MAX_EXPR;
4054 optab = optab_for_tree_code (code, vectype, optab_default);
4056 /* We have a whole vector shift available. */
4057 if (optab != unknown_optab
4058 && VECTOR_MODE_P (mode)
4059 && optab_handler (optab, mode) != CODE_FOR_nothing
4060 && have_whole_vector_shift (mode))
4062 /* Final reduction via vector shifts and the reduction operator.
4063 Also requires scalar extract. */
4064 epilogue_cost += record_stmt_cost (cost_vec,
4065 exact_log2 (nelements) * 2,
4066 vector_stmt, stmt_info, 0,
4067 vect_epilogue);
4068 epilogue_cost += record_stmt_cost (cost_vec, 1,
4069 vec_to_scalar, stmt_info, 0,
4070 vect_epilogue);
4072 else
4073 /* Use extracts and reduction op for final reduction. For N
4074 elements, we have N extracts and N-1 reduction ops. */
4075 epilogue_cost += record_stmt_cost (cost_vec,
4076 nelements + nelements - 1,
4077 vector_stmt, stmt_info, 0,
4078 vect_epilogue);
4082 if (dump_enabled_p ())
4083 dump_printf (MSG_NOTE,
4084 "vect_model_reduction_cost: inside_cost = %d, "
4085 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4086 prologue_cost, epilogue_cost);
4090 /* Function vect_model_induction_cost.
4092 Models cost for induction operations. */
4094 static void
4095 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4096 stmt_vector_for_cost *cost_vec)
4098 unsigned inside_cost, prologue_cost;
4100 if (PURE_SLP_STMT (stmt_info))
4101 return;
4103 /* loop cost for vec_loop. */
4104 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4105 stmt_info, 0, vect_body);
4107 /* prologue cost for vec_init and vec_step. */
4108 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4109 stmt_info, 0, vect_prologue);
4111 if (dump_enabled_p ())
4112 dump_printf_loc (MSG_NOTE, vect_location,
4113 "vect_model_induction_cost: inside_cost = %d, "
4114 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4119 /* Function get_initial_def_for_reduction
4121 Input:
4122 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4123 INIT_VAL - the initial value of the reduction variable
4125 Output:
4126 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4127 of the reduction (used for adjusting the epilog - see below).
4128 Return a vector variable, initialized according to the operation that
4129 STMT_VINFO performs. This vector will be used as the initial value
4130 of the vector of partial results.
4132 Option1 (adjust in epilog): Initialize the vector as follows:
4133 add/bit or/xor: [0,0,...,0,0]
4134 mult/bit and: [1,1,...,1,1]
4135 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4136 and when necessary (e.g. add/mult case) let the caller know
4137 that it needs to adjust the result by init_val.
4139 Option2: Initialize the vector as follows:
4140 add/bit or/xor: [init_val,0,0,...,0]
4141 mult/bit and: [init_val,1,1,...,1]
4142 min/max/cond_expr: [init_val,init_val,...,init_val]
4143 and no adjustments are needed.
4145 For example, for the following code:
4147 s = init_val;
4148 for (i=0;i<n;i++)
4149 s = s + a[i];
4151 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4152 For a vector of 4 units, we want to return either [0,0,0,init_val],
4153 or [0,0,0,0] and let the caller know that it needs to adjust
4154 the result at the end by 'init_val'.
4156 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4157 initialization vector is simpler (same element in all entries), if
4158 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4160 A cost model should help decide between these two schemes. */
4162 static tree
4163 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4164 stmt_vec_info stmt_vinfo,
4165 enum tree_code code, tree init_val,
4166 tree *adjustment_def)
4168 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4169 tree scalar_type = TREE_TYPE (init_val);
4170 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4171 tree def_for_init;
4172 tree init_def;
4173 REAL_VALUE_TYPE real_init_val = dconst0;
4174 int int_init_val = 0;
4175 gimple_seq stmts = NULL;
4177 gcc_assert (vectype);
4179 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4180 || SCALAR_FLOAT_TYPE_P (scalar_type));
4182 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4183 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4185 /* ADJUSTMENT_DEF is NULL when called from
4186 vect_create_epilog_for_reduction to vectorize double reduction. */
4187 if (adjustment_def)
4188 *adjustment_def = NULL;
4190 switch (code)
4192 case WIDEN_SUM_EXPR:
4193 case DOT_PROD_EXPR:
4194 case SAD_EXPR:
4195 case PLUS_EXPR:
4196 case MINUS_EXPR:
4197 case BIT_IOR_EXPR:
4198 case BIT_XOR_EXPR:
4199 case MULT_EXPR:
4200 case BIT_AND_EXPR:
4202 if (code == MULT_EXPR)
4204 real_init_val = dconst1;
4205 int_init_val = 1;
4208 if (code == BIT_AND_EXPR)
4209 int_init_val = -1;
4211 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4212 def_for_init = build_real (scalar_type, real_init_val);
4213 else
4214 def_for_init = build_int_cst (scalar_type, int_init_val);
4216 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4218 /* Option1: the first element is '0' or '1' as well. */
4219 if (!operand_equal_p (def_for_init, init_val, 0))
4220 *adjustment_def = init_val;
4221 init_def = gimple_build_vector_from_val (&stmts, vectype,
4222 def_for_init);
4224 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4226 /* Option2 (variable length): the first element is INIT_VAL. */
4227 init_def = gimple_build_vector_from_val (&stmts, vectype,
4228 def_for_init);
4229 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4230 vectype, init_def, init_val);
4232 else
4234 /* Option2: the first element is INIT_VAL. */
4235 tree_vector_builder elts (vectype, 1, 2);
4236 elts.quick_push (init_val);
4237 elts.quick_push (def_for_init);
4238 init_def = gimple_build_vector (&stmts, &elts);
4241 break;
4243 case MIN_EXPR:
4244 case MAX_EXPR:
4245 case COND_EXPR:
4247 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4248 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4250 break;
4252 default:
4253 gcc_unreachable ();
4256 if (stmts)
4257 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4258 return init_def;
4261 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4262 NUMBER_OF_VECTORS is the number of vector defs to create.
4263 If NEUTRAL_OP is nonnull, introducing extra elements of that
4264 value will not change the result. */
4266 static void
4267 get_initial_defs_for_reduction (vec_info *vinfo,
4268 slp_tree slp_node,
4269 vec<tree> *vec_oprnds,
4270 unsigned int number_of_vectors,
4271 bool reduc_chain, tree neutral_op)
4273 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4274 stmt_vec_info stmt_vinfo = stmts[0];
4275 unsigned HOST_WIDE_INT nunits;
4276 unsigned j, number_of_places_left_in_vector;
4277 tree vector_type;
4278 unsigned int group_size = stmts.length ();
4279 unsigned int i;
4280 class loop *loop;
4282 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4284 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4286 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4287 gcc_assert (loop);
4288 edge pe = loop_preheader_edge (loop);
4290 gcc_assert (!reduc_chain || neutral_op);
4292 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4293 created vectors. It is greater than 1 if unrolling is performed.
4295 For example, we have two scalar operands, s1 and s2 (e.g., group of
4296 strided accesses of size two), while NUNITS is four (i.e., four scalars
4297 of this type can be packed in a vector). The output vector will contain
4298 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4299 will be 2).
4301 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4302 vectors containing the operands.
4304 For example, NUNITS is four as before, and the group size is 8
4305 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4306 {s5, s6, s7, s8}. */
4308 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4309 nunits = group_size;
4311 number_of_places_left_in_vector = nunits;
4312 bool constant_p = true;
4313 tree_vector_builder elts (vector_type, nunits, 1);
4314 elts.quick_grow (nunits);
4315 gimple_seq ctor_seq = NULL;
4316 for (j = 0; j < nunits * number_of_vectors; ++j)
4318 tree op;
4319 i = j % group_size;
4320 stmt_vinfo = stmts[i];
4322 /* Get the def before the loop. In reduction chain we have only
4323 one initial value. Else we have as many as PHIs in the group. */
4324 if (reduc_chain)
4325 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4326 else if (((vec_oprnds->length () + 1) * nunits
4327 - number_of_places_left_in_vector >= group_size)
4328 && neutral_op)
4329 op = neutral_op;
4330 else
4331 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4333 /* Create 'vect_ = {op0,op1,...,opn}'. */
4334 number_of_places_left_in_vector--;
4335 elts[nunits - number_of_places_left_in_vector - 1] = op;
4336 if (!CONSTANT_CLASS_P (op))
4337 constant_p = false;
4339 if (number_of_places_left_in_vector == 0)
4341 tree init;
4342 if (constant_p && !neutral_op
4343 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4344 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4345 /* Build the vector directly from ELTS. */
4346 init = gimple_build_vector (&ctor_seq, &elts);
4347 else if (neutral_op)
4349 /* Build a vector of the neutral value and shift the
4350 other elements into place. */
4351 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4352 neutral_op);
4353 int k = nunits;
4354 while (k > 0 && elts[k - 1] == neutral_op)
4355 k -= 1;
4356 while (k > 0)
4358 k -= 1;
4359 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4360 vector_type, init, elts[k]);
4363 else
4365 /* First time round, duplicate ELTS to fill the
4366 required number of vectors. */
4367 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4368 number_of_vectors, *vec_oprnds);
4369 break;
4371 vec_oprnds->quick_push (init);
4373 number_of_places_left_in_vector = nunits;
4374 elts.new_vector (vector_type, nunits, 1);
4375 elts.quick_grow (nunits);
4376 constant_p = true;
4379 if (ctor_seq != NULL)
4380 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4383 /* For a statement STMT_INFO taking part in a reduction operation return
4384 the stmt_vec_info the meta information is stored on. */
4386 stmt_vec_info
4387 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4389 stmt_info = vect_orig_stmt (stmt_info);
4390 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4391 if (!is_a <gphi *> (stmt_info->stmt))
4392 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4393 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4394 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4396 if (gimple_phi_num_args (phi) == 1)
4397 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4399 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4401 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4402 stmt_vec_info info
4403 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4404 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4405 stmt_info = info;
4407 return stmt_info;
4410 /* Function vect_create_epilog_for_reduction
4412 Create code at the loop-epilog to finalize the result of a reduction
4413 computation.
4415 STMT_INFO is the scalar reduction stmt that is being vectorized.
4416 SLP_NODE is an SLP node containing a group of reduction statements. The
4417 first one in this group is STMT_INFO.
4418 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4419 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4420 (counting from 0)
4422 This function:
4423 1. Completes the reduction def-use cycles.
4424 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4425 by calling the function specified by REDUC_FN if available, or by
4426 other means (whole-vector shifts or a scalar loop).
4427 The function also creates a new phi node at the loop exit to preserve
4428 loop-closed form, as illustrated below.
4430 The flow at the entry to this function:
4432 loop:
4433 vec_def = phi <vec_init, null> # REDUCTION_PHI
4434 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4435 s_loop = scalar_stmt # (scalar) STMT_INFO
4436 loop_exit:
4437 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4438 use <s_out0>
4439 use <s_out0>
4441 The above is transformed by this function into:
4443 loop:
4444 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4445 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4446 s_loop = scalar_stmt # (scalar) STMT_INFO
4447 loop_exit:
4448 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4449 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4450 v_out2 = reduce <v_out1>
4451 s_out3 = extract_field <v_out2, 0>
4452 s_out4 = adjust_result <s_out3>
4453 use <s_out4>
4454 use <s_out4>
4457 static void
4458 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4459 stmt_vec_info stmt_info,
4460 slp_tree slp_node,
4461 slp_instance slp_node_instance)
4463 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4464 gcc_assert (reduc_info->is_reduc_info);
4465 /* For double reductions we need to get at the inner loop reduction
4466 stmt which has the meta info attached. Our stmt_info is that of the
4467 loop-closed PHI of the inner loop which we remember as
4468 def for the reduction PHI generation. */
4469 bool double_reduc = false;
4470 stmt_vec_info rdef_info = stmt_info;
4471 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4473 gcc_assert (!slp_node);
4474 double_reduc = true;
4475 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4476 (stmt_info->stmt, 0));
4477 stmt_info = vect_stmt_to_vectorize (stmt_info);
4479 gphi *reduc_def_stmt
4480 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4481 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4482 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4483 stmt_vec_info prev_phi_info;
4484 tree vectype;
4485 machine_mode mode;
4486 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4487 basic_block exit_bb;
4488 tree scalar_dest;
4489 tree scalar_type;
4490 gimple *new_phi = NULL, *phi;
4491 stmt_vec_info phi_info;
4492 gimple_stmt_iterator exit_gsi;
4493 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4494 gimple *epilog_stmt = NULL;
4495 gimple *exit_phi;
4496 tree bitsize;
4497 tree def;
4498 tree orig_name, scalar_result;
4499 imm_use_iterator imm_iter, phi_imm_iter;
4500 use_operand_p use_p, phi_use_p;
4501 gimple *use_stmt;
4502 bool nested_in_vect_loop = false;
4503 auto_vec<gimple *> new_phis;
4504 int j, i;
4505 auto_vec<tree> scalar_results;
4506 unsigned int group_size = 1, k;
4507 auto_vec<gimple *> phis;
4508 bool slp_reduc = false;
4509 bool direct_slp_reduc;
4510 tree new_phi_result;
4511 tree induction_index = NULL_TREE;
4513 if (slp_node)
4514 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4516 if (nested_in_vect_loop_p (loop, stmt_info))
4518 outer_loop = loop;
4519 loop = loop->inner;
4520 nested_in_vect_loop = true;
4521 gcc_assert (!slp_node);
4523 gcc_assert (!nested_in_vect_loop || double_reduc);
4525 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4526 gcc_assert (vectype);
4527 mode = TYPE_MODE (vectype);
4529 tree initial_def = NULL;
4530 tree induc_val = NULL_TREE;
4531 tree adjustment_def = NULL;
4532 if (slp_node)
4534 else
4536 /* Get at the scalar def before the loop, that defines the initial value
4537 of the reduction variable. */
4538 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4539 loop_preheader_edge (loop));
4540 /* Optimize: for induction condition reduction, if we can't use zero
4541 for induc_val, use initial_def. */
4542 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4543 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4544 else if (double_reduc)
4546 else if (nested_in_vect_loop)
4548 else
4549 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4552 unsigned vec_num;
4553 int ncopies;
4554 if (slp_node)
4556 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4557 ncopies = 1;
4559 else
4561 vec_num = 1;
4562 ncopies = 0;
4563 phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4566 ncopies++;
4567 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4569 while (phi_info);
4572 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4573 which is updated with the current index of the loop for every match of
4574 the original loop's cond_expr (VEC_STMT). This results in a vector
4575 containing the last time the condition passed for that vector lane.
4576 The first match will be a 1 to allow 0 to be used for non-matching
4577 indexes. If there are no matches at all then the vector will be all
4578 zeroes.
4580 PR92772: This algorithm is broken for architectures that support
4581 masked vectors, but do not provide fold_extract_last. */
4582 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4584 auto_vec<std::pair<tree, bool>, 2> ccompares;
4585 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4586 cond_info = vect_stmt_to_vectorize (cond_info);
4587 while (cond_info != reduc_info)
4589 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4591 gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4592 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4593 ccompares.safe_push
4594 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4595 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4597 cond_info
4598 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4599 1 + STMT_VINFO_REDUC_IDX
4600 (cond_info)));
4601 cond_info = vect_stmt_to_vectorize (cond_info);
4603 gcc_assert (ccompares.length () != 0);
4605 tree indx_before_incr, indx_after_incr;
4606 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4607 int scalar_precision
4608 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4609 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4610 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4611 (TYPE_MODE (vectype), cr_index_scalar_type,
4612 TYPE_VECTOR_SUBPARTS (vectype));
4614 /* First we create a simple vector induction variable which starts
4615 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4616 vector size (STEP). */
4618 /* Create a {1,2,3,...} vector. */
4619 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4621 /* Create a vector of the step value. */
4622 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4623 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4625 /* Create an induction variable. */
4626 gimple_stmt_iterator incr_gsi;
4627 bool insert_after;
4628 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4629 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4630 insert_after, &indx_before_incr, &indx_after_incr);
4632 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4633 filled with zeros (VEC_ZERO). */
4635 /* Create a vector of 0s. */
4636 tree zero = build_zero_cst (cr_index_scalar_type);
4637 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4639 /* Create a vector phi node. */
4640 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4641 new_phi = create_phi_node (new_phi_tree, loop->header);
4642 loop_vinfo->add_stmt (new_phi);
4643 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4644 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4646 /* Now take the condition from the loops original cond_exprs
4647 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4648 every match uses values from the induction variable
4649 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4650 (NEW_PHI_TREE).
4651 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4652 the new cond_expr (INDEX_COND_EXPR). */
4653 gimple_seq stmts = NULL;
4654 for (int i = ccompares.length () - 1; i != -1; --i)
4656 tree ccompare = ccompares[i].first;
4657 if (ccompares[i].second)
4658 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4659 cr_index_vector_type,
4660 ccompare,
4661 indx_before_incr, new_phi_tree);
4662 else
4663 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4664 cr_index_vector_type,
4665 ccompare,
4666 new_phi_tree, indx_before_incr);
4668 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4669 stmt_vec_info index_vec_info
4670 = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4671 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4673 /* Update the phi with the vec cond. */
4674 induction_index = new_phi_tree;
4675 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4676 loop_latch_edge (loop), UNKNOWN_LOCATION);
4679 /* 2. Create epilog code.
4680 The reduction epilog code operates across the elements of the vector
4681 of partial results computed by the vectorized loop.
4682 The reduction epilog code consists of:
4684 step 1: compute the scalar result in a vector (v_out2)
4685 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4686 step 3: adjust the scalar result (s_out3) if needed.
4688 Step 1 can be accomplished using one the following three schemes:
4689 (scheme 1) using reduc_fn, if available.
4690 (scheme 2) using whole-vector shifts, if available.
4691 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4692 combined.
4694 The overall epilog code looks like this:
4696 s_out0 = phi <s_loop> # original EXIT_PHI
4697 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4698 v_out2 = reduce <v_out1> # step 1
4699 s_out3 = extract_field <v_out2, 0> # step 2
4700 s_out4 = adjust_result <s_out3> # step 3
4702 (step 3 is optional, and steps 1 and 2 may be combined).
4703 Lastly, the uses of s_out0 are replaced by s_out4. */
4706 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4707 v_out1 = phi <VECT_DEF>
4708 Store them in NEW_PHIS. */
4709 if (double_reduc)
4710 loop = outer_loop;
4711 exit_bb = single_exit (loop)->dest;
4712 prev_phi_info = NULL;
4713 new_phis.create (slp_node ? vec_num : ncopies);
4714 for (unsigned i = 0; i < vec_num; i++)
4716 if (slp_node)
4717 def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4718 else
4719 def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4720 for (j = 0; j < ncopies; j++)
4722 tree new_def = copy_ssa_name (def);
4723 phi = create_phi_node (new_def, exit_bb);
4724 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4725 if (j == 0)
4726 new_phis.quick_push (phi);
4727 else
4729 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4730 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4733 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4734 prev_phi_info = phi_info;
4738 exit_gsi = gsi_after_labels (exit_bb);
4740 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4741 (i.e. when reduc_fn is not available) and in the final adjustment
4742 code (if needed). Also get the original scalar reduction variable as
4743 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4744 represents a reduction pattern), the tree-code and scalar-def are
4745 taken from the original stmt that the pattern-stmt (STMT) replaces.
4746 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4747 are taken from STMT. */
4749 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4750 if (orig_stmt_info != stmt_info)
4752 /* Reduction pattern */
4753 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4754 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4757 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4758 scalar_type = TREE_TYPE (scalar_dest);
4759 scalar_results.create (group_size);
4760 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4761 bitsize = TYPE_SIZE (scalar_type);
4763 /* SLP reduction without reduction chain, e.g.,
4764 # a1 = phi <a2, a0>
4765 # b1 = phi <b2, b0>
4766 a2 = operation (a1)
4767 b2 = operation (b1) */
4768 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4770 /* True if we should implement SLP_REDUC using native reduction operations
4771 instead of scalar operations. */
4772 direct_slp_reduc = (reduc_fn != IFN_LAST
4773 && slp_reduc
4774 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4776 /* In case of reduction chain, e.g.,
4777 # a1 = phi <a3, a0>
4778 a2 = operation (a1)
4779 a3 = operation (a2),
4781 we may end up with more than one vector result. Here we reduce them to
4782 one vector. */
4783 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4785 gimple_seq stmts = NULL;
4786 tree first_vect = PHI_RESULT (new_phis[0]);
4787 first_vect = gimple_convert (&stmts, vectype, first_vect);
4788 for (k = 1; k < new_phis.length (); k++)
4790 gimple *next_phi = new_phis[k];
4791 tree second_vect = PHI_RESULT (next_phi);
4792 second_vect = gimple_convert (&stmts, vectype, second_vect);
4793 first_vect = gimple_build (&stmts, code, vectype,
4794 first_vect, second_vect);
4796 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4798 new_phi_result = first_vect;
4799 new_phis.truncate (0);
4800 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4802 /* Likewise if we couldn't use a single defuse cycle. */
4803 else if (ncopies > 1)
4805 gcc_assert (new_phis.length () == 1);
4806 gimple_seq stmts = NULL;
4807 tree first_vect = PHI_RESULT (new_phis[0]);
4808 first_vect = gimple_convert (&stmts, vectype, first_vect);
4809 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4810 for (int k = 1; k < ncopies; ++k)
4812 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4813 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4814 second_vect = gimple_convert (&stmts, vectype, second_vect);
4815 first_vect = gimple_build (&stmts, code, vectype,
4816 first_vect, second_vect);
4818 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4819 new_phi_result = first_vect;
4820 new_phis.truncate (0);
4821 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4823 else
4824 new_phi_result = PHI_RESULT (new_phis[0]);
4826 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4827 && reduc_fn != IFN_LAST)
4829 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4830 various data values where the condition matched and another vector
4831 (INDUCTION_INDEX) containing all the indexes of those matches. We
4832 need to extract the last matching index (which will be the index with
4833 highest value) and use this to index into the data vector.
4834 For the case where there were no matches, the data vector will contain
4835 all default values and the index vector will be all zeros. */
4837 /* Get various versions of the type of the vector of indexes. */
4838 tree index_vec_type = TREE_TYPE (induction_index);
4839 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4840 tree index_scalar_type = TREE_TYPE (index_vec_type);
4841 tree index_vec_cmp_type = truth_type_for (index_vec_type);
4843 /* Get an unsigned integer version of the type of the data vector. */
4844 int scalar_precision
4845 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4846 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4847 tree vectype_unsigned = build_vector_type
4848 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4850 /* First we need to create a vector (ZERO_VEC) of zeros and another
4851 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4852 can create using a MAX reduction and then expanding.
4853 In the case where the loop never made any matches, the max index will
4854 be zero. */
4856 /* Vector of {0, 0, 0,...}. */
4857 tree zero_vec = build_zero_cst (vectype);
4859 gimple_seq stmts = NULL;
4860 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4861 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4863 /* Find maximum value from the vector of found indexes. */
4864 tree max_index = make_ssa_name (index_scalar_type);
4865 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4866 1, induction_index);
4867 gimple_call_set_lhs (max_index_stmt, max_index);
4868 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4870 /* Vector of {max_index, max_index, max_index,...}. */
4871 tree max_index_vec = make_ssa_name (index_vec_type);
4872 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4873 max_index);
4874 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4875 max_index_vec_rhs);
4876 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4878 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4879 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4880 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4881 otherwise. Only one value should match, resulting in a vector
4882 (VEC_COND) with one data value and the rest zeros.
4883 In the case where the loop never made any matches, every index will
4884 match, resulting in a vector with all data values (which will all be
4885 the default value). */
4887 /* Compare the max index vector to the vector of found indexes to find
4888 the position of the max value. */
4889 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4890 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4891 induction_index,
4892 max_index_vec);
4893 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4895 /* Use the compare to choose either values from the data vector or
4896 zero. */
4897 tree vec_cond = make_ssa_name (vectype);
4898 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4899 vec_compare, new_phi_result,
4900 zero_vec);
4901 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4903 /* Finally we need to extract the data value from the vector (VEC_COND)
4904 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4905 reduction, but because this doesn't exist, we can use a MAX reduction
4906 instead. The data value might be signed or a float so we need to cast
4907 it first.
4908 In the case where the loop never made any matches, the data values are
4909 all identical, and so will reduce down correctly. */
4911 /* Make the matched data values unsigned. */
4912 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4913 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4914 vec_cond);
4915 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4916 VIEW_CONVERT_EXPR,
4917 vec_cond_cast_rhs);
4918 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4920 /* Reduce down to a scalar value. */
4921 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4922 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4923 1, vec_cond_cast);
4924 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4925 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4927 /* Convert the reduced value back to the result type and set as the
4928 result. */
4929 stmts = NULL;
4930 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4931 data_reduc);
4932 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4933 scalar_results.safe_push (new_temp);
4935 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4936 && reduc_fn == IFN_LAST)
4938 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4939 idx = 0;
4940 idx_val = induction_index[0];
4941 val = data_reduc[0];
4942 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4943 if (induction_index[i] > idx_val)
4944 val = data_reduc[i], idx_val = induction_index[i];
4945 return val; */
4947 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4948 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4949 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4950 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4951 /* Enforced by vectorizable_reduction, which ensures we have target
4952 support before allowing a conditional reduction on variable-length
4953 vectors. */
4954 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4955 tree idx_val = NULL_TREE, val = NULL_TREE;
4956 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4958 tree old_idx_val = idx_val;
4959 tree old_val = val;
4960 idx_val = make_ssa_name (idx_eltype);
4961 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4962 build3 (BIT_FIELD_REF, idx_eltype,
4963 induction_index,
4964 bitsize_int (el_size),
4965 bitsize_int (off)));
4966 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4967 val = make_ssa_name (data_eltype);
4968 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4969 build3 (BIT_FIELD_REF,
4970 data_eltype,
4971 new_phi_result,
4972 bitsize_int (el_size),
4973 bitsize_int (off)));
4974 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4975 if (off != 0)
4977 tree new_idx_val = idx_val;
4978 if (off != v_size - el_size)
4980 new_idx_val = make_ssa_name (idx_eltype);
4981 epilog_stmt = gimple_build_assign (new_idx_val,
4982 MAX_EXPR, idx_val,
4983 old_idx_val);
4984 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4986 tree new_val = make_ssa_name (data_eltype);
4987 epilog_stmt = gimple_build_assign (new_val,
4988 COND_EXPR,
4989 build2 (GT_EXPR,
4990 boolean_type_node,
4991 idx_val,
4992 old_idx_val),
4993 val, old_val);
4994 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4995 idx_val = new_idx_val;
4996 val = new_val;
4999 /* Convert the reduced value back to the result type and set as the
5000 result. */
5001 gimple_seq stmts = NULL;
5002 val = gimple_convert (&stmts, scalar_type, val);
5003 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5004 scalar_results.safe_push (val);
5007 /* 2.3 Create the reduction code, using one of the three schemes described
5008 above. In SLP we simply need to extract all the elements from the
5009 vector (without reducing them), so we use scalar shifts. */
5010 else if (reduc_fn != IFN_LAST && !slp_reduc)
5012 tree tmp;
5013 tree vec_elem_type;
5015 /* Case 1: Create:
5016 v_out2 = reduc_expr <v_out1> */
5018 if (dump_enabled_p ())
5019 dump_printf_loc (MSG_NOTE, vect_location,
5020 "Reduce using direct vector reduction.\n");
5022 gimple_seq stmts = NULL;
5023 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5024 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5025 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5026 vec_elem_type, new_phi_result);
5027 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5028 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5030 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5031 && induc_val)
5033 /* Earlier we set the initial value to be a vector if induc_val
5034 values. Check the result and if it is induc_val then replace
5035 with the original initial value, unless induc_val is
5036 the same as initial_def already. */
5037 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5038 induc_val);
5040 tmp = make_ssa_name (new_scalar_dest);
5041 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5042 initial_def, new_temp);
5043 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5044 new_temp = tmp;
5047 scalar_results.safe_push (new_temp);
5049 else if (direct_slp_reduc)
5051 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5052 with the elements for other SLP statements replaced with the
5053 neutral value. We can then do a normal reduction on each vector. */
5055 /* Enforced by vectorizable_reduction. */
5056 gcc_assert (new_phis.length () == 1);
5057 gcc_assert (pow2p_hwi (group_size));
5059 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5060 vec<stmt_vec_info> orig_phis
5061 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5062 gimple_seq seq = NULL;
5064 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5065 and the same element size as VECTYPE. */
5066 tree index = build_index_vector (vectype, 0, 1);
5067 tree index_type = TREE_TYPE (index);
5068 tree index_elt_type = TREE_TYPE (index_type);
5069 tree mask_type = truth_type_for (index_type);
5071 /* Create a vector that, for each element, identifies which of
5072 the REDUC_GROUP_SIZE results should use it. */
5073 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5074 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5075 build_vector_from_val (index_type, index_mask));
5077 /* Get a neutral vector value. This is simply a splat of the neutral
5078 scalar value if we have one, otherwise the initial scalar value
5079 is itself a neutral value. */
5080 tree vector_identity = NULL_TREE;
5081 tree neutral_op = NULL_TREE;
5082 if (slp_node)
5084 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5085 neutral_op
5086 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5087 vectype, code, first != NULL);
5089 if (neutral_op)
5090 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5091 neutral_op);
5092 for (unsigned int i = 0; i < group_size; ++i)
5094 /* If there's no univeral neutral value, we can use the
5095 initial scalar value from the original PHI. This is used
5096 for MIN and MAX reduction, for example. */
5097 if (!neutral_op)
5099 tree scalar_value
5100 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5101 loop_preheader_edge (loop));
5102 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5103 scalar_value);
5104 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5105 scalar_value);
5108 /* Calculate the equivalent of:
5110 sel[j] = (index[j] == i);
5112 which selects the elements of NEW_PHI_RESULT that should
5113 be included in the result. */
5114 tree compare_val = build_int_cst (index_elt_type, i);
5115 compare_val = build_vector_from_val (index_type, compare_val);
5116 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5117 index, compare_val);
5119 /* Calculate the equivalent of:
5121 vec = seq ? new_phi_result : vector_identity;
5123 VEC is now suitable for a full vector reduction. */
5124 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5125 sel, new_phi_result, vector_identity);
5127 /* Do the reduction and convert it to the appropriate type. */
5128 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5129 TREE_TYPE (vectype), vec);
5130 scalar = gimple_convert (&seq, scalar_type, scalar);
5131 scalar_results.safe_push (scalar);
5133 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5135 else
5137 bool reduce_with_shift;
5138 tree vec_temp;
5140 gcc_assert (slp_reduc || new_phis.length () == 1);
5142 /* See if the target wants to do the final (shift) reduction
5143 in a vector mode of smaller size and first reduce upper/lower
5144 halves against each other. */
5145 enum machine_mode mode1 = mode;
5146 tree stype = TREE_TYPE (vectype);
5147 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5148 unsigned nunits1 = nunits;
5149 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5150 && new_phis.length () == 1)
5152 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5153 /* For SLP reductions we have to make sure lanes match up, but
5154 since we're doing individual element final reduction reducing
5155 vector width here is even more important.
5156 ??? We can also separate lanes with permutes, for the common
5157 case of power-of-two group-size odd/even extracts would work. */
5158 if (slp_reduc && nunits != nunits1)
5160 nunits1 = least_common_multiple (nunits1, group_size);
5161 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5164 if (!slp_reduc
5165 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5166 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5168 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5169 stype, nunits1);
5170 reduce_with_shift = have_whole_vector_shift (mode1);
5171 if (!VECTOR_MODE_P (mode1))
5172 reduce_with_shift = false;
5173 else
5175 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5176 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5177 reduce_with_shift = false;
5180 /* First reduce the vector to the desired vector size we should
5181 do shift reduction on by combining upper and lower halves. */
5182 new_temp = new_phi_result;
5183 while (nunits > nunits1)
5185 nunits /= 2;
5186 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5187 stype, nunits);
5188 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5190 /* The target has to make sure we support lowpart/highpart
5191 extraction, either via direct vector extract or through
5192 an integer mode punning. */
5193 tree dst1, dst2;
5194 if (convert_optab_handler (vec_extract_optab,
5195 TYPE_MODE (TREE_TYPE (new_temp)),
5196 TYPE_MODE (vectype1))
5197 != CODE_FOR_nothing)
5199 /* Extract sub-vectors directly once vec_extract becomes
5200 a conversion optab. */
5201 dst1 = make_ssa_name (vectype1);
5202 epilog_stmt
5203 = gimple_build_assign (dst1, BIT_FIELD_REF,
5204 build3 (BIT_FIELD_REF, vectype1,
5205 new_temp, TYPE_SIZE (vectype1),
5206 bitsize_int (0)));
5207 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5208 dst2 = make_ssa_name (vectype1);
5209 epilog_stmt
5210 = gimple_build_assign (dst2, BIT_FIELD_REF,
5211 build3 (BIT_FIELD_REF, vectype1,
5212 new_temp, TYPE_SIZE (vectype1),
5213 bitsize_int (bitsize)));
5214 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5216 else
5218 /* Extract via punning to appropriately sized integer mode
5219 vector. */
5220 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5221 tree etype = build_vector_type (eltype, 2);
5222 gcc_assert (convert_optab_handler (vec_extract_optab,
5223 TYPE_MODE (etype),
5224 TYPE_MODE (eltype))
5225 != CODE_FOR_nothing);
5226 tree tem = make_ssa_name (etype);
5227 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5228 build1 (VIEW_CONVERT_EXPR,
5229 etype, new_temp));
5230 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5231 new_temp = tem;
5232 tem = make_ssa_name (eltype);
5233 epilog_stmt
5234 = gimple_build_assign (tem, BIT_FIELD_REF,
5235 build3 (BIT_FIELD_REF, eltype,
5236 new_temp, TYPE_SIZE (eltype),
5237 bitsize_int (0)));
5238 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5239 dst1 = make_ssa_name (vectype1);
5240 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5241 build1 (VIEW_CONVERT_EXPR,
5242 vectype1, tem));
5243 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5244 tem = make_ssa_name (eltype);
5245 epilog_stmt
5246 = gimple_build_assign (tem, BIT_FIELD_REF,
5247 build3 (BIT_FIELD_REF, eltype,
5248 new_temp, TYPE_SIZE (eltype),
5249 bitsize_int (bitsize)));
5250 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251 dst2 = make_ssa_name (vectype1);
5252 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5253 build1 (VIEW_CONVERT_EXPR,
5254 vectype1, tem));
5255 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5258 new_temp = make_ssa_name (vectype1);
5259 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5260 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5261 new_phis[0] = epilog_stmt;
5264 if (reduce_with_shift && !slp_reduc)
5266 int element_bitsize = tree_to_uhwi (bitsize);
5267 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5268 for variable-length vectors and also requires direct target support
5269 for loop reductions. */
5270 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5271 int nelements = vec_size_in_bits / element_bitsize;
5272 vec_perm_builder sel;
5273 vec_perm_indices indices;
5275 int elt_offset;
5277 tree zero_vec = build_zero_cst (vectype1);
5278 /* Case 2: Create:
5279 for (offset = nelements/2; offset >= 1; offset/=2)
5281 Create: va' = vec_shift <va, offset>
5282 Create: va = vop <va, va'>
5283 } */
5285 tree rhs;
5287 if (dump_enabled_p ())
5288 dump_printf_loc (MSG_NOTE, vect_location,
5289 "Reduce using vector shifts\n");
5291 gimple_seq stmts = NULL;
5292 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5293 for (elt_offset = nelements / 2;
5294 elt_offset >= 1;
5295 elt_offset /= 2)
5297 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5298 indices.new_vector (sel, 2, nelements);
5299 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5300 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5301 new_temp, zero_vec, mask);
5302 new_temp = gimple_build (&stmts, code,
5303 vectype1, new_name, new_temp);
5305 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5307 /* 2.4 Extract the final scalar result. Create:
5308 s_out3 = extract_field <v_out2, bitpos> */
5310 if (dump_enabled_p ())
5311 dump_printf_loc (MSG_NOTE, vect_location,
5312 "extract scalar result\n");
5314 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5315 bitsize, bitsize_zero_node);
5316 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5317 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5318 gimple_assign_set_lhs (epilog_stmt, new_temp);
5319 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5320 scalar_results.safe_push (new_temp);
5322 else
5324 /* Case 3: Create:
5325 s = extract_field <v_out2, 0>
5326 for (offset = element_size;
5327 offset < vector_size;
5328 offset += element_size;)
5330 Create: s' = extract_field <v_out2, offset>
5331 Create: s = op <s, s'> // For non SLP cases
5332 } */
5334 if (dump_enabled_p ())
5335 dump_printf_loc (MSG_NOTE, vect_location,
5336 "Reduce using scalar code.\n");
5338 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5339 int element_bitsize = tree_to_uhwi (bitsize);
5340 tree compute_type = TREE_TYPE (vectype);
5341 gimple_seq stmts = NULL;
5342 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5344 int bit_offset;
5345 if (gimple_code (new_phi) == GIMPLE_PHI)
5346 vec_temp = PHI_RESULT (new_phi);
5347 else
5348 vec_temp = gimple_assign_lhs (new_phi);
5349 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5350 vec_temp, bitsize, bitsize_zero_node);
5352 /* In SLP we don't need to apply reduction operation, so we just
5353 collect s' values in SCALAR_RESULTS. */
5354 if (slp_reduc)
5355 scalar_results.safe_push (new_temp);
5357 for (bit_offset = element_bitsize;
5358 bit_offset < vec_size_in_bits;
5359 bit_offset += element_bitsize)
5361 tree bitpos = bitsize_int (bit_offset);
5362 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5363 compute_type, vec_temp,
5364 bitsize, bitpos);
5365 if (slp_reduc)
5367 /* In SLP we don't need to apply reduction operation, so
5368 we just collect s' values in SCALAR_RESULTS. */
5369 new_temp = new_name;
5370 scalar_results.safe_push (new_name);
5372 else
5373 new_temp = gimple_build (&stmts, code, compute_type,
5374 new_name, new_temp);
5378 /* The only case where we need to reduce scalar results in SLP, is
5379 unrolling. If the size of SCALAR_RESULTS is greater than
5380 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5381 REDUC_GROUP_SIZE. */
5382 if (slp_reduc)
5384 tree res, first_res, new_res;
5386 /* Reduce multiple scalar results in case of SLP unrolling. */
5387 for (j = group_size; scalar_results.iterate (j, &res);
5388 j++)
5390 first_res = scalar_results[j % group_size];
5391 new_res = gimple_build (&stmts, code, compute_type,
5392 first_res, res);
5393 scalar_results[j % group_size] = new_res;
5395 for (k = 0; k < group_size; k++)
5396 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5397 scalar_results[k]);
5399 else
5401 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5402 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5403 scalar_results.safe_push (new_temp);
5406 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5409 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5410 && induc_val)
5412 /* Earlier we set the initial value to be a vector if induc_val
5413 values. Check the result and if it is induc_val then replace
5414 with the original initial value, unless induc_val is
5415 the same as initial_def already. */
5416 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5417 induc_val);
5419 tree tmp = make_ssa_name (new_scalar_dest);
5420 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5421 initial_def, new_temp);
5422 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5423 scalar_results[0] = tmp;
5427 /* 2.5 Adjust the final result by the initial value of the reduction
5428 variable. (When such adjustment is not needed, then
5429 'adjustment_def' is zero). For example, if code is PLUS we create:
5430 new_temp = loop_exit_def + adjustment_def */
5432 if (adjustment_def)
5434 gcc_assert (!slp_reduc);
5435 gimple_seq stmts = NULL;
5436 if (nested_in_vect_loop)
5438 new_phi = new_phis[0];
5439 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5440 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5441 new_temp = gimple_build (&stmts, code, vectype,
5442 PHI_RESULT (new_phi), adjustment_def);
5444 else
5446 new_temp = scalar_results[0];
5447 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5448 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5449 new_temp = gimple_build (&stmts, code, scalar_type,
5450 new_temp, adjustment_def);
5453 epilog_stmt = gimple_seq_last_stmt (stmts);
5454 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5455 if (nested_in_vect_loop)
5457 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5458 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5459 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5461 if (!double_reduc)
5462 scalar_results.quick_push (new_temp);
5463 else
5464 scalar_results[0] = new_temp;
5466 else
5467 scalar_results[0] = new_temp;
5469 new_phis[0] = epilog_stmt;
5472 if (double_reduc)
5473 loop = loop->inner;
5475 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5476 phis with new adjusted scalar results, i.e., replace use <s_out0>
5477 with use <s_out4>.
5479 Transform:
5480 loop_exit:
5481 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5482 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5483 v_out2 = reduce <v_out1>
5484 s_out3 = extract_field <v_out2, 0>
5485 s_out4 = adjust_result <s_out3>
5486 use <s_out0>
5487 use <s_out0>
5489 into:
5491 loop_exit:
5492 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5493 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5494 v_out2 = reduce <v_out1>
5495 s_out3 = extract_field <v_out2, 0>
5496 s_out4 = adjust_result <s_out3>
5497 use <s_out4>
5498 use <s_out4> */
5501 /* In SLP reduction chain we reduce vector results into one vector if
5502 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5503 LHS of the last stmt in the reduction chain, since we are looking for
5504 the loop exit phi node. */
5505 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5507 stmt_vec_info dest_stmt_info
5508 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5509 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5510 group_size = 1;
5513 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5514 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5515 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5516 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5517 correspond to the first vector stmt, etc.
5518 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5519 if (group_size > new_phis.length ())
5520 gcc_assert (!(group_size % new_phis.length ()));
5522 for (k = 0; k < group_size; k++)
5524 if (slp_reduc)
5526 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5528 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5529 /* SLP statements can't participate in patterns. */
5530 gcc_assert (!orig_stmt_info);
5531 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5534 if (nested_in_vect_loop)
5536 if (double_reduc)
5537 loop = outer_loop;
5538 else
5539 gcc_unreachable ();
5542 phis.create (3);
5543 /* Find the loop-closed-use at the loop exit of the original scalar
5544 result. (The reduction result is expected to have two immediate uses,
5545 one at the latch block, and one at the loop exit). For double
5546 reductions we are looking for exit phis of the outer loop. */
5547 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5549 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5551 if (!is_gimple_debug (USE_STMT (use_p)))
5552 phis.safe_push (USE_STMT (use_p));
5554 else
5556 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5558 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5560 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5562 if (!flow_bb_inside_loop_p (loop,
5563 gimple_bb (USE_STMT (phi_use_p)))
5564 && !is_gimple_debug (USE_STMT (phi_use_p)))
5565 phis.safe_push (USE_STMT (phi_use_p));
5571 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5573 /* Replace the uses: */
5574 orig_name = PHI_RESULT (exit_phi);
5575 scalar_result = scalar_results[k];
5576 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5578 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5579 SET_USE (use_p, scalar_result);
5580 update_stmt (use_stmt);
5584 phis.release ();
5588 /* Return a vector of type VECTYPE that is equal to the vector select
5589 operation "MASK ? VEC : IDENTITY". Insert the select statements
5590 before GSI. */
5592 static tree
5593 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5594 tree vec, tree identity)
5596 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5597 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5598 mask, vec, identity);
5599 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5600 return cond;
5603 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5604 order, starting with LHS. Insert the extraction statements before GSI and
5605 associate the new scalar SSA names with variable SCALAR_DEST.
5606 Return the SSA name for the result. */
5608 static tree
5609 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5610 tree_code code, tree lhs, tree vector_rhs)
5612 tree vectype = TREE_TYPE (vector_rhs);
5613 tree scalar_type = TREE_TYPE (vectype);
5614 tree bitsize = TYPE_SIZE (scalar_type);
5615 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5616 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5618 for (unsigned HOST_WIDE_INT bit_offset = 0;
5619 bit_offset < vec_size_in_bits;
5620 bit_offset += element_bitsize)
5622 tree bitpos = bitsize_int (bit_offset);
5623 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5624 bitsize, bitpos);
5626 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5627 rhs = make_ssa_name (scalar_dest, stmt);
5628 gimple_assign_set_lhs (stmt, rhs);
5629 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5631 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5632 tree new_name = make_ssa_name (scalar_dest, stmt);
5633 gimple_assign_set_lhs (stmt, new_name);
5634 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5635 lhs = new_name;
5637 return lhs;
5640 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5641 type of the vector input. */
5643 static internal_fn
5644 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5646 internal_fn mask_reduc_fn;
5648 switch (reduc_fn)
5650 case IFN_FOLD_LEFT_PLUS:
5651 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5652 break;
5654 default:
5655 return IFN_LAST;
5658 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5659 OPTIMIZE_FOR_SPEED))
5660 return mask_reduc_fn;
5661 return IFN_LAST;
5664 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5665 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5666 statement. CODE is the operation performed by STMT_INFO and OPS are
5667 its scalar operands. REDUC_INDEX is the index of the operand in
5668 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5669 implements in-order reduction, or IFN_LAST if we should open-code it.
5670 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5671 that should be used to control the operation in a fully-masked loop. */
5673 static bool
5674 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5675 stmt_vec_info stmt_info,
5676 gimple_stmt_iterator *gsi,
5677 stmt_vec_info *vec_stmt, slp_tree slp_node,
5678 gimple *reduc_def_stmt,
5679 tree_code code, internal_fn reduc_fn,
5680 tree ops[3], tree vectype_in,
5681 int reduc_index, vec_loop_masks *masks)
5683 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5684 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5685 stmt_vec_info new_stmt_info = NULL;
5686 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5688 int ncopies;
5689 if (slp_node)
5690 ncopies = 1;
5691 else
5692 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5694 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5695 gcc_assert (ncopies == 1);
5696 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5698 if (slp_node)
5699 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5700 TYPE_VECTOR_SUBPARTS (vectype_in)));
5702 tree op0 = ops[1 - reduc_index];
5704 int group_size = 1;
5705 stmt_vec_info scalar_dest_def_info;
5706 auto_vec<tree> vec_oprnds0;
5707 if (slp_node)
5709 auto_vec<vec<tree> > vec_defs (2);
5710 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5711 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5712 vec_defs[0].release ();
5713 vec_defs[1].release ();
5714 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5715 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5717 else
5719 tree loop_vec_def0 = vect_get_vec_def_for_operand (loop_vinfo,
5720 op0, stmt_info);
5721 vec_oprnds0.create (1);
5722 vec_oprnds0.quick_push (loop_vec_def0);
5723 scalar_dest_def_info = stmt_info;
5726 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5727 tree scalar_type = TREE_TYPE (scalar_dest);
5728 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5730 int vec_num = vec_oprnds0.length ();
5731 gcc_assert (vec_num == 1 || slp_node);
5732 tree vec_elem_type = TREE_TYPE (vectype_out);
5733 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5735 tree vector_identity = NULL_TREE;
5736 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5737 vector_identity = build_zero_cst (vectype_out);
5739 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5740 int i;
5741 tree def0;
5742 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5744 gimple *new_stmt;
5745 tree mask = NULL_TREE;
5746 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5747 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5749 /* Handle MINUS by adding the negative. */
5750 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5752 tree negated = make_ssa_name (vectype_out);
5753 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5754 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5755 def0 = negated;
5758 if (mask && mask_reduc_fn == IFN_LAST)
5759 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5760 vector_identity);
5762 /* On the first iteration the input is simply the scalar phi
5763 result, and for subsequent iterations it is the output of
5764 the preceding operation. */
5765 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5767 if (mask && mask_reduc_fn != IFN_LAST)
5768 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5769 def0, mask);
5770 else
5771 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5772 def0);
5773 /* For chained SLP reductions the output of the previous reduction
5774 operation serves as the input of the next. For the final statement
5775 the output cannot be a temporary - we reuse the original
5776 scalar destination of the last statement. */
5777 if (i != vec_num - 1)
5779 gimple_set_lhs (new_stmt, scalar_dest_var);
5780 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5781 gimple_set_lhs (new_stmt, reduc_var);
5784 else
5786 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5787 reduc_var, def0);
5788 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5789 /* Remove the statement, so that we can use the same code paths
5790 as for statements that we've just created. */
5791 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5792 gsi_remove (&tmp_gsi, true);
5795 if (i == vec_num - 1)
5797 gimple_set_lhs (new_stmt, scalar_dest);
5798 new_stmt_info = vect_finish_replace_stmt (loop_vinfo,
5799 scalar_dest_def_info,
5800 new_stmt);
5802 else
5803 new_stmt_info = vect_finish_stmt_generation (loop_vinfo,
5804 scalar_dest_def_info,
5805 new_stmt, gsi);
5807 if (slp_node)
5808 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5811 if (!slp_node)
5812 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5814 return true;
5817 /* Function is_nonwrapping_integer_induction.
5819 Check if STMT_VINO (which is part of loop LOOP) both increments and
5820 does not cause overflow. */
5822 static bool
5823 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5825 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5826 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5827 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5828 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5829 widest_int ni, max_loop_value, lhs_max;
5830 wi::overflow_type overflow = wi::OVF_NONE;
5832 /* Make sure the loop is integer based. */
5833 if (TREE_CODE (base) != INTEGER_CST
5834 || TREE_CODE (step) != INTEGER_CST)
5835 return false;
5837 /* Check that the max size of the loop will not wrap. */
5839 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5840 return true;
5842 if (! max_stmt_executions (loop, &ni))
5843 return false;
5845 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5846 &overflow);
5847 if (overflow)
5848 return false;
5850 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5851 TYPE_SIGN (lhs_type), &overflow);
5852 if (overflow)
5853 return false;
5855 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5856 <= TYPE_PRECISION (lhs_type));
5859 /* Check if masking can be supported by inserting a conditional expression.
5860 CODE is the code for the operation. COND_FN is the conditional internal
5861 function, if it exists. VECTYPE_IN is the type of the vector input. */
5862 static bool
5863 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5864 tree vectype_in)
5866 if (cond_fn != IFN_LAST
5867 && direct_internal_fn_supported_p (cond_fn, vectype_in,
5868 OPTIMIZE_FOR_SPEED))
5869 return false;
5871 switch (code)
5873 case DOT_PROD_EXPR:
5874 case SAD_EXPR:
5875 return true;
5877 default:
5878 return false;
5882 /* Insert a conditional expression to enable masked vectorization. CODE is the
5883 code for the operation. VOP is the array of operands. MASK is the loop
5884 mask. GSI is a statement iterator used to place the new conditional
5885 expression. */
5886 static void
5887 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5888 gimple_stmt_iterator *gsi)
5890 switch (code)
5892 case DOT_PROD_EXPR:
5894 tree vectype = TREE_TYPE (vop[1]);
5895 tree zero = build_zero_cst (vectype);
5896 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5897 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5898 mask, vop[1], zero);
5899 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5900 vop[1] = masked_op1;
5901 break;
5904 case SAD_EXPR:
5906 tree vectype = TREE_TYPE (vop[1]);
5907 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5908 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5909 mask, vop[1], vop[0]);
5910 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5911 vop[1] = masked_op1;
5912 break;
5915 default:
5916 gcc_unreachable ();
5920 /* Function vectorizable_reduction.
5922 Check if STMT_INFO performs a reduction operation that can be vectorized.
5923 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5924 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5925 Return true if STMT_INFO is vectorizable in this way.
5927 This function also handles reduction idioms (patterns) that have been
5928 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5929 may be of this form:
5930 X = pattern_expr (arg0, arg1, ..., X)
5931 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5932 sequence that had been detected and replaced by the pattern-stmt
5933 (STMT_INFO).
5935 This function also handles reduction of condition expressions, for example:
5936 for (int i = 0; i < N; i++)
5937 if (a[i] < value)
5938 last = a[i];
5939 This is handled by vectorising the loop and creating an additional vector
5940 containing the loop indexes for which "a[i] < value" was true. In the
5941 function epilogue this is reduced to a single max value and then used to
5942 index into the vector of results.
5944 In some cases of reduction patterns, the type of the reduction variable X is
5945 different than the type of the other arguments of STMT_INFO.
5946 In such cases, the vectype that is used when transforming STMT_INFO into
5947 a vector stmt is different than the vectype that is used to determine the
5948 vectorization factor, because it consists of a different number of elements
5949 than the actual number of elements that are being operated upon in parallel.
5951 For example, consider an accumulation of shorts into an int accumulator.
5952 On some targets it's possible to vectorize this pattern operating on 8
5953 shorts at a time (hence, the vectype for purposes of determining the
5954 vectorization factor should be V8HI); on the other hand, the vectype that
5955 is used to create the vector form is actually V4SI (the type of the result).
5957 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5958 indicates what is the actual level of parallelism (V8HI in the example), so
5959 that the right vectorization factor would be derived. This vectype
5960 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5961 be used to create the vectorized stmt. The right vectype for the vectorized
5962 stmt is obtained from the type of the result X:
5963 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5965 This means that, contrary to "regular" reductions (or "regular" stmts in
5966 general), the following equation:
5967 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5968 does *NOT* necessarily hold for reduction patterns. */
5970 bool
5971 vectorizable_reduction (loop_vec_info loop_vinfo,
5972 stmt_vec_info stmt_info, slp_tree slp_node,
5973 slp_instance slp_node_instance,
5974 stmt_vector_for_cost *cost_vec)
5976 tree scalar_dest;
5977 tree vectype_in = NULL_TREE;
5978 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5979 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5980 stmt_vec_info cond_stmt_vinfo = NULL;
5981 tree scalar_type;
5982 int i;
5983 int ncopies;
5984 bool single_defuse_cycle = false;
5985 bool nested_cycle = false;
5986 bool double_reduc = false;
5987 int vec_num;
5988 tree tem;
5989 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5990 tree cond_reduc_val = NULL_TREE;
5992 /* Make sure it was already recognized as a reduction computation. */
5993 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5994 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5995 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5996 return false;
5998 /* The stmt we store reduction analysis meta on. */
5999 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6000 reduc_info->is_reduc_info = true;
6002 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6004 if (is_a <gphi *> (stmt_info->stmt))
6005 /* Analysis for double-reduction is done on the outer
6006 loop PHI, nested cycles have no further restrictions. */
6007 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6008 else
6009 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6010 return true;
6013 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6014 stmt_vec_info phi_info = stmt_info;
6015 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6016 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6018 if (!is_a <gphi *> (stmt_info->stmt))
6020 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6021 return true;
6023 if (slp_node)
6025 slp_node_instance->reduc_phis = slp_node;
6026 /* ??? We're leaving slp_node to point to the PHIs, we only
6027 need it to get at the number of vector stmts which wasn't
6028 yet initialized for the instance root. */
6030 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6031 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6032 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6034 use_operand_p use_p;
6035 gimple *use_stmt;
6036 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6037 &use_p, &use_stmt);
6038 gcc_assert (res);
6039 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6040 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6044 /* PHIs should not participate in patterns. */
6045 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6046 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6048 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6049 and compute the reduction chain length. */
6050 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6051 loop_latch_edge (loop));
6052 unsigned reduc_chain_length = 0;
6053 bool only_slp_reduc_chain = true;
6054 stmt_info = NULL;
6055 while (reduc_def != PHI_RESULT (reduc_def_phi))
6057 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6058 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6059 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6061 if (dump_enabled_p ())
6062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6063 "reduction chain broken by patterns.\n");
6064 return false;
6066 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6067 only_slp_reduc_chain = false;
6068 /* ??? For epilogue generation live members of the chain need
6069 to point back to the PHI via their original stmt for
6070 info_for_reduction to work. */
6071 if (STMT_VINFO_LIVE_P (vdef))
6072 STMT_VINFO_REDUC_DEF (def) = phi_info;
6073 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6074 if (!assign)
6076 if (dump_enabled_p ())
6077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6078 "reduction chain includes calls.\n");
6079 return false;
6081 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6083 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6084 TREE_TYPE (gimple_assign_rhs1 (assign))))
6086 if (dump_enabled_p ())
6087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6088 "conversion in the reduction chain.\n");
6089 return false;
6092 else if (!stmt_info)
6093 /* First non-conversion stmt. */
6094 stmt_info = vdef;
6095 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6096 reduc_chain_length++;
6098 /* PHIs should not participate in patterns. */
6099 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6101 if (nested_in_vect_loop_p (loop, stmt_info))
6103 loop = loop->inner;
6104 nested_cycle = true;
6107 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6108 element. */
6109 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6111 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6112 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6114 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6115 gcc_assert (slp_node
6116 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6118 /* 1. Is vectorizable reduction? */
6119 /* Not supportable if the reduction variable is used in the loop, unless
6120 it's a reduction chain. */
6121 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6122 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6123 return false;
6125 /* Reductions that are not used even in an enclosing outer-loop,
6126 are expected to be "live" (used out of the loop). */
6127 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6128 && !STMT_VINFO_LIVE_P (stmt_info))
6129 return false;
6131 /* 2. Has this been recognized as a reduction pattern?
6133 Check if STMT represents a pattern that has been recognized
6134 in earlier analysis stages. For stmts that represent a pattern,
6135 the STMT_VINFO_RELATED_STMT field records the last stmt in
6136 the original sequence that constitutes the pattern. */
6138 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6139 if (orig_stmt_info)
6141 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6142 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6145 /* 3. Check the operands of the operation. The first operands are defined
6146 inside the loop body. The last operand is the reduction variable,
6147 which is defined by the loop-header-phi. */
6149 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6150 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6151 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6152 enum tree_code code = gimple_assign_rhs_code (stmt);
6153 bool lane_reduc_code_p
6154 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6155 int op_type = TREE_CODE_LENGTH (code);
6157 scalar_dest = gimple_assign_lhs (stmt);
6158 scalar_type = TREE_TYPE (scalar_dest);
6159 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6160 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6161 return false;
6163 /* Do not try to vectorize bit-precision reductions. */
6164 if (!type_has_mode_precision_p (scalar_type))
6165 return false;
6167 /* For lane-reducing ops we're reducing the number of reduction PHIs
6168 which means the only use of that may be in the lane-reducing operation. */
6169 if (lane_reduc_code_p
6170 && reduc_chain_length != 1
6171 && !only_slp_reduc_chain)
6173 if (dump_enabled_p ())
6174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6175 "lane-reducing reduction with extra stmts.\n");
6176 return false;
6179 /* All uses but the last are expected to be defined in the loop.
6180 The last use is the reduction variable. In case of nested cycle this
6181 assumption is not true: we use reduc_index to record the index of the
6182 reduction variable. */
6183 reduc_def = PHI_RESULT (reduc_def_phi);
6184 for (i = 0; i < op_type; i++)
6186 tree op = gimple_op (stmt, i + 1);
6187 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6188 if (i == 0 && code == COND_EXPR)
6189 continue;
6191 stmt_vec_info def_stmt_info;
6192 enum vect_def_type dt;
6193 if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6194 &def_stmt_info))
6196 if (dump_enabled_p ())
6197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6198 "use not simple.\n");
6199 return false;
6201 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6202 continue;
6204 /* There should be only one cycle def in the stmt, the one
6205 leading to reduc_def. */
6206 if (VECTORIZABLE_CYCLE_DEF (dt))
6207 return false;
6209 /* To properly compute ncopies we are interested in the widest
6210 non-reduction input type in case we're looking at a widening
6211 accumulation that we later handle in vect_transform_reduction. */
6212 if (lane_reduc_code_p
6213 && tem
6214 && (!vectype_in
6215 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6216 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6217 vectype_in = tem;
6219 if (code == COND_EXPR)
6221 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6222 if (dt == vect_constant_def)
6224 cond_reduc_dt = dt;
6225 cond_reduc_val = op;
6227 if (dt == vect_induction_def
6228 && def_stmt_info
6229 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6231 cond_reduc_dt = dt;
6232 cond_stmt_vinfo = def_stmt_info;
6236 if (!vectype_in)
6237 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6238 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6240 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6241 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6242 /* If we have a condition reduction, see if we can simplify it further. */
6243 if (v_reduc_type == COND_REDUCTION)
6245 if (slp_node)
6246 return false;
6248 /* When the condition uses the reduction value in the condition, fail. */
6249 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6251 if (dump_enabled_p ())
6252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6253 "condition depends on previous iteration\n");
6254 return false;
6257 if (reduc_chain_length == 1
6258 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6259 vectype_in, OPTIMIZE_FOR_SPEED))
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 "optimizing condition reduction with"
6264 " FOLD_EXTRACT_LAST.\n");
6265 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6267 else if (cond_reduc_dt == vect_induction_def)
6269 tree base
6270 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6271 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6273 gcc_assert (TREE_CODE (base) == INTEGER_CST
6274 && TREE_CODE (step) == INTEGER_CST);
6275 cond_reduc_val = NULL_TREE;
6276 enum tree_code cond_reduc_op_code = ERROR_MARK;
6277 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6278 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6280 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6281 above base; punt if base is the minimum value of the type for
6282 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6283 else if (tree_int_cst_sgn (step) == -1)
6285 cond_reduc_op_code = MIN_EXPR;
6286 if (tree_int_cst_sgn (base) == -1)
6287 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6288 else if (tree_int_cst_lt (base,
6289 TYPE_MAX_VALUE (TREE_TYPE (base))))
6290 cond_reduc_val
6291 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6293 else
6295 cond_reduc_op_code = MAX_EXPR;
6296 if (tree_int_cst_sgn (base) == 1)
6297 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6298 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6299 base))
6300 cond_reduc_val
6301 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6303 if (cond_reduc_val)
6305 if (dump_enabled_p ())
6306 dump_printf_loc (MSG_NOTE, vect_location,
6307 "condition expression based on "
6308 "integer induction.\n");
6309 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6310 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6311 = cond_reduc_val;
6312 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6315 else if (cond_reduc_dt == vect_constant_def)
6317 enum vect_def_type cond_initial_dt;
6318 tree cond_initial_val
6319 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6321 gcc_assert (cond_reduc_val != NULL_TREE);
6322 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6323 if (cond_initial_dt == vect_constant_def
6324 && types_compatible_p (TREE_TYPE (cond_initial_val),
6325 TREE_TYPE (cond_reduc_val)))
6327 tree e = fold_binary (LE_EXPR, boolean_type_node,
6328 cond_initial_val, cond_reduc_val);
6329 if (e && (integer_onep (e) || integer_zerop (e)))
6331 if (dump_enabled_p ())
6332 dump_printf_loc (MSG_NOTE, vect_location,
6333 "condition expression based on "
6334 "compile time constant.\n");
6335 /* Record reduction code at analysis stage. */
6336 STMT_VINFO_REDUC_CODE (reduc_info)
6337 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6338 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6344 if (STMT_VINFO_LIVE_P (phi_info))
6345 return false;
6347 if (slp_node)
6348 ncopies = 1;
6349 else
6350 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6352 gcc_assert (ncopies >= 1);
6354 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6356 if (nested_cycle)
6358 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6359 == vect_double_reduction_def);
6360 double_reduc = true;
6363 /* 4.2. Check support for the epilog operation.
6365 If STMT represents a reduction pattern, then the type of the
6366 reduction variable may be different than the type of the rest
6367 of the arguments. For example, consider the case of accumulation
6368 of shorts into an int accumulator; The original code:
6369 S1: int_a = (int) short_a;
6370 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6372 was replaced with:
6373 STMT: int_acc = widen_sum <short_a, int_acc>
6375 This means that:
6376 1. The tree-code that is used to create the vector operation in the
6377 epilog code (that reduces the partial results) is not the
6378 tree-code of STMT, but is rather the tree-code of the original
6379 stmt from the pattern that STMT is replacing. I.e, in the example
6380 above we want to use 'widen_sum' in the loop, but 'plus' in the
6381 epilog.
6382 2. The type (mode) we use to check available target support
6383 for the vector operation to be created in the *epilog*, is
6384 determined by the type of the reduction variable (in the example
6385 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6386 However the type (mode) we use to check available target support
6387 for the vector operation to be created *inside the loop*, is
6388 determined by the type of the other arguments to STMT (in the
6389 example we'd check this: optab_handler (widen_sum_optab,
6390 vect_short_mode)).
6392 This is contrary to "regular" reductions, in which the types of all
6393 the arguments are the same as the type of the reduction variable.
6394 For "regular" reductions we can therefore use the same vector type
6395 (and also the same tree-code) when generating the epilog code and
6396 when generating the code inside the loop. */
6398 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6399 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6401 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6402 if (reduction_type == TREE_CODE_REDUCTION)
6404 /* Check whether it's ok to change the order of the computation.
6405 Generally, when vectorizing a reduction we change the order of the
6406 computation. This may change the behavior of the program in some
6407 cases, so we need to check that this is ok. One exception is when
6408 vectorizing an outer-loop: the inner-loop is executed sequentially,
6409 and therefore vectorizing reductions in the inner-loop during
6410 outer-loop vectorization is safe. */
6411 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6413 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6414 is not directy used in stmt. */
6415 if (!only_slp_reduc_chain
6416 && reduc_chain_length != 1)
6418 if (dump_enabled_p ())
6419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6420 "in-order reduction chain without SLP.\n");
6421 return false;
6423 STMT_VINFO_REDUC_TYPE (reduc_info)
6424 = reduction_type = FOLD_LEFT_REDUCTION;
6426 else if (!commutative_tree_code (orig_code)
6427 || !associative_tree_code (orig_code))
6429 if (dump_enabled_p ())
6430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6431 "reduction: not commutative/associative");
6432 return false;
6436 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6437 && ncopies > 1)
6439 if (dump_enabled_p ())
6440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6441 "multiple types in double reduction or condition "
6442 "reduction or fold-left reduction.\n");
6443 return false;
6446 internal_fn reduc_fn = IFN_LAST;
6447 if (reduction_type == TREE_CODE_REDUCTION
6448 || reduction_type == FOLD_LEFT_REDUCTION
6449 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6450 || reduction_type == CONST_COND_REDUCTION)
6452 if (reduction_type == FOLD_LEFT_REDUCTION
6453 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6454 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6456 if (reduc_fn != IFN_LAST
6457 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6458 OPTIMIZE_FOR_SPEED))
6460 if (dump_enabled_p ())
6461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6462 "reduc op not supported by target.\n");
6464 reduc_fn = IFN_LAST;
6467 else
6469 if (!nested_cycle || double_reduc)
6471 if (dump_enabled_p ())
6472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6473 "no reduc code for scalar code.\n");
6475 return false;
6479 else if (reduction_type == COND_REDUCTION)
6481 int scalar_precision
6482 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6483 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6484 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6485 nunits_out);
6487 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6488 OPTIMIZE_FOR_SPEED))
6489 reduc_fn = IFN_REDUC_MAX;
6491 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6493 if (reduction_type != EXTRACT_LAST_REDUCTION
6494 && (!nested_cycle || double_reduc)
6495 && reduc_fn == IFN_LAST
6496 && !nunits_out.is_constant ())
6498 if (dump_enabled_p ())
6499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6500 "missing target support for reduction on"
6501 " variable-length vectors.\n");
6502 return false;
6505 /* For SLP reductions, see if there is a neutral value we can use. */
6506 tree neutral_op = NULL_TREE;
6507 if (slp_node)
6508 neutral_op = neutral_op_for_slp_reduction
6509 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6510 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6512 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6514 /* We can't support in-order reductions of code such as this:
6516 for (int i = 0; i < n1; ++i)
6517 for (int j = 0; j < n2; ++j)
6518 l += a[j];
6520 since GCC effectively transforms the loop when vectorizing:
6522 for (int i = 0; i < n1 / VF; ++i)
6523 for (int j = 0; j < n2; ++j)
6524 for (int k = 0; k < VF; ++k)
6525 l += a[j];
6527 which is a reassociation of the original operation. */
6528 if (dump_enabled_p ())
6529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6530 "in-order double reduction not supported.\n");
6532 return false;
6535 if (reduction_type == FOLD_LEFT_REDUCTION
6536 && slp_node
6537 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6539 /* We cannot use in-order reductions in this case because there is
6540 an implicit reassociation of the operations involved. */
6541 if (dump_enabled_p ())
6542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6543 "in-order unchained SLP reductions not supported.\n");
6544 return false;
6547 /* For double reductions, and for SLP reductions with a neutral value,
6548 we construct a variable-length initial vector by loading a vector
6549 full of the neutral value and then shift-and-inserting the start
6550 values into the low-numbered elements. */
6551 if ((double_reduc || neutral_op)
6552 && !nunits_out.is_constant ()
6553 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6554 vectype_out, OPTIMIZE_FOR_SPEED))
6556 if (dump_enabled_p ())
6557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6558 "reduction on variable-length vectors requires"
6559 " target support for a vector-shift-and-insert"
6560 " operation.\n");
6561 return false;
6564 /* Check extra constraints for variable-length unchained SLP reductions. */
6565 if (STMT_SLP_TYPE (stmt_info)
6566 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6567 && !nunits_out.is_constant ())
6569 /* We checked above that we could build the initial vector when
6570 there's a neutral element value. Check here for the case in
6571 which each SLP statement has its own initial value and in which
6572 that value needs to be repeated for every instance of the
6573 statement within the initial vector. */
6574 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6575 if (!neutral_op
6576 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6577 TREE_TYPE (vectype_out)))
6579 if (dump_enabled_p ())
6580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581 "unsupported form of SLP reduction for"
6582 " variable-length vectors: cannot build"
6583 " initial vector.\n");
6584 return false;
6586 /* The epilogue code relies on the number of elements being a multiple
6587 of the group size. The duplicate-and-interleave approach to setting
6588 up the initial vector does too. */
6589 if (!multiple_p (nunits_out, group_size))
6591 if (dump_enabled_p ())
6592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6593 "unsupported form of SLP reduction for"
6594 " variable-length vectors: the vector size"
6595 " is not a multiple of the number of results.\n");
6596 return false;
6600 if (reduction_type == COND_REDUCTION)
6602 widest_int ni;
6604 if (! max_loop_iterations (loop, &ni))
6606 if (dump_enabled_p ())
6607 dump_printf_loc (MSG_NOTE, vect_location,
6608 "loop count not known, cannot create cond "
6609 "reduction.\n");
6610 return false;
6612 /* Convert backedges to iterations. */
6613 ni += 1;
6615 /* The additional index will be the same type as the condition. Check
6616 that the loop can fit into this less one (because we'll use up the
6617 zero slot for when there are no matches). */
6618 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6619 if (wi::geu_p (ni, wi::to_widest (max_index)))
6621 if (dump_enabled_p ())
6622 dump_printf_loc (MSG_NOTE, vect_location,
6623 "loop size is greater than data size.\n");
6624 return false;
6628 /* In case the vectorization factor (VF) is bigger than the number
6629 of elements that we can fit in a vectype (nunits), we have to generate
6630 more than one vector stmt - i.e - we need to "unroll" the
6631 vector stmt by a factor VF/nunits. For more details see documentation
6632 in vectorizable_operation. */
6634 /* If the reduction is used in an outer loop we need to generate
6635 VF intermediate results, like so (e.g. for ncopies=2):
6636 r0 = phi (init, r0)
6637 r1 = phi (init, r1)
6638 r0 = x0 + r0;
6639 r1 = x1 + r1;
6640 (i.e. we generate VF results in 2 registers).
6641 In this case we have a separate def-use cycle for each copy, and therefore
6642 for each copy we get the vector def for the reduction variable from the
6643 respective phi node created for this copy.
6645 Otherwise (the reduction is unused in the loop nest), we can combine
6646 together intermediate results, like so (e.g. for ncopies=2):
6647 r = phi (init, r)
6648 r = x0 + r;
6649 r = x1 + r;
6650 (i.e. we generate VF/2 results in a single register).
6651 In this case for each copy we get the vector def for the reduction variable
6652 from the vectorized reduction operation generated in the previous iteration.
6654 This only works when we see both the reduction PHI and its only consumer
6655 in vectorizable_reduction and there are no intermediate stmts
6656 participating. */
6657 if (ncopies > 1
6658 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6659 && reduc_chain_length == 1)
6660 single_defuse_cycle = true;
6662 if (single_defuse_cycle || lane_reduc_code_p)
6664 gcc_assert (code != COND_EXPR);
6666 /* 4. Supportable by target? */
6667 bool ok = true;
6669 /* 4.1. check support for the operation in the loop */
6670 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6671 if (!optab)
6673 if (dump_enabled_p ())
6674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6675 "no optab.\n");
6676 ok = false;
6679 machine_mode vec_mode = TYPE_MODE (vectype_in);
6680 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6682 if (dump_enabled_p ())
6683 dump_printf (MSG_NOTE, "op not supported by target.\n");
6684 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6685 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6686 ok = false;
6687 else
6688 if (dump_enabled_p ())
6689 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6692 /* Worthwhile without SIMD support? */
6693 if (ok
6694 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6695 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6697 if (dump_enabled_p ())
6698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6699 "not worthwhile without SIMD support.\n");
6700 ok = false;
6703 /* lane-reducing operations have to go through vect_transform_reduction.
6704 For the other cases try without the single cycle optimization. */
6705 if (!ok)
6707 if (lane_reduc_code_p)
6708 return false;
6709 else
6710 single_defuse_cycle = false;
6713 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6715 /* If the reduction stmt is one of the patterns that have lane
6716 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6717 if ((ncopies > 1 && ! single_defuse_cycle)
6718 && lane_reduc_code_p)
6720 if (dump_enabled_p ())
6721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6722 "multi def-use cycle not possible for lane-reducing "
6723 "reduction operation\n");
6724 return false;
6727 if (slp_node)
6728 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6729 else
6730 vec_num = 1;
6732 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
6733 reduction_type, ncopies, cost_vec);
6734 if (dump_enabled_p ()
6735 && reduction_type == FOLD_LEFT_REDUCTION)
6736 dump_printf_loc (MSG_NOTE, vect_location,
6737 "using an in-order (fold-left) reduction.\n");
6738 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6739 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6740 reductions go through their own vectorizable_* routines. */
6741 if (!single_defuse_cycle
6742 && code != DOT_PROD_EXPR
6743 && code != WIDEN_SUM_EXPR
6744 && code != SAD_EXPR
6745 && reduction_type != FOLD_LEFT_REDUCTION)
6747 stmt_vec_info tem
6748 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6749 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6751 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6752 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6754 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6755 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6757 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6759 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6760 internal_fn cond_fn = get_conditional_internal_fn (code);
6762 if (reduction_type != FOLD_LEFT_REDUCTION
6763 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6764 && (cond_fn == IFN_LAST
6765 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6766 OPTIMIZE_FOR_SPEED)))
6768 if (dump_enabled_p ())
6769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6770 "can't use a fully-masked loop because no"
6771 " conditional operation is available.\n");
6772 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6774 else if (reduction_type == FOLD_LEFT_REDUCTION
6775 && reduc_fn == IFN_LAST
6776 && !expand_vec_cond_expr_p (vectype_in,
6777 truth_type_for (vectype_in),
6778 SSA_NAME))
6780 if (dump_enabled_p ())
6781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6782 "can't use a fully-masked loop because no"
6783 " conditional operation is available.\n");
6784 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6786 else
6787 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6788 vectype_in, NULL);
6790 return true;
6793 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6794 value. */
6796 bool
6797 vect_transform_reduction (loop_vec_info loop_vinfo,
6798 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6799 stmt_vec_info *vec_stmt, slp_tree slp_node)
6801 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6802 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6803 int i;
6804 int ncopies;
6805 int j;
6806 int vec_num;
6808 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6809 gcc_assert (reduc_info->is_reduc_info);
6811 if (nested_in_vect_loop_p (loop, stmt_info))
6813 loop = loop->inner;
6814 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6817 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6818 enum tree_code code = gimple_assign_rhs_code (stmt);
6819 int op_type = TREE_CODE_LENGTH (code);
6821 /* Flatten RHS. */
6822 tree ops[3];
6823 switch (get_gimple_rhs_class (code))
6825 case GIMPLE_TERNARY_RHS:
6826 ops[2] = gimple_assign_rhs3 (stmt);
6827 /* Fall thru. */
6828 case GIMPLE_BINARY_RHS:
6829 ops[0] = gimple_assign_rhs1 (stmt);
6830 ops[1] = gimple_assign_rhs2 (stmt);
6831 break;
6832 default:
6833 gcc_unreachable ();
6836 /* All uses but the last are expected to be defined in the loop.
6837 The last use is the reduction variable. In case of nested cycle this
6838 assumption is not true: we use reduc_index to record the index of the
6839 reduction variable. */
6840 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6841 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6842 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6843 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6845 if (slp_node)
6847 ncopies = 1;
6848 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6850 else
6852 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6853 vec_num = 1;
6856 internal_fn cond_fn = get_conditional_internal_fn (code);
6857 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6858 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6860 /* Transform. */
6861 stmt_vec_info new_stmt_info = NULL;
6862 stmt_vec_info prev_stmt_info;
6863 tree new_temp = NULL_TREE;
6864 auto_vec<tree> vec_oprnds0;
6865 auto_vec<tree> vec_oprnds1;
6866 auto_vec<tree> vec_oprnds2;
6867 tree def0;
6869 if (dump_enabled_p ())
6870 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6872 /* FORNOW: Multiple types are not supported for condition. */
6873 if (code == COND_EXPR)
6874 gcc_assert (ncopies == 1);
6876 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6878 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6879 if (reduction_type == FOLD_LEFT_REDUCTION)
6881 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6882 return vectorize_fold_left_reduction
6883 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6884 reduc_fn, ops, vectype_in, reduc_index, masks);
6887 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6888 gcc_assert (single_defuse_cycle
6889 || code == DOT_PROD_EXPR
6890 || code == WIDEN_SUM_EXPR
6891 || code == SAD_EXPR);
6893 /* Create the destination vector */
6894 tree scalar_dest = gimple_assign_lhs (stmt);
6895 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6897 prev_stmt_info = NULL;
6898 if (!slp_node)
6900 vec_oprnds0.create (1);
6901 vec_oprnds1.create (1);
6902 if (op_type == ternary_op)
6903 vec_oprnds2.create (1);
6906 for (j = 0; j < ncopies; j++)
6908 /* Handle uses. */
6909 if (j == 0)
6911 if (slp_node)
6913 /* Get vec defs for all the operands except the reduction index,
6914 ensuring the ordering of the ops in the vector is kept. */
6915 auto_vec<vec<tree>, 3> vec_defs;
6916 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6917 vec_oprnds0.safe_splice (vec_defs[0]);
6918 vec_defs[0].release ();
6919 vec_oprnds1.safe_splice (vec_defs[1]);
6920 vec_defs[1].release ();
6921 if (op_type == ternary_op)
6923 vec_oprnds2.safe_splice (vec_defs[2]);
6924 vec_defs[2].release ();
6927 else
6929 vec_oprnds0.quick_push
6930 (vect_get_vec_def_for_operand (loop_vinfo, ops[0], stmt_info));
6931 vec_oprnds1.quick_push
6932 (vect_get_vec_def_for_operand (loop_vinfo, ops[1], stmt_info));
6933 if (op_type == ternary_op)
6934 vec_oprnds2.quick_push
6935 (vect_get_vec_def_for_operand (loop_vinfo, ops[2], stmt_info));
6938 else
6940 if (!slp_node)
6942 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6944 if (single_defuse_cycle && reduc_index == 0)
6945 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6946 else
6947 vec_oprnds0[0]
6948 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6949 vec_oprnds0[0]);
6950 if (single_defuse_cycle && reduc_index == 1)
6951 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6952 else
6953 vec_oprnds1[0]
6954 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6955 vec_oprnds1[0]);
6956 if (op_type == ternary_op)
6958 if (single_defuse_cycle && reduc_index == 2)
6959 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6960 else
6961 vec_oprnds2[0]
6962 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6963 vec_oprnds2[0]);
6968 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6970 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6971 if (masked_loop_p && !mask_by_cond_expr)
6973 /* Make sure that the reduction accumulator is vop[0]. */
6974 if (reduc_index == 1)
6976 gcc_assert (commutative_tree_code (code));
6977 std::swap (vop[0], vop[1]);
6979 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6980 vectype_in, i * ncopies + j);
6981 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6982 vop[0], vop[1],
6983 vop[0]);
6984 new_temp = make_ssa_name (vec_dest, call);
6985 gimple_call_set_lhs (call, new_temp);
6986 gimple_call_set_nothrow (call, true);
6987 new_stmt_info
6988 = vect_finish_stmt_generation (loop_vinfo,
6989 stmt_info, call, gsi);
6991 else
6993 if (op_type == ternary_op)
6994 vop[2] = vec_oprnds2[i];
6996 if (masked_loop_p && mask_by_cond_expr)
6998 tree mask = vect_get_loop_mask (gsi, masks,
6999 vec_num * ncopies,
7000 vectype_in, i * ncopies + j);
7001 build_vect_cond_expr (code, vop, mask, gsi);
7004 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7005 vop[0], vop[1], vop[2]);
7006 new_temp = make_ssa_name (vec_dest, new_stmt);
7007 gimple_assign_set_lhs (new_stmt, new_temp);
7008 new_stmt_info
7009 = vect_finish_stmt_generation (loop_vinfo,
7010 stmt_info, new_stmt, gsi);
7013 if (slp_node)
7014 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7017 if (slp_node || single_defuse_cycle)
7018 continue;
7020 if (j == 0)
7021 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7022 else
7023 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7025 prev_stmt_info = new_stmt_info;
7028 if (single_defuse_cycle && !slp_node)
7029 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7031 return true;
7034 /* Transform phase of a cycle PHI. */
7036 bool
7037 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7038 stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7039 slp_tree slp_node, slp_instance slp_node_instance)
7041 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7042 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7043 int i;
7044 int ncopies;
7045 stmt_vec_info prev_phi_info;
7046 int j;
7047 bool nested_cycle = false;
7048 int vec_num;
7050 if (nested_in_vect_loop_p (loop, stmt_info))
7052 loop = loop->inner;
7053 nested_cycle = true;
7056 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7057 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7058 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7059 gcc_assert (reduc_info->is_reduc_info);
7061 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7062 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7063 /* Leave the scalar phi in place. */
7064 return true;
7066 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7067 /* For a nested cycle we do not fill the above. */
7068 if (!vectype_in)
7069 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7070 gcc_assert (vectype_in);
7072 if (slp_node)
7074 /* The size vect_schedule_slp_instance computes is off for us. */
7075 vec_num = vect_get_num_vectors
7076 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7077 * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7078 ncopies = 1;
7080 else
7082 vec_num = 1;
7083 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7086 /* Check whether we should use a single PHI node and accumulate
7087 vectors to one before the backedge. */
7088 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7089 ncopies = 1;
7091 /* Create the destination vector */
7092 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7093 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7094 vectype_out);
7096 /* Get the loop-entry arguments. */
7097 tree vec_initial_def;
7098 auto_vec<tree> vec_initial_defs;
7099 if (slp_node)
7101 vec_initial_defs.reserve (vec_num);
7102 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7103 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7104 tree neutral_op
7105 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7106 STMT_VINFO_REDUC_CODE (reduc_info),
7107 first != NULL);
7108 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7109 &vec_initial_defs, vec_num,
7110 first != NULL, neutral_op);
7112 else
7114 /* Get at the scalar def before the loop, that defines the initial
7115 value of the reduction variable. */
7116 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7117 loop_preheader_edge (loop));
7118 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7119 and we can't use zero for induc_val, use initial_def. Similarly
7120 for REDUC_MIN and initial_def larger than the base. */
7121 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7123 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7124 if (TREE_CODE (initial_def) == INTEGER_CST
7125 && !integer_zerop (induc_val)
7126 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7127 && tree_int_cst_lt (initial_def, induc_val))
7128 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7129 && tree_int_cst_lt (induc_val, initial_def))))
7131 induc_val = initial_def;
7132 /* Communicate we used the initial_def to epilouge
7133 generation. */
7134 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7136 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7138 else if (nested_cycle)
7140 /* Do not use an adjustment def as that case is not supported
7141 correctly if ncopies is not one. */
7142 vec_initial_def = vect_get_vec_def_for_operand (loop_vinfo,
7143 initial_def,
7144 reduc_stmt_info);
7146 else
7148 tree adjustment_def = NULL_TREE;
7149 tree *adjustment_defp = &adjustment_def;
7150 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7151 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7152 adjustment_defp = NULL;
7153 vec_initial_def
7154 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7155 initial_def, adjustment_defp);
7156 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7158 vec_initial_defs.create (1);
7159 vec_initial_defs.quick_push (vec_initial_def);
7162 /* Generate the reduction PHIs upfront. */
7163 prev_phi_info = NULL;
7164 for (i = 0; i < vec_num; i++)
7166 tree vec_init_def = vec_initial_defs[i];
7167 for (j = 0; j < ncopies; j++)
7169 /* Create the reduction-phi that defines the reduction
7170 operand. */
7171 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7172 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7174 /* Set the loop-entry arg of the reduction-phi. */
7175 if (j != 0 && nested_cycle)
7176 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7177 vec_init_def);
7178 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7179 UNKNOWN_LOCATION);
7181 /* The loop-latch arg is set in epilogue processing. */
7183 if (slp_node)
7184 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7185 else
7187 if (j == 0)
7188 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7189 else
7190 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7191 prev_phi_info = new_phi_info;
7196 return true;
7199 /* Vectorizes LC PHIs. */
7201 bool
7202 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7203 stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7204 slp_tree slp_node)
7206 if (!loop_vinfo
7207 || !is_a <gphi *> (stmt_info->stmt)
7208 || gimple_phi_num_args (stmt_info->stmt) != 1)
7209 return false;
7211 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7212 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7213 return false;
7215 if (!vec_stmt) /* transformation not required. */
7217 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7218 return true;
7221 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7222 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7223 basic_block bb = gimple_bb (stmt_info->stmt);
7224 edge e = single_pred_edge (bb);
7225 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7226 vec<tree> vec_oprnds = vNULL;
7227 vect_get_vec_defs (loop_vinfo,
7228 gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7229 stmt_info, &vec_oprnds, NULL, slp_node);
7230 if (slp_node)
7232 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7233 gcc_assert (vec_oprnds.length () == vec_num);
7234 for (unsigned i = 0; i < vec_num; i++)
7236 /* Create the vectorized LC PHI node. */
7237 gphi *new_phi = create_phi_node (vec_dest, bb);
7238 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7239 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7240 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7243 else
7245 unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7246 stmt_vec_info prev_phi_info = NULL;
7247 for (unsigned i = 0; i < ncopies; i++)
7249 if (i != 0)
7250 vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7251 /* Create the vectorized LC PHI node. */
7252 gphi *new_phi = create_phi_node (vec_dest, bb);
7253 add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7254 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7255 if (i == 0)
7256 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7257 else
7258 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7259 prev_phi_info = new_phi_info;
7262 vec_oprnds.release ();
7264 return true;
7268 /* Function vect_min_worthwhile_factor.
7270 For a loop where we could vectorize the operation indicated by CODE,
7271 return the minimum vectorization factor that makes it worthwhile
7272 to use generic vectors. */
7273 static unsigned int
7274 vect_min_worthwhile_factor (enum tree_code code)
7276 switch (code)
7278 case PLUS_EXPR:
7279 case MINUS_EXPR:
7280 case NEGATE_EXPR:
7281 return 4;
7283 case BIT_AND_EXPR:
7284 case BIT_IOR_EXPR:
7285 case BIT_XOR_EXPR:
7286 case BIT_NOT_EXPR:
7287 return 2;
7289 default:
7290 return INT_MAX;
7294 /* Return true if VINFO indicates we are doing loop vectorization and if
7295 it is worth decomposing CODE operations into scalar operations for
7296 that loop's vectorization factor. */
7298 bool
7299 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7301 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7302 unsigned HOST_WIDE_INT value;
7303 return (loop_vinfo
7304 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7305 && value >= vect_min_worthwhile_factor (code));
7308 /* Function vectorizable_induction
7310 Check if STMT_INFO performs an induction computation that can be vectorized.
7311 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7312 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7313 Return true if STMT_INFO is vectorizable in this way. */
7315 bool
7316 vectorizable_induction (loop_vec_info loop_vinfo,
7317 stmt_vec_info stmt_info,
7318 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7319 stmt_vec_info *vec_stmt, slp_tree slp_node,
7320 stmt_vector_for_cost *cost_vec)
7322 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7323 unsigned ncopies;
7324 bool nested_in_vect_loop = false;
7325 class loop *iv_loop;
7326 tree vec_def;
7327 edge pe = loop_preheader_edge (loop);
7328 basic_block new_bb;
7329 tree new_vec, vec_init, vec_step, t;
7330 tree new_name;
7331 gimple *new_stmt;
7332 gphi *induction_phi;
7333 tree induc_def, vec_dest;
7334 tree init_expr, step_expr;
7335 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7336 unsigned i;
7337 tree expr;
7338 gimple_seq stmts;
7339 imm_use_iterator imm_iter;
7340 use_operand_p use_p;
7341 gimple *exit_phi;
7342 edge latch_e;
7343 tree loop_arg;
7344 gimple_stmt_iterator si;
7346 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7347 if (!phi)
7348 return false;
7350 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7351 return false;
7353 /* Make sure it was recognized as induction computation. */
7354 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7355 return false;
7357 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7358 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7360 if (slp_node)
7361 ncopies = 1;
7362 else
7363 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7364 gcc_assert (ncopies >= 1);
7366 /* FORNOW. These restrictions should be relaxed. */
7367 if (nested_in_vect_loop_p (loop, stmt_info))
7369 imm_use_iterator imm_iter;
7370 use_operand_p use_p;
7371 gimple *exit_phi;
7372 edge latch_e;
7373 tree loop_arg;
7375 if (ncopies > 1)
7377 if (dump_enabled_p ())
7378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7379 "multiple types in nested loop.\n");
7380 return false;
7383 /* FORNOW: outer loop induction with SLP not supported. */
7384 if (STMT_SLP_TYPE (stmt_info))
7385 return false;
7387 exit_phi = NULL;
7388 latch_e = loop_latch_edge (loop->inner);
7389 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7390 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7392 gimple *use_stmt = USE_STMT (use_p);
7393 if (is_gimple_debug (use_stmt))
7394 continue;
7396 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7398 exit_phi = use_stmt;
7399 break;
7402 if (exit_phi)
7404 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7405 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7406 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7408 if (dump_enabled_p ())
7409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7410 "inner-loop induction only used outside "
7411 "of the outer vectorized loop.\n");
7412 return false;
7416 nested_in_vect_loop = true;
7417 iv_loop = loop->inner;
7419 else
7420 iv_loop = loop;
7421 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7423 if (slp_node && !nunits.is_constant ())
7425 /* The current SLP code creates the initial value element-by-element. */
7426 if (dump_enabled_p ())
7427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7428 "SLP induction not supported for variable-length"
7429 " vectors.\n");
7430 return false;
7433 if (!vec_stmt) /* transformation not required. */
7435 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7436 DUMP_VECT_SCOPE ("vectorizable_induction");
7437 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7438 return true;
7441 /* Transform. */
7443 /* Compute a vector variable, initialized with the first VF values of
7444 the induction variable. E.g., for an iv with IV_PHI='X' and
7445 evolution S, for a vector of 4 units, we want to compute:
7446 [X, X + S, X + 2*S, X + 3*S]. */
7448 if (dump_enabled_p ())
7449 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7451 latch_e = loop_latch_edge (iv_loop);
7452 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7454 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7455 gcc_assert (step_expr != NULL_TREE);
7456 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7458 pe = loop_preheader_edge (iv_loop);
7459 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7460 loop_preheader_edge (iv_loop));
7462 stmts = NULL;
7463 if (!nested_in_vect_loop)
7465 /* Convert the initial value to the IV update type. */
7466 tree new_type = TREE_TYPE (step_expr);
7467 init_expr = gimple_convert (&stmts, new_type, init_expr);
7469 /* If we are using the loop mask to "peel" for alignment then we need
7470 to adjust the start value here. */
7471 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7472 if (skip_niters != NULL_TREE)
7474 if (FLOAT_TYPE_P (vectype))
7475 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7476 skip_niters);
7477 else
7478 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7479 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7480 skip_niters, step_expr);
7481 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7482 init_expr, skip_step);
7486 if (stmts)
7488 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7489 gcc_assert (!new_bb);
7492 /* Find the first insertion point in the BB. */
7493 basic_block bb = gimple_bb (phi);
7494 si = gsi_after_labels (bb);
7496 /* For SLP induction we have to generate several IVs as for example
7497 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7498 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7499 [VF*S, VF*S, VF*S, VF*S] for all. */
7500 if (slp_node)
7502 /* Enforced above. */
7503 unsigned int const_nunits = nunits.to_constant ();
7505 /* Generate [VF*S, VF*S, ... ]. */
7506 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7508 expr = build_int_cst (integer_type_node, vf);
7509 expr = fold_convert (TREE_TYPE (step_expr), expr);
7511 else
7512 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7513 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7514 expr, step_expr);
7515 if (! CONSTANT_CLASS_P (new_name))
7516 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7517 TREE_TYPE (step_expr), NULL);
7518 new_vec = build_vector_from_val (step_vectype, new_name);
7519 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7520 new_vec, step_vectype, NULL);
7522 /* Now generate the IVs. */
7523 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7524 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7525 unsigned elts = const_nunits * nvects;
7526 unsigned nivs = least_common_multiple (group_size,
7527 const_nunits) / const_nunits;
7528 gcc_assert (elts % group_size == 0);
7529 tree elt = init_expr;
7530 unsigned ivn;
7531 for (ivn = 0; ivn < nivs; ++ivn)
7533 tree_vector_builder elts (step_vectype, const_nunits, 1);
7534 stmts = NULL;
7535 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7537 if (ivn*const_nunits + eltn >= group_size
7538 && (ivn * const_nunits + eltn) % group_size == 0)
7539 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7540 elt, step_expr);
7541 elts.quick_push (elt);
7543 vec_init = gimple_build_vector (&stmts, &elts);
7544 vec_init = gimple_convert (&stmts, vectype, vec_init);
7545 if (stmts)
7547 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7548 gcc_assert (!new_bb);
7551 /* Create the induction-phi that defines the induction-operand. */
7552 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7553 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7554 stmt_vec_info induction_phi_info
7555 = loop_vinfo->add_stmt (induction_phi);
7556 induc_def = PHI_RESULT (induction_phi);
7558 /* Create the iv update inside the loop */
7559 gimple_seq stmts = NULL;
7560 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7561 vec_def = gimple_build (&stmts,
7562 PLUS_EXPR, step_vectype, vec_def, vec_step);
7563 vec_def = gimple_convert (&stmts, vectype, vec_def);
7564 loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7565 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7567 /* Set the arguments of the phi node: */
7568 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7569 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7570 UNKNOWN_LOCATION);
7572 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7575 /* Re-use IVs when we can. */
7576 if (ivn < nvects)
7578 unsigned vfp
7579 = least_common_multiple (group_size, const_nunits) / group_size;
7580 /* Generate [VF'*S, VF'*S, ... ]. */
7581 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7583 expr = build_int_cst (integer_type_node, vfp);
7584 expr = fold_convert (TREE_TYPE (step_expr), expr);
7586 else
7587 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7588 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7589 expr, step_expr);
7590 if (! CONSTANT_CLASS_P (new_name))
7591 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7592 TREE_TYPE (step_expr), NULL);
7593 new_vec = build_vector_from_val (step_vectype, new_name);
7594 vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7595 step_vectype, NULL);
7596 for (; ivn < nvects; ++ivn)
7598 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7599 tree def;
7600 if (gimple_code (iv) == GIMPLE_PHI)
7601 def = gimple_phi_result (iv);
7602 else
7603 def = gimple_assign_lhs (iv);
7604 gimple_seq stmts = NULL;
7605 def = gimple_convert (&stmts, step_vectype, def);
7606 def = gimple_build (&stmts,
7607 PLUS_EXPR, step_vectype, def, vec_step);
7608 def = gimple_convert (&stmts, vectype, def);
7609 if (gimple_code (iv) == GIMPLE_PHI)
7610 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7611 else
7613 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7614 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7616 SLP_TREE_VEC_STMTS (slp_node).quick_push
7617 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7621 return true;
7624 /* Create the vector that holds the initial_value of the induction. */
7625 if (nested_in_vect_loop)
7627 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7628 been created during vectorization of previous stmts. We obtain it
7629 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7630 vec_init = vect_get_vec_def_for_operand (loop_vinfo,
7631 init_expr, stmt_info);
7632 /* If the initial value is not of proper type, convert it. */
7633 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7635 new_stmt
7636 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7637 vect_simple_var,
7638 "vec_iv_"),
7639 VIEW_CONVERT_EXPR,
7640 build1 (VIEW_CONVERT_EXPR, vectype,
7641 vec_init));
7642 vec_init = gimple_assign_lhs (new_stmt);
7643 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7644 new_stmt);
7645 gcc_assert (!new_bb);
7646 loop_vinfo->add_stmt (new_stmt);
7649 else
7651 /* iv_loop is the loop to be vectorized. Create:
7652 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7653 stmts = NULL;
7654 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7656 unsigned HOST_WIDE_INT const_nunits;
7657 if (nunits.is_constant (&const_nunits))
7659 tree_vector_builder elts (step_vectype, const_nunits, 1);
7660 elts.quick_push (new_name);
7661 for (i = 1; i < const_nunits; i++)
7663 /* Create: new_name_i = new_name + step_expr */
7664 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7665 new_name, step_expr);
7666 elts.quick_push (new_name);
7668 /* Create a vector from [new_name_0, new_name_1, ...,
7669 new_name_nunits-1] */
7670 vec_init = gimple_build_vector (&stmts, &elts);
7672 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7673 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7674 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7675 new_name, step_expr);
7676 else
7678 /* Build:
7679 [base, base, base, ...]
7680 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7681 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7682 gcc_assert (flag_associative_math);
7683 tree index = build_index_vector (step_vectype, 0, 1);
7684 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7685 new_name);
7686 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7687 step_expr);
7688 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7689 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7690 vec_init, step_vec);
7691 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7692 vec_init, base_vec);
7694 vec_init = gimple_convert (&stmts, vectype, vec_init);
7696 if (stmts)
7698 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7699 gcc_assert (!new_bb);
7704 /* Create the vector that holds the step of the induction. */
7705 if (nested_in_vect_loop)
7706 /* iv_loop is nested in the loop to be vectorized. Generate:
7707 vec_step = [S, S, S, S] */
7708 new_name = step_expr;
7709 else
7711 /* iv_loop is the loop to be vectorized. Generate:
7712 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7713 gimple_seq seq = NULL;
7714 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7716 expr = build_int_cst (integer_type_node, vf);
7717 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7719 else
7720 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7721 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7722 expr, step_expr);
7723 if (seq)
7725 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7726 gcc_assert (!new_bb);
7730 t = unshare_expr (new_name);
7731 gcc_assert (CONSTANT_CLASS_P (new_name)
7732 || TREE_CODE (new_name) == SSA_NAME);
7733 new_vec = build_vector_from_val (step_vectype, t);
7734 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7735 new_vec, step_vectype, NULL);
7738 /* Create the following def-use cycle:
7739 loop prolog:
7740 vec_init = ...
7741 vec_step = ...
7742 loop:
7743 vec_iv = PHI <vec_init, vec_loop>
7745 STMT
7747 vec_loop = vec_iv + vec_step; */
7749 /* Create the induction-phi that defines the induction-operand. */
7750 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7751 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7752 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7753 induc_def = PHI_RESULT (induction_phi);
7755 /* Create the iv update inside the loop */
7756 stmts = NULL;
7757 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7758 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7759 vec_def = gimple_convert (&stmts, vectype, vec_def);
7760 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7761 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7762 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7764 /* Set the arguments of the phi node: */
7765 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7766 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7767 UNKNOWN_LOCATION);
7769 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7771 /* In case that vectorization factor (VF) is bigger than the number
7772 of elements that we can fit in a vectype (nunits), we have to generate
7773 more than one vector stmt - i.e - we need to "unroll" the
7774 vector stmt by a factor VF/nunits. For more details see documentation
7775 in vectorizable_operation. */
7777 if (ncopies > 1)
7779 gimple_seq seq = NULL;
7780 stmt_vec_info prev_stmt_vinfo;
7781 /* FORNOW. This restriction should be relaxed. */
7782 gcc_assert (!nested_in_vect_loop);
7784 /* Create the vector that holds the step of the induction. */
7785 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7787 expr = build_int_cst (integer_type_node, nunits);
7788 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7790 else
7791 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7792 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7793 expr, step_expr);
7794 if (seq)
7796 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7797 gcc_assert (!new_bb);
7800 t = unshare_expr (new_name);
7801 gcc_assert (CONSTANT_CLASS_P (new_name)
7802 || TREE_CODE (new_name) == SSA_NAME);
7803 new_vec = build_vector_from_val (step_vectype, t);
7804 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7805 new_vec, step_vectype, NULL);
7807 vec_def = induc_def;
7808 prev_stmt_vinfo = induction_phi_info;
7809 for (i = 1; i < ncopies; i++)
7811 /* vec_i = vec_prev + vec_step */
7812 gimple_seq stmts = NULL;
7813 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7814 vec_def = gimple_build (&stmts,
7815 PLUS_EXPR, step_vectype, vec_def, vec_step);
7816 vec_def = gimple_convert (&stmts, vectype, vec_def);
7818 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7819 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7820 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7821 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7822 prev_stmt_vinfo = new_stmt_info;
7826 if (nested_in_vect_loop)
7828 /* Find the loop-closed exit-phi of the induction, and record
7829 the final vector of induction results: */
7830 exit_phi = NULL;
7831 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7833 gimple *use_stmt = USE_STMT (use_p);
7834 if (is_gimple_debug (use_stmt))
7835 continue;
7837 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7839 exit_phi = use_stmt;
7840 break;
7843 if (exit_phi)
7845 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7846 /* FORNOW. Currently not supporting the case that an inner-loop induction
7847 is not used in the outer-loop (i.e. only outside the outer-loop). */
7848 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7849 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7851 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7852 if (dump_enabled_p ())
7853 dump_printf_loc (MSG_NOTE, vect_location,
7854 "vector of inductions after inner-loop:%G",
7855 new_stmt);
7860 if (dump_enabled_p ())
7861 dump_printf_loc (MSG_NOTE, vect_location,
7862 "transform induction: created def-use cycle: %G%G",
7863 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7865 return true;
7868 /* Function vectorizable_live_operation.
7870 STMT_INFO computes a value that is used outside the loop. Check if
7871 it can be supported. */
7873 bool
7874 vectorizable_live_operation (loop_vec_info loop_vinfo,
7875 stmt_vec_info stmt_info,
7876 gimple_stmt_iterator *gsi,
7877 slp_tree slp_node, slp_instance slp_node_instance,
7878 int slp_index, bool vec_stmt_p,
7879 stmt_vector_for_cost *)
7881 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7882 imm_use_iterator imm_iter;
7883 tree lhs, lhs_type, bitsize, vec_bitsize;
7884 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7885 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7886 int ncopies;
7887 gimple *use_stmt;
7888 auto_vec<tree> vec_oprnds;
7889 int vec_entry = 0;
7890 poly_uint64 vec_index = 0;
7892 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7894 /* If a stmt of a reduction is live, vectorize it via
7895 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7896 validity so just trigger the transform here. */
7897 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7899 if (!vec_stmt_p)
7900 return true;
7901 if (slp_node)
7903 /* For reduction chains the meta-info is attached to
7904 the group leader. */
7905 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7906 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7907 /* For SLP reductions we vectorize the epilogue for
7908 all involved stmts together. */
7909 else if (slp_index != 0)
7910 return true;
7912 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7913 gcc_assert (reduc_info->is_reduc_info);
7914 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7915 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7916 return true;
7917 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
7918 slp_node_instance);
7919 return true;
7922 /* FORNOW. CHECKME. */
7923 if (nested_in_vect_loop_p (loop, stmt_info))
7924 return false;
7926 /* If STMT is not relevant and it is a simple assignment and its inputs are
7927 invariant then it can remain in place, unvectorized. The original last
7928 scalar value that it computes will be used. */
7929 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7931 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7932 if (dump_enabled_p ())
7933 dump_printf_loc (MSG_NOTE, vect_location,
7934 "statement is simple and uses invariant. Leaving in "
7935 "place.\n");
7936 return true;
7939 if (slp_node)
7940 ncopies = 1;
7941 else
7942 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7944 if (slp_node)
7946 gcc_assert (slp_index >= 0);
7948 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7949 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7951 /* Get the last occurrence of the scalar index from the concatenation of
7952 all the slp vectors. Calculate which slp vector it is and the index
7953 within. */
7954 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7956 /* Calculate which vector contains the result, and which lane of
7957 that vector we need. */
7958 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7960 if (dump_enabled_p ())
7961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7962 "Cannot determine which vector holds the"
7963 " final result.\n");
7964 return false;
7968 if (!vec_stmt_p)
7970 /* No transformation required. */
7971 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7973 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7974 OPTIMIZE_FOR_SPEED))
7976 if (dump_enabled_p ())
7977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978 "can't use a fully-masked loop because "
7979 "the target doesn't support extract last "
7980 "reduction.\n");
7981 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7983 else if (slp_node)
7985 if (dump_enabled_p ())
7986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7987 "can't use a fully-masked loop because an "
7988 "SLP statement is live after the loop.\n");
7989 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7991 else if (ncopies > 1)
7993 if (dump_enabled_p ())
7994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7995 "can't use a fully-masked loop because"
7996 " ncopies is greater than 1.\n");
7997 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7999 else
8001 gcc_assert (ncopies == 1 && !slp_node);
8002 vect_record_loop_mask (loop_vinfo,
8003 &LOOP_VINFO_MASKS (loop_vinfo),
8004 1, vectype, NULL);
8007 return true;
8010 /* Use the lhs of the original scalar statement. */
8011 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8013 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8014 : gimple_get_lhs (stmt);
8015 lhs_type = TREE_TYPE (lhs);
8017 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8018 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8019 : TYPE_SIZE (TREE_TYPE (vectype)));
8020 vec_bitsize = TYPE_SIZE (vectype);
8022 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8023 tree vec_lhs, bitstart;
8024 if (slp_node)
8026 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8028 /* Get the correct slp vectorized stmt. */
8029 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8030 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8031 vec_lhs = gimple_phi_result (phi);
8032 else
8033 vec_lhs = gimple_get_lhs (vec_stmt);
8035 /* Get entry to use. */
8036 bitstart = bitsize_int (vec_index);
8037 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8039 else
8041 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8042 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8043 gcc_checking_assert (ncopies == 1
8044 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8046 /* For multiple copies, get the last copy. */
8047 for (int i = 1; i < ncopies; ++i)
8048 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8050 /* Get the last lane in the vector. */
8051 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8054 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8055 requirement, insert one phi node for it. It looks like:
8056 loop;
8058 # lhs' = PHI <lhs>
8060 loop;
8062 # vec_lhs' = PHI <vec_lhs>
8063 new_tree = lane_extract <vec_lhs', ...>;
8064 lhs' = new_tree; */
8066 basic_block exit_bb = single_exit (loop)->dest;
8067 gcc_assert (single_pred_p (exit_bb));
8069 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8070 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8071 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8073 gimple_seq stmts = NULL;
8074 tree new_tree;
8075 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8077 /* Emit:
8079 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8081 where VEC_LHS is the vectorized live-out result and MASK is
8082 the loop mask for the final iteration. */
8083 gcc_assert (ncopies == 1 && !slp_node);
8084 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8085 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8086 vectype, 0);
8087 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8088 mask, vec_lhs_phi);
8090 /* Convert the extracted vector element to the required scalar type. */
8091 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8093 else
8095 tree bftype = TREE_TYPE (vectype);
8096 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8097 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8098 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8099 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8100 &stmts, true, NULL_TREE);
8103 if (stmts)
8105 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8106 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8108 /* Remove existing phi from lhs and create one copy from new_tree. */
8109 tree lhs_phi = NULL_TREE;
8110 gimple_stmt_iterator gsi;
8111 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8113 gimple *phi = gsi_stmt (gsi);
8114 if ((gimple_phi_arg_def (phi, 0) == lhs))
8116 remove_phi_node (&gsi, false);
8117 lhs_phi = gimple_phi_result (phi);
8118 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8119 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8120 break;
8125 /* Replace use of lhs with newly computed result. If the use stmt is a
8126 single arg PHI, just replace all uses of PHI result. It's necessary
8127 because lcssa PHI defining lhs may be before newly inserted stmt. */
8128 use_operand_p use_p;
8129 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8130 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8131 && !is_gimple_debug (use_stmt))
8133 if (gimple_code (use_stmt) == GIMPLE_PHI
8134 && gimple_phi_num_args (use_stmt) == 1)
8136 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8138 else
8140 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8141 SET_USE (use_p, new_tree);
8143 update_stmt (use_stmt);
8146 return true;
8149 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8151 static void
8152 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8154 ssa_op_iter op_iter;
8155 imm_use_iterator imm_iter;
8156 def_operand_p def_p;
8157 gimple *ustmt;
8159 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8161 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8163 basic_block bb;
8165 if (!is_gimple_debug (ustmt))
8166 continue;
8168 bb = gimple_bb (ustmt);
8170 if (!flow_bb_inside_loop_p (loop, bb))
8172 if (gimple_debug_bind_p (ustmt))
8174 if (dump_enabled_p ())
8175 dump_printf_loc (MSG_NOTE, vect_location,
8176 "killing debug use\n");
8178 gimple_debug_bind_reset_value (ustmt);
8179 update_stmt (ustmt);
8181 else
8182 gcc_unreachable ();
8188 /* Given loop represented by LOOP_VINFO, return true if computation of
8189 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8190 otherwise. */
8192 static bool
8193 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8195 /* Constant case. */
8196 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8198 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8199 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8201 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8202 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8203 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8204 return true;
8207 widest_int max;
8208 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8209 /* Check the upper bound of loop niters. */
8210 if (get_max_loop_iterations (loop, &max))
8212 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8213 signop sgn = TYPE_SIGN (type);
8214 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8215 if (max < type_max)
8216 return true;
8218 return false;
8221 /* Return a mask type with half the number of elements as OLD_TYPE,
8222 given that it should have mode NEW_MODE. */
8224 tree
8225 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8227 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8228 return build_truth_vector_type_for_mode (nunits, new_mode);
8231 /* Return a mask type with twice as many elements as OLD_TYPE,
8232 given that it should have mode NEW_MODE. */
8234 tree
8235 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8237 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8238 return build_truth_vector_type_for_mode (nunits, new_mode);
8241 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8242 contain a sequence of NVECTORS masks that each control a vector of type
8243 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8244 these vector masks with the vector version of SCALAR_MASK. */
8246 void
8247 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8248 unsigned int nvectors, tree vectype, tree scalar_mask)
8250 gcc_assert (nvectors != 0);
8251 if (masks->length () < nvectors)
8252 masks->safe_grow_cleared (nvectors);
8253 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8254 /* The number of scalars per iteration and the number of vectors are
8255 both compile-time constants. */
8256 unsigned int nscalars_per_iter
8257 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8258 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8260 if (scalar_mask)
8262 scalar_cond_masked_key cond (scalar_mask, nvectors);
8263 loop_vinfo->scalar_cond_masked_set.add (cond);
8266 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8268 rgm->max_nscalars_per_iter = nscalars_per_iter;
8269 rgm->mask_type = truth_type_for (vectype);
8273 /* Given a complete set of masks MASKS, extract mask number INDEX
8274 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8275 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8277 See the comment above vec_loop_masks for more details about the mask
8278 arrangement. */
8280 tree
8281 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8282 unsigned int nvectors, tree vectype, unsigned int index)
8284 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8285 tree mask_type = rgm->mask_type;
8287 /* Populate the rgroup's mask array, if this is the first time we've
8288 used it. */
8289 if (rgm->masks.is_empty ())
8291 rgm->masks.safe_grow_cleared (nvectors);
8292 for (unsigned int i = 0; i < nvectors; ++i)
8294 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8295 /* Provide a dummy definition until the real one is available. */
8296 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8297 rgm->masks[i] = mask;
8301 tree mask = rgm->masks[index];
8302 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8303 TYPE_VECTOR_SUBPARTS (vectype)))
8305 /* A loop mask for data type X can be reused for data type Y
8306 if X has N times more elements than Y and if Y's elements
8307 are N times bigger than X's. In this case each sequence
8308 of N elements in the loop mask will be all-zero or all-one.
8309 We can then view-convert the mask so that each sequence of
8310 N elements is replaced by a single element. */
8311 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8312 TYPE_VECTOR_SUBPARTS (vectype)));
8313 gimple_seq seq = NULL;
8314 mask_type = truth_type_for (vectype);
8315 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8316 if (seq)
8317 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8319 return mask;
8322 /* Scale profiling counters by estimation for LOOP which is vectorized
8323 by factor VF. */
8325 static void
8326 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8328 edge preheader = loop_preheader_edge (loop);
8329 /* Reduce loop iterations by the vectorization factor. */
8330 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8331 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8333 if (freq_h.nonzero_p ())
8335 profile_probability p;
8337 /* Avoid dropping loop body profile counter to 0 because of zero count
8338 in loop's preheader. */
8339 if (!(freq_e == profile_count::zero ()))
8340 freq_e = freq_e.force_nonzero ();
8341 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8342 scale_loop_frequencies (loop, p);
8345 edge exit_e = single_exit (loop);
8346 exit_e->probability = profile_probability::always ()
8347 .apply_scale (1, new_est_niter + 1);
8349 edge exit_l = single_pred_edge (loop->latch);
8350 profile_probability prob = exit_l->probability;
8351 exit_l->probability = exit_e->probability.invert ();
8352 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8353 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8356 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8357 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8358 stmt_vec_info. */
8360 static void
8361 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8362 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8364 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8365 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8367 if (dump_enabled_p ())
8368 dump_printf_loc (MSG_NOTE, vect_location,
8369 "------>vectorizing statement: %G", stmt_info->stmt);
8371 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8372 vect_loop_kill_debug_uses (loop, stmt_info);
8374 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8375 && !STMT_VINFO_LIVE_P (stmt_info))
8376 return;
8378 if (STMT_VINFO_VECTYPE (stmt_info))
8380 poly_uint64 nunits
8381 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8382 if (!STMT_SLP_TYPE (stmt_info)
8383 && maybe_ne (nunits, vf)
8384 && dump_enabled_p ())
8385 /* For SLP VF is set according to unrolling factor, and not
8386 to vector size, hence for SLP this print is not valid. */
8387 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8390 /* Pure SLP statements have already been vectorized. We still need
8391 to apply loop vectorization to hybrid SLP statements. */
8392 if (PURE_SLP_STMT (stmt_info))
8393 return;
8395 if (dump_enabled_p ())
8396 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8398 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8399 *seen_store = stmt_info;
8402 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8403 in the hash_map with its corresponding values. */
8405 static tree
8406 find_in_mapping (tree t, void *context)
8408 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8410 tree *value = mapping->get (t);
8411 return value ? *value : t;
8414 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8415 original loop that has now been vectorized.
8417 The inits of the data_references need to be advanced with the number of
8418 iterations of the main loop. This has been computed in vect_do_peeling and
8419 is stored in parameter ADVANCE. We first restore the data_references
8420 initial offset with the values recored in ORIG_DRS_INIT.
8422 Since the loop_vec_info of this EPILOGUE was constructed for the original
8423 loop, its stmt_vec_infos all point to the original statements. These need
8424 to be updated to point to their corresponding copies as well as the SSA_NAMES
8425 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8427 The data_reference's connections also need to be updated. Their
8428 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8429 stmt_vec_infos, their statements need to point to their corresponding copy,
8430 if they are gather loads or scatter stores then their reference needs to be
8431 updated to point to its corresponding copy and finally we set
8432 'base_misaligned' to false as we have already peeled for alignment in the
8433 prologue of the main loop. */
8435 static void
8436 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8438 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8439 auto_vec<gimple *> stmt_worklist;
8440 hash_map<tree,tree> mapping;
8441 gimple *orig_stmt, *new_stmt;
8442 gimple_stmt_iterator epilogue_gsi;
8443 gphi_iterator epilogue_phi_gsi;
8444 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8445 basic_block *epilogue_bbs = get_loop_body (epilogue);
8446 unsigned i;
8448 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8450 /* Advance data_reference's with the number of iterations of the previous
8451 loop and its prologue. */
8452 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8455 /* The EPILOGUE loop is a copy of the original loop so they share the same
8456 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8457 point to the copied statements. We also create a mapping of all LHS' in
8458 the original loop and all the LHS' in the EPILOGUE and create worklists to
8459 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8460 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8462 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8463 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8465 new_stmt = epilogue_phi_gsi.phi ();
8467 gcc_assert (gimple_uid (new_stmt) > 0);
8468 stmt_vinfo
8469 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8471 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8472 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8474 mapping.put (gimple_phi_result (orig_stmt),
8475 gimple_phi_result (new_stmt));
8476 /* PHI nodes can not have patterns or related statements. */
8477 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8478 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8481 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8482 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8484 new_stmt = gsi_stmt (epilogue_gsi);
8486 gcc_assert (gimple_uid (new_stmt) > 0);
8487 stmt_vinfo
8488 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8490 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8491 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8493 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8494 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8496 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8498 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8499 for (gimple_stmt_iterator gsi = gsi_start (seq);
8500 !gsi_end_p (gsi); gsi_next (&gsi))
8501 stmt_worklist.safe_push (gsi_stmt (gsi));
8504 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8505 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8507 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8508 stmt_worklist.safe_push (stmt);
8509 /* Set BB such that the assert in
8510 'get_initial_def_for_reduction' is able to determine that
8511 the BB of the related stmt is inside this loop. */
8512 gimple_set_bb (stmt,
8513 gimple_bb (new_stmt));
8514 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8515 gcc_assert (related_vinfo == NULL
8516 || related_vinfo == stmt_vinfo);
8521 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8522 using the original main loop and thus need to be updated to refer to the
8523 cloned variables used in the epilogue. */
8524 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8526 gimple *stmt = stmt_worklist[i];
8527 tree *new_op;
8529 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8531 tree op = gimple_op (stmt, j);
8532 if ((new_op = mapping.get(op)))
8533 gimple_set_op (stmt, j, *new_op);
8534 else
8536 /* PR92429: The last argument of simplify_replace_tree disables
8537 folding when replacing arguments. This is required as
8538 otherwise you might end up with different statements than the
8539 ones analyzed in vect_loop_analyze, leading to different
8540 vectorization. */
8541 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8542 &find_in_mapping, &mapping, false);
8543 gimple_set_op (stmt, j, op);
8548 struct data_reference *dr;
8549 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8550 FOR_EACH_VEC_ELT (datarefs, i, dr)
8552 orig_stmt = DR_STMT (dr);
8553 gcc_assert (gimple_uid (orig_stmt) > 0);
8554 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8555 /* Data references for gather loads and scatter stores do not use the
8556 updated offset we set using ADVANCE. Instead we have to make sure the
8557 reference in the data references point to the corresponding copy of
8558 the original in the epilogue. */
8559 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8560 == VMAT_GATHER_SCATTER)
8562 DR_REF (dr)
8563 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8564 &find_in_mapping, &mapping);
8565 DR_BASE_ADDRESS (dr)
8566 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8567 &find_in_mapping, &mapping);
8569 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8570 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8571 /* The vector size of the epilogue is smaller than that of the main loop
8572 so the alignment is either the same or lower. This means the dr will
8573 thus by definition be aligned. */
8574 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8577 epilogue_vinfo->shared->datarefs_copy.release ();
8578 epilogue_vinfo->shared->save_datarefs ();
8581 /* Function vect_transform_loop.
8583 The analysis phase has determined that the loop is vectorizable.
8584 Vectorize the loop - created vectorized stmts to replace the scalar
8585 stmts in the loop, and update the loop exit condition.
8586 Returns scalar epilogue loop if any. */
8588 class loop *
8589 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8591 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8592 class loop *epilogue = NULL;
8593 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8594 int nbbs = loop->num_nodes;
8595 int i;
8596 tree niters_vector = NULL_TREE;
8597 tree step_vector = NULL_TREE;
8598 tree niters_vector_mult_vf = NULL_TREE;
8599 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8600 unsigned int lowest_vf = constant_lower_bound (vf);
8601 gimple *stmt;
8602 bool check_profitability = false;
8603 unsigned int th;
8605 DUMP_VECT_SCOPE ("vec_transform_loop");
8607 loop_vinfo->shared->check_datarefs ();
8609 /* Use the more conservative vectorization threshold. If the number
8610 of iterations is constant assume the cost check has been performed
8611 by our caller. If the threshold makes all loops profitable that
8612 run at least the (estimated) vectorization factor number of times
8613 checking is pointless, too. */
8614 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8615 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8617 if (dump_enabled_p ())
8618 dump_printf_loc (MSG_NOTE, vect_location,
8619 "Profitability threshold is %d loop iterations.\n",
8620 th);
8621 check_profitability = true;
8624 /* Make sure there exists a single-predecessor exit bb. Do this before
8625 versioning. */
8626 edge e = single_exit (loop);
8627 if (! single_pred_p (e->dest))
8629 split_loop_exit_edge (e, true);
8630 if (dump_enabled_p ())
8631 dump_printf (MSG_NOTE, "split exit edge\n");
8634 /* Version the loop first, if required, so the profitability check
8635 comes first. */
8637 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8639 class loop *sloop
8640 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8641 sloop->force_vectorize = false;
8642 check_profitability = false;
8645 /* Make sure there exists a single-predecessor exit bb also on the
8646 scalar loop copy. Do this after versioning but before peeling
8647 so CFG structure is fine for both scalar and if-converted loop
8648 to make slpeel_duplicate_current_defs_from_edges face matched
8649 loop closed PHI nodes on the exit. */
8650 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8652 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8653 if (! single_pred_p (e->dest))
8655 split_loop_exit_edge (e, true);
8656 if (dump_enabled_p ())
8657 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8661 tree niters = vect_build_loop_niters (loop_vinfo);
8662 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8663 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8664 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8665 tree advance;
8666 drs_init_vec orig_drs_init;
8668 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8669 &step_vector, &niters_vector_mult_vf, th,
8670 check_profitability, niters_no_overflow,
8671 &advance);
8673 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8674 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8675 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8676 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8678 if (niters_vector == NULL_TREE)
8680 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8681 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8682 && known_eq (lowest_vf, vf))
8684 niters_vector
8685 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8686 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8687 step_vector = build_one_cst (TREE_TYPE (niters));
8689 else
8690 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8691 &step_vector, niters_no_overflow);
8694 /* 1) Make sure the loop header has exactly two entries
8695 2) Make sure we have a preheader basic block. */
8697 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8699 split_edge (loop_preheader_edge (loop));
8701 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8702 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8703 /* This will deal with any possible peeling. */
8704 vect_prepare_for_masked_peels (loop_vinfo);
8706 /* Schedule the SLP instances first, then handle loop vectorization
8707 below. */
8708 if (!loop_vinfo->slp_instances.is_empty ())
8710 DUMP_VECT_SCOPE ("scheduling SLP instances");
8711 vect_schedule_slp (loop_vinfo);
8714 /* FORNOW: the vectorizer supports only loops which body consist
8715 of one basic block (header + empty latch). When the vectorizer will
8716 support more involved loop forms, the order by which the BBs are
8717 traversed need to be reconsidered. */
8719 for (i = 0; i < nbbs; i++)
8721 basic_block bb = bbs[i];
8722 stmt_vec_info stmt_info;
8724 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8725 gsi_next (&si))
8727 gphi *phi = si.phi ();
8728 if (dump_enabled_p ())
8729 dump_printf_loc (MSG_NOTE, vect_location,
8730 "------>vectorizing phi: %G", phi);
8731 stmt_info = loop_vinfo->lookup_stmt (phi);
8732 if (!stmt_info)
8733 continue;
8735 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8736 vect_loop_kill_debug_uses (loop, stmt_info);
8738 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8739 && !STMT_VINFO_LIVE_P (stmt_info))
8740 continue;
8742 if (STMT_VINFO_VECTYPE (stmt_info)
8743 && (maybe_ne
8744 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8745 && dump_enabled_p ())
8746 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8748 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8749 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8750 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8751 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8752 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8753 && ! PURE_SLP_STMT (stmt_info))
8755 if (dump_enabled_p ())
8756 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8757 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
8761 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8762 !gsi_end_p (si);)
8764 stmt = gsi_stmt (si);
8765 /* During vectorization remove existing clobber stmts. */
8766 if (gimple_clobber_p (stmt))
8768 unlink_stmt_vdef (stmt);
8769 gsi_remove (&si, true);
8770 release_defs (stmt);
8772 else
8774 stmt_info = loop_vinfo->lookup_stmt (stmt);
8776 /* vector stmts created in the outer-loop during vectorization of
8777 stmts in an inner-loop may not have a stmt_info, and do not
8778 need to be vectorized. */
8779 stmt_vec_info seen_store = NULL;
8780 if (stmt_info)
8782 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8784 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8785 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8786 !gsi_end_p (subsi); gsi_next (&subsi))
8788 stmt_vec_info pat_stmt_info
8789 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8790 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8791 &si, &seen_store);
8793 stmt_vec_info pat_stmt_info
8794 = STMT_VINFO_RELATED_STMT (stmt_info);
8795 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8796 &seen_store);
8798 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8799 &seen_store);
8801 gsi_next (&si);
8802 if (seen_store)
8804 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8805 /* Interleaving. If IS_STORE is TRUE, the
8806 vectorization of the interleaving chain was
8807 completed - free all the stores in the chain. */
8808 vect_remove_stores (loop_vinfo,
8809 DR_GROUP_FIRST_ELEMENT (seen_store));
8810 else
8811 /* Free the attached stmt_vec_info and remove the stmt. */
8812 loop_vinfo->remove_stmt (stmt_info);
8817 /* Stub out scalar statements that must not survive vectorization.
8818 Doing this here helps with grouped statements, or statements that
8819 are involved in patterns. */
8820 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8821 !gsi_end_p (gsi); gsi_next (&gsi))
8823 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8824 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8826 tree lhs = gimple_get_lhs (call);
8827 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8829 tree zero = build_zero_cst (TREE_TYPE (lhs));
8830 gimple *new_stmt = gimple_build_assign (lhs, zero);
8831 gsi_replace (&gsi, new_stmt, true);
8835 } /* BBs in loop */
8837 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8838 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8839 if (integer_onep (step_vector))
8840 niters_no_overflow = true;
8841 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8842 niters_vector_mult_vf, !niters_no_overflow);
8844 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8845 scale_profile_for_vect_loop (loop, assumed_vf);
8847 /* True if the final iteration might not handle a full vector's
8848 worth of scalar iterations. */
8849 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8850 /* The minimum number of iterations performed by the epilogue. This
8851 is 1 when peeling for gaps because we always need a final scalar
8852 iteration. */
8853 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8854 /* +1 to convert latch counts to loop iteration counts,
8855 -min_epilogue_iters to remove iterations that cannot be performed
8856 by the vector code. */
8857 int bias_for_lowest = 1 - min_epilogue_iters;
8858 int bias_for_assumed = bias_for_lowest;
8859 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8860 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8862 /* When the amount of peeling is known at compile time, the first
8863 iteration will have exactly alignment_npeels active elements.
8864 In the worst case it will have at least one. */
8865 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8866 bias_for_lowest += lowest_vf - min_first_active;
8867 bias_for_assumed += assumed_vf - min_first_active;
8869 /* In these calculations the "- 1" converts loop iteration counts
8870 back to latch counts. */
8871 if (loop->any_upper_bound)
8872 loop->nb_iterations_upper_bound
8873 = (final_iter_may_be_partial
8874 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8875 lowest_vf) - 1
8876 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8877 lowest_vf) - 1);
8878 if (loop->any_likely_upper_bound)
8879 loop->nb_iterations_likely_upper_bound
8880 = (final_iter_may_be_partial
8881 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8882 + bias_for_lowest, lowest_vf) - 1
8883 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8884 + bias_for_lowest, lowest_vf) - 1);
8885 if (loop->any_estimate)
8886 loop->nb_iterations_estimate
8887 = (final_iter_may_be_partial
8888 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8889 assumed_vf) - 1
8890 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8891 assumed_vf) - 1);
8893 if (dump_enabled_p ())
8895 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8897 dump_printf_loc (MSG_NOTE, vect_location,
8898 "LOOP VECTORIZED\n");
8899 if (loop->inner)
8900 dump_printf_loc (MSG_NOTE, vect_location,
8901 "OUTER LOOP VECTORIZED\n");
8902 dump_printf (MSG_NOTE, "\n");
8904 else
8905 dump_printf_loc (MSG_NOTE, vect_location,
8906 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8907 GET_MODE_NAME (loop_vinfo->vector_mode));
8910 /* Loops vectorized with a variable factor won't benefit from
8911 unrolling/peeling. */
8912 if (!vf.is_constant ())
8914 loop->unroll = 1;
8915 if (dump_enabled_p ())
8916 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8917 " variable-length vectorization factor\n");
8919 /* Free SLP instances here because otherwise stmt reference counting
8920 won't work. */
8921 slp_instance instance;
8922 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8923 vect_free_slp_instance (instance, true);
8924 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8925 /* Clear-up safelen field since its value is invalid after vectorization
8926 since vectorized loop can have loop-carried dependencies. */
8927 loop->safelen = 0;
8929 if (epilogue)
8931 update_epilogue_loop_vinfo (epilogue, advance);
8933 epilogue->simduid = loop->simduid;
8934 epilogue->force_vectorize = loop->force_vectorize;
8935 epilogue->dont_vectorize = false;
8938 return epilogue;
8941 /* The code below is trying to perform simple optimization - revert
8942 if-conversion for masked stores, i.e. if the mask of a store is zero
8943 do not perform it and all stored value producers also if possible.
8944 For example,
8945 for (i=0; i<n; i++)
8946 if (c[i])
8948 p1[i] += 1;
8949 p2[i] = p3[i] +2;
8951 this transformation will produce the following semi-hammock:
8953 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8955 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8956 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8957 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8958 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8959 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8960 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8964 void
8965 optimize_mask_stores (class loop *loop)
8967 basic_block *bbs = get_loop_body (loop);
8968 unsigned nbbs = loop->num_nodes;
8969 unsigned i;
8970 basic_block bb;
8971 class loop *bb_loop;
8972 gimple_stmt_iterator gsi;
8973 gimple *stmt;
8974 auto_vec<gimple *> worklist;
8975 auto_purge_vect_location sentinel;
8977 vect_location = find_loop_location (loop);
8978 /* Pick up all masked stores in loop if any. */
8979 for (i = 0; i < nbbs; i++)
8981 bb = bbs[i];
8982 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8983 gsi_next (&gsi))
8985 stmt = gsi_stmt (gsi);
8986 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8987 worklist.safe_push (stmt);
8991 free (bbs);
8992 if (worklist.is_empty ())
8993 return;
8995 /* Loop has masked stores. */
8996 while (!worklist.is_empty ())
8998 gimple *last, *last_store;
8999 edge e, efalse;
9000 tree mask;
9001 basic_block store_bb, join_bb;
9002 gimple_stmt_iterator gsi_to;
9003 tree vdef, new_vdef;
9004 gphi *phi;
9005 tree vectype;
9006 tree zero;
9008 last = worklist.pop ();
9009 mask = gimple_call_arg (last, 2);
9010 bb = gimple_bb (last);
9011 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9012 the same loop as if_bb. It could be different to LOOP when two
9013 level loop-nest is vectorized and mask_store belongs to the inner
9014 one. */
9015 e = split_block (bb, last);
9016 bb_loop = bb->loop_father;
9017 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9018 join_bb = e->dest;
9019 store_bb = create_empty_bb (bb);
9020 add_bb_to_loop (store_bb, bb_loop);
9021 e->flags = EDGE_TRUE_VALUE;
9022 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9023 /* Put STORE_BB to likely part. */
9024 efalse->probability = profile_probability::unlikely ();
9025 store_bb->count = efalse->count ();
9026 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9027 if (dom_info_available_p (CDI_DOMINATORS))
9028 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9029 if (dump_enabled_p ())
9030 dump_printf_loc (MSG_NOTE, vect_location,
9031 "Create new block %d to sink mask stores.",
9032 store_bb->index);
9033 /* Create vector comparison with boolean result. */
9034 vectype = TREE_TYPE (mask);
9035 zero = build_zero_cst (vectype);
9036 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9037 gsi = gsi_last_bb (bb);
9038 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9039 /* Create new PHI node for vdef of the last masked store:
9040 .MEM_2 = VDEF <.MEM_1>
9041 will be converted to
9042 .MEM.3 = VDEF <.MEM_1>
9043 and new PHI node will be created in join bb
9044 .MEM_2 = PHI <.MEM_1, .MEM_3>
9046 vdef = gimple_vdef (last);
9047 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9048 gimple_set_vdef (last, new_vdef);
9049 phi = create_phi_node (vdef, join_bb);
9050 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9052 /* Put all masked stores with the same mask to STORE_BB if possible. */
9053 while (true)
9055 gimple_stmt_iterator gsi_from;
9056 gimple *stmt1 = NULL;
9058 /* Move masked store to STORE_BB. */
9059 last_store = last;
9060 gsi = gsi_for_stmt (last);
9061 gsi_from = gsi;
9062 /* Shift GSI to the previous stmt for further traversal. */
9063 gsi_prev (&gsi);
9064 gsi_to = gsi_start_bb (store_bb);
9065 gsi_move_before (&gsi_from, &gsi_to);
9066 /* Setup GSI_TO to the non-empty block start. */
9067 gsi_to = gsi_start_bb (store_bb);
9068 if (dump_enabled_p ())
9069 dump_printf_loc (MSG_NOTE, vect_location,
9070 "Move stmt to created bb\n%G", last);
9071 /* Move all stored value producers if possible. */
9072 while (!gsi_end_p (gsi))
9074 tree lhs;
9075 imm_use_iterator imm_iter;
9076 use_operand_p use_p;
9077 bool res;
9079 /* Skip debug statements. */
9080 if (is_gimple_debug (gsi_stmt (gsi)))
9082 gsi_prev (&gsi);
9083 continue;
9085 stmt1 = gsi_stmt (gsi);
9086 /* Do not consider statements writing to memory or having
9087 volatile operand. */
9088 if (gimple_vdef (stmt1)
9089 || gimple_has_volatile_ops (stmt1))
9090 break;
9091 gsi_from = gsi;
9092 gsi_prev (&gsi);
9093 lhs = gimple_get_lhs (stmt1);
9094 if (!lhs)
9095 break;
9097 /* LHS of vectorized stmt must be SSA_NAME. */
9098 if (TREE_CODE (lhs) != SSA_NAME)
9099 break;
9101 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9103 /* Remove dead scalar statement. */
9104 if (has_zero_uses (lhs))
9106 gsi_remove (&gsi_from, true);
9107 continue;
9111 /* Check that LHS does not have uses outside of STORE_BB. */
9112 res = true;
9113 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9115 gimple *use_stmt;
9116 use_stmt = USE_STMT (use_p);
9117 if (is_gimple_debug (use_stmt))
9118 continue;
9119 if (gimple_bb (use_stmt) != store_bb)
9121 res = false;
9122 break;
9125 if (!res)
9126 break;
9128 if (gimple_vuse (stmt1)
9129 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9130 break;
9132 /* Can move STMT1 to STORE_BB. */
9133 if (dump_enabled_p ())
9134 dump_printf_loc (MSG_NOTE, vect_location,
9135 "Move stmt to created bb\n%G", stmt1);
9136 gsi_move_before (&gsi_from, &gsi_to);
9137 /* Shift GSI_TO for further insertion. */
9138 gsi_prev (&gsi_to);
9140 /* Put other masked stores with the same mask to STORE_BB. */
9141 if (worklist.is_empty ()
9142 || gimple_call_arg (worklist.last (), 2) != mask
9143 || worklist.last () != stmt1)
9144 break;
9145 last = worklist.pop ();
9147 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9151 /* Decide whether it is possible to use a zero-based induction variable
9152 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9153 return the value that the induction variable must be able to hold
9154 in order to ensure that the loop ends with an all-false mask.
9155 Return -1 otherwise. */
9156 widest_int
9157 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9159 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9160 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9161 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9163 /* Calculate the value that the induction variable must be able
9164 to hit in order to ensure that we end the loop with an all-false mask.
9165 This involves adding the maximum number of inactive trailing scalar
9166 iterations. */
9167 widest_int iv_limit = -1;
9168 if (max_loop_iterations (loop, &iv_limit))
9170 if (niters_skip)
9172 /* Add the maximum number of skipped iterations to the
9173 maximum iteration count. */
9174 if (TREE_CODE (niters_skip) == INTEGER_CST)
9175 iv_limit += wi::to_widest (niters_skip);
9176 else
9177 iv_limit += max_vf - 1;
9179 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9180 /* Make a conservatively-correct assumption. */
9181 iv_limit += max_vf - 1;
9183 /* IV_LIMIT is the maximum number of latch iterations, which is also
9184 the maximum in-range IV value. Round this value down to the previous
9185 vector alignment boundary and then add an extra full iteration. */
9186 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9187 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9189 return iv_limit;