[RS6000] dg-do !compile and scan-assembler
[official-gcc.git] / gcc / tree-vect-loop.c
blob75b731407ba45f3dce8994c5655b18a1990c7883
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
403 *init = init_expr;
404 *step = step_expr;
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
422 return true;
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
436 x_3 = ...;
439 outer2:
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
516 worklist.safe_push (stmt_vinfo);
517 continue;
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 else
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 else
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
603 Example1: reduction:
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
609 Example2: induction:
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
657 while (stmt_info);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 stmt_vec_info first;
666 unsigned i;
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 if (STMT_VINFO_IN_PATTERN_P (first))
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
674 if (! STMT_VINFO_IN_PATTERN_P (next)
675 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 /* If not all stmt in the chain are patterns or if we failed
680 to update STMT_VINFO_REDUC_IDX try to handle the chain
681 without patterns. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
685 vect_fixup_reduc_chain (first);
686 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
687 = STMT_VINFO_RELATED_STMT (first);
692 /* Function vect_get_loop_niters.
694 Determine how many iterations the loop is executed and place it
695 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
696 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
697 niter information holds in ASSUMPTIONS.
699 Return the loop exit condition. */
702 static gcond *
703 vect_get_loop_niters (class loop *loop, tree *assumptions,
704 tree *number_of_iterations, tree *number_of_iterationsm1)
706 edge exit = single_exit (loop);
707 class tree_niter_desc niter_desc;
708 tree niter_assumptions, niter, may_be_zero;
709 gcond *cond = get_loop_exit_condition (loop);
711 *assumptions = boolean_true_node;
712 *number_of_iterationsm1 = chrec_dont_know;
713 *number_of_iterations = chrec_dont_know;
714 DUMP_VECT_SCOPE ("get_loop_niters");
716 if (!exit)
717 return cond;
719 may_be_zero = NULL_TREE;
720 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
721 || chrec_contains_undetermined (niter_desc.niter))
722 return cond;
724 niter_assumptions = niter_desc.assumptions;
725 may_be_zero = niter_desc.may_be_zero;
726 niter = niter_desc.niter;
728 if (may_be_zero && integer_zerop (may_be_zero))
729 may_be_zero = NULL_TREE;
731 if (may_be_zero)
733 if (COMPARISON_CLASS_P (may_be_zero))
735 /* Try to combine may_be_zero with assumptions, this can simplify
736 computation of niter expression. */
737 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
738 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
739 niter_assumptions,
740 fold_build1 (TRUTH_NOT_EXPR,
741 boolean_type_node,
742 may_be_zero));
743 else
744 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
745 build_int_cst (TREE_TYPE (niter), 0),
746 rewrite_to_non_trapping_overflow (niter));
748 may_be_zero = NULL_TREE;
750 else if (integer_nonzerop (may_be_zero))
752 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
753 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
754 return cond;
756 else
757 return cond;
760 *assumptions = niter_assumptions;
761 *number_of_iterationsm1 = niter;
763 /* We want the number of loop header executions which is the number
764 of latch executions plus one.
765 ??? For UINT_MAX latch executions this number overflows to zero
766 for loops like do { n++; } while (n != 0); */
767 if (niter && !chrec_contains_undetermined (niter))
768 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
769 build_int_cst (TREE_TYPE (niter), 1));
770 *number_of_iterations = niter;
772 return cond;
775 /* Function bb_in_loop_p
777 Used as predicate for dfs order traversal of the loop bbs. */
779 static bool
780 bb_in_loop_p (const_basic_block bb, const void *data)
782 const class loop *const loop = (const class loop *)data;
783 if (flow_bb_inside_loop_p (loop, bb))
784 return true;
785 return false;
789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
790 stmt_vec_info structs for all the stmts in LOOP_IN. */
792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
793 : vec_info (vec_info::loop, init_cost (loop_in), shared),
794 loop (loop_in),
795 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
796 num_itersm1 (NULL_TREE),
797 num_iters (NULL_TREE),
798 num_iters_unchanged (NULL_TREE),
799 num_iters_assumptions (NULL_TREE),
800 th (0),
801 versioning_threshold (0),
802 vectorization_factor (0),
803 max_vectorization_factor (0),
804 mask_skip_niters (NULL_TREE),
805 rgroup_compare_type (NULL_TREE),
806 simd_if_cond (NULL_TREE),
807 unaligned_dr (NULL),
808 peeling_for_alignment (0),
809 ptr_mask (0),
810 ivexpr_map (NULL),
811 scan_map (NULL),
812 slp_unrolling_factor (1),
813 single_scalar_iteration_cost (0),
814 vec_outside_cost (0),
815 vec_inside_cost (0),
816 vectorizable (false),
817 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
818 using_partial_vectors_p (false),
819 epil_using_partial_vectors_p (false),
820 peeling_for_gaps (false),
821 peeling_for_niter (false),
822 no_data_dependencies (false),
823 has_mask_store (false),
824 scalar_loop_scaling (profile_probability::uninitialized ()),
825 scalar_loop (NULL),
826 orig_loop_info (NULL)
828 /* CHECKME: We want to visit all BBs before their successors (except for
829 latch blocks, for which this assertion wouldn't hold). In the simple
830 case of the loop forms we allow, a dfs order of the BBs would the same
831 as reversed postorder traversal, so we are safe. */
833 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
834 bbs, loop->num_nodes, loop);
835 gcc_assert (nbbs == loop->num_nodes);
837 for (unsigned int i = 0; i < nbbs; i++)
839 basic_block bb = bbs[i];
840 gimple_stmt_iterator si;
842 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
844 gimple *phi = gsi_stmt (si);
845 gimple_set_uid (phi, 0);
846 add_stmt (phi);
849 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
851 gimple *stmt = gsi_stmt (si);
852 gimple_set_uid (stmt, 0);
853 if (is_gimple_debug (stmt))
854 continue;
855 add_stmt (stmt);
856 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
857 third argument is the #pragma omp simd if (x) condition, when 0,
858 loop shouldn't be vectorized, when non-zero constant, it should
859 be vectorized normally, otherwise versioned with vectorized loop
860 done if the condition is non-zero at runtime. */
861 if (loop_in->simduid
862 && is_gimple_call (stmt)
863 && gimple_call_internal_p (stmt)
864 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
865 && gimple_call_num_args (stmt) >= 3
866 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
867 && (loop_in->simduid
868 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
870 tree arg = gimple_call_arg (stmt, 2);
871 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
872 simd_if_cond = arg;
873 else
874 gcc_assert (integer_nonzerop (arg));
879 epilogue_vinfos.create (6);
882 /* Free all levels of rgroup CONTROLS. */
884 void
885 release_vec_loop_controls (vec<rgroup_controls> *controls)
887 rgroup_controls *rgc;
888 unsigned int i;
889 FOR_EACH_VEC_ELT (*controls, i, rgc)
890 rgc->controls.release ();
891 controls->release ();
894 /* Free all memory used by the _loop_vec_info, as well as all the
895 stmt_vec_info structs of all the stmts in the loop. */
897 _loop_vec_info::~_loop_vec_info ()
899 free (bbs);
901 release_vec_loop_controls (&masks);
902 release_vec_loop_controls (&lens);
903 delete ivexpr_map;
904 delete scan_map;
905 epilogue_vinfos.release ();
907 loop->aux = NULL;
910 /* Return an invariant or register for EXPR and emit necessary
911 computations in the LOOP_VINFO loop preheader. */
913 tree
914 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
916 if (is_gimple_reg (expr)
917 || is_gimple_min_invariant (expr))
918 return expr;
920 if (! loop_vinfo->ivexpr_map)
921 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
922 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
923 if (! cached)
925 gimple_seq stmts = NULL;
926 cached = force_gimple_operand (unshare_expr (expr),
927 &stmts, true, NULL_TREE);
928 if (stmts)
930 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
931 gsi_insert_seq_on_edge_immediate (e, stmts);
934 return cached;
937 /* Return true if we can use CMP_TYPE as the comparison type to produce
938 all masks required to mask LOOP_VINFO. */
940 static bool
941 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
943 rgroup_controls *rgm;
944 unsigned int i;
945 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
946 if (rgm->type != NULL_TREE
947 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
948 cmp_type, rgm->type,
949 OPTIMIZE_FOR_SPEED))
950 return false;
951 return true;
954 /* Calculate the maximum number of scalars per iteration for every
955 rgroup in LOOP_VINFO. */
957 static unsigned int
958 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
960 unsigned int res = 1;
961 unsigned int i;
962 rgroup_controls *rgm;
963 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
964 res = MAX (res, rgm->max_nscalars_per_iter);
965 return res;
968 /* Calculate the minimum precision necessary to represent:
970 MAX_NITERS * FACTOR
972 as an unsigned integer, where MAX_NITERS is the maximum number of
973 loop header iterations for the original scalar form of LOOP_VINFO. */
975 static unsigned
976 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
978 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
980 /* Get the maximum number of iterations that is representable
981 in the counter type. */
982 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
983 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
985 /* Get a more refined estimate for the number of iterations. */
986 widest_int max_back_edges;
987 if (max_loop_iterations (loop, &max_back_edges))
988 max_ni = wi::smin (max_ni, max_back_edges + 1);
990 /* Work out how many bits we need to represent the limit. */
991 return wi::min_precision (max_ni * factor, UNSIGNED);
994 /* True if the loop needs peeling or partial vectors when vectorized. */
996 static bool
997 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
999 unsigned HOST_WIDE_INT const_vf;
1000 HOST_WIDE_INT max_niter
1001 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1003 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1004 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1005 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1006 (loop_vinfo));
1008 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1009 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1011 /* Work out the (constant) number of iterations that need to be
1012 peeled for reasons other than niters. */
1013 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1014 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1015 peel_niter += 1;
1016 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1017 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1018 return true;
1020 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1021 /* ??? When peeling for gaps but not alignment, we could
1022 try to check whether the (variable) niters is known to be
1023 VF * N + 1. That's something of a niche case though. */
1024 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1025 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1026 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1027 < (unsigned) exact_log2 (const_vf))
1028 /* In case of versioning, check if the maximum number of
1029 iterations is greater than th. If they are identical,
1030 the epilogue is unnecessary. */
1031 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1032 || ((unsigned HOST_WIDE_INT) max_niter
1033 > (th / const_vf) * const_vf))))
1034 return true;
1036 return false;
1039 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1040 whether we can actually generate the masks required. Return true if so,
1041 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1043 static bool
1044 vect_verify_full_masking (loop_vec_info loop_vinfo)
1046 unsigned int min_ni_width;
1047 unsigned int max_nscalars_per_iter
1048 = vect_get_max_nscalars_per_iter (loop_vinfo);
1050 /* Use a normal loop if there are no statements that need masking.
1051 This only happens in rare degenerate cases: it means that the loop
1052 has no loads, no stores, and no live-out values. */
1053 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1054 return false;
1056 /* Work out how many bits we need to represent the limit. */
1057 min_ni_width
1058 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1060 /* Find a scalar mode for which WHILE_ULT is supported. */
1061 opt_scalar_int_mode cmp_mode_iter;
1062 tree cmp_type = NULL_TREE;
1063 tree iv_type = NULL_TREE;
1064 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1065 unsigned int iv_precision = UINT_MAX;
1067 if (iv_limit != -1)
1068 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1069 UNSIGNED);
1071 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1073 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1074 if (cmp_bits >= min_ni_width
1075 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1077 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1078 if (this_type
1079 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1081 /* Although we could stop as soon as we find a valid mode,
1082 there are at least two reasons why that's not always the
1083 best choice:
1085 - An IV that's Pmode or wider is more likely to be reusable
1086 in address calculations than an IV that's narrower than
1087 Pmode.
1089 - Doing the comparison in IV_PRECISION or wider allows
1090 a natural 0-based IV, whereas using a narrower comparison
1091 type requires mitigations against wrap-around.
1093 Conversely, if the IV limit is variable, doing the comparison
1094 in a wider type than the original type can introduce
1095 unnecessary extensions, so picking the widest valid mode
1096 is not always a good choice either.
1098 Here we prefer the first IV type that's Pmode or wider,
1099 and the first comparison type that's IV_PRECISION or wider.
1100 (The comparison type must be no wider than the IV type,
1101 to avoid extensions in the vector loop.)
1103 ??? We might want to try continuing beyond Pmode for ILP32
1104 targets if CMP_BITS < IV_PRECISION. */
1105 iv_type = this_type;
1106 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1107 cmp_type = this_type;
1108 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1109 break;
1114 if (!cmp_type)
1115 return false;
1117 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1118 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1119 return true;
1122 /* Check whether we can use vector access with length based on precison
1123 comparison. So far, to keep it simple, we only allow the case that the
1124 precision of the target supported length is larger than the precision
1125 required by loop niters. */
1127 static bool
1128 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1130 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1131 return false;
1133 unsigned int max_nitems_per_iter = 1;
1134 unsigned int i;
1135 rgroup_controls *rgl;
1136 /* Find the maximum number of items per iteration for every rgroup. */
1137 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1139 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1140 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1143 /* Work out how many bits we need to represent the length limit. */
1144 unsigned int min_ni_prec
1145 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1147 /* Now use the maximum of below precisions for one suitable IV type:
1148 - the IV's natural precision
1149 - the precision needed to hold: the maximum number of scalar
1150 iterations multiplied by the scale factor (min_ni_prec above)
1151 - the Pmode precision
1153 If min_ni_prec is less than the precision of the current niters,
1154 we perfer to still use the niters type. Prefer to use Pmode and
1155 wider IV to avoid narrow conversions. */
1157 unsigned int ni_prec
1158 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1159 min_ni_prec = MAX (min_ni_prec, ni_prec);
1160 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1162 tree iv_type = NULL_TREE;
1163 opt_scalar_int_mode tmode_iter;
1164 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1166 scalar_mode tmode = tmode_iter.require ();
1167 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1169 /* ??? Do we really want to construct one IV whose precision exceeds
1170 BITS_PER_WORD? */
1171 if (tbits > BITS_PER_WORD)
1172 break;
1174 /* Find the first available standard integral type. */
1175 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1177 iv_type = build_nonstandard_integer_type (tbits, true);
1178 break;
1182 if (!iv_type)
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "can't vectorize with length-based partial vectors"
1187 " because there is no suitable iv type.\n");
1188 return false;
1191 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1192 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1194 return true;
1197 /* Calculate the cost of one scalar iteration of the loop. */
1198 static void
1199 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1201 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1202 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1203 int nbbs = loop->num_nodes, factor;
1204 int innerloop_iters, i;
1206 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1208 /* Gather costs for statements in the scalar loop. */
1210 /* FORNOW. */
1211 innerloop_iters = 1;
1212 if (loop->inner)
1213 innerloop_iters = 50; /* FIXME */
1215 for (i = 0; i < nbbs; i++)
1217 gimple_stmt_iterator si;
1218 basic_block bb = bbs[i];
1220 if (bb->loop_father == loop->inner)
1221 factor = innerloop_iters;
1222 else
1223 factor = 1;
1225 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1227 gimple *stmt = gsi_stmt (si);
1228 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1230 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1231 continue;
1233 /* Skip stmts that are not vectorized inside the loop. */
1234 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1235 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1236 && (!STMT_VINFO_LIVE_P (vstmt_info)
1237 || !VECTORIZABLE_CYCLE_DEF
1238 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1239 continue;
1241 vect_cost_for_stmt kind;
1242 if (STMT_VINFO_DATA_REF (stmt_info))
1244 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1245 kind = scalar_load;
1246 else
1247 kind = scalar_store;
1249 else if (vect_nop_conversion_p (stmt_info))
1250 continue;
1251 else
1252 kind = scalar_stmt;
1254 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1255 factor, kind, stmt_info, 0, vect_prologue);
1259 /* Now accumulate cost. */
1260 void *target_cost_data = init_cost (loop);
1261 stmt_info_for_cost *si;
1262 int j;
1263 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1264 j, si)
1265 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1266 si->kind, si->stmt_info, si->vectype,
1267 si->misalign, vect_body);
1268 unsigned dummy, body_cost = 0;
1269 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1270 destroy_cost_data (target_cost_data);
1271 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1275 /* Function vect_analyze_loop_form_1.
1277 Verify that certain CFG restrictions hold, including:
1278 - the loop has a pre-header
1279 - the loop has a single entry and exit
1280 - the loop exit condition is simple enough
1281 - the number of iterations can be analyzed, i.e, a countable loop. The
1282 niter could be analyzed under some assumptions. */
1284 opt_result
1285 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1286 tree *assumptions, tree *number_of_iterationsm1,
1287 tree *number_of_iterations, gcond **inner_loop_cond)
1289 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1291 /* Different restrictions apply when we are considering an inner-most loop,
1292 vs. an outer (nested) loop.
1293 (FORNOW. May want to relax some of these restrictions in the future). */
1295 if (!loop->inner)
1297 /* Inner-most loop. We currently require that the number of BBs is
1298 exactly 2 (the header and latch). Vectorizable inner-most loops
1299 look like this:
1301 (pre-header)
1303 header <--------+
1304 | | |
1305 | +--> latch --+
1307 (exit-bb) */
1309 if (loop->num_nodes != 2)
1310 return opt_result::failure_at (vect_location,
1311 "not vectorized:"
1312 " control flow in loop.\n");
1314 if (empty_block_p (loop->header))
1315 return opt_result::failure_at (vect_location,
1316 "not vectorized: empty loop.\n");
1318 else
1320 class loop *innerloop = loop->inner;
1321 edge entryedge;
1323 /* Nested loop. We currently require that the loop is doubly-nested,
1324 contains a single inner loop, and the number of BBs is exactly 5.
1325 Vectorizable outer-loops look like this:
1327 (pre-header)
1329 header <---+
1331 inner-loop |
1333 tail ------+
1335 (exit-bb)
1337 The inner-loop has the properties expected of inner-most loops
1338 as described above. */
1340 if ((loop->inner)->inner || (loop->inner)->next)
1341 return opt_result::failure_at (vect_location,
1342 "not vectorized:"
1343 " multiple nested loops.\n");
1345 if (loop->num_nodes != 5)
1346 return opt_result::failure_at (vect_location,
1347 "not vectorized:"
1348 " control flow in loop.\n");
1350 entryedge = loop_preheader_edge (innerloop);
1351 if (entryedge->src != loop->header
1352 || !single_exit (innerloop)
1353 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1354 return opt_result::failure_at (vect_location,
1355 "not vectorized:"
1356 " unsupported outerloop form.\n");
1358 /* Analyze the inner-loop. */
1359 tree inner_niterm1, inner_niter, inner_assumptions;
1360 opt_result res
1361 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1362 &inner_assumptions, &inner_niterm1,
1363 &inner_niter, NULL);
1364 if (!res)
1366 if (dump_enabled_p ())
1367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368 "not vectorized: Bad inner loop.\n");
1369 return res;
1372 /* Don't support analyzing niter under assumptions for inner
1373 loop. */
1374 if (!integer_onep (inner_assumptions))
1375 return opt_result::failure_at (vect_location,
1376 "not vectorized: Bad inner loop.\n");
1378 if (!expr_invariant_in_loop_p (loop, inner_niter))
1379 return opt_result::failure_at (vect_location,
1380 "not vectorized: inner-loop count not"
1381 " invariant.\n");
1383 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_NOTE, vect_location,
1385 "Considering outer-loop vectorization.\n");
1388 if (!single_exit (loop))
1389 return opt_result::failure_at (vect_location,
1390 "not vectorized: multiple exits.\n");
1391 if (EDGE_COUNT (loop->header->preds) != 2)
1392 return opt_result::failure_at (vect_location,
1393 "not vectorized:"
1394 " too many incoming edges.\n");
1396 /* We assume that the loop exit condition is at the end of the loop. i.e,
1397 that the loop is represented as a do-while (with a proper if-guard
1398 before the loop if needed), where the loop header contains all the
1399 executable statements, and the latch is empty. */
1400 if (!empty_block_p (loop->latch)
1401 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: latch block not empty.\n");
1405 /* Make sure the exit is not abnormal. */
1406 edge e = single_exit (loop);
1407 if (e->flags & EDGE_ABNORMAL)
1408 return opt_result::failure_at (vect_location,
1409 "not vectorized:"
1410 " abnormal loop exit edge.\n");
1412 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1413 number_of_iterationsm1);
1414 if (!*loop_cond)
1415 return opt_result::failure_at
1416 (vect_location,
1417 "not vectorized: complicated exit condition.\n");
1419 if (integer_zerop (*assumptions)
1420 || !*number_of_iterations
1421 || chrec_contains_undetermined (*number_of_iterations))
1422 return opt_result::failure_at
1423 (*loop_cond,
1424 "not vectorized: number of iterations cannot be computed.\n");
1426 if (integer_zerop (*number_of_iterations))
1427 return opt_result::failure_at
1428 (*loop_cond,
1429 "not vectorized: number of iterations = 0.\n");
1431 return opt_result::success ();
1434 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1436 opt_loop_vec_info
1437 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1439 tree assumptions, number_of_iterations, number_of_iterationsm1;
1440 gcond *loop_cond, *inner_loop_cond = NULL;
1442 opt_result res
1443 = vect_analyze_loop_form_1 (loop, &loop_cond,
1444 &assumptions, &number_of_iterationsm1,
1445 &number_of_iterations, &inner_loop_cond);
1446 if (!res)
1447 return opt_loop_vec_info::propagate_failure (res);
1449 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1450 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1451 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1452 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1453 if (!integer_onep (assumptions))
1455 /* We consider to vectorize this loop by versioning it under
1456 some assumptions. In order to do this, we need to clear
1457 existing information computed by scev and niter analyzer. */
1458 scev_reset_htab ();
1459 free_numbers_of_iterations_estimates (loop);
1460 /* Also set flag for this loop so that following scev and niter
1461 analysis are done under the assumptions. */
1462 loop_constraint_set (loop, LOOP_C_FINITE);
1463 /* Also record the assumptions for versioning. */
1464 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1467 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1469 if (dump_enabled_p ())
1471 dump_printf_loc (MSG_NOTE, vect_location,
1472 "Symbolic number of iterations is ");
1473 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1474 dump_printf (MSG_NOTE, "\n");
1478 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1479 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1480 if (inner_loop_cond)
1482 stmt_vec_info inner_loop_cond_info
1483 = loop_vinfo->lookup_stmt (inner_loop_cond);
1484 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1487 gcc_assert (!loop->aux);
1488 loop->aux = loop_vinfo;
1489 return opt_loop_vec_info::success (loop_vinfo);
1494 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1495 statements update the vectorization factor. */
1497 static void
1498 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1500 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1501 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1502 int nbbs = loop->num_nodes;
1503 poly_uint64 vectorization_factor;
1504 int i;
1506 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1508 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1509 gcc_assert (known_ne (vectorization_factor, 0U));
1511 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1512 vectorization factor of the loop is the unrolling factor required by
1513 the SLP instances. If that unrolling factor is 1, we say, that we
1514 perform pure SLP on loop - cross iteration parallelism is not
1515 exploited. */
1516 bool only_slp_in_loop = true;
1517 for (i = 0; i < nbbs; i++)
1519 basic_block bb = bbs[i];
1520 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1521 gsi_next (&si))
1523 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1524 if (!stmt_info)
1525 continue;
1526 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1527 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1528 && !PURE_SLP_STMT (stmt_info))
1529 /* STMT needs both SLP and loop-based vectorization. */
1530 only_slp_in_loop = false;
1532 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1533 gsi_next (&si))
1535 if (is_gimple_debug (gsi_stmt (si)))
1536 continue;
1537 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1538 stmt_info = vect_stmt_to_vectorize (stmt_info);
1539 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1540 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1541 && !PURE_SLP_STMT (stmt_info))
1542 /* STMT needs both SLP and loop-based vectorization. */
1543 only_slp_in_loop = false;
1547 if (only_slp_in_loop)
1549 if (dump_enabled_p ())
1550 dump_printf_loc (MSG_NOTE, vect_location,
1551 "Loop contains only SLP stmts\n");
1552 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1554 else
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_NOTE, vect_location,
1558 "Loop contains SLP and non-SLP stmts\n");
1559 /* Both the vectorization factor and unroll factor have the form
1560 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1561 so they must have a common multiple. */
1562 vectorization_factor
1563 = force_common_multiple (vectorization_factor,
1564 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1567 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1568 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_NOTE, vect_location,
1571 "Updating vectorization factor to ");
1572 dump_dec (MSG_NOTE, vectorization_factor);
1573 dump_printf (MSG_NOTE, ".\n");
1577 /* Return true if STMT_INFO describes a double reduction phi and if
1578 the other phi in the reduction is also relevant for vectorization.
1579 This rejects cases such as:
1581 outer1:
1582 x_1 = PHI <x_3(outer2), ...>;
1585 inner:
1586 x_2 = ...;
1589 outer2:
1590 x_3 = PHI <x_2(inner)>;
1592 if nothing in x_2 or elsewhere makes x_1 relevant. */
1594 static bool
1595 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1597 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1598 return false;
1600 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1603 /* Function vect_analyze_loop_operations.
1605 Scan the loop stmts and make sure they are all vectorizable. */
1607 static opt_result
1608 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1610 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1611 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1612 int nbbs = loop->num_nodes;
1613 int i;
1614 stmt_vec_info stmt_info;
1615 bool need_to_vectorize = false;
1616 bool ok;
1618 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1620 auto_vec<stmt_info_for_cost> cost_vec;
1622 for (i = 0; i < nbbs; i++)
1624 basic_block bb = bbs[i];
1626 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1627 gsi_next (&si))
1629 gphi *phi = si.phi ();
1630 ok = true;
1632 stmt_info = loop_vinfo->lookup_stmt (phi);
1633 if (dump_enabled_p ())
1634 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1635 if (virtual_operand_p (gimple_phi_result (phi)))
1636 continue;
1638 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1639 (i.e., a phi in the tail of the outer-loop). */
1640 if (! is_loop_header_bb_p (bb))
1642 /* FORNOW: we currently don't support the case that these phis
1643 are not used in the outerloop (unless it is double reduction,
1644 i.e., this phi is vect_reduction_def), cause this case
1645 requires to actually do something here. */
1646 if (STMT_VINFO_LIVE_P (stmt_info)
1647 && !vect_active_double_reduction_p (stmt_info))
1648 return opt_result::failure_at (phi,
1649 "Unsupported loop-closed phi"
1650 " in outer-loop.\n");
1652 /* If PHI is used in the outer loop, we check that its operand
1653 is defined in the inner loop. */
1654 if (STMT_VINFO_RELEVANT_P (stmt_info))
1656 tree phi_op;
1658 if (gimple_phi_num_args (phi) != 1)
1659 return opt_result::failure_at (phi, "unsupported phi");
1661 phi_op = PHI_ARG_DEF (phi, 0);
1662 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1663 if (!op_def_info)
1664 return opt_result::failure_at (phi, "unsupported phi\n");
1666 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1667 && (STMT_VINFO_RELEVANT (op_def_info)
1668 != vect_used_in_outer_by_reduction))
1669 return opt_result::failure_at (phi, "unsupported phi\n");
1671 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1672 || (STMT_VINFO_DEF_TYPE (stmt_info)
1673 == vect_double_reduction_def))
1674 && !vectorizable_lc_phi (loop_vinfo,
1675 stmt_info, NULL, NULL))
1676 return opt_result::failure_at (phi, "unsupported phi\n");
1679 continue;
1682 gcc_assert (stmt_info);
1684 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1685 || STMT_VINFO_LIVE_P (stmt_info))
1686 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1687 /* A scalar-dependence cycle that we don't support. */
1688 return opt_result::failure_at (phi,
1689 "not vectorized:"
1690 " scalar dependence cycle.\n");
1692 if (STMT_VINFO_RELEVANT_P (stmt_info))
1694 need_to_vectorize = true;
1695 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1696 && ! PURE_SLP_STMT (stmt_info))
1697 ok = vectorizable_induction (loop_vinfo,
1698 stmt_info, NULL, NULL,
1699 &cost_vec);
1700 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1701 || (STMT_VINFO_DEF_TYPE (stmt_info)
1702 == vect_double_reduction_def)
1703 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1704 && ! PURE_SLP_STMT (stmt_info))
1705 ok = vectorizable_reduction (loop_vinfo,
1706 stmt_info, NULL, NULL, &cost_vec);
1709 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1710 if (ok
1711 && STMT_VINFO_LIVE_P (stmt_info)
1712 && !PURE_SLP_STMT (stmt_info))
1713 ok = vectorizable_live_operation (loop_vinfo,
1714 stmt_info, NULL, NULL, NULL,
1715 -1, false, &cost_vec);
1717 if (!ok)
1718 return opt_result::failure_at (phi,
1719 "not vectorized: relevant phi not "
1720 "supported: %G",
1721 static_cast <gimple *> (phi));
1724 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1725 gsi_next (&si))
1727 gimple *stmt = gsi_stmt (si);
1728 if (!gimple_clobber_p (stmt)
1729 && !is_gimple_debug (stmt))
1731 opt_result res
1732 = vect_analyze_stmt (loop_vinfo,
1733 loop_vinfo->lookup_stmt (stmt),
1734 &need_to_vectorize,
1735 NULL, NULL, &cost_vec);
1736 if (!res)
1737 return res;
1740 } /* bbs */
1742 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1744 /* All operations in the loop are either irrelevant (deal with loop
1745 control, or dead), or only used outside the loop and can be moved
1746 out of the loop (e.g. invariants, inductions). The loop can be
1747 optimized away by scalar optimizations. We're better off not
1748 touching this loop. */
1749 if (!need_to_vectorize)
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_NOTE, vect_location,
1753 "All the computation can be taken out of the loop.\n");
1754 return opt_result::failure_at
1755 (vect_location,
1756 "not vectorized: redundant loop. no profit to vectorize.\n");
1759 return opt_result::success ();
1762 /* Return true if we know that the iteration count is smaller than the
1763 vectorization factor. Return false if it isn't, or if we can't be sure
1764 either way. */
1766 static bool
1767 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1769 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1771 HOST_WIDE_INT max_niter;
1772 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1773 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1774 else
1775 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1777 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1778 return true;
1780 return false;
1783 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1784 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1785 definitely no, or -1 if it's worth retrying. */
1787 static int
1788 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1790 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1791 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793 /* Only loops that can handle partially-populated vectors can have iteration
1794 counts less than the vectorization factor. */
1795 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1797 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1801 "not vectorized: iteration count smaller than "
1802 "vectorization factor.\n");
1803 return 0;
1807 int min_profitable_iters, min_profitable_estimate;
1808 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1809 &min_profitable_estimate);
1811 if (min_profitable_iters < 0)
1813 if (dump_enabled_p ())
1814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815 "not vectorized: vectorization not profitable.\n");
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818 "not vectorized: vector version will never be "
1819 "profitable.\n");
1820 return -1;
1823 int min_scalar_loop_bound = (param_min_vect_loop_bound
1824 * assumed_vf);
1826 /* Use the cost model only if it is more conservative than user specified
1827 threshold. */
1828 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1829 min_profitable_iters);
1831 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1833 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1834 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "not vectorized: vectorization not profitable.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "not vectorized: iteration count smaller than user "
1842 "specified loop bound parameter or minimum profitable "
1843 "iterations (whichever is more conservative).\n");
1844 return 0;
1847 /* The static profitablity threshold min_profitable_estimate includes
1848 the cost of having to check at runtime whether the scalar loop
1849 should be used instead. If it turns out that we don't need or want
1850 such a check, the threshold we should use for the static estimate
1851 is simply the point at which the vector loop becomes more profitable
1852 than the scalar loop. */
1853 if (min_profitable_estimate > min_profitable_iters
1854 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1855 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1856 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1861 " choice between the scalar and vector loops\n");
1862 min_profitable_estimate = min_profitable_iters;
1865 HOST_WIDE_INT estimated_niter;
1867 /* If we are vectorizing an epilogue then we know the maximum number of
1868 scalar iterations it will cover is at least one lower than the
1869 vectorization factor of the main loop. */
1870 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1871 estimated_niter
1872 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1873 else
1875 estimated_niter = estimated_stmt_executions_int (loop);
1876 if (estimated_niter == -1)
1877 estimated_niter = likely_max_stmt_executions_int (loop);
1879 if (estimated_niter != -1
1880 && ((unsigned HOST_WIDE_INT) estimated_niter
1881 < MAX (th, (unsigned) min_profitable_estimate)))
1883 if (dump_enabled_p ())
1884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885 "not vectorized: estimated iteration count too "
1886 "small.\n");
1887 if (dump_enabled_p ())
1888 dump_printf_loc (MSG_NOTE, vect_location,
1889 "not vectorized: estimated iteration count smaller "
1890 "than specified loop bound parameter or minimum "
1891 "profitable iterations (whichever is more "
1892 "conservative).\n");
1893 return -1;
1896 return 1;
1899 static opt_result
1900 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1901 vec<data_reference_p> *datarefs,
1902 unsigned int *n_stmts)
1904 *n_stmts = 0;
1905 for (unsigned i = 0; i < loop->num_nodes; i++)
1906 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1907 !gsi_end_p (gsi); gsi_next (&gsi))
1909 gimple *stmt = gsi_stmt (gsi);
1910 if (is_gimple_debug (stmt))
1911 continue;
1912 ++(*n_stmts);
1913 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1914 NULL, 0);
1915 if (!res)
1917 if (is_gimple_call (stmt) && loop->safelen)
1919 tree fndecl = gimple_call_fndecl (stmt), op;
1920 if (fndecl != NULL_TREE)
1922 cgraph_node *node = cgraph_node::get (fndecl);
1923 if (node != NULL && node->simd_clones != NULL)
1925 unsigned int j, n = gimple_call_num_args (stmt);
1926 for (j = 0; j < n; j++)
1928 op = gimple_call_arg (stmt, j);
1929 if (DECL_P (op)
1930 || (REFERENCE_CLASS_P (op)
1931 && get_base_address (op)))
1932 break;
1934 op = gimple_call_lhs (stmt);
1935 /* Ignore #pragma omp declare simd functions
1936 if they don't have data references in the
1937 call stmt itself. */
1938 if (j == n
1939 && !(op
1940 && (DECL_P (op)
1941 || (REFERENCE_CLASS_P (op)
1942 && get_base_address (op)))))
1943 continue;
1947 return res;
1949 /* If dependence analysis will give up due to the limit on the
1950 number of datarefs stop here and fail fatally. */
1951 if (datarefs->length ()
1952 > (unsigned)param_loop_max_datarefs_for_datadeps)
1953 return opt_result::failure_at (stmt, "exceeded param "
1954 "loop-max-datarefs-for-datadeps\n");
1956 return opt_result::success ();
1959 /* Look for SLP-only access groups and turn each individual access into its own
1960 group. */
1961 static void
1962 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1964 unsigned int i;
1965 struct data_reference *dr;
1967 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1969 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1970 FOR_EACH_VEC_ELT (datarefs, i, dr)
1972 gcc_assert (DR_REF (dr));
1973 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1975 /* Check if the load is a part of an interleaving chain. */
1976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1978 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1979 unsigned int group_size = DR_GROUP_SIZE (first_element);
1981 /* Check if SLP-only groups. */
1982 if (!STMT_SLP_TYPE (stmt_info)
1983 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1985 /* Dissolve the group. */
1986 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1988 stmt_vec_info vinfo = first_element;
1989 while (vinfo)
1991 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1992 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1993 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1994 DR_GROUP_SIZE (vinfo) = 1;
1995 if (STMT_VINFO_STRIDED_P (first_element))
1996 DR_GROUP_GAP (vinfo) = 0;
1997 else
1998 DR_GROUP_GAP (vinfo) = group_size - 1;
1999 vinfo = next;
2006 /* Determine if operating on full vectors for LOOP_VINFO might leave
2007 some scalar iterations still to do. If so, decide how we should
2008 handle those scalar iterations. The possibilities are:
2010 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2011 In this case:
2013 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2014 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2015 LOOP_VINFO_PEELING_FOR_NITER == false
2017 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2018 to handle the remaining scalar iterations. In this case:
2020 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2021 LOOP_VINFO_PEELING_FOR_NITER == true
2023 There are two choices:
2025 (2a) Consider vectorizing the epilogue loop at the same VF as the
2026 main loop, but using partial vectors instead of full vectors.
2027 In this case:
2029 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2031 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2032 In this case:
2034 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2036 When FOR_EPILOGUE_P is true, make this determination based on the
2037 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2038 based on the assumption that LOOP_VINFO is the main loop. The caller
2039 has made sure that the number of iterations is set appropriately for
2040 this value of FOR_EPILOGUE_P. */
2042 opt_result
2043 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2044 bool for_epilogue_p)
2046 /* Determine whether there would be any scalar iterations left over. */
2047 bool need_peeling_or_partial_vectors_p
2048 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2050 /* Decide whether to vectorize the loop with partial vectors. */
2051 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2052 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2053 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2054 && need_peeling_or_partial_vectors_p)
2056 /* For partial-vector-usage=1, try to push the handling of partial
2057 vectors to the epilogue, with the main loop continuing to operate
2058 on full vectors.
2060 ??? We could then end up failing to use partial vectors if we
2061 decide to peel iterations into a prologue, and if the main loop
2062 then ends up processing fewer than VF iterations. */
2063 if (param_vect_partial_vector_usage == 1
2064 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2065 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2066 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2067 else
2068 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2071 if (dump_enabled_p ())
2073 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2074 dump_printf_loc (MSG_NOTE, vect_location,
2075 "operating on partial vectors%s.\n",
2076 for_epilogue_p ? " for epilogue loop" : "");
2077 else
2078 dump_printf_loc (MSG_NOTE, vect_location,
2079 "operating only on full vectors%s.\n",
2080 for_epilogue_p ? " for epilogue loop" : "");
2083 if (for_epilogue_p)
2085 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2086 gcc_assert (orig_loop_vinfo);
2087 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2088 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2089 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2092 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2093 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2095 /* Check that the loop processes at least one full vector. */
2096 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2097 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2098 if (known_lt (wi::to_widest (scalar_niters), vf))
2099 return opt_result::failure_at (vect_location,
2100 "loop does not have enough iterations"
2101 " to support vectorization.\n");
2103 /* If we need to peel an extra epilogue iteration to handle data
2104 accesses with gaps, check that there are enough scalar iterations
2105 available.
2107 The check above is redundant with this one when peeling for gaps,
2108 but the distinction is useful for diagnostics. */
2109 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2110 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2111 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2112 return opt_result::failure_at (vect_location,
2113 "loop does not have enough iterations"
2114 " to support peeling for gaps.\n");
2117 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2118 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2119 && need_peeling_or_partial_vectors_p);
2121 return opt_result::success ();
2124 /* Function vect_analyze_loop_2.
2126 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2127 for it. The different analyses will record information in the
2128 loop_vec_info struct. */
2129 static opt_result
2130 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2132 opt_result ok = opt_result::success ();
2133 int res;
2134 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2135 poly_uint64 min_vf = 2;
2136 loop_vec_info orig_loop_vinfo = NULL;
2138 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2139 loop_vec_info of the first vectorized loop. */
2140 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2141 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2142 else
2143 orig_loop_vinfo = loop_vinfo;
2144 gcc_assert (orig_loop_vinfo);
2146 /* The first group of checks is independent of the vector size. */
2147 fatal = true;
2149 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2150 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2151 return opt_result::failure_at (vect_location,
2152 "not vectorized: simd if(0)\n");
2154 /* Find all data references in the loop (which correspond to vdefs/vuses)
2155 and analyze their evolution in the loop. */
2157 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2159 /* Gather the data references and count stmts in the loop. */
2160 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2162 opt_result res
2163 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2164 &LOOP_VINFO_DATAREFS (loop_vinfo),
2165 n_stmts);
2166 if (!res)
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170 "not vectorized: loop contains function "
2171 "calls or data references that cannot "
2172 "be analyzed\n");
2173 return res;
2175 loop_vinfo->shared->save_datarefs ();
2177 else
2178 loop_vinfo->shared->check_datarefs ();
2180 /* Analyze the data references and also adjust the minimal
2181 vectorization factor according to the loads and stores. */
2183 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2184 if (!ok)
2186 if (dump_enabled_p ())
2187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2188 "bad data references.\n");
2189 return ok;
2192 /* Classify all cross-iteration scalar data-flow cycles.
2193 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2194 vect_analyze_scalar_cycles (loop_vinfo);
2196 vect_pattern_recog (loop_vinfo);
2198 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2200 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2201 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2203 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2204 if (!ok)
2206 if (dump_enabled_p ())
2207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208 "bad data access.\n");
2209 return ok;
2212 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2214 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2215 if (!ok)
2217 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219 "unexpected pattern.\n");
2220 return ok;
2223 /* While the rest of the analysis below depends on it in some way. */
2224 fatal = false;
2226 /* Analyze data dependences between the data-refs in the loop
2227 and adjust the maximum vectorization factor according to
2228 the dependences.
2229 FORNOW: fail at the first data dependence that we encounter. */
2231 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2232 if (!ok)
2234 if (dump_enabled_p ())
2235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236 "bad data dependence.\n");
2237 return ok;
2239 if (max_vf != MAX_VECTORIZATION_FACTOR
2240 && maybe_lt (max_vf, min_vf))
2241 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2242 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2244 ok = vect_determine_vectorization_factor (loop_vinfo);
2245 if (!ok)
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "can't determine vectorization factor.\n");
2250 return ok;
2252 if (max_vf != MAX_VECTORIZATION_FACTOR
2253 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2254 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2256 /* Compute the scalar iteration cost. */
2257 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2259 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2261 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2262 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2263 if (!ok)
2264 return ok;
2266 /* If there are any SLP instances mark them as pure_slp. */
2267 bool slp = vect_make_slp_decision (loop_vinfo);
2268 if (slp)
2270 /* Find stmts that need to be both vectorized and SLPed. */
2271 vect_detect_hybrid_slp (loop_vinfo);
2273 /* Update the vectorization factor based on the SLP decision. */
2274 vect_update_vf_for_slp (loop_vinfo);
2276 /* Optimize the SLP graph with the vectorization factor fixed. */
2277 vect_optimize_slp (loop_vinfo);
2280 bool saved_can_use_partial_vectors_p
2281 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2283 /* We don't expect to have to roll back to anything other than an empty
2284 set of rgroups. */
2285 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2287 /* This is the point where we can re-start analysis with SLP forced off. */
2288 start_over:
2290 /* Now the vectorization factor is final. */
2291 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2292 gcc_assert (known_ne (vectorization_factor, 0U));
2294 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "vectorization_factor = ");
2298 dump_dec (MSG_NOTE, vectorization_factor);
2299 dump_printf (MSG_NOTE, ", niters = %wd\n",
2300 LOOP_VINFO_INT_NITERS (loop_vinfo));
2303 /* Analyze the alignment of the data-refs in the loop.
2304 Fail if a data reference is found that cannot be vectorized. */
2306 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2307 if (!ok)
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "bad data alignment.\n");
2312 return ok;
2315 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2316 It is important to call pruning after vect_analyze_data_ref_accesses,
2317 since we use grouping information gathered by interleaving analysis. */
2318 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2319 if (!ok)
2320 return ok;
2322 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2323 vectorization, since we do not want to add extra peeling or
2324 add versioning for alignment. */
2325 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2326 /* This pass will decide on using loop versioning and/or loop peeling in
2327 order to enhance the alignment of data references in the loop. */
2328 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2329 if (!ok)
2330 return ok;
2332 if (slp)
2334 /* Analyze operations in the SLP instances. Note this may
2335 remove unsupported SLP instances which makes the above
2336 SLP kind detection invalid. */
2337 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2338 vect_slp_analyze_operations (loop_vinfo);
2339 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2341 ok = opt_result::failure_at (vect_location,
2342 "unsupported SLP instances\n");
2343 goto again;
2347 /* Dissolve SLP-only groups. */
2348 vect_dissolve_slp_only_groups (loop_vinfo);
2350 /* Scan all the remaining operations in the loop that are not subject
2351 to SLP and make sure they are vectorizable. */
2352 ok = vect_analyze_loop_operations (loop_vinfo);
2353 if (!ok)
2355 if (dump_enabled_p ())
2356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2357 "bad operation or unsupported loop bound.\n");
2358 return ok;
2361 /* For now, we don't expect to mix both masking and length approaches for one
2362 loop, disable it if both are recorded. */
2363 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2364 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2365 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 "can't vectorize a loop with partial vectors"
2370 " because we don't expect to mix different"
2371 " approaches with partial vectors for the"
2372 " same loop.\n");
2373 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2376 /* If we still have the option of using partial vectors,
2377 check whether we can generate the necessary loop controls. */
2378 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2379 && !vect_verify_full_masking (loop_vinfo)
2380 && !vect_verify_loop_lens (loop_vinfo))
2381 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2383 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2384 to be able to handle fewer than VF scalars, or needs to have a lower VF
2385 than the main loop. */
2386 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2387 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2388 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2389 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2390 return opt_result::failure_at (vect_location,
2391 "Vectorization factor too high for"
2392 " epilogue loop.\n");
2394 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2395 assuming that the loop will be used as a main loop. We will redo
2396 this analysis later if we instead decide to use the loop as an
2397 epilogue loop. */
2398 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2399 if (!ok)
2400 return ok;
2402 /* Check the costings of the loop make vectorizing worthwhile. */
2403 res = vect_analyze_loop_costing (loop_vinfo);
2404 if (res < 0)
2406 ok = opt_result::failure_at (vect_location,
2407 "Loop costings may not be worthwhile.\n");
2408 goto again;
2410 if (!res)
2411 return opt_result::failure_at (vect_location,
2412 "Loop costings not worthwhile.\n");
2414 /* If an epilogue loop is required make sure we can create one. */
2415 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2416 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2420 if (!vect_can_advance_ivs_p (loop_vinfo)
2421 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2422 single_exit (LOOP_VINFO_LOOP
2423 (loop_vinfo))))
2425 ok = opt_result::failure_at (vect_location,
2426 "not vectorized: can't create required "
2427 "epilog loop\n");
2428 goto again;
2432 /* During peeling, we need to check if number of loop iterations is
2433 enough for both peeled prolog loop and vector loop. This check
2434 can be merged along with threshold check of loop versioning, so
2435 increase threshold for this case if necessary.
2437 If we are analyzing an epilogue we still want to check what its
2438 versioning threshold would be. If we decide to vectorize the epilogues we
2439 will want to use the lowest versioning threshold of all epilogues and main
2440 loop. This will enable us to enter a vectorized epilogue even when
2441 versioning the loop. We can't simply check whether the epilogue requires
2442 versioning though since we may have skipped some versioning checks when
2443 analyzing the epilogue. For instance, checks for alias versioning will be
2444 skipped when dealing with epilogues as we assume we already checked them
2445 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2446 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2448 poly_uint64 niters_th = 0;
2449 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2451 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2453 /* Niters for peeled prolog loop. */
2454 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2456 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2457 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2458 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2460 else
2461 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2464 /* Niters for at least one iteration of vectorized loop. */
2465 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2466 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2467 /* One additional iteration because of peeling for gap. */
2468 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2469 niters_th += 1;
2471 /* Use the same condition as vect_transform_loop to decide when to use
2472 the cost to determine a versioning threshold. */
2473 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2474 && ordered_p (th, niters_th))
2475 niters_th = ordered_max (poly_uint64 (th), niters_th);
2477 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2480 gcc_assert (known_eq (vectorization_factor,
2481 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2483 /* Ok to vectorize! */
2484 return opt_result::success ();
2486 again:
2487 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2488 gcc_assert (!ok);
2490 /* Try again with SLP forced off but if we didn't do any SLP there is
2491 no point in re-trying. */
2492 if (!slp)
2493 return ok;
2495 /* If there are reduction chains re-trying will fail anyway. */
2496 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2497 return ok;
2499 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2500 via interleaving or lane instructions. */
2501 slp_instance instance;
2502 slp_tree node;
2503 unsigned i, j;
2504 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2506 stmt_vec_info vinfo;
2507 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2508 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2509 continue;
2510 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2511 unsigned int size = DR_GROUP_SIZE (vinfo);
2512 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2513 if (! vect_store_lanes_supported (vectype, size, false)
2514 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2515 && ! vect_grouped_store_supported (vectype, size))
2516 return opt_result::failure_at (vinfo->stmt,
2517 "unsupported grouped store\n");
2518 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2520 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2521 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2522 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2523 size = DR_GROUP_SIZE (vinfo);
2524 vectype = STMT_VINFO_VECTYPE (vinfo);
2525 if (! vect_load_lanes_supported (vectype, size, false)
2526 && ! vect_grouped_load_supported (vectype, single_element_p,
2527 size))
2528 return opt_result::failure_at (vinfo->stmt,
2529 "unsupported grouped load\n");
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_NOTE, vect_location,
2535 "re-trying with SLP disabled\n");
2537 /* Roll back state appropriately. No SLP this time. */
2538 slp = false;
2539 /* Restore vectorization factor as it were without SLP. */
2540 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2541 /* Free the SLP instances. */
2542 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2543 vect_free_slp_instance (instance);
2544 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2545 /* Reset SLP type to loop_vect on all stmts. */
2546 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2548 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2549 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2550 !gsi_end_p (si); gsi_next (&si))
2552 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2553 STMT_SLP_TYPE (stmt_info) = loop_vect;
2554 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2555 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2557 /* vectorizable_reduction adjusts reduction stmt def-types,
2558 restore them to that of the PHI. */
2559 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2560 = STMT_VINFO_DEF_TYPE (stmt_info);
2561 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2562 (STMT_VINFO_REDUC_DEF (stmt_info)))
2563 = STMT_VINFO_DEF_TYPE (stmt_info);
2566 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2567 !gsi_end_p (si); gsi_next (&si))
2569 if (is_gimple_debug (gsi_stmt (si)))
2570 continue;
2571 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2572 STMT_SLP_TYPE (stmt_info) = loop_vect;
2573 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2575 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2576 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2577 STMT_SLP_TYPE (stmt_info) = loop_vect;
2578 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2579 !gsi_end_p (pi); gsi_next (&pi))
2580 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2581 = loop_vect;
2585 /* Free optimized alias test DDRS. */
2586 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2587 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2588 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2589 /* Reset target cost data. */
2590 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2591 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2592 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2593 /* Reset accumulated rgroup information. */
2594 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2595 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2596 /* Reset assorted flags. */
2597 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2598 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2599 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2600 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2601 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2602 = saved_can_use_partial_vectors_p;
2604 goto start_over;
2607 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2608 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2609 OLD_LOOP_VINFO is better unless something specifically indicates
2610 otherwise.
2612 Note that this deliberately isn't a partial order. */
2614 static bool
2615 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2616 loop_vec_info old_loop_vinfo)
2618 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2619 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2621 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2622 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2624 /* Always prefer a VF of loop->simdlen over any other VF. */
2625 if (loop->simdlen)
2627 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2628 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2629 if (new_simdlen_p != old_simdlen_p)
2630 return new_simdlen_p;
2633 /* Limit the VFs to what is likely to be the maximum number of iterations,
2634 to handle cases in which at least one loop_vinfo is fully-masked. */
2635 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2636 if (estimated_max_niter != -1)
2638 if (known_le (estimated_max_niter, new_vf))
2639 new_vf = estimated_max_niter;
2640 if (known_le (estimated_max_niter, old_vf))
2641 old_vf = estimated_max_niter;
2644 /* Check whether the (fractional) cost per scalar iteration is lower
2645 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2646 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2647 * poly_widest_int (old_vf));
2648 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2649 * poly_widest_int (new_vf));
2650 if (maybe_lt (rel_old, rel_new))
2652 /* When old_loop_vinfo uses a variable vectorization factor,
2653 we know that it has a lower cost for at least one runtime VF.
2654 However, we don't know how likely that VF is.
2656 One option would be to compare the costs for the estimated VFs.
2657 The problem is that that can put too much pressure on the cost
2658 model. E.g. if the estimated VF is also the lowest possible VF,
2659 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2660 for the estimated VF, we'd then choose new_loop_vinfo even
2661 though (a) new_loop_vinfo might not actually be better than
2662 old_loop_vinfo for that VF and (b) it would be significantly
2663 worse at larger VFs.
2665 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2666 no more expensive than old_loop_vinfo even after doubling the
2667 estimated old_loop_vinfo VF. For all but trivial loops, this
2668 ensures that we only pick new_loop_vinfo if it is significantly
2669 better than old_loop_vinfo at the estimated VF. */
2670 if (rel_new.is_constant ())
2671 return false;
2673 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2674 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2675 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2676 * widest_int (old_estimated_vf));
2677 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2678 * widest_int (new_estimated_vf));
2679 return estimated_rel_new * 2 <= estimated_rel_old;
2681 if (known_lt (rel_new, rel_old))
2682 return true;
2684 /* If there's nothing to choose between the loop bodies, see whether
2685 there's a difference in the prologue and epilogue costs. */
2686 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2687 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2689 return false;
2692 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2693 true if we should. */
2695 static bool
2696 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2697 loop_vec_info old_loop_vinfo)
2699 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2700 return false;
2702 if (dump_enabled_p ())
2703 dump_printf_loc (MSG_NOTE, vect_location,
2704 "***** Preferring vector mode %s to vector mode %s\n",
2705 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2706 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2707 return true;
2710 /* Function vect_analyze_loop.
2712 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2713 for it. The different analyses will record information in the
2714 loop_vec_info struct. */
2715 opt_loop_vec_info
2716 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2718 auto_vector_modes vector_modes;
2720 /* Autodetect first vector size we try. */
2721 unsigned int autovec_flags
2722 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2723 loop->simdlen != 0);
2724 unsigned int mode_i = 0;
2726 DUMP_VECT_SCOPE ("analyze_loop_nest");
2728 if (loop_outer (loop)
2729 && loop_vec_info_for_loop (loop_outer (loop))
2730 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2731 return opt_loop_vec_info::failure_at (vect_location,
2732 "outer-loop already vectorized.\n");
2734 if (!find_loop_nest (loop, &shared->loop_nest))
2735 return opt_loop_vec_info::failure_at
2736 (vect_location,
2737 "not vectorized: loop nest containing two or more consecutive inner"
2738 " loops cannot be vectorized\n");
2740 unsigned n_stmts = 0;
2741 machine_mode autodetected_vector_mode = VOIDmode;
2742 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2743 machine_mode next_vector_mode = VOIDmode;
2744 poly_uint64 lowest_th = 0;
2745 unsigned vectorized_loops = 0;
2746 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2747 && !unlimited_cost_model (loop));
2749 bool vect_epilogues = false;
2750 opt_result res = opt_result::success ();
2751 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2752 while (1)
2754 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2755 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2756 if (!loop_vinfo)
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "bad loop form.\n");
2761 gcc_checking_assert (first_loop_vinfo == NULL);
2762 return loop_vinfo;
2764 loop_vinfo->vector_mode = next_vector_mode;
2766 bool fatal = false;
2768 /* When pick_lowest_cost_p is true, we should in principle iterate
2769 over all the loop_vec_infos that LOOP_VINFO could replace and
2770 try to vectorize LOOP_VINFO under the same conditions.
2771 E.g. when trying to replace an epilogue loop, we should vectorize
2772 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2773 to replace the main loop, we should vectorize LOOP_VINFO as a main
2774 loop too.
2776 However, autovectorize_vector_modes is usually sorted as follows:
2778 - Modes that naturally produce lower VFs usually follow modes that
2779 naturally produce higher VFs.
2781 - When modes naturally produce the same VF, maskable modes
2782 usually follow unmaskable ones, so that the maskable mode
2783 can be used to vectorize the epilogue of the unmaskable mode.
2785 This order is preferred because it leads to the maximum
2786 epilogue vectorization opportunities. Targets should only use
2787 a different order if they want to make wide modes available while
2788 disparaging them relative to earlier, smaller modes. The assumption
2789 in that case is that the wider modes are more expensive in some
2790 way that isn't reflected directly in the costs.
2792 There should therefore be few interesting cases in which
2793 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2794 treated as a standalone loop, and ends up being genuinely cheaper
2795 than FIRST_LOOP_VINFO. */
2796 if (vect_epilogues)
2797 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2799 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2800 if (mode_i == 0)
2801 autodetected_vector_mode = loop_vinfo->vector_mode;
2802 if (dump_enabled_p ())
2804 if (res)
2805 dump_printf_loc (MSG_NOTE, vect_location,
2806 "***** Analysis succeeded with vector mode %s\n",
2807 GET_MODE_NAME (loop_vinfo->vector_mode));
2808 else
2809 dump_printf_loc (MSG_NOTE, vect_location,
2810 "***** Analysis failed with vector mode %s\n",
2811 GET_MODE_NAME (loop_vinfo->vector_mode));
2814 loop->aux = NULL;
2816 if (!fatal)
2817 while (mode_i < vector_modes.length ()
2818 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2820 if (dump_enabled_p ())
2821 dump_printf_loc (MSG_NOTE, vect_location,
2822 "***** The result for vector mode %s would"
2823 " be the same\n",
2824 GET_MODE_NAME (vector_modes[mode_i]));
2825 mode_i += 1;
2828 if (res)
2830 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2831 vectorized_loops++;
2833 /* Once we hit the desired simdlen for the first time,
2834 discard any previous attempts. */
2835 if (simdlen
2836 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2838 delete first_loop_vinfo;
2839 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2840 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2841 simdlen = 0;
2843 else if (pick_lowest_cost_p && first_loop_vinfo)
2845 /* Keep trying to roll back vectorization attempts while the
2846 loop_vec_infos they produced were worse than this one. */
2847 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2848 while (!vinfos.is_empty ()
2849 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2851 gcc_assert (vect_epilogues);
2852 delete vinfos.pop ();
2854 if (vinfos.is_empty ()
2855 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2857 delete first_loop_vinfo;
2858 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2859 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2863 if (first_loop_vinfo == NULL)
2865 first_loop_vinfo = loop_vinfo;
2866 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2868 else if (vect_epilogues
2869 /* For now only allow one epilogue loop. */
2870 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2872 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2873 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2874 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2875 || maybe_ne (lowest_th, 0U));
2876 /* Keep track of the known smallest versioning
2877 threshold. */
2878 if (ordered_p (lowest_th, th))
2879 lowest_th = ordered_min (lowest_th, th);
2881 else
2883 delete loop_vinfo;
2884 loop_vinfo = opt_loop_vec_info::success (NULL);
2887 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2888 enabled, SIMDUID is not set, it is the innermost loop and we have
2889 either already found the loop's SIMDLEN or there was no SIMDLEN to
2890 begin with.
2891 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2892 vect_epilogues = (!simdlen
2893 && loop->inner == NULL
2894 && param_vect_epilogues_nomask
2895 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2896 && !loop->simduid
2897 /* For now only allow one epilogue loop, but allow
2898 pick_lowest_cost_p to replace it. */
2899 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2900 || pick_lowest_cost_p));
2902 /* Commit to first_loop_vinfo if we have no reason to try
2903 alternatives. */
2904 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2905 break;
2907 else
2909 delete loop_vinfo;
2910 loop_vinfo = opt_loop_vec_info::success (NULL);
2911 if (fatal)
2913 gcc_checking_assert (first_loop_vinfo == NULL);
2914 break;
2918 /* Handle the case that the original loop can use partial
2919 vectorization, but want to only adopt it for the epilogue.
2920 The retry should be in the same mode as original. */
2921 if (vect_epilogues
2922 && loop_vinfo
2923 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2925 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2926 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2927 if (dump_enabled_p ())
2928 dump_printf_loc (MSG_NOTE, vect_location,
2929 "***** Re-trying analysis with same vector mode"
2930 " %s for epilogue with partial vectors.\n",
2931 GET_MODE_NAME (loop_vinfo->vector_mode));
2932 continue;
2935 if (mode_i < vector_modes.length ()
2936 && VECTOR_MODE_P (autodetected_vector_mode)
2937 && (related_vector_mode (vector_modes[mode_i],
2938 GET_MODE_INNER (autodetected_vector_mode))
2939 == autodetected_vector_mode)
2940 && (related_vector_mode (autodetected_vector_mode,
2941 GET_MODE_INNER (vector_modes[mode_i]))
2942 == vector_modes[mode_i]))
2944 if (dump_enabled_p ())
2945 dump_printf_loc (MSG_NOTE, vect_location,
2946 "***** Skipping vector mode %s, which would"
2947 " repeat the analysis for %s\n",
2948 GET_MODE_NAME (vector_modes[mode_i]),
2949 GET_MODE_NAME (autodetected_vector_mode));
2950 mode_i += 1;
2953 if (mode_i == vector_modes.length ()
2954 || autodetected_vector_mode == VOIDmode)
2955 break;
2957 /* Try the next biggest vector size. */
2958 next_vector_mode = vector_modes[mode_i++];
2959 if (dump_enabled_p ())
2960 dump_printf_loc (MSG_NOTE, vect_location,
2961 "***** Re-trying analysis with vector mode %s\n",
2962 GET_MODE_NAME (next_vector_mode));
2965 if (first_loop_vinfo)
2967 loop->aux = (loop_vec_info) first_loop_vinfo;
2968 if (dump_enabled_p ())
2969 dump_printf_loc (MSG_NOTE, vect_location,
2970 "***** Choosing vector mode %s\n",
2971 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2972 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2973 return first_loop_vinfo;
2976 return opt_loop_vec_info::propagate_failure (res);
2979 /* Return true if there is an in-order reduction function for CODE, storing
2980 it in *REDUC_FN if so. */
2982 static bool
2983 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2985 switch (code)
2987 case PLUS_EXPR:
2988 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2989 return true;
2991 default:
2992 return false;
2996 /* Function reduction_fn_for_scalar_code
2998 Input:
2999 CODE - tree_code of a reduction operations.
3001 Output:
3002 REDUC_FN - the corresponding internal function to be used to reduce the
3003 vector of partial results into a single scalar result, or IFN_LAST
3004 if the operation is a supported reduction operation, but does not have
3005 such an internal function.
3007 Return FALSE if CODE currently cannot be vectorized as reduction. */
3009 static bool
3010 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3012 switch (code)
3014 case MAX_EXPR:
3015 *reduc_fn = IFN_REDUC_MAX;
3016 return true;
3018 case MIN_EXPR:
3019 *reduc_fn = IFN_REDUC_MIN;
3020 return true;
3022 case PLUS_EXPR:
3023 *reduc_fn = IFN_REDUC_PLUS;
3024 return true;
3026 case BIT_AND_EXPR:
3027 *reduc_fn = IFN_REDUC_AND;
3028 return true;
3030 case BIT_IOR_EXPR:
3031 *reduc_fn = IFN_REDUC_IOR;
3032 return true;
3034 case BIT_XOR_EXPR:
3035 *reduc_fn = IFN_REDUC_XOR;
3036 return true;
3038 case MULT_EXPR:
3039 case MINUS_EXPR:
3040 *reduc_fn = IFN_LAST;
3041 return true;
3043 default:
3044 return false;
3048 /* If there is a neutral value X such that SLP reduction NODE would not
3049 be affected by the introduction of additional X elements, return that X,
3050 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3051 is the vector type that would hold element X. REDUC_CHAIN is true if
3052 the SLP statements perform a single reduction, false if each statement
3053 performs an independent reduction. */
3055 static tree
3056 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3057 tree_code code, bool reduc_chain)
3059 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3060 stmt_vec_info stmt_vinfo = stmts[0];
3061 tree scalar_type = TREE_TYPE (vector_type);
3062 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3063 gcc_assert (loop);
3065 switch (code)
3067 case WIDEN_SUM_EXPR:
3068 case DOT_PROD_EXPR:
3069 case SAD_EXPR:
3070 case PLUS_EXPR:
3071 case MINUS_EXPR:
3072 case BIT_IOR_EXPR:
3073 case BIT_XOR_EXPR:
3074 return build_zero_cst (scalar_type);
3076 case MULT_EXPR:
3077 return build_one_cst (scalar_type);
3079 case BIT_AND_EXPR:
3080 return build_all_ones_cst (scalar_type);
3082 case MAX_EXPR:
3083 case MIN_EXPR:
3084 /* For MIN/MAX the initial values are neutral. A reduction chain
3085 has only a single initial value, so that value is neutral for
3086 all statements. */
3087 if (reduc_chain)
3088 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3089 loop_preheader_edge (loop));
3090 return NULL_TREE;
3092 default:
3093 return NULL_TREE;
3097 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3098 STMT is printed with a message MSG. */
3100 static void
3101 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3103 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3106 /* Return true if we need an in-order reduction for operation CODE
3107 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3108 overflow must wrap. */
3110 bool
3111 needs_fold_left_reduction_p (tree type, tree_code code)
3113 /* CHECKME: check for !flag_finite_math_only too? */
3114 if (SCALAR_FLOAT_TYPE_P (type))
3115 switch (code)
3117 case MIN_EXPR:
3118 case MAX_EXPR:
3119 return false;
3121 default:
3122 return !flag_associative_math;
3125 if (INTEGRAL_TYPE_P (type))
3127 if (!operation_no_trapping_overflow (type, code))
3128 return true;
3129 return false;
3132 if (SAT_FIXED_POINT_TYPE_P (type))
3133 return true;
3135 return false;
3138 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3139 has a handled computation expression. Store the main reduction
3140 operation in *CODE. */
3142 static bool
3143 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3144 tree loop_arg, enum tree_code *code,
3145 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3147 auto_bitmap visited;
3148 tree lookfor = PHI_RESULT (phi);
3149 ssa_op_iter curri;
3150 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3151 while (USE_FROM_PTR (curr) != loop_arg)
3152 curr = op_iter_next_use (&curri);
3153 curri.i = curri.numops;
3156 path.safe_push (std::make_pair (curri, curr));
3157 tree use = USE_FROM_PTR (curr);
3158 if (use == lookfor)
3159 break;
3160 gimple *def = SSA_NAME_DEF_STMT (use);
3161 if (gimple_nop_p (def)
3162 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3164 pop:
3167 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3168 curri = x.first;
3169 curr = x.second;
3171 curr = op_iter_next_use (&curri);
3172 /* Skip already visited or non-SSA operands (from iterating
3173 over PHI args). */
3174 while (curr != NULL_USE_OPERAND_P
3175 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3176 || ! bitmap_set_bit (visited,
3177 SSA_NAME_VERSION
3178 (USE_FROM_PTR (curr)))));
3180 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3181 if (curr == NULL_USE_OPERAND_P)
3182 break;
3184 else
3186 if (gimple_code (def) == GIMPLE_PHI)
3187 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3188 else
3189 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3190 while (curr != NULL_USE_OPERAND_P
3191 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3192 || ! bitmap_set_bit (visited,
3193 SSA_NAME_VERSION
3194 (USE_FROM_PTR (curr)))))
3195 curr = op_iter_next_use (&curri);
3196 if (curr == NULL_USE_OPERAND_P)
3197 goto pop;
3200 while (1);
3201 if (dump_file && (dump_flags & TDF_DETAILS))
3203 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3204 unsigned i;
3205 std::pair<ssa_op_iter, use_operand_p> *x;
3206 FOR_EACH_VEC_ELT (path, i, x)
3207 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3208 dump_printf (MSG_NOTE, "\n");
3211 /* Check whether the reduction path detected is valid. */
3212 bool fail = path.length () == 0;
3213 bool neg = false;
3214 int sign = -1;
3215 *code = ERROR_MARK;
3216 for (unsigned i = 1; i < path.length (); ++i)
3218 gimple *use_stmt = USE_STMT (path[i].second);
3219 tree op = USE_FROM_PTR (path[i].second);
3220 if (! is_gimple_assign (use_stmt)
3221 /* The following make sure we can compute the operand index
3222 easily plus it mostly disallows chaining via COND_EXPR condition
3223 operands. */
3224 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3225 && (gimple_num_ops (use_stmt) <= 2
3226 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3227 && (gimple_num_ops (use_stmt) <= 3
3228 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3230 fail = true;
3231 break;
3233 /* Check there's only a single stmt the op is used on inside
3234 of the loop. */
3235 imm_use_iterator imm_iter;
3236 gimple *op_use_stmt;
3237 unsigned cnt = 0;
3238 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3239 if (!is_gimple_debug (op_use_stmt)
3240 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3242 /* We want to allow x + x but not x < 1 ? x : 2. */
3243 if (is_gimple_assign (op_use_stmt)
3244 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3246 use_operand_p use_p;
3247 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3248 cnt++;
3250 else
3251 cnt++;
3253 if (cnt != 1)
3255 fail = true;
3256 break;
3258 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3259 if (use_code == MINUS_EXPR)
3261 use_code = PLUS_EXPR;
3262 /* Track whether we negate the reduction value each iteration. */
3263 if (gimple_assign_rhs2 (use_stmt) == op)
3264 neg = ! neg;
3266 if (CONVERT_EXPR_CODE_P (use_code)
3267 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3268 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3270 else if (*code == ERROR_MARK)
3272 *code = use_code;
3273 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3275 else if (use_code != *code)
3277 fail = true;
3278 break;
3280 else if ((use_code == MIN_EXPR
3281 || use_code == MAX_EXPR)
3282 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3284 fail = true;
3285 break;
3288 return ! fail && ! neg && *code != ERROR_MARK;
3291 bool
3292 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3293 tree loop_arg, enum tree_code code)
3295 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3296 enum tree_code code_;
3297 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3298 && code_ == code);
3303 /* Function vect_is_simple_reduction
3305 (1) Detect a cross-iteration def-use cycle that represents a simple
3306 reduction computation. We look for the following pattern:
3308 loop_header:
3309 a1 = phi < a0, a2 >
3310 a3 = ...
3311 a2 = operation (a3, a1)
3315 a3 = ...
3316 loop_header:
3317 a1 = phi < a0, a2 >
3318 a2 = operation (a3, a1)
3320 such that:
3321 1. operation is commutative and associative and it is safe to
3322 change the order of the computation
3323 2. no uses for a2 in the loop (a2 is used out of the loop)
3324 3. no uses of a1 in the loop besides the reduction operation
3325 4. no uses of a1 outside the loop.
3327 Conditions 1,4 are tested here.
3328 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3330 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3331 nested cycles.
3333 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3334 reductions:
3336 a1 = phi < a0, a2 >
3337 inner loop (def of a3)
3338 a2 = phi < a3 >
3340 (4) Detect condition expressions, ie:
3341 for (int i = 0; i < N; i++)
3342 if (a[i] < val)
3343 ret_val = a[i];
3347 static stmt_vec_info
3348 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3349 bool *double_reduc, bool *reduc_chain_p)
3351 gphi *phi = as_a <gphi *> (phi_info->stmt);
3352 gimple *phi_use_stmt = NULL;
3353 imm_use_iterator imm_iter;
3354 use_operand_p use_p;
3356 *double_reduc = false;
3357 *reduc_chain_p = false;
3358 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3360 tree phi_name = PHI_RESULT (phi);
3361 /* ??? If there are no uses of the PHI result the inner loop reduction
3362 won't be detected as possibly double-reduction by vectorizable_reduction
3363 because that tries to walk the PHI arg from the preheader edge which
3364 can be constant. See PR60382. */
3365 if (has_zero_uses (phi_name))
3366 return NULL;
3367 class loop *loop = (gimple_bb (phi))->loop_father;
3368 unsigned nphi_def_loop_uses = 0;
3369 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3371 gimple *use_stmt = USE_STMT (use_p);
3372 if (is_gimple_debug (use_stmt))
3373 continue;
3375 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3377 if (dump_enabled_p ())
3378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3379 "intermediate value used outside loop.\n");
3381 return NULL;
3384 nphi_def_loop_uses++;
3385 phi_use_stmt = use_stmt;
3388 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3389 if (TREE_CODE (latch_def) != SSA_NAME)
3391 if (dump_enabled_p ())
3392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3393 "reduction: not ssa_name: %T\n", latch_def);
3394 return NULL;
3397 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3398 if (!def_stmt_info
3399 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3400 return NULL;
3402 bool nested_in_vect_loop
3403 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3404 unsigned nlatch_def_loop_uses = 0;
3405 auto_vec<gphi *, 3> lcphis;
3406 bool inner_loop_of_double_reduc = false;
3407 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3409 gimple *use_stmt = USE_STMT (use_p);
3410 if (is_gimple_debug (use_stmt))
3411 continue;
3412 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3413 nlatch_def_loop_uses++;
3414 else
3416 /* We can have more than one loop-closed PHI. */
3417 lcphis.safe_push (as_a <gphi *> (use_stmt));
3418 if (nested_in_vect_loop
3419 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3420 == vect_double_reduction_def))
3421 inner_loop_of_double_reduc = true;
3425 /* If we are vectorizing an inner reduction we are executing that
3426 in the original order only in case we are not dealing with a
3427 double reduction. */
3428 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3430 if (dump_enabled_p ())
3431 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3432 "detected nested cycle: ");
3433 return def_stmt_info;
3436 /* If this isn't a nested cycle or if the nested cycle reduction value
3437 is used ouside of the inner loop we cannot handle uses of the reduction
3438 value. */
3439 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3441 if (dump_enabled_p ())
3442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3443 "reduction used in loop.\n");
3444 return NULL;
3447 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3448 defined in the inner loop. */
3449 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3451 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3452 if (gimple_phi_num_args (def_stmt) != 1
3453 || TREE_CODE (op1) != SSA_NAME)
3455 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3457 "unsupported phi node definition.\n");
3459 return NULL;
3462 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3463 if (gimple_bb (def1)
3464 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3465 && loop->inner
3466 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3467 && is_gimple_assign (def1)
3468 && is_a <gphi *> (phi_use_stmt)
3469 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3471 if (dump_enabled_p ())
3472 report_vect_op (MSG_NOTE, def_stmt,
3473 "detected double reduction: ");
3475 *double_reduc = true;
3476 return def_stmt_info;
3479 return NULL;
3482 /* Look for the expression computing latch_def from then loop PHI result. */
3483 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3484 enum tree_code code;
3485 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3486 path))
3488 STMT_VINFO_REDUC_CODE (phi_info) = code;
3489 if (code == COND_EXPR && !nested_in_vect_loop)
3490 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3492 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3493 reduction chain for which the additional restriction is that
3494 all operations in the chain are the same. */
3495 auto_vec<stmt_vec_info, 8> reduc_chain;
3496 unsigned i;
3497 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3498 for (i = path.length () - 1; i >= 1; --i)
3500 gimple *stmt = USE_STMT (path[i].second);
3501 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3502 STMT_VINFO_REDUC_IDX (stmt_info)
3503 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3504 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3505 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3506 && (i == 1 || i == path.length () - 1));
3507 if ((stmt_code != code && !leading_conversion)
3508 /* We can only handle the final value in epilogue
3509 generation for reduction chains. */
3510 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3511 is_slp_reduc = false;
3512 /* For reduction chains we support a trailing/leading
3513 conversions. We do not store those in the actual chain. */
3514 if (leading_conversion)
3515 continue;
3516 reduc_chain.safe_push (stmt_info);
3518 if (is_slp_reduc && reduc_chain.length () > 1)
3520 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3522 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3523 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3525 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3526 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3528 /* Save the chain for further analysis in SLP detection. */
3529 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3530 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3532 *reduc_chain_p = true;
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE, vect_location,
3535 "reduction: detected reduction chain\n");
3537 else if (dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE, vect_location,
3539 "reduction: detected reduction\n");
3541 return def_stmt_info;
3544 if (dump_enabled_p ())
3545 dump_printf_loc (MSG_NOTE, vect_location,
3546 "reduction: unknown pattern\n");
3548 return NULL;
3551 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3552 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3553 or -1 if not known. */
3555 static int
3556 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3558 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3559 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3561 if (dump_enabled_p ())
3562 dump_printf_loc (MSG_NOTE, vect_location,
3563 "cost model: epilogue peel iters set to vf/2 "
3564 "because loop iterations are unknown .\n");
3565 return assumed_vf / 2;
3567 else
3569 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3570 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3571 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3572 /* If we need to peel for gaps, but no peeling is required, we have to
3573 peel VF iterations. */
3574 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3575 peel_iters_epilogue = assumed_vf;
3576 return peel_iters_epilogue;
3580 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3582 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3583 int *peel_iters_epilogue,
3584 stmt_vector_for_cost *scalar_cost_vec,
3585 stmt_vector_for_cost *prologue_cost_vec,
3586 stmt_vector_for_cost *epilogue_cost_vec)
3588 int retval = 0;
3590 *peel_iters_epilogue
3591 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3593 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3595 /* If peeled iterations are known but number of scalar loop
3596 iterations are unknown, count a taken branch per peeled loop. */
3597 if (peel_iters_prologue > 0)
3598 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3599 NULL, NULL_TREE, 0, vect_prologue);
3600 if (*peel_iters_epilogue > 0)
3601 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3602 NULL, NULL_TREE, 0, vect_epilogue);
3605 stmt_info_for_cost *si;
3606 int j;
3607 if (peel_iters_prologue)
3608 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3609 retval += record_stmt_cost (prologue_cost_vec,
3610 si->count * peel_iters_prologue,
3611 si->kind, si->stmt_info, si->misalign,
3612 vect_prologue);
3613 if (*peel_iters_epilogue)
3614 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3615 retval += record_stmt_cost (epilogue_cost_vec,
3616 si->count * *peel_iters_epilogue,
3617 si->kind, si->stmt_info, si->misalign,
3618 vect_epilogue);
3620 return retval;
3623 /* Function vect_estimate_min_profitable_iters
3625 Return the number of iterations required for the vector version of the
3626 loop to be profitable relative to the cost of the scalar version of the
3627 loop.
3629 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3630 of iterations for vectorization. -1 value means loop vectorization
3631 is not profitable. This returned value may be used for dynamic
3632 profitability check.
3634 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3635 for static check against estimated number of iterations. */
3637 static void
3638 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3639 int *ret_min_profitable_niters,
3640 int *ret_min_profitable_estimate)
3642 int min_profitable_iters;
3643 int min_profitable_estimate;
3644 int peel_iters_prologue;
3645 int peel_iters_epilogue;
3646 unsigned vec_inside_cost = 0;
3647 int vec_outside_cost = 0;
3648 unsigned vec_prologue_cost = 0;
3649 unsigned vec_epilogue_cost = 0;
3650 int scalar_single_iter_cost = 0;
3651 int scalar_outside_cost = 0;
3652 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3653 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3654 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3656 /* Cost model disabled. */
3657 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3659 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3661 *ret_min_profitable_niters = 0;
3662 *ret_min_profitable_estimate = 0;
3663 return;
3666 /* Requires loop versioning tests to handle misalignment. */
3667 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3669 /* FIXME: Make cost depend on complexity of individual check. */
3670 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3671 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3672 NULL, NULL_TREE, 0, vect_prologue);
3673 if (dump_enabled_p ())
3674 dump_printf (MSG_NOTE,
3675 "cost model: Adding cost of checks for loop "
3676 "versioning to treat misalignment.\n");
3679 /* Requires loop versioning with alias checks. */
3680 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3682 /* FIXME: Make cost depend on complexity of individual check. */
3683 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3684 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3685 NULL, NULL_TREE, 0, vect_prologue);
3686 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3687 if (len)
3688 /* Count LEN - 1 ANDs and LEN comparisons. */
3689 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3690 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3691 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3692 if (len)
3694 /* Count LEN - 1 ANDs and LEN comparisons. */
3695 unsigned int nstmts = len * 2 - 1;
3696 /* +1 for each bias that needs adding. */
3697 for (unsigned int i = 0; i < len; ++i)
3698 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3699 nstmts += 1;
3700 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3701 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3703 if (dump_enabled_p ())
3704 dump_printf (MSG_NOTE,
3705 "cost model: Adding cost of checks for loop "
3706 "versioning aliasing.\n");
3709 /* Requires loop versioning with niter checks. */
3710 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3712 /* FIXME: Make cost depend on complexity of individual check. */
3713 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3714 NULL, NULL_TREE, 0, vect_prologue);
3715 if (dump_enabled_p ())
3716 dump_printf (MSG_NOTE,
3717 "cost model: Adding cost of checks for loop "
3718 "versioning niters.\n");
3721 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3722 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3723 NULL, NULL_TREE, 0, vect_prologue);
3725 /* Count statements in scalar loop. Using this as scalar cost for a single
3726 iteration for now.
3728 TODO: Add outer loop support.
3730 TODO: Consider assigning different costs to different scalar
3731 statements. */
3733 scalar_single_iter_cost
3734 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3736 /* Add additional cost for the peeled instructions in prologue and epilogue
3737 loop. (For fully-masked loops there will be no peeling.)
3739 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3740 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3742 TODO: Build an expression that represents peel_iters for prologue and
3743 epilogue to be used in a run-time test. */
3745 bool prologue_need_br_taken_cost = false;
3746 bool prologue_need_br_not_taken_cost = false;
3748 /* Calculate peel_iters_prologue. */
3749 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3750 peel_iters_prologue = 0;
3751 else if (npeel < 0)
3753 peel_iters_prologue = assumed_vf / 2;
3754 if (dump_enabled_p ())
3755 dump_printf (MSG_NOTE, "cost model: "
3756 "prologue peel iters set to vf/2.\n");
3758 /* If peeled iterations are unknown, count a taken branch and a not taken
3759 branch per peeled loop. Even if scalar loop iterations are known,
3760 vector iterations are not known since peeled prologue iterations are
3761 not known. Hence guards remain the same. */
3762 prologue_need_br_taken_cost = true;
3763 prologue_need_br_not_taken_cost = true;
3765 else
3767 peel_iters_prologue = npeel;
3768 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3769 /* If peeled iterations are known but number of scalar loop
3770 iterations are unknown, count a taken branch per peeled loop. */
3771 prologue_need_br_taken_cost = true;
3774 bool epilogue_need_br_taken_cost = false;
3775 bool epilogue_need_br_not_taken_cost = false;
3777 /* Calculate peel_iters_epilogue. */
3778 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3779 /* We need to peel exactly one iteration for gaps. */
3780 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3781 else if (npeel < 0)
3783 /* If peeling for alignment is unknown, loop bound of main loop
3784 becomes unknown. */
3785 peel_iters_epilogue = assumed_vf / 2;
3786 if (dump_enabled_p ())
3787 dump_printf (MSG_NOTE, "cost model: "
3788 "epilogue peel iters set to vf/2 because "
3789 "peeling for alignment is unknown.\n");
3791 /* See the same reason above in peel_iters_prologue calculation. */
3792 epilogue_need_br_taken_cost = true;
3793 epilogue_need_br_not_taken_cost = true;
3795 else
3797 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3798 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3799 /* If peeled iterations are known but number of scalar loop
3800 iterations are unknown, count a taken branch per peeled loop. */
3801 epilogue_need_br_taken_cost = true;
3804 stmt_info_for_cost *si;
3805 int j;
3806 /* Add costs associated with peel_iters_prologue. */
3807 if (peel_iters_prologue)
3808 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3810 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3811 si->count * peel_iters_prologue, si->kind,
3812 si->stmt_info, si->vectype, si->misalign,
3813 vect_prologue);
3816 /* Add costs associated with peel_iters_epilogue. */
3817 if (peel_iters_epilogue)
3818 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3820 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3821 si->count * peel_iters_epilogue, si->kind,
3822 si->stmt_info, si->vectype, si->misalign,
3823 vect_epilogue);
3826 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3828 if (prologue_need_br_taken_cost)
3829 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3830 NULL, NULL_TREE, 0, vect_prologue);
3832 if (prologue_need_br_not_taken_cost)
3833 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3834 cond_branch_not_taken, NULL, NULL_TREE, 0,
3835 vect_prologue);
3837 if (epilogue_need_br_taken_cost)
3838 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3839 NULL, NULL_TREE, 0, vect_epilogue);
3841 if (epilogue_need_br_not_taken_cost)
3842 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3843 cond_branch_not_taken, NULL, NULL_TREE, 0,
3844 vect_epilogue);
3846 /* Take care of special costs for rgroup controls of partial vectors. */
3847 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3849 /* Calculate how many masks we need to generate. */
3850 unsigned int num_masks = 0;
3851 rgroup_controls *rgm;
3852 unsigned int num_vectors_m1;
3853 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3854 if (rgm->type)
3855 num_masks += num_vectors_m1 + 1;
3856 gcc_assert (num_masks > 0);
3858 /* In the worst case, we need to generate each mask in the prologue
3859 and in the loop body. One of the loop body mask instructions
3860 replaces the comparison in the scalar loop, and since we don't
3861 count the scalar comparison against the scalar body, we shouldn't
3862 count that vector instruction against the vector body either.
3864 Sometimes we can use unpacks instead of generating prologue
3865 masks and sometimes the prologue mask will fold to a constant,
3866 so the actual prologue cost might be smaller. However, it's
3867 simpler and safer to use the worst-case cost; if this ends up
3868 being the tie-breaker between vectorizing or not, then it's
3869 probably better not to vectorize. */
3870 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3871 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3872 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3873 vector_stmt, NULL, NULL_TREE, 0, vect_body);
3875 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3877 /* Referring to the functions vect_set_loop_condition_partial_vectors
3878 and vect_set_loop_controls_directly, we need to generate each
3879 length in the prologue and in the loop body if required. Although
3880 there are some possible optimizations, we consider the worst case
3881 here. */
3883 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3884 bool need_iterate_p
3885 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3886 && !vect_known_niters_smaller_than_vf (loop_vinfo));
3888 /* Calculate how many statements to be added. */
3889 unsigned int prologue_stmts = 0;
3890 unsigned int body_stmts = 0;
3892 rgroup_controls *rgc;
3893 unsigned int num_vectors_m1;
3894 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3895 if (rgc->type)
3897 /* May need one SHIFT for nitems_total computation. */
3898 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3899 if (nitems != 1 && !niters_known_p)
3900 prologue_stmts += 1;
3902 /* May need one MAX and one MINUS for wrap around. */
3903 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3904 prologue_stmts += 2;
3906 /* Need one MAX and one MINUS for each batch limit excepting for
3907 the 1st one. */
3908 prologue_stmts += num_vectors_m1 * 2;
3910 unsigned int num_vectors = num_vectors_m1 + 1;
3912 /* Need to set up lengths in prologue, only one MIN required
3913 for each since start index is zero. */
3914 prologue_stmts += num_vectors;
3916 /* Each may need two MINs and one MINUS to update lengths in body
3917 for next iteration. */
3918 if (need_iterate_p)
3919 body_stmts += 3 * num_vectors;
3922 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3923 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3924 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3925 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3928 /* FORNOW: The scalar outside cost is incremented in one of the
3929 following ways:
3931 1. The vectorizer checks for alignment and aliasing and generates
3932 a condition that allows dynamic vectorization. A cost model
3933 check is ANDED with the versioning condition. Hence scalar code
3934 path now has the added cost of the versioning check.
3936 if (cost > th & versioning_check)
3937 jmp to vector code
3939 Hence run-time scalar is incremented by not-taken branch cost.
3941 2. The vectorizer then checks if a prologue is required. If the
3942 cost model check was not done before during versioning, it has to
3943 be done before the prologue check.
3945 if (cost <= th)
3946 prologue = scalar_iters
3947 if (prologue == 0)
3948 jmp to vector code
3949 else
3950 execute prologue
3951 if (prologue == num_iters)
3952 go to exit
3954 Hence the run-time scalar cost is incremented by a taken branch,
3955 plus a not-taken branch, plus a taken branch cost.
3957 3. The vectorizer then checks if an epilogue is required. If the
3958 cost model check was not done before during prologue check, it
3959 has to be done with the epilogue check.
3961 if (prologue == 0)
3962 jmp to vector code
3963 else
3964 execute prologue
3965 if (prologue == num_iters)
3966 go to exit
3967 vector code:
3968 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3969 jmp to epilogue
3971 Hence the run-time scalar cost should be incremented by 2 taken
3972 branches.
3974 TODO: The back end may reorder the BBS's differently and reverse
3975 conditions/branch directions. Change the estimates below to
3976 something more reasonable. */
3978 /* If the number of iterations is known and we do not do versioning, we can
3979 decide whether to vectorize at compile time. Hence the scalar version
3980 do not carry cost model guard costs. */
3981 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3982 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3984 /* Cost model check occurs at versioning. */
3985 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3986 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3987 else
3989 /* Cost model check occurs at prologue generation. */
3990 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3991 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3992 + vect_get_stmt_cost (cond_branch_not_taken);
3993 /* Cost model check occurs at epilogue generation. */
3994 else
3995 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3999 /* Complete the target-specific cost calculations. */
4000 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4001 &vec_inside_cost, &vec_epilogue_cost);
4003 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4005 /* Stash the costs so that we can compare two loop_vec_infos. */
4006 loop_vinfo->vec_inside_cost = vec_inside_cost;
4007 loop_vinfo->vec_outside_cost = vec_outside_cost;
4009 if (dump_enabled_p ())
4011 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4012 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4013 vec_inside_cost);
4014 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4015 vec_prologue_cost);
4016 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4017 vec_epilogue_cost);
4018 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4019 scalar_single_iter_cost);
4020 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4021 scalar_outside_cost);
4022 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4023 vec_outside_cost);
4024 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4025 peel_iters_prologue);
4026 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4027 peel_iters_epilogue);
4030 /* Calculate number of iterations required to make the vector version
4031 profitable, relative to the loop bodies only. The following condition
4032 must hold true:
4033 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4034 where
4035 SIC = scalar iteration cost, VIC = vector iteration cost,
4036 VOC = vector outside cost, VF = vectorization factor,
4037 NPEEL = prologue iterations + epilogue iterations,
4038 SOC = scalar outside cost for run time cost model check. */
4040 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4041 - vec_inside_cost);
4042 if (saving_per_viter <= 0)
4044 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4045 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4046 "vectorization did not happen for a simd loop");
4048 if (dump_enabled_p ())
4049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4050 "cost model: the vector iteration cost = %d "
4051 "divided by the scalar iteration cost = %d "
4052 "is greater or equal to the vectorization factor = %d"
4053 ".\n",
4054 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4055 *ret_min_profitable_niters = -1;
4056 *ret_min_profitable_estimate = -1;
4057 return;
4060 /* ??? The "if" arm is written to handle all cases; see below for what
4061 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4062 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4064 /* Rewriting the condition above in terms of the number of
4065 vector iterations (vniters) rather than the number of
4066 scalar iterations (niters) gives:
4068 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4070 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4072 For integer N, X and Y when X > 0:
4074 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4075 int outside_overhead = (vec_outside_cost
4076 - scalar_single_iter_cost * peel_iters_prologue
4077 - scalar_single_iter_cost * peel_iters_epilogue
4078 - scalar_outside_cost);
4079 /* We're only interested in cases that require at least one
4080 vector iteration. */
4081 int min_vec_niters = 1;
4082 if (outside_overhead > 0)
4083 min_vec_niters = outside_overhead / saving_per_viter + 1;
4085 if (dump_enabled_p ())
4086 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4087 min_vec_niters);
4089 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4091 /* Now that we know the minimum number of vector iterations,
4092 find the minimum niters for which the scalar cost is larger:
4094 SIC * niters > VIC * vniters + VOC - SOC
4096 We know that the minimum niters is no more than
4097 vniters * VF + NPEEL, but it might be (and often is) less
4098 than that if a partial vector iteration is cheaper than the
4099 equivalent scalar code. */
4100 int threshold = (vec_inside_cost * min_vec_niters
4101 + vec_outside_cost
4102 - scalar_outside_cost);
4103 if (threshold <= 0)
4104 min_profitable_iters = 1;
4105 else
4106 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4108 else
4109 /* Convert the number of vector iterations into a number of
4110 scalar iterations. */
4111 min_profitable_iters = (min_vec_niters * assumed_vf
4112 + peel_iters_prologue
4113 + peel_iters_epilogue);
4115 else
4117 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4118 * assumed_vf
4119 - vec_inside_cost * peel_iters_prologue
4120 - vec_inside_cost * peel_iters_epilogue);
4121 if (min_profitable_iters <= 0)
4122 min_profitable_iters = 0;
4123 else
4125 min_profitable_iters /= saving_per_viter;
4127 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4128 <= (((int) vec_inside_cost * min_profitable_iters)
4129 + (((int) vec_outside_cost - scalar_outside_cost)
4130 * assumed_vf)))
4131 min_profitable_iters++;
4135 if (dump_enabled_p ())
4136 dump_printf (MSG_NOTE,
4137 " Calculated minimum iters for profitability: %d\n",
4138 min_profitable_iters);
4140 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4141 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4142 /* We want the vectorized loop to execute at least once. */
4143 min_profitable_iters = assumed_vf + peel_iters_prologue;
4144 else if (min_profitable_iters < peel_iters_prologue)
4145 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4146 vectorized loop executes at least once. */
4147 min_profitable_iters = peel_iters_prologue;
4149 if (dump_enabled_p ())
4150 dump_printf_loc (MSG_NOTE, vect_location,
4151 " Runtime profitability threshold = %d\n",
4152 min_profitable_iters);
4154 *ret_min_profitable_niters = min_profitable_iters;
4156 /* Calculate number of iterations required to make the vector version
4157 profitable, relative to the loop bodies only.
4159 Non-vectorized variant is SIC * niters and it must win over vector
4160 variant on the expected loop trip count. The following condition must hold true:
4161 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4163 if (vec_outside_cost <= 0)
4164 min_profitable_estimate = 0;
4165 /* ??? This "else if" arm is written to handle all cases; see below for
4166 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4167 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4169 /* This is a repeat of the code above, but with + SOC rather
4170 than - SOC. */
4171 int outside_overhead = (vec_outside_cost
4172 - scalar_single_iter_cost * peel_iters_prologue
4173 - scalar_single_iter_cost * peel_iters_epilogue
4174 + scalar_outside_cost);
4175 int min_vec_niters = 1;
4176 if (outside_overhead > 0)
4177 min_vec_niters = outside_overhead / saving_per_viter + 1;
4179 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4181 int threshold = (vec_inside_cost * min_vec_niters
4182 + vec_outside_cost
4183 + scalar_outside_cost);
4184 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4186 else
4187 min_profitable_estimate = (min_vec_niters * assumed_vf
4188 + peel_iters_prologue
4189 + peel_iters_epilogue);
4191 else
4193 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4194 * assumed_vf
4195 - vec_inside_cost * peel_iters_prologue
4196 - vec_inside_cost * peel_iters_epilogue)
4197 / ((scalar_single_iter_cost * assumed_vf)
4198 - vec_inside_cost);
4200 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4201 if (dump_enabled_p ())
4202 dump_printf_loc (MSG_NOTE, vect_location,
4203 " Static estimate profitability threshold = %d\n",
4204 min_profitable_estimate);
4206 *ret_min_profitable_estimate = min_profitable_estimate;
4209 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4210 vector elements (not bits) for a vector with NELT elements. */
4211 static void
4212 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4213 vec_perm_builder *sel)
4215 /* The encoding is a single stepped pattern. Any wrap-around is handled
4216 by vec_perm_indices. */
4217 sel->new_vector (nelt, 1, 3);
4218 for (unsigned int i = 0; i < 3; i++)
4219 sel->quick_push (i + offset);
4222 /* Checks whether the target supports whole-vector shifts for vectors of mode
4223 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4224 it supports vec_perm_const with masks for all necessary shift amounts. */
4225 static bool
4226 have_whole_vector_shift (machine_mode mode)
4228 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4229 return true;
4231 /* Variable-length vectors should be handled via the optab. */
4232 unsigned int nelt;
4233 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4234 return false;
4236 vec_perm_builder sel;
4237 vec_perm_indices indices;
4238 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4240 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4241 indices.new_vector (sel, 2, nelt);
4242 if (!can_vec_perm_const_p (mode, indices, false))
4243 return false;
4245 return true;
4248 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4249 functions. Design better to avoid maintenance issues. */
4251 /* Function vect_model_reduction_cost.
4253 Models cost for a reduction operation, including the vector ops
4254 generated within the strip-mine loop, the initial definition before
4255 the loop, and the epilogue code that must be generated. */
4257 static void
4258 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4259 stmt_vec_info stmt_info, internal_fn reduc_fn,
4260 vect_reduction_type reduction_type,
4261 int ncopies, stmt_vector_for_cost *cost_vec)
4263 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4264 enum tree_code code;
4265 optab optab;
4266 tree vectype;
4267 machine_mode mode;
4268 class loop *loop = NULL;
4270 if (loop_vinfo)
4271 loop = LOOP_VINFO_LOOP (loop_vinfo);
4273 /* Condition reductions generate two reductions in the loop. */
4274 if (reduction_type == COND_REDUCTION)
4275 ncopies *= 2;
4277 vectype = STMT_VINFO_VECTYPE (stmt_info);
4278 mode = TYPE_MODE (vectype);
4279 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4281 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4283 if (reduction_type == EXTRACT_LAST_REDUCTION)
4284 /* No extra instructions are needed in the prologue. The loop body
4285 operations are costed in vectorizable_condition. */
4286 inside_cost = 0;
4287 else if (reduction_type == FOLD_LEFT_REDUCTION)
4289 /* No extra instructions needed in the prologue. */
4290 prologue_cost = 0;
4292 if (reduc_fn != IFN_LAST)
4293 /* Count one reduction-like operation per vector. */
4294 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4295 stmt_info, 0, vect_body);
4296 else
4298 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4299 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4300 inside_cost = record_stmt_cost (cost_vec, nelements,
4301 vec_to_scalar, stmt_info, 0,
4302 vect_body);
4303 inside_cost += record_stmt_cost (cost_vec, nelements,
4304 scalar_stmt, stmt_info, 0,
4305 vect_body);
4308 else
4310 /* Add in cost for initial definition.
4311 For cond reduction we have four vectors: initial index, step,
4312 initial result of the data reduction, initial value of the index
4313 reduction. */
4314 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4315 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4316 scalar_to_vec, stmt_info, 0,
4317 vect_prologue);
4319 /* Cost of reduction op inside loop. */
4320 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4321 stmt_info, 0, vect_body);
4324 /* Determine cost of epilogue code.
4326 We have a reduction operator that will reduce the vector in one statement.
4327 Also requires scalar extract. */
4329 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4331 if (reduc_fn != IFN_LAST)
4333 if (reduction_type == COND_REDUCTION)
4335 /* An EQ stmt and an COND_EXPR stmt. */
4336 epilogue_cost += record_stmt_cost (cost_vec, 2,
4337 vector_stmt, stmt_info, 0,
4338 vect_epilogue);
4339 /* Reduction of the max index and a reduction of the found
4340 values. */
4341 epilogue_cost += record_stmt_cost (cost_vec, 2,
4342 vec_to_scalar, stmt_info, 0,
4343 vect_epilogue);
4344 /* A broadcast of the max value. */
4345 epilogue_cost += record_stmt_cost (cost_vec, 1,
4346 scalar_to_vec, stmt_info, 0,
4347 vect_epilogue);
4349 else
4351 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4352 stmt_info, 0, vect_epilogue);
4353 epilogue_cost += record_stmt_cost (cost_vec, 1,
4354 vec_to_scalar, stmt_info, 0,
4355 vect_epilogue);
4358 else if (reduction_type == COND_REDUCTION)
4360 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4361 /* Extraction of scalar elements. */
4362 epilogue_cost += record_stmt_cost (cost_vec,
4363 2 * estimated_nunits,
4364 vec_to_scalar, stmt_info, 0,
4365 vect_epilogue);
4366 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4367 epilogue_cost += record_stmt_cost (cost_vec,
4368 2 * estimated_nunits - 3,
4369 scalar_stmt, stmt_info, 0,
4370 vect_epilogue);
4372 else if (reduction_type == EXTRACT_LAST_REDUCTION
4373 || reduction_type == FOLD_LEFT_REDUCTION)
4374 /* No extra instructions need in the epilogue. */
4376 else
4378 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4379 tree bitsize =
4380 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4381 int element_bitsize = tree_to_uhwi (bitsize);
4382 int nelements = vec_size_in_bits / element_bitsize;
4384 if (code == COND_EXPR)
4385 code = MAX_EXPR;
4387 optab = optab_for_tree_code (code, vectype, optab_default);
4389 /* We have a whole vector shift available. */
4390 if (optab != unknown_optab
4391 && VECTOR_MODE_P (mode)
4392 && optab_handler (optab, mode) != CODE_FOR_nothing
4393 && have_whole_vector_shift (mode))
4395 /* Final reduction via vector shifts and the reduction operator.
4396 Also requires scalar extract. */
4397 epilogue_cost += record_stmt_cost (cost_vec,
4398 exact_log2 (nelements) * 2,
4399 vector_stmt, stmt_info, 0,
4400 vect_epilogue);
4401 epilogue_cost += record_stmt_cost (cost_vec, 1,
4402 vec_to_scalar, stmt_info, 0,
4403 vect_epilogue);
4405 else
4406 /* Use extracts and reduction op for final reduction. For N
4407 elements, we have N extracts and N-1 reduction ops. */
4408 epilogue_cost += record_stmt_cost (cost_vec,
4409 nelements + nelements - 1,
4410 vector_stmt, stmt_info, 0,
4411 vect_epilogue);
4415 if (dump_enabled_p ())
4416 dump_printf (MSG_NOTE,
4417 "vect_model_reduction_cost: inside_cost = %d, "
4418 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4419 prologue_cost, epilogue_cost);
4423 /* Function vect_model_induction_cost.
4425 Models cost for induction operations. */
4427 static void
4428 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4429 stmt_vector_for_cost *cost_vec)
4431 unsigned inside_cost, prologue_cost;
4433 if (PURE_SLP_STMT (stmt_info))
4434 return;
4436 /* loop cost for vec_loop. */
4437 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4438 stmt_info, 0, vect_body);
4440 /* prologue cost for vec_init and vec_step. */
4441 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4442 stmt_info, 0, vect_prologue);
4444 if (dump_enabled_p ())
4445 dump_printf_loc (MSG_NOTE, vect_location,
4446 "vect_model_induction_cost: inside_cost = %d, "
4447 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4452 /* Function get_initial_def_for_reduction
4454 Input:
4455 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4456 INIT_VAL - the initial value of the reduction variable
4458 Output:
4459 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4460 of the reduction (used for adjusting the epilog - see below).
4461 Return a vector variable, initialized according to the operation that
4462 STMT_VINFO performs. This vector will be used as the initial value
4463 of the vector of partial results.
4465 Option1 (adjust in epilog): Initialize the vector as follows:
4466 add/bit or/xor: [0,0,...,0,0]
4467 mult/bit and: [1,1,...,1,1]
4468 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4469 and when necessary (e.g. add/mult case) let the caller know
4470 that it needs to adjust the result by init_val.
4472 Option2: Initialize the vector as follows:
4473 add/bit or/xor: [init_val,0,0,...,0]
4474 mult/bit and: [init_val,1,1,...,1]
4475 min/max/cond_expr: [init_val,init_val,...,init_val]
4476 and no adjustments are needed.
4478 For example, for the following code:
4480 s = init_val;
4481 for (i=0;i<n;i++)
4482 s = s + a[i];
4484 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4485 For a vector of 4 units, we want to return either [0,0,0,init_val],
4486 or [0,0,0,0] and let the caller know that it needs to adjust
4487 the result at the end by 'init_val'.
4489 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4490 initialization vector is simpler (same element in all entries), if
4491 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4493 A cost model should help decide between these two schemes. */
4495 static tree
4496 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4497 stmt_vec_info stmt_vinfo,
4498 enum tree_code code, tree init_val,
4499 tree *adjustment_def)
4501 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4502 tree scalar_type = TREE_TYPE (init_val);
4503 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4504 tree def_for_init;
4505 tree init_def;
4506 REAL_VALUE_TYPE real_init_val = dconst0;
4507 int int_init_val = 0;
4508 gimple_seq stmts = NULL;
4510 gcc_assert (vectype);
4512 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4513 || SCALAR_FLOAT_TYPE_P (scalar_type));
4515 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4516 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4518 /* ADJUSTMENT_DEF is NULL when called from
4519 vect_create_epilog_for_reduction to vectorize double reduction. */
4520 if (adjustment_def)
4521 *adjustment_def = NULL;
4523 switch (code)
4525 case WIDEN_SUM_EXPR:
4526 case DOT_PROD_EXPR:
4527 case SAD_EXPR:
4528 case PLUS_EXPR:
4529 case MINUS_EXPR:
4530 case BIT_IOR_EXPR:
4531 case BIT_XOR_EXPR:
4532 case MULT_EXPR:
4533 case BIT_AND_EXPR:
4535 if (code == MULT_EXPR)
4537 real_init_val = dconst1;
4538 int_init_val = 1;
4541 if (code == BIT_AND_EXPR)
4542 int_init_val = -1;
4544 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4545 def_for_init = build_real (scalar_type, real_init_val);
4546 else
4547 def_for_init = build_int_cst (scalar_type, int_init_val);
4549 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4551 /* Option1: the first element is '0' or '1' as well. */
4552 if (!operand_equal_p (def_for_init, init_val, 0))
4553 *adjustment_def = init_val;
4554 init_def = gimple_build_vector_from_val (&stmts, vectype,
4555 def_for_init);
4557 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4559 /* Option2 (variable length): the first element is INIT_VAL. */
4560 init_def = gimple_build_vector_from_val (&stmts, vectype,
4561 def_for_init);
4562 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4563 vectype, init_def, init_val);
4565 else
4567 /* Option2: the first element is INIT_VAL. */
4568 tree_vector_builder elts (vectype, 1, 2);
4569 elts.quick_push (init_val);
4570 elts.quick_push (def_for_init);
4571 init_def = gimple_build_vector (&stmts, &elts);
4574 break;
4576 case MIN_EXPR:
4577 case MAX_EXPR:
4578 case COND_EXPR:
4580 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4581 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4583 break;
4585 default:
4586 gcc_unreachable ();
4589 if (stmts)
4590 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4591 return init_def;
4594 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4595 NUMBER_OF_VECTORS is the number of vector defs to create.
4596 If NEUTRAL_OP is nonnull, introducing extra elements of that
4597 value will not change the result. */
4599 static void
4600 get_initial_defs_for_reduction (vec_info *vinfo,
4601 slp_tree slp_node,
4602 vec<tree> *vec_oprnds,
4603 unsigned int number_of_vectors,
4604 bool reduc_chain, tree neutral_op)
4606 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4607 stmt_vec_info stmt_vinfo = stmts[0];
4608 unsigned HOST_WIDE_INT nunits;
4609 unsigned j, number_of_places_left_in_vector;
4610 tree vector_type;
4611 unsigned int group_size = stmts.length ();
4612 unsigned int i;
4613 class loop *loop;
4615 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4617 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4619 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4620 gcc_assert (loop);
4621 edge pe = loop_preheader_edge (loop);
4623 gcc_assert (!reduc_chain || neutral_op);
4625 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4626 created vectors. It is greater than 1 if unrolling is performed.
4628 For example, we have two scalar operands, s1 and s2 (e.g., group of
4629 strided accesses of size two), while NUNITS is four (i.e., four scalars
4630 of this type can be packed in a vector). The output vector will contain
4631 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4632 will be 2).
4634 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4635 vectors containing the operands.
4637 For example, NUNITS is four as before, and the group size is 8
4638 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4639 {s5, s6, s7, s8}. */
4641 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4642 nunits = group_size;
4644 number_of_places_left_in_vector = nunits;
4645 bool constant_p = true;
4646 tree_vector_builder elts (vector_type, nunits, 1);
4647 elts.quick_grow (nunits);
4648 gimple_seq ctor_seq = NULL;
4649 for (j = 0; j < nunits * number_of_vectors; ++j)
4651 tree op;
4652 i = j % group_size;
4653 stmt_vinfo = stmts[i];
4655 /* Get the def before the loop. In reduction chain we have only
4656 one initial value. Else we have as many as PHIs in the group. */
4657 if (reduc_chain)
4658 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4659 else if (((vec_oprnds->length () + 1) * nunits
4660 - number_of_places_left_in_vector >= group_size)
4661 && neutral_op)
4662 op = neutral_op;
4663 else
4664 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4666 /* Create 'vect_ = {op0,op1,...,opn}'. */
4667 number_of_places_left_in_vector--;
4668 elts[nunits - number_of_places_left_in_vector - 1] = op;
4669 if (!CONSTANT_CLASS_P (op))
4670 constant_p = false;
4672 if (number_of_places_left_in_vector == 0)
4674 tree init;
4675 if (constant_p && !neutral_op
4676 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4677 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4678 /* Build the vector directly from ELTS. */
4679 init = gimple_build_vector (&ctor_seq, &elts);
4680 else if (neutral_op)
4682 /* Build a vector of the neutral value and shift the
4683 other elements into place. */
4684 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4685 neutral_op);
4686 int k = nunits;
4687 while (k > 0 && elts[k - 1] == neutral_op)
4688 k -= 1;
4689 while (k > 0)
4691 k -= 1;
4692 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4693 vector_type, init, elts[k]);
4696 else
4698 /* First time round, duplicate ELTS to fill the
4699 required number of vectors. */
4700 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4701 number_of_vectors, *vec_oprnds);
4702 break;
4704 vec_oprnds->quick_push (init);
4706 number_of_places_left_in_vector = nunits;
4707 elts.new_vector (vector_type, nunits, 1);
4708 elts.quick_grow (nunits);
4709 constant_p = true;
4712 if (ctor_seq != NULL)
4713 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4716 /* For a statement STMT_INFO taking part in a reduction operation return
4717 the stmt_vec_info the meta information is stored on. */
4719 stmt_vec_info
4720 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4722 stmt_info = vect_orig_stmt (stmt_info);
4723 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4724 if (!is_a <gphi *> (stmt_info->stmt)
4725 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4726 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4727 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4728 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4730 if (gimple_phi_num_args (phi) == 1)
4731 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4733 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4735 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4736 stmt_vec_info info
4737 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4738 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4739 stmt_info = info;
4741 return stmt_info;
4744 /* Function vect_create_epilog_for_reduction
4746 Create code at the loop-epilog to finalize the result of a reduction
4747 computation.
4749 STMT_INFO is the scalar reduction stmt that is being vectorized.
4750 SLP_NODE is an SLP node containing a group of reduction statements. The
4751 first one in this group is STMT_INFO.
4752 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4753 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4754 (counting from 0)
4756 This function:
4757 1. Completes the reduction def-use cycles.
4758 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4759 by calling the function specified by REDUC_FN if available, or by
4760 other means (whole-vector shifts or a scalar loop).
4761 The function also creates a new phi node at the loop exit to preserve
4762 loop-closed form, as illustrated below.
4764 The flow at the entry to this function:
4766 loop:
4767 vec_def = phi <vec_init, null> # REDUCTION_PHI
4768 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4769 s_loop = scalar_stmt # (scalar) STMT_INFO
4770 loop_exit:
4771 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4772 use <s_out0>
4773 use <s_out0>
4775 The above is transformed by this function into:
4777 loop:
4778 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4779 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4780 s_loop = scalar_stmt # (scalar) STMT_INFO
4781 loop_exit:
4782 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4783 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4784 v_out2 = reduce <v_out1>
4785 s_out3 = extract_field <v_out2, 0>
4786 s_out4 = adjust_result <s_out3>
4787 use <s_out4>
4788 use <s_out4>
4791 static void
4792 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4793 stmt_vec_info stmt_info,
4794 slp_tree slp_node,
4795 slp_instance slp_node_instance)
4797 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4798 gcc_assert (reduc_info->is_reduc_info);
4799 /* For double reductions we need to get at the inner loop reduction
4800 stmt which has the meta info attached. Our stmt_info is that of the
4801 loop-closed PHI of the inner loop which we remember as
4802 def for the reduction PHI generation. */
4803 bool double_reduc = false;
4804 stmt_vec_info rdef_info = stmt_info;
4805 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4807 gcc_assert (!slp_node);
4808 double_reduc = true;
4809 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4810 (stmt_info->stmt, 0));
4811 stmt_info = vect_stmt_to_vectorize (stmt_info);
4813 gphi *reduc_def_stmt
4814 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4815 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4816 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4817 tree vectype;
4818 machine_mode mode;
4819 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4820 basic_block exit_bb;
4821 tree scalar_dest;
4822 tree scalar_type;
4823 gimple *new_phi = NULL, *phi;
4824 gimple_stmt_iterator exit_gsi;
4825 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4826 gimple *epilog_stmt = NULL;
4827 gimple *exit_phi;
4828 tree bitsize;
4829 tree def;
4830 tree orig_name, scalar_result;
4831 imm_use_iterator imm_iter, phi_imm_iter;
4832 use_operand_p use_p, phi_use_p;
4833 gimple *use_stmt;
4834 bool nested_in_vect_loop = false;
4835 auto_vec<gimple *> new_phis;
4836 int j, i;
4837 auto_vec<tree> scalar_results;
4838 unsigned int group_size = 1, k;
4839 auto_vec<gimple *> phis;
4840 bool slp_reduc = false;
4841 bool direct_slp_reduc;
4842 tree new_phi_result;
4843 tree induction_index = NULL_TREE;
4845 if (slp_node)
4846 group_size = SLP_TREE_LANES (slp_node);
4848 if (nested_in_vect_loop_p (loop, stmt_info))
4850 outer_loop = loop;
4851 loop = loop->inner;
4852 nested_in_vect_loop = true;
4853 gcc_assert (!slp_node);
4855 gcc_assert (!nested_in_vect_loop || double_reduc);
4857 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4858 gcc_assert (vectype);
4859 mode = TYPE_MODE (vectype);
4861 tree initial_def = NULL;
4862 tree induc_val = NULL_TREE;
4863 tree adjustment_def = NULL;
4864 if (slp_node)
4866 else
4868 /* Get at the scalar def before the loop, that defines the initial value
4869 of the reduction variable. */
4870 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4871 loop_preheader_edge (loop));
4872 /* Optimize: for induction condition reduction, if we can't use zero
4873 for induc_val, use initial_def. */
4874 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4875 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4876 else if (double_reduc)
4878 else if (nested_in_vect_loop)
4880 else
4881 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4884 unsigned vec_num;
4885 int ncopies;
4886 if (slp_node)
4888 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4889 ncopies = 1;
4891 else
4893 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4894 vec_num = 1;
4895 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4898 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4899 which is updated with the current index of the loop for every match of
4900 the original loop's cond_expr (VEC_STMT). This results in a vector
4901 containing the last time the condition passed for that vector lane.
4902 The first match will be a 1 to allow 0 to be used for non-matching
4903 indexes. If there are no matches at all then the vector will be all
4904 zeroes.
4906 PR92772: This algorithm is broken for architectures that support
4907 masked vectors, but do not provide fold_extract_last. */
4908 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4910 auto_vec<std::pair<tree, bool>, 2> ccompares;
4911 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4912 cond_info = vect_stmt_to_vectorize (cond_info);
4913 while (cond_info != reduc_info)
4915 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4917 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4918 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4919 ccompares.safe_push
4920 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4921 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4923 cond_info
4924 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4925 1 + STMT_VINFO_REDUC_IDX
4926 (cond_info)));
4927 cond_info = vect_stmt_to_vectorize (cond_info);
4929 gcc_assert (ccompares.length () != 0);
4931 tree indx_before_incr, indx_after_incr;
4932 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4933 int scalar_precision
4934 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4935 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4936 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4937 (TYPE_MODE (vectype), cr_index_scalar_type,
4938 TYPE_VECTOR_SUBPARTS (vectype));
4940 /* First we create a simple vector induction variable which starts
4941 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4942 vector size (STEP). */
4944 /* Create a {1,2,3,...} vector. */
4945 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4947 /* Create a vector of the step value. */
4948 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4949 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4951 /* Create an induction variable. */
4952 gimple_stmt_iterator incr_gsi;
4953 bool insert_after;
4954 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4955 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4956 insert_after, &indx_before_incr, &indx_after_incr);
4958 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4959 filled with zeros (VEC_ZERO). */
4961 /* Create a vector of 0s. */
4962 tree zero = build_zero_cst (cr_index_scalar_type);
4963 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4965 /* Create a vector phi node. */
4966 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4967 new_phi = create_phi_node (new_phi_tree, loop->header);
4968 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4969 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4971 /* Now take the condition from the loops original cond_exprs
4972 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4973 every match uses values from the induction variable
4974 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4975 (NEW_PHI_TREE).
4976 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4977 the new cond_expr (INDEX_COND_EXPR). */
4978 gimple_seq stmts = NULL;
4979 for (int i = ccompares.length () - 1; i != -1; --i)
4981 tree ccompare = ccompares[i].first;
4982 if (ccompares[i].second)
4983 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4984 cr_index_vector_type,
4985 ccompare,
4986 indx_before_incr, new_phi_tree);
4987 else
4988 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4989 cr_index_vector_type,
4990 ccompare,
4991 new_phi_tree, indx_before_incr);
4993 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4995 /* Update the phi with the vec cond. */
4996 induction_index = new_phi_tree;
4997 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4998 loop_latch_edge (loop), UNKNOWN_LOCATION);
5001 /* 2. Create epilog code.
5002 The reduction epilog code operates across the elements of the vector
5003 of partial results computed by the vectorized loop.
5004 The reduction epilog code consists of:
5006 step 1: compute the scalar result in a vector (v_out2)
5007 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5008 step 3: adjust the scalar result (s_out3) if needed.
5010 Step 1 can be accomplished using one the following three schemes:
5011 (scheme 1) using reduc_fn, if available.
5012 (scheme 2) using whole-vector shifts, if available.
5013 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5014 combined.
5016 The overall epilog code looks like this:
5018 s_out0 = phi <s_loop> # original EXIT_PHI
5019 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5020 v_out2 = reduce <v_out1> # step 1
5021 s_out3 = extract_field <v_out2, 0> # step 2
5022 s_out4 = adjust_result <s_out3> # step 3
5024 (step 3 is optional, and steps 1 and 2 may be combined).
5025 Lastly, the uses of s_out0 are replaced by s_out4. */
5028 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5029 v_out1 = phi <VECT_DEF>
5030 Store them in NEW_PHIS. */
5031 if (double_reduc)
5032 loop = outer_loop;
5033 exit_bb = single_exit (loop)->dest;
5034 new_phis.create (slp_node ? vec_num : ncopies);
5035 for (unsigned i = 0; i < vec_num; i++)
5037 if (slp_node)
5038 def = vect_get_slp_vect_def (slp_node, i);
5039 else
5040 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5041 for (j = 0; j < ncopies; j++)
5043 tree new_def = copy_ssa_name (def);
5044 phi = create_phi_node (new_def, exit_bb);
5045 if (j == 0)
5046 new_phis.quick_push (phi);
5047 else
5049 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5050 new_phis.quick_push (phi);
5053 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5057 exit_gsi = gsi_after_labels (exit_bb);
5059 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5060 (i.e. when reduc_fn is not available) and in the final adjustment
5061 code (if needed). Also get the original scalar reduction variable as
5062 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5063 represents a reduction pattern), the tree-code and scalar-def are
5064 taken from the original stmt that the pattern-stmt (STMT) replaces.
5065 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5066 are taken from STMT. */
5068 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5069 if (orig_stmt_info != stmt_info)
5071 /* Reduction pattern */
5072 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5073 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5076 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5077 scalar_type = TREE_TYPE (scalar_dest);
5078 scalar_results.create (group_size);
5079 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5080 bitsize = TYPE_SIZE (scalar_type);
5082 /* SLP reduction without reduction chain, e.g.,
5083 # a1 = phi <a2, a0>
5084 # b1 = phi <b2, b0>
5085 a2 = operation (a1)
5086 b2 = operation (b1) */
5087 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5089 /* True if we should implement SLP_REDUC using native reduction operations
5090 instead of scalar operations. */
5091 direct_slp_reduc = (reduc_fn != IFN_LAST
5092 && slp_reduc
5093 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5095 /* In case of reduction chain, e.g.,
5096 # a1 = phi <a3, a0>
5097 a2 = operation (a1)
5098 a3 = operation (a2),
5100 we may end up with more than one vector result. Here we reduce them to
5101 one vector. */
5102 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5104 gimple_seq stmts = NULL;
5105 tree first_vect = PHI_RESULT (new_phis[0]);
5106 first_vect = gimple_convert (&stmts, vectype, first_vect);
5107 for (k = 1; k < new_phis.length (); k++)
5109 gimple *next_phi = new_phis[k];
5110 tree second_vect = PHI_RESULT (next_phi);
5111 second_vect = gimple_convert (&stmts, vectype, second_vect);
5112 first_vect = gimple_build (&stmts, code, vectype,
5113 first_vect, second_vect);
5115 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5117 new_phi_result = first_vect;
5118 new_phis.truncate (0);
5119 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5121 /* Likewise if we couldn't use a single defuse cycle. */
5122 else if (ncopies > 1)
5124 gimple_seq stmts = NULL;
5125 tree first_vect = PHI_RESULT (new_phis[0]);
5126 first_vect = gimple_convert (&stmts, vectype, first_vect);
5127 for (int k = 1; k < ncopies; ++k)
5129 tree second_vect = PHI_RESULT (new_phis[k]);
5130 second_vect = gimple_convert (&stmts, vectype, second_vect);
5131 first_vect = gimple_build (&stmts, code, vectype,
5132 first_vect, second_vect);
5134 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5135 new_phi_result = first_vect;
5136 new_phis.truncate (0);
5137 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5139 else
5140 new_phi_result = PHI_RESULT (new_phis[0]);
5142 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5143 && reduc_fn != IFN_LAST)
5145 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5146 various data values where the condition matched and another vector
5147 (INDUCTION_INDEX) containing all the indexes of those matches. We
5148 need to extract the last matching index (which will be the index with
5149 highest value) and use this to index into the data vector.
5150 For the case where there were no matches, the data vector will contain
5151 all default values and the index vector will be all zeros. */
5153 /* Get various versions of the type of the vector of indexes. */
5154 tree index_vec_type = TREE_TYPE (induction_index);
5155 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5156 tree index_scalar_type = TREE_TYPE (index_vec_type);
5157 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5159 /* Get an unsigned integer version of the type of the data vector. */
5160 int scalar_precision
5161 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5162 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5163 tree vectype_unsigned = build_vector_type
5164 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5166 /* First we need to create a vector (ZERO_VEC) of zeros and another
5167 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5168 can create using a MAX reduction and then expanding.
5169 In the case where the loop never made any matches, the max index will
5170 be zero. */
5172 /* Vector of {0, 0, 0,...}. */
5173 tree zero_vec = build_zero_cst (vectype);
5175 gimple_seq stmts = NULL;
5176 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5177 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5179 /* Find maximum value from the vector of found indexes. */
5180 tree max_index = make_ssa_name (index_scalar_type);
5181 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5182 1, induction_index);
5183 gimple_call_set_lhs (max_index_stmt, max_index);
5184 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5186 /* Vector of {max_index, max_index, max_index,...}. */
5187 tree max_index_vec = make_ssa_name (index_vec_type);
5188 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5189 max_index);
5190 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5191 max_index_vec_rhs);
5192 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5194 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5195 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5196 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5197 otherwise. Only one value should match, resulting in a vector
5198 (VEC_COND) with one data value and the rest zeros.
5199 In the case where the loop never made any matches, every index will
5200 match, resulting in a vector with all data values (which will all be
5201 the default value). */
5203 /* Compare the max index vector to the vector of found indexes to find
5204 the position of the max value. */
5205 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5206 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5207 induction_index,
5208 max_index_vec);
5209 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5211 /* Use the compare to choose either values from the data vector or
5212 zero. */
5213 tree vec_cond = make_ssa_name (vectype);
5214 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5215 vec_compare, new_phi_result,
5216 zero_vec);
5217 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5219 /* Finally we need to extract the data value from the vector (VEC_COND)
5220 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5221 reduction, but because this doesn't exist, we can use a MAX reduction
5222 instead. The data value might be signed or a float so we need to cast
5223 it first.
5224 In the case where the loop never made any matches, the data values are
5225 all identical, and so will reduce down correctly. */
5227 /* Make the matched data values unsigned. */
5228 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5229 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5230 vec_cond);
5231 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5232 VIEW_CONVERT_EXPR,
5233 vec_cond_cast_rhs);
5234 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5236 /* Reduce down to a scalar value. */
5237 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5238 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5239 1, vec_cond_cast);
5240 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5241 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5243 /* Convert the reduced value back to the result type and set as the
5244 result. */
5245 stmts = NULL;
5246 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5247 data_reduc);
5248 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5249 scalar_results.safe_push (new_temp);
5251 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5252 && reduc_fn == IFN_LAST)
5254 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5255 idx = 0;
5256 idx_val = induction_index[0];
5257 val = data_reduc[0];
5258 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5259 if (induction_index[i] > idx_val)
5260 val = data_reduc[i], idx_val = induction_index[i];
5261 return val; */
5263 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5264 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5265 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5266 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5267 /* Enforced by vectorizable_reduction, which ensures we have target
5268 support before allowing a conditional reduction on variable-length
5269 vectors. */
5270 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5271 tree idx_val = NULL_TREE, val = NULL_TREE;
5272 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5274 tree old_idx_val = idx_val;
5275 tree old_val = val;
5276 idx_val = make_ssa_name (idx_eltype);
5277 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5278 build3 (BIT_FIELD_REF, idx_eltype,
5279 induction_index,
5280 bitsize_int (el_size),
5281 bitsize_int (off)));
5282 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283 val = make_ssa_name (data_eltype);
5284 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5285 build3 (BIT_FIELD_REF,
5286 data_eltype,
5287 new_phi_result,
5288 bitsize_int (el_size),
5289 bitsize_int (off)));
5290 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5291 if (off != 0)
5293 tree new_idx_val = idx_val;
5294 if (off != v_size - el_size)
5296 new_idx_val = make_ssa_name (idx_eltype);
5297 epilog_stmt = gimple_build_assign (new_idx_val,
5298 MAX_EXPR, idx_val,
5299 old_idx_val);
5300 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5302 tree new_val = make_ssa_name (data_eltype);
5303 epilog_stmt = gimple_build_assign (new_val,
5304 COND_EXPR,
5305 build2 (GT_EXPR,
5306 boolean_type_node,
5307 idx_val,
5308 old_idx_val),
5309 val, old_val);
5310 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5311 idx_val = new_idx_val;
5312 val = new_val;
5315 /* Convert the reduced value back to the result type and set as the
5316 result. */
5317 gimple_seq stmts = NULL;
5318 val = gimple_convert (&stmts, scalar_type, val);
5319 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5320 scalar_results.safe_push (val);
5323 /* 2.3 Create the reduction code, using one of the three schemes described
5324 above. In SLP we simply need to extract all the elements from the
5325 vector (without reducing them), so we use scalar shifts. */
5326 else if (reduc_fn != IFN_LAST && !slp_reduc)
5328 tree tmp;
5329 tree vec_elem_type;
5331 /* Case 1: Create:
5332 v_out2 = reduc_expr <v_out1> */
5334 if (dump_enabled_p ())
5335 dump_printf_loc (MSG_NOTE, vect_location,
5336 "Reduce using direct vector reduction.\n");
5338 gimple_seq stmts = NULL;
5339 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5340 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5341 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5342 vec_elem_type, new_phi_result);
5343 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5344 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5346 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5347 && induc_val)
5349 /* Earlier we set the initial value to be a vector if induc_val
5350 values. Check the result and if it is induc_val then replace
5351 with the original initial value, unless induc_val is
5352 the same as initial_def already. */
5353 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5354 induc_val);
5356 tmp = make_ssa_name (new_scalar_dest);
5357 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5358 initial_def, new_temp);
5359 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360 new_temp = tmp;
5363 scalar_results.safe_push (new_temp);
5365 else if (direct_slp_reduc)
5367 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5368 with the elements for other SLP statements replaced with the
5369 neutral value. We can then do a normal reduction on each vector. */
5371 /* Enforced by vectorizable_reduction. */
5372 gcc_assert (new_phis.length () == 1);
5373 gcc_assert (pow2p_hwi (group_size));
5375 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5376 vec<stmt_vec_info> orig_phis
5377 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5378 gimple_seq seq = NULL;
5380 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5381 and the same element size as VECTYPE. */
5382 tree index = build_index_vector (vectype, 0, 1);
5383 tree index_type = TREE_TYPE (index);
5384 tree index_elt_type = TREE_TYPE (index_type);
5385 tree mask_type = truth_type_for (index_type);
5387 /* Create a vector that, for each element, identifies which of
5388 the REDUC_GROUP_SIZE results should use it. */
5389 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5390 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5391 build_vector_from_val (index_type, index_mask));
5393 /* Get a neutral vector value. This is simply a splat of the neutral
5394 scalar value if we have one, otherwise the initial scalar value
5395 is itself a neutral value. */
5396 tree vector_identity = NULL_TREE;
5397 tree neutral_op = NULL_TREE;
5398 if (slp_node)
5400 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5401 neutral_op
5402 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5403 vectype, code, first != NULL);
5405 if (neutral_op)
5406 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5407 neutral_op);
5408 for (unsigned int i = 0; i < group_size; ++i)
5410 /* If there's no univeral neutral value, we can use the
5411 initial scalar value from the original PHI. This is used
5412 for MIN and MAX reduction, for example. */
5413 if (!neutral_op)
5415 tree scalar_value
5416 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5417 loop_preheader_edge (loop));
5418 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5419 scalar_value);
5420 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5421 scalar_value);
5424 /* Calculate the equivalent of:
5426 sel[j] = (index[j] == i);
5428 which selects the elements of NEW_PHI_RESULT that should
5429 be included in the result. */
5430 tree compare_val = build_int_cst (index_elt_type, i);
5431 compare_val = build_vector_from_val (index_type, compare_val);
5432 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5433 index, compare_val);
5435 /* Calculate the equivalent of:
5437 vec = seq ? new_phi_result : vector_identity;
5439 VEC is now suitable for a full vector reduction. */
5440 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5441 sel, new_phi_result, vector_identity);
5443 /* Do the reduction and convert it to the appropriate type. */
5444 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5445 TREE_TYPE (vectype), vec);
5446 scalar = gimple_convert (&seq, scalar_type, scalar);
5447 scalar_results.safe_push (scalar);
5449 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5451 else
5453 bool reduce_with_shift;
5454 tree vec_temp;
5456 gcc_assert (slp_reduc || new_phis.length () == 1);
5458 /* See if the target wants to do the final (shift) reduction
5459 in a vector mode of smaller size and first reduce upper/lower
5460 halves against each other. */
5461 enum machine_mode mode1 = mode;
5462 tree stype = TREE_TYPE (vectype);
5463 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5464 unsigned nunits1 = nunits;
5465 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5466 && new_phis.length () == 1)
5468 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5469 /* For SLP reductions we have to make sure lanes match up, but
5470 since we're doing individual element final reduction reducing
5471 vector width here is even more important.
5472 ??? We can also separate lanes with permutes, for the common
5473 case of power-of-two group-size odd/even extracts would work. */
5474 if (slp_reduc && nunits != nunits1)
5476 nunits1 = least_common_multiple (nunits1, group_size);
5477 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5480 if (!slp_reduc
5481 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5482 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5484 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5485 stype, nunits1);
5486 reduce_with_shift = have_whole_vector_shift (mode1);
5487 if (!VECTOR_MODE_P (mode1))
5488 reduce_with_shift = false;
5489 else
5491 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5492 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5493 reduce_with_shift = false;
5496 /* First reduce the vector to the desired vector size we should
5497 do shift reduction on by combining upper and lower halves. */
5498 new_temp = new_phi_result;
5499 while (nunits > nunits1)
5501 nunits /= 2;
5502 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5503 stype, nunits);
5504 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5506 /* The target has to make sure we support lowpart/highpart
5507 extraction, either via direct vector extract or through
5508 an integer mode punning. */
5509 tree dst1, dst2;
5510 if (convert_optab_handler (vec_extract_optab,
5511 TYPE_MODE (TREE_TYPE (new_temp)),
5512 TYPE_MODE (vectype1))
5513 != CODE_FOR_nothing)
5515 /* Extract sub-vectors directly once vec_extract becomes
5516 a conversion optab. */
5517 dst1 = make_ssa_name (vectype1);
5518 epilog_stmt
5519 = gimple_build_assign (dst1, BIT_FIELD_REF,
5520 build3 (BIT_FIELD_REF, vectype1,
5521 new_temp, TYPE_SIZE (vectype1),
5522 bitsize_int (0)));
5523 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5524 dst2 = make_ssa_name (vectype1);
5525 epilog_stmt
5526 = gimple_build_assign (dst2, BIT_FIELD_REF,
5527 build3 (BIT_FIELD_REF, vectype1,
5528 new_temp, TYPE_SIZE (vectype1),
5529 bitsize_int (bitsize)));
5530 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5532 else
5534 /* Extract via punning to appropriately sized integer mode
5535 vector. */
5536 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5537 tree etype = build_vector_type (eltype, 2);
5538 gcc_assert (convert_optab_handler (vec_extract_optab,
5539 TYPE_MODE (etype),
5540 TYPE_MODE (eltype))
5541 != CODE_FOR_nothing);
5542 tree tem = make_ssa_name (etype);
5543 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5544 build1 (VIEW_CONVERT_EXPR,
5545 etype, new_temp));
5546 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5547 new_temp = tem;
5548 tem = make_ssa_name (eltype);
5549 epilog_stmt
5550 = gimple_build_assign (tem, BIT_FIELD_REF,
5551 build3 (BIT_FIELD_REF, eltype,
5552 new_temp, TYPE_SIZE (eltype),
5553 bitsize_int (0)));
5554 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5555 dst1 = make_ssa_name (vectype1);
5556 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5557 build1 (VIEW_CONVERT_EXPR,
5558 vectype1, tem));
5559 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5560 tem = make_ssa_name (eltype);
5561 epilog_stmt
5562 = gimple_build_assign (tem, BIT_FIELD_REF,
5563 build3 (BIT_FIELD_REF, eltype,
5564 new_temp, TYPE_SIZE (eltype),
5565 bitsize_int (bitsize)));
5566 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5567 dst2 = make_ssa_name (vectype1);
5568 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5569 build1 (VIEW_CONVERT_EXPR,
5570 vectype1, tem));
5571 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5574 new_temp = make_ssa_name (vectype1);
5575 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5576 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5577 new_phis[0] = epilog_stmt;
5580 if (reduce_with_shift && !slp_reduc)
5582 int element_bitsize = tree_to_uhwi (bitsize);
5583 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5584 for variable-length vectors and also requires direct target support
5585 for loop reductions. */
5586 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5587 int nelements = vec_size_in_bits / element_bitsize;
5588 vec_perm_builder sel;
5589 vec_perm_indices indices;
5591 int elt_offset;
5593 tree zero_vec = build_zero_cst (vectype1);
5594 /* Case 2: Create:
5595 for (offset = nelements/2; offset >= 1; offset/=2)
5597 Create: va' = vec_shift <va, offset>
5598 Create: va = vop <va, va'>
5599 } */
5601 tree rhs;
5603 if (dump_enabled_p ())
5604 dump_printf_loc (MSG_NOTE, vect_location,
5605 "Reduce using vector shifts\n");
5607 gimple_seq stmts = NULL;
5608 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5609 for (elt_offset = nelements / 2;
5610 elt_offset >= 1;
5611 elt_offset /= 2)
5613 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5614 indices.new_vector (sel, 2, nelements);
5615 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5616 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5617 new_temp, zero_vec, mask);
5618 new_temp = gimple_build (&stmts, code,
5619 vectype1, new_name, new_temp);
5621 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5623 /* 2.4 Extract the final scalar result. Create:
5624 s_out3 = extract_field <v_out2, bitpos> */
5626 if (dump_enabled_p ())
5627 dump_printf_loc (MSG_NOTE, vect_location,
5628 "extract scalar result\n");
5630 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5631 bitsize, bitsize_zero_node);
5632 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5633 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5634 gimple_assign_set_lhs (epilog_stmt, new_temp);
5635 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5636 scalar_results.safe_push (new_temp);
5638 else
5640 /* Case 3: Create:
5641 s = extract_field <v_out2, 0>
5642 for (offset = element_size;
5643 offset < vector_size;
5644 offset += element_size;)
5646 Create: s' = extract_field <v_out2, offset>
5647 Create: s = op <s, s'> // For non SLP cases
5648 } */
5650 if (dump_enabled_p ())
5651 dump_printf_loc (MSG_NOTE, vect_location,
5652 "Reduce using scalar code.\n");
5654 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5655 int element_bitsize = tree_to_uhwi (bitsize);
5656 tree compute_type = TREE_TYPE (vectype);
5657 gimple_seq stmts = NULL;
5658 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5660 int bit_offset;
5661 if (gimple_code (new_phi) == GIMPLE_PHI)
5662 vec_temp = PHI_RESULT (new_phi);
5663 else
5664 vec_temp = gimple_assign_lhs (new_phi);
5665 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5666 vec_temp, bitsize, bitsize_zero_node);
5668 /* In SLP we don't need to apply reduction operation, so we just
5669 collect s' values in SCALAR_RESULTS. */
5670 if (slp_reduc)
5671 scalar_results.safe_push (new_temp);
5673 for (bit_offset = element_bitsize;
5674 bit_offset < vec_size_in_bits;
5675 bit_offset += element_bitsize)
5677 tree bitpos = bitsize_int (bit_offset);
5678 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5679 compute_type, vec_temp,
5680 bitsize, bitpos);
5681 if (slp_reduc)
5683 /* In SLP we don't need to apply reduction operation, so
5684 we just collect s' values in SCALAR_RESULTS. */
5685 new_temp = new_name;
5686 scalar_results.safe_push (new_name);
5688 else
5689 new_temp = gimple_build (&stmts, code, compute_type,
5690 new_name, new_temp);
5694 /* The only case where we need to reduce scalar results in SLP, is
5695 unrolling. If the size of SCALAR_RESULTS is greater than
5696 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5697 REDUC_GROUP_SIZE. */
5698 if (slp_reduc)
5700 tree res, first_res, new_res;
5702 /* Reduce multiple scalar results in case of SLP unrolling. */
5703 for (j = group_size; scalar_results.iterate (j, &res);
5704 j++)
5706 first_res = scalar_results[j % group_size];
5707 new_res = gimple_build (&stmts, code, compute_type,
5708 first_res, res);
5709 scalar_results[j % group_size] = new_res;
5711 for (k = 0; k < group_size; k++)
5712 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5713 scalar_results[k]);
5715 else
5717 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5718 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5719 scalar_results.safe_push (new_temp);
5722 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5725 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5726 && induc_val)
5728 /* Earlier we set the initial value to be a vector if induc_val
5729 values. Check the result and if it is induc_val then replace
5730 with the original initial value, unless induc_val is
5731 the same as initial_def already. */
5732 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5733 induc_val);
5735 tree tmp = make_ssa_name (new_scalar_dest);
5736 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5737 initial_def, new_temp);
5738 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5739 scalar_results[0] = tmp;
5743 /* 2.5 Adjust the final result by the initial value of the reduction
5744 variable. (When such adjustment is not needed, then
5745 'adjustment_def' is zero). For example, if code is PLUS we create:
5746 new_temp = loop_exit_def + adjustment_def */
5748 if (adjustment_def)
5750 gcc_assert (!slp_reduc);
5751 gimple_seq stmts = NULL;
5752 if (nested_in_vect_loop)
5754 new_phi = new_phis[0];
5755 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5756 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5757 new_temp = gimple_build (&stmts, code, vectype,
5758 PHI_RESULT (new_phi), adjustment_def);
5760 else
5762 new_temp = scalar_results[0];
5763 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5764 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5765 new_temp = gimple_build (&stmts, code, scalar_type,
5766 new_temp, adjustment_def);
5769 epilog_stmt = gimple_seq_last_stmt (stmts);
5770 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5771 if (nested_in_vect_loop)
5773 if (!double_reduc)
5774 scalar_results.quick_push (new_temp);
5775 else
5776 scalar_results[0] = new_temp;
5778 else
5779 scalar_results[0] = new_temp;
5781 new_phis[0] = epilog_stmt;
5784 if (double_reduc)
5785 loop = loop->inner;
5787 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5788 phis with new adjusted scalar results, i.e., replace use <s_out0>
5789 with use <s_out4>.
5791 Transform:
5792 loop_exit:
5793 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5794 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5795 v_out2 = reduce <v_out1>
5796 s_out3 = extract_field <v_out2, 0>
5797 s_out4 = adjust_result <s_out3>
5798 use <s_out0>
5799 use <s_out0>
5801 into:
5803 loop_exit:
5804 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5805 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5806 v_out2 = reduce <v_out1>
5807 s_out3 = extract_field <v_out2, 0>
5808 s_out4 = adjust_result <s_out3>
5809 use <s_out4>
5810 use <s_out4> */
5813 /* In SLP reduction chain we reduce vector results into one vector if
5814 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5815 LHS of the last stmt in the reduction chain, since we are looking for
5816 the loop exit phi node. */
5817 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5819 stmt_vec_info dest_stmt_info
5820 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5821 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5822 group_size = 1;
5825 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5826 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5827 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5828 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5829 correspond to the first vector stmt, etc.
5830 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5831 if (group_size > new_phis.length ())
5832 gcc_assert (!(group_size % new_phis.length ()));
5834 for (k = 0; k < group_size; k++)
5836 if (slp_reduc)
5838 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5840 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5841 /* SLP statements can't participate in patterns. */
5842 gcc_assert (!orig_stmt_info);
5843 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5846 if (nested_in_vect_loop)
5848 if (double_reduc)
5849 loop = outer_loop;
5850 else
5851 gcc_unreachable ();
5854 phis.create (3);
5855 /* Find the loop-closed-use at the loop exit of the original scalar
5856 result. (The reduction result is expected to have two immediate uses,
5857 one at the latch block, and one at the loop exit). For double
5858 reductions we are looking for exit phis of the outer loop. */
5859 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5861 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5863 if (!is_gimple_debug (USE_STMT (use_p)))
5864 phis.safe_push (USE_STMT (use_p));
5866 else
5868 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5870 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5872 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5874 if (!flow_bb_inside_loop_p (loop,
5875 gimple_bb (USE_STMT (phi_use_p)))
5876 && !is_gimple_debug (USE_STMT (phi_use_p)))
5877 phis.safe_push (USE_STMT (phi_use_p));
5883 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5885 /* Replace the uses: */
5886 orig_name = PHI_RESULT (exit_phi);
5887 scalar_result = scalar_results[k];
5888 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5890 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5891 SET_USE (use_p, scalar_result);
5892 update_stmt (use_stmt);
5896 phis.release ();
5900 /* Return a vector of type VECTYPE that is equal to the vector select
5901 operation "MASK ? VEC : IDENTITY". Insert the select statements
5902 before GSI. */
5904 static tree
5905 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5906 tree vec, tree identity)
5908 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5909 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5910 mask, vec, identity);
5911 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5912 return cond;
5915 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5916 order, starting with LHS. Insert the extraction statements before GSI and
5917 associate the new scalar SSA names with variable SCALAR_DEST.
5918 Return the SSA name for the result. */
5920 static tree
5921 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5922 tree_code code, tree lhs, tree vector_rhs)
5924 tree vectype = TREE_TYPE (vector_rhs);
5925 tree scalar_type = TREE_TYPE (vectype);
5926 tree bitsize = TYPE_SIZE (scalar_type);
5927 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5928 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5930 for (unsigned HOST_WIDE_INT bit_offset = 0;
5931 bit_offset < vec_size_in_bits;
5932 bit_offset += element_bitsize)
5934 tree bitpos = bitsize_int (bit_offset);
5935 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5936 bitsize, bitpos);
5938 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5939 rhs = make_ssa_name (scalar_dest, stmt);
5940 gimple_assign_set_lhs (stmt, rhs);
5941 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5943 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5944 tree new_name = make_ssa_name (scalar_dest, stmt);
5945 gimple_assign_set_lhs (stmt, new_name);
5946 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5947 lhs = new_name;
5949 return lhs;
5952 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5953 type of the vector input. */
5955 static internal_fn
5956 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5958 internal_fn mask_reduc_fn;
5960 switch (reduc_fn)
5962 case IFN_FOLD_LEFT_PLUS:
5963 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5964 break;
5966 default:
5967 return IFN_LAST;
5970 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5971 OPTIMIZE_FOR_SPEED))
5972 return mask_reduc_fn;
5973 return IFN_LAST;
5976 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5977 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5978 statement. CODE is the operation performed by STMT_INFO and OPS are
5979 its scalar operands. REDUC_INDEX is the index of the operand in
5980 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5981 implements in-order reduction, or IFN_LAST if we should open-code it.
5982 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5983 that should be used to control the operation in a fully-masked loop. */
5985 static bool
5986 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5987 stmt_vec_info stmt_info,
5988 gimple_stmt_iterator *gsi,
5989 gimple **vec_stmt, slp_tree slp_node,
5990 gimple *reduc_def_stmt,
5991 tree_code code, internal_fn reduc_fn,
5992 tree ops[3], tree vectype_in,
5993 int reduc_index, vec_loop_masks *masks)
5995 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5996 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5997 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5999 int ncopies;
6000 if (slp_node)
6001 ncopies = 1;
6002 else
6003 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6005 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6006 gcc_assert (ncopies == 1);
6007 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6009 if (slp_node)
6010 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6011 TYPE_VECTOR_SUBPARTS (vectype_in)));
6013 tree op0 = ops[1 - reduc_index];
6015 int group_size = 1;
6016 stmt_vec_info scalar_dest_def_info;
6017 auto_vec<tree> vec_oprnds0;
6018 if (slp_node)
6020 auto_vec<vec<tree> > vec_defs (2);
6021 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6022 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6023 vec_defs[0].release ();
6024 vec_defs[1].release ();
6025 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6026 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6028 else
6030 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6031 op0, &vec_oprnds0);
6032 scalar_dest_def_info = stmt_info;
6035 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6036 tree scalar_type = TREE_TYPE (scalar_dest);
6037 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6039 int vec_num = vec_oprnds0.length ();
6040 gcc_assert (vec_num == 1 || slp_node);
6041 tree vec_elem_type = TREE_TYPE (vectype_out);
6042 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6044 tree vector_identity = NULL_TREE;
6045 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6046 vector_identity = build_zero_cst (vectype_out);
6048 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6049 int i;
6050 tree def0;
6051 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6053 gimple *new_stmt;
6054 tree mask = NULL_TREE;
6055 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6056 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6058 /* Handle MINUS by adding the negative. */
6059 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6061 tree negated = make_ssa_name (vectype_out);
6062 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6063 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6064 def0 = negated;
6067 if (mask && mask_reduc_fn == IFN_LAST)
6068 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6069 vector_identity);
6071 /* On the first iteration the input is simply the scalar phi
6072 result, and for subsequent iterations it is the output of
6073 the preceding operation. */
6074 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6076 if (mask && mask_reduc_fn != IFN_LAST)
6077 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6078 def0, mask);
6079 else
6080 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6081 def0);
6082 /* For chained SLP reductions the output of the previous reduction
6083 operation serves as the input of the next. For the final statement
6084 the output cannot be a temporary - we reuse the original
6085 scalar destination of the last statement. */
6086 if (i != vec_num - 1)
6088 gimple_set_lhs (new_stmt, scalar_dest_var);
6089 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6090 gimple_set_lhs (new_stmt, reduc_var);
6093 else
6095 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6096 reduc_var, def0);
6097 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6098 /* Remove the statement, so that we can use the same code paths
6099 as for statements that we've just created. */
6100 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6101 gsi_remove (&tmp_gsi, true);
6104 if (i == vec_num - 1)
6106 gimple_set_lhs (new_stmt, scalar_dest);
6107 vect_finish_replace_stmt (loop_vinfo,
6108 scalar_dest_def_info,
6109 new_stmt);
6111 else
6112 vect_finish_stmt_generation (loop_vinfo,
6113 scalar_dest_def_info,
6114 new_stmt, gsi);
6116 if (slp_node)
6117 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6118 else
6120 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6121 *vec_stmt = new_stmt;
6125 return true;
6128 /* Function is_nonwrapping_integer_induction.
6130 Check if STMT_VINO (which is part of loop LOOP) both increments and
6131 does not cause overflow. */
6133 static bool
6134 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6136 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6137 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6138 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6139 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6140 widest_int ni, max_loop_value, lhs_max;
6141 wi::overflow_type overflow = wi::OVF_NONE;
6143 /* Make sure the loop is integer based. */
6144 if (TREE_CODE (base) != INTEGER_CST
6145 || TREE_CODE (step) != INTEGER_CST)
6146 return false;
6148 /* Check that the max size of the loop will not wrap. */
6150 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6151 return true;
6153 if (! max_stmt_executions (loop, &ni))
6154 return false;
6156 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6157 &overflow);
6158 if (overflow)
6159 return false;
6161 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6162 TYPE_SIGN (lhs_type), &overflow);
6163 if (overflow)
6164 return false;
6166 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6167 <= TYPE_PRECISION (lhs_type));
6170 /* Check if masking can be supported by inserting a conditional expression.
6171 CODE is the code for the operation. COND_FN is the conditional internal
6172 function, if it exists. VECTYPE_IN is the type of the vector input. */
6173 static bool
6174 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6175 tree vectype_in)
6177 if (cond_fn != IFN_LAST
6178 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6179 OPTIMIZE_FOR_SPEED))
6180 return false;
6182 switch (code)
6184 case DOT_PROD_EXPR:
6185 case SAD_EXPR:
6186 return true;
6188 default:
6189 return false;
6193 /* Insert a conditional expression to enable masked vectorization. CODE is the
6194 code for the operation. VOP is the array of operands. MASK is the loop
6195 mask. GSI is a statement iterator used to place the new conditional
6196 expression. */
6197 static void
6198 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6199 gimple_stmt_iterator *gsi)
6201 switch (code)
6203 case DOT_PROD_EXPR:
6205 tree vectype = TREE_TYPE (vop[1]);
6206 tree zero = build_zero_cst (vectype);
6207 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6208 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6209 mask, vop[1], zero);
6210 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6211 vop[1] = masked_op1;
6212 break;
6215 case SAD_EXPR:
6217 tree vectype = TREE_TYPE (vop[1]);
6218 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6219 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6220 mask, vop[1], vop[0]);
6221 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6222 vop[1] = masked_op1;
6223 break;
6226 default:
6227 gcc_unreachable ();
6231 /* Function vectorizable_reduction.
6233 Check if STMT_INFO performs a reduction operation that can be vectorized.
6234 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6235 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6236 Return true if STMT_INFO is vectorizable in this way.
6238 This function also handles reduction idioms (patterns) that have been
6239 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6240 may be of this form:
6241 X = pattern_expr (arg0, arg1, ..., X)
6242 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6243 sequence that had been detected and replaced by the pattern-stmt
6244 (STMT_INFO).
6246 This function also handles reduction of condition expressions, for example:
6247 for (int i = 0; i < N; i++)
6248 if (a[i] < value)
6249 last = a[i];
6250 This is handled by vectorising the loop and creating an additional vector
6251 containing the loop indexes for which "a[i] < value" was true. In the
6252 function epilogue this is reduced to a single max value and then used to
6253 index into the vector of results.
6255 In some cases of reduction patterns, the type of the reduction variable X is
6256 different than the type of the other arguments of STMT_INFO.
6257 In such cases, the vectype that is used when transforming STMT_INFO into
6258 a vector stmt is different than the vectype that is used to determine the
6259 vectorization factor, because it consists of a different number of elements
6260 than the actual number of elements that are being operated upon in parallel.
6262 For example, consider an accumulation of shorts into an int accumulator.
6263 On some targets it's possible to vectorize this pattern operating on 8
6264 shorts at a time (hence, the vectype for purposes of determining the
6265 vectorization factor should be V8HI); on the other hand, the vectype that
6266 is used to create the vector form is actually V4SI (the type of the result).
6268 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6269 indicates what is the actual level of parallelism (V8HI in the example), so
6270 that the right vectorization factor would be derived. This vectype
6271 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6272 be used to create the vectorized stmt. The right vectype for the vectorized
6273 stmt is obtained from the type of the result X:
6274 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6276 This means that, contrary to "regular" reductions (or "regular" stmts in
6277 general), the following equation:
6278 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6279 does *NOT* necessarily hold for reduction patterns. */
6281 bool
6282 vectorizable_reduction (loop_vec_info loop_vinfo,
6283 stmt_vec_info stmt_info, slp_tree slp_node,
6284 slp_instance slp_node_instance,
6285 stmt_vector_for_cost *cost_vec)
6287 tree scalar_dest;
6288 tree vectype_in = NULL_TREE;
6289 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6290 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6291 stmt_vec_info cond_stmt_vinfo = NULL;
6292 tree scalar_type;
6293 int i;
6294 int ncopies;
6295 bool single_defuse_cycle = false;
6296 bool nested_cycle = false;
6297 bool double_reduc = false;
6298 int vec_num;
6299 tree tem;
6300 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6301 tree cond_reduc_val = NULL_TREE;
6303 /* Make sure it was already recognized as a reduction computation. */
6304 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6305 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6306 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6307 return false;
6309 /* The stmt we store reduction analysis meta on. */
6310 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6311 reduc_info->is_reduc_info = true;
6313 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6315 if (is_a <gphi *> (stmt_info->stmt))
6316 /* Analysis for double-reduction is done on the outer
6317 loop PHI, nested cycles have no further restrictions. */
6318 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6319 else
6320 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6321 return true;
6324 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6325 stmt_vec_info phi_info = stmt_info;
6326 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6327 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6329 if (!is_a <gphi *> (stmt_info->stmt))
6331 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6332 return true;
6334 if (slp_node)
6336 slp_node_instance->reduc_phis = slp_node;
6337 /* ??? We're leaving slp_node to point to the PHIs, we only
6338 need it to get at the number of vector stmts which wasn't
6339 yet initialized for the instance root. */
6341 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6342 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6343 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6345 use_operand_p use_p;
6346 gimple *use_stmt;
6347 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6348 &use_p, &use_stmt);
6349 gcc_assert (res);
6350 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6351 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6355 /* PHIs should not participate in patterns. */
6356 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6357 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6359 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6360 and compute the reduction chain length. Discover the real
6361 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6362 tree reduc_def
6363 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6364 loop_latch_edge
6365 (gimple_bb (reduc_def_phi)->loop_father));
6366 unsigned reduc_chain_length = 0;
6367 bool only_slp_reduc_chain = true;
6368 stmt_info = NULL;
6369 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6370 while (reduc_def != PHI_RESULT (reduc_def_phi))
6372 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6373 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6374 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6376 if (dump_enabled_p ())
6377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6378 "reduction chain broken by patterns.\n");
6379 return false;
6381 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6382 only_slp_reduc_chain = false;
6383 /* ??? For epilogue generation live members of the chain need
6384 to point back to the PHI via their original stmt for
6385 info_for_reduction to work. */
6386 if (STMT_VINFO_LIVE_P (vdef))
6387 STMT_VINFO_REDUC_DEF (def) = phi_info;
6388 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6389 if (!assign)
6391 if (dump_enabled_p ())
6392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6393 "reduction chain includes calls.\n");
6394 return false;
6396 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6398 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6399 TREE_TYPE (gimple_assign_rhs1 (assign))))
6401 if (dump_enabled_p ())
6402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6403 "conversion in the reduction chain.\n");
6404 return false;
6407 else if (!stmt_info)
6408 /* First non-conversion stmt. */
6409 stmt_info = vdef;
6410 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6411 reduc_chain_length++;
6412 if (!stmt_info && slp_node)
6413 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6415 /* PHIs should not participate in patterns. */
6416 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6418 if (nested_in_vect_loop_p (loop, stmt_info))
6420 loop = loop->inner;
6421 nested_cycle = true;
6424 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6425 element. */
6426 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6428 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6429 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6431 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6432 gcc_assert (slp_node
6433 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6435 /* 1. Is vectorizable reduction? */
6436 /* Not supportable if the reduction variable is used in the loop, unless
6437 it's a reduction chain. */
6438 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6439 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6440 return false;
6442 /* Reductions that are not used even in an enclosing outer-loop,
6443 are expected to be "live" (used out of the loop). */
6444 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6445 && !STMT_VINFO_LIVE_P (stmt_info))
6446 return false;
6448 /* 2. Has this been recognized as a reduction pattern?
6450 Check if STMT represents a pattern that has been recognized
6451 in earlier analysis stages. For stmts that represent a pattern,
6452 the STMT_VINFO_RELATED_STMT field records the last stmt in
6453 the original sequence that constitutes the pattern. */
6455 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6456 if (orig_stmt_info)
6458 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6459 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6462 /* 3. Check the operands of the operation. The first operands are defined
6463 inside the loop body. The last operand is the reduction variable,
6464 which is defined by the loop-header-phi. */
6466 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6467 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6468 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6469 enum tree_code code = gimple_assign_rhs_code (stmt);
6470 bool lane_reduc_code_p
6471 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6472 int op_type = TREE_CODE_LENGTH (code);
6474 scalar_dest = gimple_assign_lhs (stmt);
6475 scalar_type = TREE_TYPE (scalar_dest);
6476 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6477 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6478 return false;
6480 /* Do not try to vectorize bit-precision reductions. */
6481 if (!type_has_mode_precision_p (scalar_type))
6482 return false;
6484 /* For lane-reducing ops we're reducing the number of reduction PHIs
6485 which means the only use of that may be in the lane-reducing operation. */
6486 if (lane_reduc_code_p
6487 && reduc_chain_length != 1
6488 && !only_slp_reduc_chain)
6490 if (dump_enabled_p ())
6491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492 "lane-reducing reduction with extra stmts.\n");
6493 return false;
6496 /* All uses but the last are expected to be defined in the loop.
6497 The last use is the reduction variable. In case of nested cycle this
6498 assumption is not true: we use reduc_index to record the index of the
6499 reduction variable. */
6500 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6501 /* We need to skip an extra operand for COND_EXPRs with embedded
6502 comparison. */
6503 unsigned opno_adjust = 0;
6504 if (code == COND_EXPR
6505 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6506 opno_adjust = 1;
6507 for (i = 0; i < op_type; i++)
6509 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6510 if (i == 0 && code == COND_EXPR)
6511 continue;
6513 stmt_vec_info def_stmt_info;
6514 enum vect_def_type dt;
6515 tree op;
6516 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6517 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6518 &def_stmt_info))
6520 if (dump_enabled_p ())
6521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522 "use not simple.\n");
6523 return false;
6525 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6526 continue;
6528 /* There should be only one cycle def in the stmt, the one
6529 leading to reduc_def. */
6530 if (VECTORIZABLE_CYCLE_DEF (dt))
6531 return false;
6533 /* To properly compute ncopies we are interested in the widest
6534 non-reduction input type in case we're looking at a widening
6535 accumulation that we later handle in vect_transform_reduction. */
6536 if (lane_reduc_code_p
6537 && tem
6538 && (!vectype_in
6539 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6540 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6541 vectype_in = tem;
6543 if (code == COND_EXPR)
6545 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6546 if (dt == vect_constant_def)
6548 cond_reduc_dt = dt;
6549 cond_reduc_val = op;
6551 if (dt == vect_induction_def
6552 && def_stmt_info
6553 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6555 cond_reduc_dt = dt;
6556 cond_stmt_vinfo = def_stmt_info;
6560 if (!vectype_in)
6561 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6562 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6564 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6565 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6566 /* If we have a condition reduction, see if we can simplify it further. */
6567 if (v_reduc_type == COND_REDUCTION)
6569 if (slp_node)
6570 return false;
6572 /* When the condition uses the reduction value in the condition, fail. */
6573 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6575 if (dump_enabled_p ())
6576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6577 "condition depends on previous iteration\n");
6578 return false;
6581 if (reduc_chain_length == 1
6582 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6583 vectype_in, OPTIMIZE_FOR_SPEED))
6585 if (dump_enabled_p ())
6586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6587 "optimizing condition reduction with"
6588 " FOLD_EXTRACT_LAST.\n");
6589 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6591 else if (cond_reduc_dt == vect_induction_def)
6593 tree base
6594 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6595 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6597 gcc_assert (TREE_CODE (base) == INTEGER_CST
6598 && TREE_CODE (step) == INTEGER_CST);
6599 cond_reduc_val = NULL_TREE;
6600 enum tree_code cond_reduc_op_code = ERROR_MARK;
6601 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6602 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6604 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6605 above base; punt if base is the minimum value of the type for
6606 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6607 else if (tree_int_cst_sgn (step) == -1)
6609 cond_reduc_op_code = MIN_EXPR;
6610 if (tree_int_cst_sgn (base) == -1)
6611 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6612 else if (tree_int_cst_lt (base,
6613 TYPE_MAX_VALUE (TREE_TYPE (base))))
6614 cond_reduc_val
6615 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6617 else
6619 cond_reduc_op_code = MAX_EXPR;
6620 if (tree_int_cst_sgn (base) == 1)
6621 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6622 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6623 base))
6624 cond_reduc_val
6625 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6627 if (cond_reduc_val)
6629 if (dump_enabled_p ())
6630 dump_printf_loc (MSG_NOTE, vect_location,
6631 "condition expression based on "
6632 "integer induction.\n");
6633 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6634 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6635 = cond_reduc_val;
6636 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6639 else if (cond_reduc_dt == vect_constant_def)
6641 enum vect_def_type cond_initial_dt;
6642 tree cond_initial_val
6643 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6645 gcc_assert (cond_reduc_val != NULL_TREE);
6646 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6647 if (cond_initial_dt == vect_constant_def
6648 && types_compatible_p (TREE_TYPE (cond_initial_val),
6649 TREE_TYPE (cond_reduc_val)))
6651 tree e = fold_binary (LE_EXPR, boolean_type_node,
6652 cond_initial_val, cond_reduc_val);
6653 if (e && (integer_onep (e) || integer_zerop (e)))
6655 if (dump_enabled_p ())
6656 dump_printf_loc (MSG_NOTE, vect_location,
6657 "condition expression based on "
6658 "compile time constant.\n");
6659 /* Record reduction code at analysis stage. */
6660 STMT_VINFO_REDUC_CODE (reduc_info)
6661 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6662 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6668 if (STMT_VINFO_LIVE_P (phi_info))
6669 return false;
6671 if (slp_node)
6672 ncopies = 1;
6673 else
6674 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6676 gcc_assert (ncopies >= 1);
6678 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6680 if (nested_cycle)
6682 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6683 == vect_double_reduction_def);
6684 double_reduc = true;
6687 /* 4.2. Check support for the epilog operation.
6689 If STMT represents a reduction pattern, then the type of the
6690 reduction variable may be different than the type of the rest
6691 of the arguments. For example, consider the case of accumulation
6692 of shorts into an int accumulator; The original code:
6693 S1: int_a = (int) short_a;
6694 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6696 was replaced with:
6697 STMT: int_acc = widen_sum <short_a, int_acc>
6699 This means that:
6700 1. The tree-code that is used to create the vector operation in the
6701 epilog code (that reduces the partial results) is not the
6702 tree-code of STMT, but is rather the tree-code of the original
6703 stmt from the pattern that STMT is replacing. I.e, in the example
6704 above we want to use 'widen_sum' in the loop, but 'plus' in the
6705 epilog.
6706 2. The type (mode) we use to check available target support
6707 for the vector operation to be created in the *epilog*, is
6708 determined by the type of the reduction variable (in the example
6709 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6710 However the type (mode) we use to check available target support
6711 for the vector operation to be created *inside the loop*, is
6712 determined by the type of the other arguments to STMT (in the
6713 example we'd check this: optab_handler (widen_sum_optab,
6714 vect_short_mode)).
6716 This is contrary to "regular" reductions, in which the types of all
6717 the arguments are the same as the type of the reduction variable.
6718 For "regular" reductions we can therefore use the same vector type
6719 (and also the same tree-code) when generating the epilog code and
6720 when generating the code inside the loop. */
6722 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6723 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6725 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6726 if (reduction_type == TREE_CODE_REDUCTION)
6728 /* Check whether it's ok to change the order of the computation.
6729 Generally, when vectorizing a reduction we change the order of the
6730 computation. This may change the behavior of the program in some
6731 cases, so we need to check that this is ok. One exception is when
6732 vectorizing an outer-loop: the inner-loop is executed sequentially,
6733 and therefore vectorizing reductions in the inner-loop during
6734 outer-loop vectorization is safe. */
6735 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6737 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6738 is not directy used in stmt. */
6739 if (!only_slp_reduc_chain
6740 && reduc_chain_length != 1)
6742 if (dump_enabled_p ())
6743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744 "in-order reduction chain without SLP.\n");
6745 return false;
6747 STMT_VINFO_REDUC_TYPE (reduc_info)
6748 = reduction_type = FOLD_LEFT_REDUCTION;
6750 else if (!commutative_tree_code (orig_code)
6751 || !associative_tree_code (orig_code))
6753 if (dump_enabled_p ())
6754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755 "reduction: not commutative/associative");
6756 return false;
6760 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6761 && ncopies > 1)
6763 if (dump_enabled_p ())
6764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765 "multiple types in double reduction or condition "
6766 "reduction or fold-left reduction.\n");
6767 return false;
6770 internal_fn reduc_fn = IFN_LAST;
6771 if (reduction_type == TREE_CODE_REDUCTION
6772 || reduction_type == FOLD_LEFT_REDUCTION
6773 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6774 || reduction_type == CONST_COND_REDUCTION)
6776 if (reduction_type == FOLD_LEFT_REDUCTION
6777 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6778 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6780 if (reduc_fn != IFN_LAST
6781 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6782 OPTIMIZE_FOR_SPEED))
6784 if (dump_enabled_p ())
6785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6786 "reduc op not supported by target.\n");
6788 reduc_fn = IFN_LAST;
6791 else
6793 if (!nested_cycle || double_reduc)
6795 if (dump_enabled_p ())
6796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6797 "no reduc code for scalar code.\n");
6799 return false;
6803 else if (reduction_type == COND_REDUCTION)
6805 int scalar_precision
6806 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6807 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6808 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6809 nunits_out);
6811 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6812 OPTIMIZE_FOR_SPEED))
6813 reduc_fn = IFN_REDUC_MAX;
6815 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6817 if (reduction_type != EXTRACT_LAST_REDUCTION
6818 && (!nested_cycle || double_reduc)
6819 && reduc_fn == IFN_LAST
6820 && !nunits_out.is_constant ())
6822 if (dump_enabled_p ())
6823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6824 "missing target support for reduction on"
6825 " variable-length vectors.\n");
6826 return false;
6829 /* For SLP reductions, see if there is a neutral value we can use. */
6830 tree neutral_op = NULL_TREE;
6831 if (slp_node)
6832 neutral_op = neutral_op_for_slp_reduction
6833 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6834 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6836 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6838 /* We can't support in-order reductions of code such as this:
6840 for (int i = 0; i < n1; ++i)
6841 for (int j = 0; j < n2; ++j)
6842 l += a[j];
6844 since GCC effectively transforms the loop when vectorizing:
6846 for (int i = 0; i < n1 / VF; ++i)
6847 for (int j = 0; j < n2; ++j)
6848 for (int k = 0; k < VF; ++k)
6849 l += a[j];
6851 which is a reassociation of the original operation. */
6852 if (dump_enabled_p ())
6853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854 "in-order double reduction not supported.\n");
6856 return false;
6859 if (reduction_type == FOLD_LEFT_REDUCTION
6860 && slp_node
6861 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6863 /* We cannot use in-order reductions in this case because there is
6864 an implicit reassociation of the operations involved. */
6865 if (dump_enabled_p ())
6866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867 "in-order unchained SLP reductions not supported.\n");
6868 return false;
6871 /* For double reductions, and for SLP reductions with a neutral value,
6872 we construct a variable-length initial vector by loading a vector
6873 full of the neutral value and then shift-and-inserting the start
6874 values into the low-numbered elements. */
6875 if ((double_reduc || neutral_op)
6876 && !nunits_out.is_constant ()
6877 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6878 vectype_out, OPTIMIZE_FOR_SPEED))
6880 if (dump_enabled_p ())
6881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6882 "reduction on variable-length vectors requires"
6883 " target support for a vector-shift-and-insert"
6884 " operation.\n");
6885 return false;
6888 /* Check extra constraints for variable-length unchained SLP reductions. */
6889 if (STMT_SLP_TYPE (stmt_info)
6890 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6891 && !nunits_out.is_constant ())
6893 /* We checked above that we could build the initial vector when
6894 there's a neutral element value. Check here for the case in
6895 which each SLP statement has its own initial value and in which
6896 that value needs to be repeated for every instance of the
6897 statement within the initial vector. */
6898 unsigned int group_size = SLP_TREE_LANES (slp_node);
6899 if (!neutral_op
6900 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6901 TREE_TYPE (vectype_out)))
6903 if (dump_enabled_p ())
6904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6905 "unsupported form of SLP reduction for"
6906 " variable-length vectors: cannot build"
6907 " initial vector.\n");
6908 return false;
6910 /* The epilogue code relies on the number of elements being a multiple
6911 of the group size. The duplicate-and-interleave approach to setting
6912 up the initial vector does too. */
6913 if (!multiple_p (nunits_out, group_size))
6915 if (dump_enabled_p ())
6916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6917 "unsupported form of SLP reduction for"
6918 " variable-length vectors: the vector size"
6919 " is not a multiple of the number of results.\n");
6920 return false;
6924 if (reduction_type == COND_REDUCTION)
6926 widest_int ni;
6928 if (! max_loop_iterations (loop, &ni))
6930 if (dump_enabled_p ())
6931 dump_printf_loc (MSG_NOTE, vect_location,
6932 "loop count not known, cannot create cond "
6933 "reduction.\n");
6934 return false;
6936 /* Convert backedges to iterations. */
6937 ni += 1;
6939 /* The additional index will be the same type as the condition. Check
6940 that the loop can fit into this less one (because we'll use up the
6941 zero slot for when there are no matches). */
6942 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6943 if (wi::geu_p (ni, wi::to_widest (max_index)))
6945 if (dump_enabled_p ())
6946 dump_printf_loc (MSG_NOTE, vect_location,
6947 "loop size is greater than data size.\n");
6948 return false;
6952 /* In case the vectorization factor (VF) is bigger than the number
6953 of elements that we can fit in a vectype (nunits), we have to generate
6954 more than one vector stmt - i.e - we need to "unroll" the
6955 vector stmt by a factor VF/nunits. For more details see documentation
6956 in vectorizable_operation. */
6958 /* If the reduction is used in an outer loop we need to generate
6959 VF intermediate results, like so (e.g. for ncopies=2):
6960 r0 = phi (init, r0)
6961 r1 = phi (init, r1)
6962 r0 = x0 + r0;
6963 r1 = x1 + r1;
6964 (i.e. we generate VF results in 2 registers).
6965 In this case we have a separate def-use cycle for each copy, and therefore
6966 for each copy we get the vector def for the reduction variable from the
6967 respective phi node created for this copy.
6969 Otherwise (the reduction is unused in the loop nest), we can combine
6970 together intermediate results, like so (e.g. for ncopies=2):
6971 r = phi (init, r)
6972 r = x0 + r;
6973 r = x1 + r;
6974 (i.e. we generate VF/2 results in a single register).
6975 In this case for each copy we get the vector def for the reduction variable
6976 from the vectorized reduction operation generated in the previous iteration.
6978 This only works when we see both the reduction PHI and its only consumer
6979 in vectorizable_reduction and there are no intermediate stmts
6980 participating. */
6981 if (ncopies > 1
6982 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6983 && reduc_chain_length == 1)
6984 single_defuse_cycle = true;
6986 if (single_defuse_cycle || lane_reduc_code_p)
6988 gcc_assert (code != COND_EXPR);
6990 /* 4. Supportable by target? */
6991 bool ok = true;
6993 /* 4.1. check support for the operation in the loop */
6994 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6995 if (!optab)
6997 if (dump_enabled_p ())
6998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6999 "no optab.\n");
7000 ok = false;
7003 machine_mode vec_mode = TYPE_MODE (vectype_in);
7004 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7006 if (dump_enabled_p ())
7007 dump_printf (MSG_NOTE, "op not supported by target.\n");
7008 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7009 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7010 ok = false;
7011 else
7012 if (dump_enabled_p ())
7013 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7016 /* Worthwhile without SIMD support? */
7017 if (ok
7018 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7019 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7021 if (dump_enabled_p ())
7022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7023 "not worthwhile without SIMD support.\n");
7024 ok = false;
7027 /* lane-reducing operations have to go through vect_transform_reduction.
7028 For the other cases try without the single cycle optimization. */
7029 if (!ok)
7031 if (lane_reduc_code_p)
7032 return false;
7033 else
7034 single_defuse_cycle = false;
7037 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7039 /* If the reduction stmt is one of the patterns that have lane
7040 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7041 if ((ncopies > 1 && ! single_defuse_cycle)
7042 && lane_reduc_code_p)
7044 if (dump_enabled_p ())
7045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7046 "multi def-use cycle not possible for lane-reducing "
7047 "reduction operation\n");
7048 return false;
7051 if (slp_node
7052 && !(!single_defuse_cycle
7053 && code != DOT_PROD_EXPR
7054 && code != WIDEN_SUM_EXPR
7055 && code != SAD_EXPR
7056 && reduction_type != FOLD_LEFT_REDUCTION))
7057 for (i = 0; i < op_type; i++)
7058 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7060 if (dump_enabled_p ())
7061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7062 "incompatible vector types for invariants\n");
7063 return false;
7066 if (slp_node)
7067 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7068 else
7069 vec_num = 1;
7071 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7072 reduction_type, ncopies, cost_vec);
7073 if (dump_enabled_p ()
7074 && reduction_type == FOLD_LEFT_REDUCTION)
7075 dump_printf_loc (MSG_NOTE, vect_location,
7076 "using an in-order (fold-left) reduction.\n");
7077 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7078 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7079 reductions go through their own vectorizable_* routines. */
7080 if (!single_defuse_cycle
7081 && code != DOT_PROD_EXPR
7082 && code != WIDEN_SUM_EXPR
7083 && code != SAD_EXPR
7084 && reduction_type != FOLD_LEFT_REDUCTION)
7086 stmt_vec_info tem
7087 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7088 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7090 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7091 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7093 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7094 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7096 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7098 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7099 internal_fn cond_fn = get_conditional_internal_fn (code);
7101 if (reduction_type != FOLD_LEFT_REDUCTION
7102 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7103 && (cond_fn == IFN_LAST
7104 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7105 OPTIMIZE_FOR_SPEED)))
7107 if (dump_enabled_p ())
7108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109 "can't operate on partial vectors because"
7110 " no conditional operation is available.\n");
7111 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7113 else if (reduction_type == FOLD_LEFT_REDUCTION
7114 && reduc_fn == IFN_LAST
7115 && !expand_vec_cond_expr_p (vectype_in,
7116 truth_type_for (vectype_in),
7117 SSA_NAME))
7119 if (dump_enabled_p ())
7120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7121 "can't operate on partial vectors because"
7122 " no conditional operation is available.\n");
7123 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7125 else
7126 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7127 vectype_in, NULL);
7129 return true;
7132 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7133 value. */
7135 bool
7136 vect_transform_reduction (loop_vec_info loop_vinfo,
7137 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7138 gimple **vec_stmt, slp_tree slp_node)
7140 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7141 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7142 int i;
7143 int ncopies;
7144 int vec_num;
7146 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7147 gcc_assert (reduc_info->is_reduc_info);
7149 if (nested_in_vect_loop_p (loop, stmt_info))
7151 loop = loop->inner;
7152 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7155 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7156 enum tree_code code = gimple_assign_rhs_code (stmt);
7157 int op_type = TREE_CODE_LENGTH (code);
7159 /* Flatten RHS. */
7160 tree ops[3];
7161 switch (get_gimple_rhs_class (code))
7163 case GIMPLE_TERNARY_RHS:
7164 ops[2] = gimple_assign_rhs3 (stmt);
7165 /* Fall thru. */
7166 case GIMPLE_BINARY_RHS:
7167 ops[0] = gimple_assign_rhs1 (stmt);
7168 ops[1] = gimple_assign_rhs2 (stmt);
7169 break;
7170 default:
7171 gcc_unreachable ();
7174 /* All uses but the last are expected to be defined in the loop.
7175 The last use is the reduction variable. In case of nested cycle this
7176 assumption is not true: we use reduc_index to record the index of the
7177 reduction variable. */
7178 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7179 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7180 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7181 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7183 if (slp_node)
7185 ncopies = 1;
7186 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7188 else
7190 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7191 vec_num = 1;
7194 internal_fn cond_fn = get_conditional_internal_fn (code);
7195 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7196 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7198 /* Transform. */
7199 tree new_temp = NULL_TREE;
7200 auto_vec<tree> vec_oprnds0;
7201 auto_vec<tree> vec_oprnds1;
7202 auto_vec<tree> vec_oprnds2;
7203 tree def0;
7205 if (dump_enabled_p ())
7206 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7208 /* FORNOW: Multiple types are not supported for condition. */
7209 if (code == COND_EXPR)
7210 gcc_assert (ncopies == 1);
7212 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7214 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7215 if (reduction_type == FOLD_LEFT_REDUCTION)
7217 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7218 return vectorize_fold_left_reduction
7219 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7220 reduc_fn, ops, vectype_in, reduc_index, masks);
7223 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7224 gcc_assert (single_defuse_cycle
7225 || code == DOT_PROD_EXPR
7226 || code == WIDEN_SUM_EXPR
7227 || code == SAD_EXPR);
7229 /* Create the destination vector */
7230 tree scalar_dest = gimple_assign_lhs (stmt);
7231 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7233 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7234 single_defuse_cycle && reduc_index == 0
7235 ? NULL_TREE : ops[0], &vec_oprnds0,
7236 single_defuse_cycle && reduc_index == 1
7237 ? NULL_TREE : ops[1], &vec_oprnds1,
7238 op_type == ternary_op
7239 && !(single_defuse_cycle && reduc_index == 2)
7240 ? ops[2] : NULL_TREE, &vec_oprnds2);
7241 if (single_defuse_cycle)
7243 gcc_assert (!slp_node);
7244 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7245 ops[reduc_index],
7246 reduc_index == 0 ? &vec_oprnds0
7247 : (reduc_index == 1 ? &vec_oprnds1
7248 : &vec_oprnds2));
7251 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7253 gimple *new_stmt;
7254 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7255 if (masked_loop_p && !mask_by_cond_expr)
7257 /* Make sure that the reduction accumulator is vop[0]. */
7258 if (reduc_index == 1)
7260 gcc_assert (commutative_tree_code (code));
7261 std::swap (vop[0], vop[1]);
7263 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7264 vectype_in, i);
7265 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7266 vop[0], vop[1], vop[0]);
7267 new_temp = make_ssa_name (vec_dest, call);
7268 gimple_call_set_lhs (call, new_temp);
7269 gimple_call_set_nothrow (call, true);
7270 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7271 new_stmt = call;
7273 else
7275 if (op_type == ternary_op)
7276 vop[2] = vec_oprnds2[i];
7278 if (masked_loop_p && mask_by_cond_expr)
7280 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7281 vectype_in, i);
7282 build_vect_cond_expr (code, vop, mask, gsi);
7285 new_stmt = gimple_build_assign (vec_dest, code,
7286 vop[0], vop[1], vop[2]);
7287 new_temp = make_ssa_name (vec_dest, new_stmt);
7288 gimple_assign_set_lhs (new_stmt, new_temp);
7289 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7292 if (slp_node)
7293 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7294 else if (single_defuse_cycle
7295 && i < ncopies - 1)
7297 if (reduc_index == 0)
7298 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7299 else if (reduc_index == 1)
7300 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7301 else if (reduc_index == 2)
7302 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7304 else
7305 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7308 if (!slp_node)
7309 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7311 return true;
7314 /* Transform phase of a cycle PHI. */
7316 bool
7317 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7318 stmt_vec_info stmt_info, gimple **vec_stmt,
7319 slp_tree slp_node, slp_instance slp_node_instance)
7321 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7322 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7323 int i;
7324 int ncopies;
7325 int j;
7326 bool nested_cycle = false;
7327 int vec_num;
7329 if (nested_in_vect_loop_p (loop, stmt_info))
7331 loop = loop->inner;
7332 nested_cycle = true;
7335 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7336 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7337 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7338 gcc_assert (reduc_info->is_reduc_info);
7340 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7341 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7342 /* Leave the scalar phi in place. */
7343 return true;
7345 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7346 /* For a nested cycle we do not fill the above. */
7347 if (!vectype_in)
7348 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7349 gcc_assert (vectype_in);
7351 if (slp_node)
7353 /* The size vect_schedule_slp_instance computes is off for us. */
7354 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7355 * SLP_TREE_LANES (slp_node), vectype_in);
7356 ncopies = 1;
7358 else
7360 vec_num = 1;
7361 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7364 /* Check whether we should use a single PHI node and accumulate
7365 vectors to one before the backedge. */
7366 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7367 ncopies = 1;
7369 /* Create the destination vector */
7370 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7371 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7372 vectype_out);
7374 /* Get the loop-entry arguments. */
7375 tree vec_initial_def;
7376 auto_vec<tree> vec_initial_defs;
7377 if (slp_node)
7379 vec_initial_defs.reserve (vec_num);
7380 if (nested_cycle)
7382 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7383 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7384 &vec_initial_defs);
7386 else
7388 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7389 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7390 tree neutral_op
7391 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7392 STMT_VINFO_REDUC_CODE (reduc_info),
7393 first != NULL);
7394 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7395 &vec_initial_defs, vec_num,
7396 first != NULL, neutral_op);
7399 else
7401 /* Get at the scalar def before the loop, that defines the initial
7402 value of the reduction variable. */
7403 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7404 loop_preheader_edge (loop));
7405 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7406 and we can't use zero for induc_val, use initial_def. Similarly
7407 for REDUC_MIN and initial_def larger than the base. */
7408 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7410 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7411 if (TREE_CODE (initial_def) == INTEGER_CST
7412 && !integer_zerop (induc_val)
7413 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7414 && tree_int_cst_lt (initial_def, induc_val))
7415 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7416 && tree_int_cst_lt (induc_val, initial_def))))
7418 induc_val = initial_def;
7419 /* Communicate we used the initial_def to epilouge
7420 generation. */
7421 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7423 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7424 vec_initial_defs.create (ncopies);
7425 for (i = 0; i < ncopies; ++i)
7426 vec_initial_defs.quick_push (vec_initial_def);
7428 else if (nested_cycle)
7430 /* Do not use an adjustment def as that case is not supported
7431 correctly if ncopies is not one. */
7432 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7433 ncopies, initial_def,
7434 &vec_initial_defs);
7436 else
7438 tree adjustment_def = NULL_TREE;
7439 tree *adjustment_defp = &adjustment_def;
7440 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7441 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7442 adjustment_defp = NULL;
7443 vec_initial_def
7444 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7445 initial_def, adjustment_defp);
7446 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7447 vec_initial_defs.create (ncopies);
7448 for (i = 0; i < ncopies; ++i)
7449 vec_initial_defs.quick_push (vec_initial_def);
7453 /* Generate the reduction PHIs upfront. */
7454 for (i = 0; i < vec_num; i++)
7456 tree vec_init_def = vec_initial_defs[i];
7457 for (j = 0; j < ncopies; j++)
7459 /* Create the reduction-phi that defines the reduction
7460 operand. */
7461 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7463 /* Set the loop-entry arg of the reduction-phi. */
7464 if (j != 0 && nested_cycle)
7465 vec_init_def = vec_initial_defs[j];
7466 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7467 UNKNOWN_LOCATION);
7469 /* The loop-latch arg is set in epilogue processing. */
7471 if (slp_node)
7472 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7473 else
7475 if (j == 0)
7476 *vec_stmt = new_phi;
7477 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7482 return true;
7485 /* Vectorizes LC PHIs. */
7487 bool
7488 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7489 stmt_vec_info stmt_info, gimple **vec_stmt,
7490 slp_tree slp_node)
7492 if (!loop_vinfo
7493 || !is_a <gphi *> (stmt_info->stmt)
7494 || gimple_phi_num_args (stmt_info->stmt) != 1)
7495 return false;
7497 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7498 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7499 return false;
7501 if (!vec_stmt) /* transformation not required. */
7503 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7504 return true;
7507 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7508 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7509 basic_block bb = gimple_bb (stmt_info->stmt);
7510 edge e = single_pred_edge (bb);
7511 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7512 auto_vec<tree> vec_oprnds;
7513 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7514 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7515 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7516 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7518 /* Create the vectorized LC PHI node. */
7519 gphi *new_phi = create_phi_node (vec_dest, bb);
7520 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7521 if (slp_node)
7522 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7523 else
7524 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7526 if (!slp_node)
7527 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7529 return true;
7532 /* Vectorizes PHIs. */
7534 bool
7535 vectorizable_phi (vec_info *,
7536 stmt_vec_info stmt_info, gimple **vec_stmt,
7537 slp_tree slp_node)
7539 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7540 return false;
7542 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7543 return false;
7545 tree vectype = SLP_TREE_VECTYPE (slp_node);
7547 if (!vec_stmt) /* transformation not required. */
7549 slp_tree child;
7550 unsigned i;
7551 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7552 if (!child)
7554 if (dump_enabled_p ())
7555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7556 "PHI node with unvectorized backedge def\n");
7557 return false;
7559 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7561 if (dump_enabled_p ())
7562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7563 "incompatible vector types for invariants\n");
7564 return false;
7566 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7567 return true;
7570 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7571 basic_block bb = gimple_bb (stmt_info->stmt);
7572 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7573 auto_vec<tree> vec_oprnds;
7574 auto_vec<gphi *> new_phis;
7575 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7577 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7579 /* Skip not yet vectorized defs. */
7580 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7581 && SLP_TREE_VEC_STMTS (child).is_empty ())
7582 continue;
7584 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7585 if (!new_phis.exists ())
7587 new_phis.create (vec_oprnds.length ());
7588 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7590 /* Create the vectorized LC PHI node. */
7591 new_phis.quick_push (create_phi_node (vec_dest, bb));
7592 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7595 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7596 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7597 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7599 /* We should have at least one already vectorized child. */
7600 gcc_assert (new_phis.exists ());
7602 return true;
7606 /* Function vect_min_worthwhile_factor.
7608 For a loop where we could vectorize the operation indicated by CODE,
7609 return the minimum vectorization factor that makes it worthwhile
7610 to use generic vectors. */
7611 static unsigned int
7612 vect_min_worthwhile_factor (enum tree_code code)
7614 switch (code)
7616 case PLUS_EXPR:
7617 case MINUS_EXPR:
7618 case NEGATE_EXPR:
7619 return 4;
7621 case BIT_AND_EXPR:
7622 case BIT_IOR_EXPR:
7623 case BIT_XOR_EXPR:
7624 case BIT_NOT_EXPR:
7625 return 2;
7627 default:
7628 return INT_MAX;
7632 /* Return true if VINFO indicates we are doing loop vectorization and if
7633 it is worth decomposing CODE operations into scalar operations for
7634 that loop's vectorization factor. */
7636 bool
7637 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7639 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7640 unsigned HOST_WIDE_INT value;
7641 return (loop_vinfo
7642 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7643 && value >= vect_min_worthwhile_factor (code));
7646 /* Function vectorizable_induction
7648 Check if STMT_INFO performs an induction computation that can be vectorized.
7649 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7650 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7651 Return true if STMT_INFO is vectorizable in this way. */
7653 bool
7654 vectorizable_induction (loop_vec_info loop_vinfo,
7655 stmt_vec_info stmt_info,
7656 gimple **vec_stmt, slp_tree slp_node,
7657 stmt_vector_for_cost *cost_vec)
7659 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7660 unsigned ncopies;
7661 bool nested_in_vect_loop = false;
7662 class loop *iv_loop;
7663 tree vec_def;
7664 edge pe = loop_preheader_edge (loop);
7665 basic_block new_bb;
7666 tree new_vec, vec_init, vec_step, t;
7667 tree new_name;
7668 gimple *new_stmt;
7669 gphi *induction_phi;
7670 tree induc_def, vec_dest;
7671 tree init_expr, step_expr;
7672 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7673 unsigned i;
7674 tree expr;
7675 gimple_seq stmts;
7676 gimple_stmt_iterator si;
7678 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7679 if (!phi)
7680 return false;
7682 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7683 return false;
7685 /* Make sure it was recognized as induction computation. */
7686 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7687 return false;
7689 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7690 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7692 if (slp_node)
7693 ncopies = 1;
7694 else
7695 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7696 gcc_assert (ncopies >= 1);
7698 /* FORNOW. These restrictions should be relaxed. */
7699 if (nested_in_vect_loop_p (loop, stmt_info))
7701 imm_use_iterator imm_iter;
7702 use_operand_p use_p;
7703 gimple *exit_phi;
7704 edge latch_e;
7705 tree loop_arg;
7707 if (ncopies > 1)
7709 if (dump_enabled_p ())
7710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7711 "multiple types in nested loop.\n");
7712 return false;
7715 /* FORNOW: outer loop induction with SLP not supported. */
7716 if (STMT_SLP_TYPE (stmt_info))
7717 return false;
7719 exit_phi = NULL;
7720 latch_e = loop_latch_edge (loop->inner);
7721 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7722 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7724 gimple *use_stmt = USE_STMT (use_p);
7725 if (is_gimple_debug (use_stmt))
7726 continue;
7728 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7730 exit_phi = use_stmt;
7731 break;
7734 if (exit_phi)
7736 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7737 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7738 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7740 if (dump_enabled_p ())
7741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742 "inner-loop induction only used outside "
7743 "of the outer vectorized loop.\n");
7744 return false;
7748 nested_in_vect_loop = true;
7749 iv_loop = loop->inner;
7751 else
7752 iv_loop = loop;
7753 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7755 if (slp_node && !nunits.is_constant ())
7757 /* The current SLP code creates the initial value element-by-element. */
7758 if (dump_enabled_p ())
7759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7760 "SLP induction not supported for variable-length"
7761 " vectors.\n");
7762 return false;
7765 if (!vec_stmt) /* transformation not required. */
7767 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7768 DUMP_VECT_SCOPE ("vectorizable_induction");
7769 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7770 return true;
7773 /* Transform. */
7775 /* Compute a vector variable, initialized with the first VF values of
7776 the induction variable. E.g., for an iv with IV_PHI='X' and
7777 evolution S, for a vector of 4 units, we want to compute:
7778 [X, X + S, X + 2*S, X + 3*S]. */
7780 if (dump_enabled_p ())
7781 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7783 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7784 gcc_assert (step_expr != NULL_TREE);
7785 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7787 pe = loop_preheader_edge (iv_loop);
7788 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7789 loop_preheader_edge (iv_loop));
7791 stmts = NULL;
7792 if (!nested_in_vect_loop)
7794 /* Convert the initial value to the IV update type. */
7795 tree new_type = TREE_TYPE (step_expr);
7796 init_expr = gimple_convert (&stmts, new_type, init_expr);
7798 /* If we are using the loop mask to "peel" for alignment then we need
7799 to adjust the start value here. */
7800 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7801 if (skip_niters != NULL_TREE)
7803 if (FLOAT_TYPE_P (vectype))
7804 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7805 skip_niters);
7806 else
7807 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7808 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7809 skip_niters, step_expr);
7810 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7811 init_expr, skip_step);
7815 if (stmts)
7817 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7818 gcc_assert (!new_bb);
7821 /* Find the first insertion point in the BB. */
7822 basic_block bb = gimple_bb (phi);
7823 si = gsi_after_labels (bb);
7825 /* For SLP induction we have to generate several IVs as for example
7826 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7827 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7828 [VF*S, VF*S, VF*S, VF*S] for all. */
7829 if (slp_node)
7831 /* Enforced above. */
7832 unsigned int const_nunits = nunits.to_constant ();
7834 /* Generate [VF*S, VF*S, ... ]. */
7835 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7837 expr = build_int_cst (integer_type_node, vf);
7838 expr = fold_convert (TREE_TYPE (step_expr), expr);
7840 else
7841 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7842 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7843 expr, step_expr);
7844 if (! CONSTANT_CLASS_P (new_name))
7845 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7846 TREE_TYPE (step_expr), NULL);
7847 new_vec = build_vector_from_val (step_vectype, new_name);
7848 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7849 new_vec, step_vectype, NULL);
7851 /* Now generate the IVs. */
7852 unsigned group_size = SLP_TREE_LANES (slp_node);
7853 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7854 unsigned elts = const_nunits * nvects;
7855 /* Compute the number of distinct IVs we need. First reduce
7856 group_size if it is a multiple of const_nunits so we get
7857 one IV for a group_size of 4 but const_nunits 2. */
7858 unsigned group_sizep = group_size;
7859 if (group_sizep % const_nunits == 0)
7860 group_sizep = group_sizep / const_nunits;
7861 unsigned nivs = least_common_multiple (group_sizep,
7862 const_nunits) / const_nunits;
7863 gcc_assert (elts % group_size == 0);
7864 tree elt = init_expr;
7865 unsigned ivn;
7866 for (ivn = 0; ivn < nivs; ++ivn)
7868 tree_vector_builder elts (step_vectype, const_nunits, 1);
7869 stmts = NULL;
7870 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7872 if (ivn*const_nunits + eltn >= group_size
7873 && (ivn * const_nunits + eltn) % group_size == 0)
7874 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7875 elt, step_expr);
7876 elts.quick_push (elt);
7878 vec_init = gimple_build_vector (&stmts, &elts);
7879 vec_init = gimple_convert (&stmts, vectype, vec_init);
7880 if (stmts)
7882 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7883 gcc_assert (!new_bb);
7886 /* Create the induction-phi that defines the induction-operand. */
7887 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7888 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7889 induc_def = PHI_RESULT (induction_phi);
7891 /* Create the iv update inside the loop */
7892 gimple_seq stmts = NULL;
7893 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7894 vec_def = gimple_build (&stmts,
7895 PLUS_EXPR, step_vectype, vec_def, vec_step);
7896 vec_def = gimple_convert (&stmts, vectype, vec_def);
7897 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7899 /* Set the arguments of the phi node: */
7900 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7901 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7902 UNKNOWN_LOCATION);
7904 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7906 /* Fill up to the number of vectors we need for the whole group. */
7907 nivs = least_common_multiple (group_size,
7908 const_nunits) / const_nunits;
7909 for (; ivn < nivs; ++ivn)
7910 SLP_TREE_VEC_STMTS (slp_node)
7911 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7913 /* Re-use IVs when we can. */
7914 if (ivn < nvects)
7916 unsigned vfp
7917 = least_common_multiple (group_size, const_nunits) / group_size;
7918 /* Generate [VF'*S, VF'*S, ... ]. */
7919 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7921 expr = build_int_cst (integer_type_node, vfp);
7922 expr = fold_convert (TREE_TYPE (step_expr), expr);
7924 else
7925 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7926 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7927 expr, step_expr);
7928 if (! CONSTANT_CLASS_P (new_name))
7929 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7930 TREE_TYPE (step_expr), NULL);
7931 new_vec = build_vector_from_val (step_vectype, new_name);
7932 vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7933 step_vectype, NULL);
7934 for (; ivn < nvects; ++ivn)
7936 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7937 tree def;
7938 if (gimple_code (iv) == GIMPLE_PHI)
7939 def = gimple_phi_result (iv);
7940 else
7941 def = gimple_assign_lhs (iv);
7942 gimple_seq stmts = NULL;
7943 def = gimple_convert (&stmts, step_vectype, def);
7944 def = gimple_build (&stmts,
7945 PLUS_EXPR, step_vectype, def, vec_step);
7946 def = gimple_convert (&stmts, vectype, def);
7947 if (gimple_code (iv) == GIMPLE_PHI)
7948 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7949 else
7951 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7952 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7954 SLP_TREE_VEC_STMTS (slp_node)
7955 .quick_push (SSA_NAME_DEF_STMT (def));
7959 return true;
7962 /* Create the vector that holds the initial_value of the induction. */
7963 if (nested_in_vect_loop)
7965 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7966 been created during vectorization of previous stmts. We obtain it
7967 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7968 auto_vec<tree> vec_inits;
7969 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7970 init_expr, &vec_inits);
7971 vec_init = vec_inits[0];
7972 /* If the initial value is not of proper type, convert it. */
7973 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7975 new_stmt
7976 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7977 vect_simple_var,
7978 "vec_iv_"),
7979 VIEW_CONVERT_EXPR,
7980 build1 (VIEW_CONVERT_EXPR, vectype,
7981 vec_init));
7982 vec_init = gimple_assign_lhs (new_stmt);
7983 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7984 new_stmt);
7985 gcc_assert (!new_bb);
7988 else
7990 /* iv_loop is the loop to be vectorized. Create:
7991 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7992 stmts = NULL;
7993 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7995 unsigned HOST_WIDE_INT const_nunits;
7996 if (nunits.is_constant (&const_nunits))
7998 tree_vector_builder elts (step_vectype, const_nunits, 1);
7999 elts.quick_push (new_name);
8000 for (i = 1; i < const_nunits; i++)
8002 /* Create: new_name_i = new_name + step_expr */
8003 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8004 new_name, step_expr);
8005 elts.quick_push (new_name);
8007 /* Create a vector from [new_name_0, new_name_1, ...,
8008 new_name_nunits-1] */
8009 vec_init = gimple_build_vector (&stmts, &elts);
8011 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8012 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8013 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8014 new_name, step_expr);
8015 else
8017 /* Build:
8018 [base, base, base, ...]
8019 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8020 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8021 gcc_assert (flag_associative_math);
8022 tree index = build_index_vector (step_vectype, 0, 1);
8023 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8024 new_name);
8025 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8026 step_expr);
8027 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8028 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8029 vec_init, step_vec);
8030 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8031 vec_init, base_vec);
8033 vec_init = gimple_convert (&stmts, vectype, vec_init);
8035 if (stmts)
8037 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8038 gcc_assert (!new_bb);
8043 /* Create the vector that holds the step of the induction. */
8044 if (nested_in_vect_loop)
8045 /* iv_loop is nested in the loop to be vectorized. Generate:
8046 vec_step = [S, S, S, S] */
8047 new_name = step_expr;
8048 else
8050 /* iv_loop is the loop to be vectorized. Generate:
8051 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8052 gimple_seq seq = NULL;
8053 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8055 expr = build_int_cst (integer_type_node, vf);
8056 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8058 else
8059 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8060 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8061 expr, step_expr);
8062 if (seq)
8064 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8065 gcc_assert (!new_bb);
8069 t = unshare_expr (new_name);
8070 gcc_assert (CONSTANT_CLASS_P (new_name)
8071 || TREE_CODE (new_name) == SSA_NAME);
8072 new_vec = build_vector_from_val (step_vectype, t);
8073 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8074 new_vec, step_vectype, NULL);
8077 /* Create the following def-use cycle:
8078 loop prolog:
8079 vec_init = ...
8080 vec_step = ...
8081 loop:
8082 vec_iv = PHI <vec_init, vec_loop>
8084 STMT
8086 vec_loop = vec_iv + vec_step; */
8088 /* Create the induction-phi that defines the induction-operand. */
8089 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8090 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8091 induc_def = PHI_RESULT (induction_phi);
8093 /* Create the iv update inside the loop */
8094 stmts = NULL;
8095 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8096 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8097 vec_def = gimple_convert (&stmts, vectype, vec_def);
8098 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8099 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8101 /* Set the arguments of the phi node: */
8102 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8103 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8104 UNKNOWN_LOCATION);
8106 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8107 *vec_stmt = induction_phi;
8109 /* In case that vectorization factor (VF) is bigger than the number
8110 of elements that we can fit in a vectype (nunits), we have to generate
8111 more than one vector stmt - i.e - we need to "unroll" the
8112 vector stmt by a factor VF/nunits. For more details see documentation
8113 in vectorizable_operation. */
8115 if (ncopies > 1)
8117 gimple_seq seq = NULL;
8118 /* FORNOW. This restriction should be relaxed. */
8119 gcc_assert (!nested_in_vect_loop);
8121 /* Create the vector that holds the step of the induction. */
8122 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8124 expr = build_int_cst (integer_type_node, nunits);
8125 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8127 else
8128 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8129 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8130 expr, step_expr);
8131 if (seq)
8133 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8134 gcc_assert (!new_bb);
8137 t = unshare_expr (new_name);
8138 gcc_assert (CONSTANT_CLASS_P (new_name)
8139 || TREE_CODE (new_name) == SSA_NAME);
8140 new_vec = build_vector_from_val (step_vectype, t);
8141 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8142 new_vec, step_vectype, NULL);
8144 vec_def = induc_def;
8145 for (i = 1; i < ncopies; i++)
8147 /* vec_i = vec_prev + vec_step */
8148 gimple_seq stmts = NULL;
8149 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8150 vec_def = gimple_build (&stmts,
8151 PLUS_EXPR, step_vectype, vec_def, vec_step);
8152 vec_def = gimple_convert (&stmts, vectype, vec_def);
8154 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8155 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8156 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8160 if (dump_enabled_p ())
8161 dump_printf_loc (MSG_NOTE, vect_location,
8162 "transform induction: created def-use cycle: %G%G",
8163 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8165 return true;
8168 /* Function vectorizable_live_operation.
8170 STMT_INFO computes a value that is used outside the loop. Check if
8171 it can be supported. */
8173 bool
8174 vectorizable_live_operation (vec_info *vinfo,
8175 stmt_vec_info stmt_info,
8176 gimple_stmt_iterator *gsi,
8177 slp_tree slp_node, slp_instance slp_node_instance,
8178 int slp_index, bool vec_stmt_p,
8179 stmt_vector_for_cost *cost_vec)
8181 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8182 imm_use_iterator imm_iter;
8183 tree lhs, lhs_type, bitsize, vec_bitsize;
8184 tree vectype = (slp_node
8185 ? SLP_TREE_VECTYPE (slp_node)
8186 : STMT_VINFO_VECTYPE (stmt_info));
8187 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8188 int ncopies;
8189 gimple *use_stmt;
8190 auto_vec<tree> vec_oprnds;
8191 int vec_entry = 0;
8192 poly_uint64 vec_index = 0;
8194 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8196 /* If a stmt of a reduction is live, vectorize it via
8197 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8198 validity so just trigger the transform here. */
8199 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8201 if (!vec_stmt_p)
8202 return true;
8203 if (slp_node)
8205 /* For reduction chains the meta-info is attached to
8206 the group leader. */
8207 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8208 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8209 /* For SLP reductions we vectorize the epilogue for
8210 all involved stmts together. */
8211 else if (slp_index != 0)
8212 return true;
8213 else
8214 /* For SLP reductions the meta-info is attached to
8215 the representative. */
8216 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8218 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8219 gcc_assert (reduc_info->is_reduc_info);
8220 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8221 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8222 return true;
8223 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8224 slp_node_instance);
8225 return true;
8228 /* If STMT is not relevant and it is a simple assignment and its inputs are
8229 invariant then it can remain in place, unvectorized. The original last
8230 scalar value that it computes will be used. */
8231 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8233 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8234 if (dump_enabled_p ())
8235 dump_printf_loc (MSG_NOTE, vect_location,
8236 "statement is simple and uses invariant. Leaving in "
8237 "place.\n");
8238 return true;
8241 if (slp_node)
8242 ncopies = 1;
8243 else
8244 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8246 if (slp_node)
8248 gcc_assert (slp_index >= 0);
8250 /* Get the last occurrence of the scalar index from the concatenation of
8251 all the slp vectors. Calculate which slp vector it is and the index
8252 within. */
8253 int num_scalar = SLP_TREE_LANES (slp_node);
8254 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8255 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8257 /* Calculate which vector contains the result, and which lane of
8258 that vector we need. */
8259 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8261 if (dump_enabled_p ())
8262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8263 "Cannot determine which vector holds the"
8264 " final result.\n");
8265 return false;
8269 if (!vec_stmt_p)
8271 /* No transformation required. */
8272 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8274 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8275 OPTIMIZE_FOR_SPEED))
8277 if (dump_enabled_p ())
8278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8279 "can't operate on partial vectors "
8280 "because the target doesn't support extract "
8281 "last reduction.\n");
8282 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8284 else if (slp_node)
8286 if (dump_enabled_p ())
8287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8288 "can't operate on partial vectors "
8289 "because an SLP statement is live after "
8290 "the loop.\n");
8291 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8293 else if (ncopies > 1)
8295 if (dump_enabled_p ())
8296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8297 "can't operate on partial vectors "
8298 "because ncopies is greater than 1.\n");
8299 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8301 else
8303 gcc_assert (ncopies == 1 && !slp_node);
8304 vect_record_loop_mask (loop_vinfo,
8305 &LOOP_VINFO_MASKS (loop_vinfo),
8306 1, vectype, NULL);
8309 /* ??? Enable for loop costing as well. */
8310 if (!loop_vinfo)
8311 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8312 0, vect_epilogue);
8313 return true;
8316 /* Use the lhs of the original scalar statement. */
8317 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8318 if (dump_enabled_p ())
8319 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8320 "stmt %G", stmt);
8322 lhs = gimple_get_lhs (stmt);
8323 lhs_type = TREE_TYPE (lhs);
8325 bitsize = vector_element_bits_tree (vectype);
8326 vec_bitsize = TYPE_SIZE (vectype);
8328 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8329 tree vec_lhs, bitstart;
8330 gimple *vec_stmt;
8331 if (slp_node)
8333 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8335 /* Get the correct slp vectorized stmt. */
8336 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8337 vec_lhs = gimple_get_lhs (vec_stmt);
8339 /* Get entry to use. */
8340 bitstart = bitsize_int (vec_index);
8341 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8343 else
8345 /* For multiple copies, get the last copy. */
8346 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8347 vec_lhs = gimple_get_lhs (vec_stmt);
8349 /* Get the last lane in the vector. */
8350 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8353 if (loop_vinfo)
8355 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8356 requirement, insert one phi node for it. It looks like:
8357 loop;
8359 # lhs' = PHI <lhs>
8361 loop;
8363 # vec_lhs' = PHI <vec_lhs>
8364 new_tree = lane_extract <vec_lhs', ...>;
8365 lhs' = new_tree; */
8367 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8368 basic_block exit_bb = single_exit (loop)->dest;
8369 gcc_assert (single_pred_p (exit_bb));
8371 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8372 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8373 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8375 gimple_seq stmts = NULL;
8376 tree new_tree;
8377 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8379 /* Emit:
8381 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8383 where VEC_LHS is the vectorized live-out result and MASK is
8384 the loop mask for the final iteration. */
8385 gcc_assert (ncopies == 1 && !slp_node);
8386 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8387 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8388 1, vectype, 0);
8389 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8390 mask, vec_lhs_phi);
8392 /* Convert the extracted vector element to the scalar type. */
8393 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8395 else
8397 tree bftype = TREE_TYPE (vectype);
8398 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8399 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8400 new_tree = build3 (BIT_FIELD_REF, bftype,
8401 vec_lhs_phi, bitsize, bitstart);
8402 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8403 &stmts, true, NULL_TREE);
8406 if (stmts)
8408 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8409 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8411 /* Remove existing phi from lhs and create one copy from new_tree. */
8412 tree lhs_phi = NULL_TREE;
8413 gimple_stmt_iterator gsi;
8414 for (gsi = gsi_start_phis (exit_bb);
8415 !gsi_end_p (gsi); gsi_next (&gsi))
8417 gimple *phi = gsi_stmt (gsi);
8418 if ((gimple_phi_arg_def (phi, 0) == lhs))
8420 remove_phi_node (&gsi, false);
8421 lhs_phi = gimple_phi_result (phi);
8422 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8423 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8424 break;
8429 /* Replace use of lhs with newly computed result. If the use stmt is a
8430 single arg PHI, just replace all uses of PHI result. It's necessary
8431 because lcssa PHI defining lhs may be before newly inserted stmt. */
8432 use_operand_p use_p;
8433 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8434 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8435 && !is_gimple_debug (use_stmt))
8437 if (gimple_code (use_stmt) == GIMPLE_PHI
8438 && gimple_phi_num_args (use_stmt) == 1)
8440 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8442 else
8444 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8445 SET_USE (use_p, new_tree);
8447 update_stmt (use_stmt);
8450 else
8452 /* For basic-block vectorization simply insert the lane-extraction. */
8453 tree bftype = TREE_TYPE (vectype);
8454 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8455 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8456 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8457 vec_lhs, bitsize, bitstart);
8458 gimple_seq stmts = NULL;
8459 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8460 &stmts, true, NULL_TREE);
8461 if (is_a <gphi *> (vec_stmt))
8463 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8464 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8466 else
8468 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8469 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8472 /* Replace use of lhs with newly computed result. If the use stmt is a
8473 single arg PHI, just replace all uses of PHI result. It's necessary
8474 because lcssa PHI defining lhs may be before newly inserted stmt. */
8475 use_operand_p use_p;
8476 stmt_vec_info use_stmt_info;
8477 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8478 if (!is_gimple_debug (use_stmt)
8479 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8480 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8482 /* ??? This can happen when the live lane ends up being
8483 used in a vector construction code-generated by an
8484 external SLP node (and code-generation for that already
8485 happened). See gcc.dg/vect/bb-slp-47.c.
8486 Doing this is what would happen if that vector CTOR
8487 were not code-generated yet so it is not too bad.
8488 ??? In fact we'd likely want to avoid this situation
8489 in the first place. */
8490 if (TREE_CODE (new_tree) == SSA_NAME
8491 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8492 && gimple_code (use_stmt) != GIMPLE_PHI
8493 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8494 use_stmt))
8496 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8497 gcc_assert (code == CONSTRUCTOR
8498 || code == VIEW_CONVERT_EXPR
8499 || CONVERT_EXPR_CODE_P (code));
8500 if (dump_enabled_p ())
8501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8502 "Using original scalar computation for "
8503 "live lane because use preceeds vector "
8504 "def\n");
8505 continue;
8507 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8508 SET_USE (use_p, new_tree);
8509 update_stmt (use_stmt);
8513 return true;
8516 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8518 static void
8519 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8521 ssa_op_iter op_iter;
8522 imm_use_iterator imm_iter;
8523 def_operand_p def_p;
8524 gimple *ustmt;
8526 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8528 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8530 basic_block bb;
8532 if (!is_gimple_debug (ustmt))
8533 continue;
8535 bb = gimple_bb (ustmt);
8537 if (!flow_bb_inside_loop_p (loop, bb))
8539 if (gimple_debug_bind_p (ustmt))
8541 if (dump_enabled_p ())
8542 dump_printf_loc (MSG_NOTE, vect_location,
8543 "killing debug use\n");
8545 gimple_debug_bind_reset_value (ustmt);
8546 update_stmt (ustmt);
8548 else
8549 gcc_unreachable ();
8555 /* Given loop represented by LOOP_VINFO, return true if computation of
8556 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8557 otherwise. */
8559 static bool
8560 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8562 /* Constant case. */
8563 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8565 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8566 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8568 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8569 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8570 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8571 return true;
8574 widest_int max;
8575 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8576 /* Check the upper bound of loop niters. */
8577 if (get_max_loop_iterations (loop, &max))
8579 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8580 signop sgn = TYPE_SIGN (type);
8581 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8582 if (max < type_max)
8583 return true;
8585 return false;
8588 /* Return a mask type with half the number of elements as OLD_TYPE,
8589 given that it should have mode NEW_MODE. */
8591 tree
8592 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8594 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8595 return build_truth_vector_type_for_mode (nunits, new_mode);
8598 /* Return a mask type with twice as many elements as OLD_TYPE,
8599 given that it should have mode NEW_MODE. */
8601 tree
8602 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8604 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8605 return build_truth_vector_type_for_mode (nunits, new_mode);
8608 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8609 contain a sequence of NVECTORS masks that each control a vector of type
8610 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8611 these vector masks with the vector version of SCALAR_MASK. */
8613 void
8614 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8615 unsigned int nvectors, tree vectype, tree scalar_mask)
8617 gcc_assert (nvectors != 0);
8618 if (masks->length () < nvectors)
8619 masks->safe_grow_cleared (nvectors, true);
8620 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8621 /* The number of scalars per iteration and the number of vectors are
8622 both compile-time constants. */
8623 unsigned int nscalars_per_iter
8624 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8625 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8627 if (scalar_mask)
8629 scalar_cond_masked_key cond (scalar_mask, nvectors);
8630 loop_vinfo->scalar_cond_masked_set.add (cond);
8633 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8635 rgm->max_nscalars_per_iter = nscalars_per_iter;
8636 rgm->type = truth_type_for (vectype);
8637 rgm->factor = 1;
8641 /* Given a complete set of masks MASKS, extract mask number INDEX
8642 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8643 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8645 See the comment above vec_loop_masks for more details about the mask
8646 arrangement. */
8648 tree
8649 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8650 unsigned int nvectors, tree vectype, unsigned int index)
8652 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8653 tree mask_type = rgm->type;
8655 /* Populate the rgroup's mask array, if this is the first time we've
8656 used it. */
8657 if (rgm->controls.is_empty ())
8659 rgm->controls.safe_grow_cleared (nvectors, true);
8660 for (unsigned int i = 0; i < nvectors; ++i)
8662 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8663 /* Provide a dummy definition until the real one is available. */
8664 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8665 rgm->controls[i] = mask;
8669 tree mask = rgm->controls[index];
8670 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8671 TYPE_VECTOR_SUBPARTS (vectype)))
8673 /* A loop mask for data type X can be reused for data type Y
8674 if X has N times more elements than Y and if Y's elements
8675 are N times bigger than X's. In this case each sequence
8676 of N elements in the loop mask will be all-zero or all-one.
8677 We can then view-convert the mask so that each sequence of
8678 N elements is replaced by a single element. */
8679 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8680 TYPE_VECTOR_SUBPARTS (vectype)));
8681 gimple_seq seq = NULL;
8682 mask_type = truth_type_for (vectype);
8683 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8684 if (seq)
8685 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8687 return mask;
8690 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8691 lengths for controlling an operation on VECTYPE. The operation splits
8692 each element of VECTYPE into FACTOR separate subelements, measuring the
8693 length as a number of these subelements. */
8695 void
8696 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8697 unsigned int nvectors, tree vectype, unsigned int factor)
8699 gcc_assert (nvectors != 0);
8700 if (lens->length () < nvectors)
8701 lens->safe_grow_cleared (nvectors, true);
8702 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8704 /* The number of scalars per iteration, scalar occupied bytes and
8705 the number of vectors are both compile-time constants. */
8706 unsigned int nscalars_per_iter
8707 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8708 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8710 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8712 /* For now, we only support cases in which all loads and stores fall back
8713 to VnQI or none do. */
8714 gcc_assert (!rgl->max_nscalars_per_iter
8715 || (rgl->factor == 1 && factor == 1)
8716 || (rgl->max_nscalars_per_iter * rgl->factor
8717 == nscalars_per_iter * factor));
8718 rgl->max_nscalars_per_iter = nscalars_per_iter;
8719 rgl->type = vectype;
8720 rgl->factor = factor;
8724 /* Given a complete set of length LENS, extract length number INDEX for an
8725 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8727 tree
8728 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8729 unsigned int nvectors, unsigned int index)
8731 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8733 /* Populate the rgroup's len array, if this is the first time we've
8734 used it. */
8735 if (rgl->controls.is_empty ())
8737 rgl->controls.safe_grow_cleared (nvectors, true);
8738 for (unsigned int i = 0; i < nvectors; ++i)
8740 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8741 gcc_assert (len_type != NULL_TREE);
8742 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8744 /* Provide a dummy definition until the real one is available. */
8745 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8746 rgl->controls[i] = len;
8750 return rgl->controls[index];
8753 /* Scale profiling counters by estimation for LOOP which is vectorized
8754 by factor VF. */
8756 static void
8757 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8759 edge preheader = loop_preheader_edge (loop);
8760 /* Reduce loop iterations by the vectorization factor. */
8761 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8762 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8764 if (freq_h.nonzero_p ())
8766 profile_probability p;
8768 /* Avoid dropping loop body profile counter to 0 because of zero count
8769 in loop's preheader. */
8770 if (!(freq_e == profile_count::zero ()))
8771 freq_e = freq_e.force_nonzero ();
8772 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8773 scale_loop_frequencies (loop, p);
8776 edge exit_e = single_exit (loop);
8777 exit_e->probability = profile_probability::always ()
8778 .apply_scale (1, new_est_niter + 1);
8780 edge exit_l = single_pred_edge (loop->latch);
8781 profile_probability prob = exit_l->probability;
8782 exit_l->probability = exit_e->probability.invert ();
8783 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8784 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8787 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8788 latch edge values originally defined by it. */
8790 static void
8791 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8792 stmt_vec_info def_stmt_info)
8794 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8795 if (!def || TREE_CODE (def) != SSA_NAME)
8796 return;
8797 stmt_vec_info phi_info;
8798 imm_use_iterator iter;
8799 use_operand_p use_p;
8800 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8801 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8802 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8803 && (phi_info = loop_vinfo->lookup_stmt (phi))
8804 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8805 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8806 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8808 loop_p loop = gimple_bb (phi)->loop_father;
8809 edge e = loop_latch_edge (loop);
8810 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8812 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
8813 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
8814 gcc_assert (phi_defs.length () == latch_defs.length ());
8815 for (unsigned i = 0; i < phi_defs.length (); ++i)
8816 add_phi_arg (as_a <gphi *> (phi_defs[i]),
8817 gimple_get_lhs (latch_defs[i]), e,
8818 gimple_phi_arg_location (phi, e->dest_idx));
8823 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8824 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8825 stmt_vec_info. */
8827 static void
8828 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8829 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8831 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8832 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8834 if (dump_enabled_p ())
8835 dump_printf_loc (MSG_NOTE, vect_location,
8836 "------>vectorizing statement: %G", stmt_info->stmt);
8838 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8839 vect_loop_kill_debug_uses (loop, stmt_info);
8841 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8842 && !STMT_VINFO_LIVE_P (stmt_info))
8843 return;
8845 if (STMT_VINFO_VECTYPE (stmt_info))
8847 poly_uint64 nunits
8848 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8849 if (!STMT_SLP_TYPE (stmt_info)
8850 && maybe_ne (nunits, vf)
8851 && dump_enabled_p ())
8852 /* For SLP VF is set according to unrolling factor, and not
8853 to vector size, hence for SLP this print is not valid. */
8854 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8857 /* Pure SLP statements have already been vectorized. We still need
8858 to apply loop vectorization to hybrid SLP statements. */
8859 if (PURE_SLP_STMT (stmt_info))
8860 return;
8862 if (dump_enabled_p ())
8863 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8865 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8866 *seen_store = stmt_info;
8869 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8870 in the hash_map with its corresponding values. */
8872 static tree
8873 find_in_mapping (tree t, void *context)
8875 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8877 tree *value = mapping->get (t);
8878 return value ? *value : t;
8881 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8882 original loop that has now been vectorized.
8884 The inits of the data_references need to be advanced with the number of
8885 iterations of the main loop. This has been computed in vect_do_peeling and
8886 is stored in parameter ADVANCE. We first restore the data_references
8887 initial offset with the values recored in ORIG_DRS_INIT.
8889 Since the loop_vec_info of this EPILOGUE was constructed for the original
8890 loop, its stmt_vec_infos all point to the original statements. These need
8891 to be updated to point to their corresponding copies as well as the SSA_NAMES
8892 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8894 The data_reference's connections also need to be updated. Their
8895 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8896 stmt_vec_infos, their statements need to point to their corresponding copy,
8897 if they are gather loads or scatter stores then their reference needs to be
8898 updated to point to its corresponding copy and finally we set
8899 'base_misaligned' to false as we have already peeled for alignment in the
8900 prologue of the main loop. */
8902 static void
8903 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8905 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8906 auto_vec<gimple *> stmt_worklist;
8907 hash_map<tree,tree> mapping;
8908 gimple *orig_stmt, *new_stmt;
8909 gimple_stmt_iterator epilogue_gsi;
8910 gphi_iterator epilogue_phi_gsi;
8911 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8912 basic_block *epilogue_bbs = get_loop_body (epilogue);
8913 unsigned i;
8915 free (LOOP_VINFO_BBS (epilogue_vinfo));
8916 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8918 /* Advance data_reference's with the number of iterations of the previous
8919 loop and its prologue. */
8920 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8923 /* The EPILOGUE loop is a copy of the original loop so they share the same
8924 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8925 point to the copied statements. We also create a mapping of all LHS' in
8926 the original loop and all the LHS' in the EPILOGUE and create worklists to
8927 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8928 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8930 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8931 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8933 new_stmt = epilogue_phi_gsi.phi ();
8935 gcc_assert (gimple_uid (new_stmt) > 0);
8936 stmt_vinfo
8937 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8939 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8940 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8942 mapping.put (gimple_phi_result (orig_stmt),
8943 gimple_phi_result (new_stmt));
8944 /* PHI nodes can not have patterns or related statements. */
8945 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8946 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8949 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8950 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8952 new_stmt = gsi_stmt (epilogue_gsi);
8953 if (is_gimple_debug (new_stmt))
8954 continue;
8956 gcc_assert (gimple_uid (new_stmt) > 0);
8957 stmt_vinfo
8958 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8960 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8961 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8963 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8964 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8966 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8968 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8969 for (gimple_stmt_iterator gsi = gsi_start (seq);
8970 !gsi_end_p (gsi); gsi_next (&gsi))
8971 stmt_worklist.safe_push (gsi_stmt (gsi));
8974 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8975 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8977 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8978 stmt_worklist.safe_push (stmt);
8979 /* Set BB such that the assert in
8980 'get_initial_def_for_reduction' is able to determine that
8981 the BB of the related stmt is inside this loop. */
8982 gimple_set_bb (stmt,
8983 gimple_bb (new_stmt));
8984 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8985 gcc_assert (related_vinfo == NULL
8986 || related_vinfo == stmt_vinfo);
8991 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8992 using the original main loop and thus need to be updated to refer to the
8993 cloned variables used in the epilogue. */
8994 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8996 gimple *stmt = stmt_worklist[i];
8997 tree *new_op;
8999 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9001 tree op = gimple_op (stmt, j);
9002 if ((new_op = mapping.get(op)))
9003 gimple_set_op (stmt, j, *new_op);
9004 else
9006 /* PR92429: The last argument of simplify_replace_tree disables
9007 folding when replacing arguments. This is required as
9008 otherwise you might end up with different statements than the
9009 ones analyzed in vect_loop_analyze, leading to different
9010 vectorization. */
9011 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9012 &find_in_mapping, &mapping, false);
9013 gimple_set_op (stmt, j, op);
9018 struct data_reference *dr;
9019 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9020 FOR_EACH_VEC_ELT (datarefs, i, dr)
9022 orig_stmt = DR_STMT (dr);
9023 gcc_assert (gimple_uid (orig_stmt) > 0);
9024 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9025 /* Data references for gather loads and scatter stores do not use the
9026 updated offset we set using ADVANCE. Instead we have to make sure the
9027 reference in the data references point to the corresponding copy of
9028 the original in the epilogue. */
9029 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9030 == VMAT_GATHER_SCATTER)
9032 DR_REF (dr)
9033 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9034 &find_in_mapping, &mapping);
9035 DR_BASE_ADDRESS (dr)
9036 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9037 &find_in_mapping, &mapping);
9039 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9040 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9041 /* The vector size of the epilogue is smaller than that of the main loop
9042 so the alignment is either the same or lower. This means the dr will
9043 thus by definition be aligned. */
9044 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9047 epilogue_vinfo->shared->datarefs_copy.release ();
9048 epilogue_vinfo->shared->save_datarefs ();
9051 /* Function vect_transform_loop.
9053 The analysis phase has determined that the loop is vectorizable.
9054 Vectorize the loop - created vectorized stmts to replace the scalar
9055 stmts in the loop, and update the loop exit condition.
9056 Returns scalar epilogue loop if any. */
9058 class loop *
9059 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9061 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9062 class loop *epilogue = NULL;
9063 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9064 int nbbs = loop->num_nodes;
9065 int i;
9066 tree niters_vector = NULL_TREE;
9067 tree step_vector = NULL_TREE;
9068 tree niters_vector_mult_vf = NULL_TREE;
9069 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9070 unsigned int lowest_vf = constant_lower_bound (vf);
9071 gimple *stmt;
9072 bool check_profitability = false;
9073 unsigned int th;
9075 DUMP_VECT_SCOPE ("vec_transform_loop");
9077 loop_vinfo->shared->check_datarefs ();
9079 /* Use the more conservative vectorization threshold. If the number
9080 of iterations is constant assume the cost check has been performed
9081 by our caller. If the threshold makes all loops profitable that
9082 run at least the (estimated) vectorization factor number of times
9083 checking is pointless, too. */
9084 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9085 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9087 if (dump_enabled_p ())
9088 dump_printf_loc (MSG_NOTE, vect_location,
9089 "Profitability threshold is %d loop iterations.\n",
9090 th);
9091 check_profitability = true;
9094 /* Make sure there exists a single-predecessor exit bb. Do this before
9095 versioning. */
9096 edge e = single_exit (loop);
9097 if (! single_pred_p (e->dest))
9099 split_loop_exit_edge (e, true);
9100 if (dump_enabled_p ())
9101 dump_printf (MSG_NOTE, "split exit edge\n");
9104 /* Version the loop first, if required, so the profitability check
9105 comes first. */
9107 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9109 class loop *sloop
9110 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9111 sloop->force_vectorize = false;
9112 check_profitability = false;
9115 /* Make sure there exists a single-predecessor exit bb also on the
9116 scalar loop copy. Do this after versioning but before peeling
9117 so CFG structure is fine for both scalar and if-converted loop
9118 to make slpeel_duplicate_current_defs_from_edges face matched
9119 loop closed PHI nodes on the exit. */
9120 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9122 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9123 if (! single_pred_p (e->dest))
9125 split_loop_exit_edge (e, true);
9126 if (dump_enabled_p ())
9127 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9131 tree niters = vect_build_loop_niters (loop_vinfo);
9132 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9133 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9134 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9135 tree advance;
9136 drs_init_vec orig_drs_init;
9138 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9139 &step_vector, &niters_vector_mult_vf, th,
9140 check_profitability, niters_no_overflow,
9141 &advance);
9143 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9144 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9145 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9146 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9148 if (niters_vector == NULL_TREE)
9150 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9151 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9152 && known_eq (lowest_vf, vf))
9154 niters_vector
9155 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9156 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9157 step_vector = build_one_cst (TREE_TYPE (niters));
9159 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9160 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9161 &step_vector, niters_no_overflow);
9162 else
9163 /* vect_do_peeling subtracted the number of peeled prologue
9164 iterations from LOOP_VINFO_NITERS. */
9165 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9166 &niters_vector, &step_vector,
9167 niters_no_overflow);
9170 /* 1) Make sure the loop header has exactly two entries
9171 2) Make sure we have a preheader basic block. */
9173 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9175 split_edge (loop_preheader_edge (loop));
9177 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9178 /* This will deal with any possible peeling. */
9179 vect_prepare_for_masked_peels (loop_vinfo);
9181 /* Schedule the SLP instances first, then handle loop vectorization
9182 below. */
9183 if (!loop_vinfo->slp_instances.is_empty ())
9185 DUMP_VECT_SCOPE ("scheduling SLP instances");
9186 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9189 /* FORNOW: the vectorizer supports only loops which body consist
9190 of one basic block (header + empty latch). When the vectorizer will
9191 support more involved loop forms, the order by which the BBs are
9192 traversed need to be reconsidered. */
9194 for (i = 0; i < nbbs; i++)
9196 basic_block bb = bbs[i];
9197 stmt_vec_info stmt_info;
9199 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9200 gsi_next (&si))
9202 gphi *phi = si.phi ();
9203 if (dump_enabled_p ())
9204 dump_printf_loc (MSG_NOTE, vect_location,
9205 "------>vectorizing phi: %G", phi);
9206 stmt_info = loop_vinfo->lookup_stmt (phi);
9207 if (!stmt_info)
9208 continue;
9210 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9211 vect_loop_kill_debug_uses (loop, stmt_info);
9213 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9214 && !STMT_VINFO_LIVE_P (stmt_info))
9215 continue;
9217 if (STMT_VINFO_VECTYPE (stmt_info)
9218 && (maybe_ne
9219 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9220 && dump_enabled_p ())
9221 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9223 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9224 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9225 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9226 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9227 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9228 && ! PURE_SLP_STMT (stmt_info))
9230 if (dump_enabled_p ())
9231 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9232 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9236 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9237 gsi_next (&si))
9239 gphi *phi = si.phi ();
9240 stmt_info = loop_vinfo->lookup_stmt (phi);
9241 if (!stmt_info)
9242 continue;
9244 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9245 && !STMT_VINFO_LIVE_P (stmt_info))
9246 continue;
9248 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9249 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9250 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9251 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9252 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9253 && ! PURE_SLP_STMT (stmt_info))
9254 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9257 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9258 !gsi_end_p (si);)
9260 stmt = gsi_stmt (si);
9261 /* During vectorization remove existing clobber stmts. */
9262 if (gimple_clobber_p (stmt))
9264 unlink_stmt_vdef (stmt);
9265 gsi_remove (&si, true);
9266 release_defs (stmt);
9268 else
9270 /* Ignore vector stmts created in the outer loop. */
9271 stmt_info = loop_vinfo->lookup_stmt (stmt);
9273 /* vector stmts created in the outer-loop during vectorization of
9274 stmts in an inner-loop may not have a stmt_info, and do not
9275 need to be vectorized. */
9276 stmt_vec_info seen_store = NULL;
9277 if (stmt_info)
9279 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9281 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9282 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9283 !gsi_end_p (subsi); gsi_next (&subsi))
9285 stmt_vec_info pat_stmt_info
9286 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9287 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9288 &si, &seen_store);
9290 stmt_vec_info pat_stmt_info
9291 = STMT_VINFO_RELATED_STMT (stmt_info);
9292 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9293 &seen_store);
9294 maybe_set_vectorized_backedge_value (loop_vinfo,
9295 pat_stmt_info);
9297 else
9299 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9300 &seen_store);
9301 maybe_set_vectorized_backedge_value (loop_vinfo,
9302 stmt_info);
9305 gsi_next (&si);
9306 if (seen_store)
9308 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9309 /* Interleaving. If IS_STORE is TRUE, the
9310 vectorization of the interleaving chain was
9311 completed - free all the stores in the chain. */
9312 vect_remove_stores (loop_vinfo,
9313 DR_GROUP_FIRST_ELEMENT (seen_store));
9314 else
9315 /* Free the attached stmt_vec_info and remove the stmt. */
9316 loop_vinfo->remove_stmt (stmt_info);
9321 /* Stub out scalar statements that must not survive vectorization.
9322 Doing this here helps with grouped statements, or statements that
9323 are involved in patterns. */
9324 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9325 !gsi_end_p (gsi); gsi_next (&gsi))
9327 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9328 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9330 tree lhs = gimple_get_lhs (call);
9331 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9333 tree zero = build_zero_cst (TREE_TYPE (lhs));
9334 gimple *new_stmt = gimple_build_assign (lhs, zero);
9335 gsi_replace (&gsi, new_stmt, true);
9339 } /* BBs in loop */
9341 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9342 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9343 if (integer_onep (step_vector))
9344 niters_no_overflow = true;
9345 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9346 niters_vector_mult_vf, !niters_no_overflow);
9348 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9349 scale_profile_for_vect_loop (loop, assumed_vf);
9351 /* True if the final iteration might not handle a full vector's
9352 worth of scalar iterations. */
9353 bool final_iter_may_be_partial
9354 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9355 /* The minimum number of iterations performed by the epilogue. This
9356 is 1 when peeling for gaps because we always need a final scalar
9357 iteration. */
9358 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9359 /* +1 to convert latch counts to loop iteration counts,
9360 -min_epilogue_iters to remove iterations that cannot be performed
9361 by the vector code. */
9362 int bias_for_lowest = 1 - min_epilogue_iters;
9363 int bias_for_assumed = bias_for_lowest;
9364 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9365 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9367 /* When the amount of peeling is known at compile time, the first
9368 iteration will have exactly alignment_npeels active elements.
9369 In the worst case it will have at least one. */
9370 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9371 bias_for_lowest += lowest_vf - min_first_active;
9372 bias_for_assumed += assumed_vf - min_first_active;
9374 /* In these calculations the "- 1" converts loop iteration counts
9375 back to latch counts. */
9376 if (loop->any_upper_bound)
9377 loop->nb_iterations_upper_bound
9378 = (final_iter_may_be_partial
9379 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9380 lowest_vf) - 1
9381 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9382 lowest_vf) - 1);
9383 if (loop->any_likely_upper_bound)
9384 loop->nb_iterations_likely_upper_bound
9385 = (final_iter_may_be_partial
9386 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9387 + bias_for_lowest, lowest_vf) - 1
9388 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9389 + bias_for_lowest, lowest_vf) - 1);
9390 if (loop->any_estimate)
9391 loop->nb_iterations_estimate
9392 = (final_iter_may_be_partial
9393 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9394 assumed_vf) - 1
9395 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9396 assumed_vf) - 1);
9398 if (dump_enabled_p ())
9400 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9402 dump_printf_loc (MSG_NOTE, vect_location,
9403 "LOOP VECTORIZED\n");
9404 if (loop->inner)
9405 dump_printf_loc (MSG_NOTE, vect_location,
9406 "OUTER LOOP VECTORIZED\n");
9407 dump_printf (MSG_NOTE, "\n");
9409 else
9410 dump_printf_loc (MSG_NOTE, vect_location,
9411 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9412 GET_MODE_NAME (loop_vinfo->vector_mode));
9415 /* Loops vectorized with a variable factor won't benefit from
9416 unrolling/peeling. */
9417 if (!vf.is_constant ())
9419 loop->unroll = 1;
9420 if (dump_enabled_p ())
9421 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9422 " variable-length vectorization factor\n");
9424 /* Free SLP instances here because otherwise stmt reference counting
9425 won't work. */
9426 slp_instance instance;
9427 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9428 vect_free_slp_instance (instance);
9429 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9430 /* Clear-up safelen field since its value is invalid after vectorization
9431 since vectorized loop can have loop-carried dependencies. */
9432 loop->safelen = 0;
9434 if (epilogue)
9436 update_epilogue_loop_vinfo (epilogue, advance);
9438 epilogue->simduid = loop->simduid;
9439 epilogue->force_vectorize = loop->force_vectorize;
9440 epilogue->dont_vectorize = false;
9443 return epilogue;
9446 /* The code below is trying to perform simple optimization - revert
9447 if-conversion for masked stores, i.e. if the mask of a store is zero
9448 do not perform it and all stored value producers also if possible.
9449 For example,
9450 for (i=0; i<n; i++)
9451 if (c[i])
9453 p1[i] += 1;
9454 p2[i] = p3[i] +2;
9456 this transformation will produce the following semi-hammock:
9458 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9460 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9461 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9462 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9463 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9464 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9465 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9469 void
9470 optimize_mask_stores (class loop *loop)
9472 basic_block *bbs = get_loop_body (loop);
9473 unsigned nbbs = loop->num_nodes;
9474 unsigned i;
9475 basic_block bb;
9476 class loop *bb_loop;
9477 gimple_stmt_iterator gsi;
9478 gimple *stmt;
9479 auto_vec<gimple *> worklist;
9480 auto_purge_vect_location sentinel;
9482 vect_location = find_loop_location (loop);
9483 /* Pick up all masked stores in loop if any. */
9484 for (i = 0; i < nbbs; i++)
9486 bb = bbs[i];
9487 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9488 gsi_next (&gsi))
9490 stmt = gsi_stmt (gsi);
9491 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9492 worklist.safe_push (stmt);
9496 free (bbs);
9497 if (worklist.is_empty ())
9498 return;
9500 /* Loop has masked stores. */
9501 while (!worklist.is_empty ())
9503 gimple *last, *last_store;
9504 edge e, efalse;
9505 tree mask;
9506 basic_block store_bb, join_bb;
9507 gimple_stmt_iterator gsi_to;
9508 tree vdef, new_vdef;
9509 gphi *phi;
9510 tree vectype;
9511 tree zero;
9513 last = worklist.pop ();
9514 mask = gimple_call_arg (last, 2);
9515 bb = gimple_bb (last);
9516 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9517 the same loop as if_bb. It could be different to LOOP when two
9518 level loop-nest is vectorized and mask_store belongs to the inner
9519 one. */
9520 e = split_block (bb, last);
9521 bb_loop = bb->loop_father;
9522 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9523 join_bb = e->dest;
9524 store_bb = create_empty_bb (bb);
9525 add_bb_to_loop (store_bb, bb_loop);
9526 e->flags = EDGE_TRUE_VALUE;
9527 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9528 /* Put STORE_BB to likely part. */
9529 efalse->probability = profile_probability::unlikely ();
9530 store_bb->count = efalse->count ();
9531 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9532 if (dom_info_available_p (CDI_DOMINATORS))
9533 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9534 if (dump_enabled_p ())
9535 dump_printf_loc (MSG_NOTE, vect_location,
9536 "Create new block %d to sink mask stores.",
9537 store_bb->index);
9538 /* Create vector comparison with boolean result. */
9539 vectype = TREE_TYPE (mask);
9540 zero = build_zero_cst (vectype);
9541 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9542 gsi = gsi_last_bb (bb);
9543 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9544 /* Create new PHI node for vdef of the last masked store:
9545 .MEM_2 = VDEF <.MEM_1>
9546 will be converted to
9547 .MEM.3 = VDEF <.MEM_1>
9548 and new PHI node will be created in join bb
9549 .MEM_2 = PHI <.MEM_1, .MEM_3>
9551 vdef = gimple_vdef (last);
9552 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9553 gimple_set_vdef (last, new_vdef);
9554 phi = create_phi_node (vdef, join_bb);
9555 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9557 /* Put all masked stores with the same mask to STORE_BB if possible. */
9558 while (true)
9560 gimple_stmt_iterator gsi_from;
9561 gimple *stmt1 = NULL;
9563 /* Move masked store to STORE_BB. */
9564 last_store = last;
9565 gsi = gsi_for_stmt (last);
9566 gsi_from = gsi;
9567 /* Shift GSI to the previous stmt for further traversal. */
9568 gsi_prev (&gsi);
9569 gsi_to = gsi_start_bb (store_bb);
9570 gsi_move_before (&gsi_from, &gsi_to);
9571 /* Setup GSI_TO to the non-empty block start. */
9572 gsi_to = gsi_start_bb (store_bb);
9573 if (dump_enabled_p ())
9574 dump_printf_loc (MSG_NOTE, vect_location,
9575 "Move stmt to created bb\n%G", last);
9576 /* Move all stored value producers if possible. */
9577 while (!gsi_end_p (gsi))
9579 tree lhs;
9580 imm_use_iterator imm_iter;
9581 use_operand_p use_p;
9582 bool res;
9584 /* Skip debug statements. */
9585 if (is_gimple_debug (gsi_stmt (gsi)))
9587 gsi_prev (&gsi);
9588 continue;
9590 stmt1 = gsi_stmt (gsi);
9591 /* Do not consider statements writing to memory or having
9592 volatile operand. */
9593 if (gimple_vdef (stmt1)
9594 || gimple_has_volatile_ops (stmt1))
9595 break;
9596 gsi_from = gsi;
9597 gsi_prev (&gsi);
9598 lhs = gimple_get_lhs (stmt1);
9599 if (!lhs)
9600 break;
9602 /* LHS of vectorized stmt must be SSA_NAME. */
9603 if (TREE_CODE (lhs) != SSA_NAME)
9604 break;
9606 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9608 /* Remove dead scalar statement. */
9609 if (has_zero_uses (lhs))
9611 gsi_remove (&gsi_from, true);
9612 continue;
9616 /* Check that LHS does not have uses outside of STORE_BB. */
9617 res = true;
9618 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9620 gimple *use_stmt;
9621 use_stmt = USE_STMT (use_p);
9622 if (is_gimple_debug (use_stmt))
9623 continue;
9624 if (gimple_bb (use_stmt) != store_bb)
9626 res = false;
9627 break;
9630 if (!res)
9631 break;
9633 if (gimple_vuse (stmt1)
9634 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9635 break;
9637 /* Can move STMT1 to STORE_BB. */
9638 if (dump_enabled_p ())
9639 dump_printf_loc (MSG_NOTE, vect_location,
9640 "Move stmt to created bb\n%G", stmt1);
9641 gsi_move_before (&gsi_from, &gsi_to);
9642 /* Shift GSI_TO for further insertion. */
9643 gsi_prev (&gsi_to);
9645 /* Put other masked stores with the same mask to STORE_BB. */
9646 if (worklist.is_empty ()
9647 || gimple_call_arg (worklist.last (), 2) != mask
9648 || worklist.last () != stmt1)
9649 break;
9650 last = worklist.pop ();
9652 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9656 /* Decide whether it is possible to use a zero-based induction variable
9657 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9658 the value that the induction variable must be able to hold in order
9659 to ensure that the rgroups eventually have no active vector elements.
9660 Return -1 otherwise. */
9662 widest_int
9663 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9665 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9666 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9667 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9669 /* Calculate the value that the induction variable must be able
9670 to hit in order to ensure that we end the loop with an all-false mask.
9671 This involves adding the maximum number of inactive trailing scalar
9672 iterations. */
9673 widest_int iv_limit = -1;
9674 if (max_loop_iterations (loop, &iv_limit))
9676 if (niters_skip)
9678 /* Add the maximum number of skipped iterations to the
9679 maximum iteration count. */
9680 if (TREE_CODE (niters_skip) == INTEGER_CST)
9681 iv_limit += wi::to_widest (niters_skip);
9682 else
9683 iv_limit += max_vf - 1;
9685 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9686 /* Make a conservatively-correct assumption. */
9687 iv_limit += max_vf - 1;
9689 /* IV_LIMIT is the maximum number of latch iterations, which is also
9690 the maximum in-range IV value. Round this value down to the previous
9691 vector alignment boundary and then add an extra full iteration. */
9692 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9693 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9695 return iv_limit;
9698 /* For the given rgroup_controls RGC, check whether an induction variable
9699 would ever hit a value that produces a set of all-false masks or zero
9700 lengths before wrapping around. Return true if it's possible to wrap
9701 around before hitting the desirable value, otherwise return false. */
9703 bool
9704 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9706 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9708 if (iv_limit == -1)
9709 return true;
9711 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9712 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9713 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9715 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9716 return true;
9718 return false;