[RS6000] Non-pcrel tests when power10
[official-gcc.git] / gcc / tree-vect-loop.c
blobe42f3277ed541debd9dd8b9d6cb56006486ead35
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
403 *init = init_expr;
404 *step = step_expr;
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
422 return true;
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
436 x_3 = ...;
439 outer2:
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
516 worklist.safe_push (stmt_vinfo);
517 continue;
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 else
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 else
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
603 Example1: reduction:
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
609 Example2: induction:
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
657 while (stmt_info);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 stmt_vec_info first;
666 unsigned i;
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 if (STMT_VINFO_IN_PATTERN_P (first))
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
674 if (! STMT_VINFO_IN_PATTERN_P (next)
675 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 /* If not all stmt in the chain are patterns or if we failed
680 to update STMT_VINFO_REDUC_IDX try to handle the chain
681 without patterns. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
685 vect_fixup_reduc_chain (first);
686 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
687 = STMT_VINFO_RELATED_STMT (first);
692 /* Function vect_get_loop_niters.
694 Determine how many iterations the loop is executed and place it
695 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
696 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
697 niter information holds in ASSUMPTIONS.
699 Return the loop exit condition. */
702 static gcond *
703 vect_get_loop_niters (class loop *loop, tree *assumptions,
704 tree *number_of_iterations, tree *number_of_iterationsm1)
706 edge exit = single_exit (loop);
707 class tree_niter_desc niter_desc;
708 tree niter_assumptions, niter, may_be_zero;
709 gcond *cond = get_loop_exit_condition (loop);
711 *assumptions = boolean_true_node;
712 *number_of_iterationsm1 = chrec_dont_know;
713 *number_of_iterations = chrec_dont_know;
714 DUMP_VECT_SCOPE ("get_loop_niters");
716 if (!exit)
717 return cond;
719 may_be_zero = NULL_TREE;
720 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
721 || chrec_contains_undetermined (niter_desc.niter))
722 return cond;
724 niter_assumptions = niter_desc.assumptions;
725 may_be_zero = niter_desc.may_be_zero;
726 niter = niter_desc.niter;
728 if (may_be_zero && integer_zerop (may_be_zero))
729 may_be_zero = NULL_TREE;
731 if (may_be_zero)
733 if (COMPARISON_CLASS_P (may_be_zero))
735 /* Try to combine may_be_zero with assumptions, this can simplify
736 computation of niter expression. */
737 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
738 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
739 niter_assumptions,
740 fold_build1 (TRUTH_NOT_EXPR,
741 boolean_type_node,
742 may_be_zero));
743 else
744 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
745 build_int_cst (TREE_TYPE (niter), 0),
746 rewrite_to_non_trapping_overflow (niter));
748 may_be_zero = NULL_TREE;
750 else if (integer_nonzerop (may_be_zero))
752 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
753 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
754 return cond;
756 else
757 return cond;
760 *assumptions = niter_assumptions;
761 *number_of_iterationsm1 = niter;
763 /* We want the number of loop header executions which is the number
764 of latch executions plus one.
765 ??? For UINT_MAX latch executions this number overflows to zero
766 for loops like do { n++; } while (n != 0); */
767 if (niter && !chrec_contains_undetermined (niter))
768 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
769 build_int_cst (TREE_TYPE (niter), 1));
770 *number_of_iterations = niter;
772 return cond;
775 /* Function bb_in_loop_p
777 Used as predicate for dfs order traversal of the loop bbs. */
779 static bool
780 bb_in_loop_p (const_basic_block bb, const void *data)
782 const class loop *const loop = (const class loop *)data;
783 if (flow_bb_inside_loop_p (loop, bb))
784 return true;
785 return false;
789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
790 stmt_vec_info structs for all the stmts in LOOP_IN. */
792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
793 : vec_info (vec_info::loop, init_cost (loop_in), shared),
794 loop (loop_in),
795 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
796 num_itersm1 (NULL_TREE),
797 num_iters (NULL_TREE),
798 num_iters_unchanged (NULL_TREE),
799 num_iters_assumptions (NULL_TREE),
800 th (0),
801 versioning_threshold (0),
802 vectorization_factor (0),
803 max_vectorization_factor (0),
804 mask_skip_niters (NULL_TREE),
805 rgroup_compare_type (NULL_TREE),
806 simd_if_cond (NULL_TREE),
807 unaligned_dr (NULL),
808 peeling_for_alignment (0),
809 ptr_mask (0),
810 ivexpr_map (NULL),
811 scan_map (NULL),
812 slp_unrolling_factor (1),
813 single_scalar_iteration_cost (0),
814 vec_outside_cost (0),
815 vec_inside_cost (0),
816 vectorizable (false),
817 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
818 using_partial_vectors_p (false),
819 epil_using_partial_vectors_p (false),
820 peeling_for_gaps (false),
821 peeling_for_niter (false),
822 no_data_dependencies (false),
823 has_mask_store (false),
824 scalar_loop_scaling (profile_probability::uninitialized ()),
825 scalar_loop (NULL),
826 orig_loop_info (NULL)
828 /* CHECKME: We want to visit all BBs before their successors (except for
829 latch blocks, for which this assertion wouldn't hold). In the simple
830 case of the loop forms we allow, a dfs order of the BBs would the same
831 as reversed postorder traversal, so we are safe. */
833 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
834 bbs, loop->num_nodes, loop);
835 gcc_assert (nbbs == loop->num_nodes);
837 for (unsigned int i = 0; i < nbbs; i++)
839 basic_block bb = bbs[i];
840 gimple_stmt_iterator si;
842 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
844 gimple *phi = gsi_stmt (si);
845 gimple_set_uid (phi, 0);
846 add_stmt (phi);
849 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
851 gimple *stmt = gsi_stmt (si);
852 gimple_set_uid (stmt, 0);
853 if (is_gimple_debug (stmt))
854 continue;
855 add_stmt (stmt);
856 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
857 third argument is the #pragma omp simd if (x) condition, when 0,
858 loop shouldn't be vectorized, when non-zero constant, it should
859 be vectorized normally, otherwise versioned with vectorized loop
860 done if the condition is non-zero at runtime. */
861 if (loop_in->simduid
862 && is_gimple_call (stmt)
863 && gimple_call_internal_p (stmt)
864 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
865 && gimple_call_num_args (stmt) >= 3
866 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
867 && (loop_in->simduid
868 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
870 tree arg = gimple_call_arg (stmt, 2);
871 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
872 simd_if_cond = arg;
873 else
874 gcc_assert (integer_nonzerop (arg));
879 epilogue_vinfos.create (6);
882 /* Free all levels of rgroup CONTROLS. */
884 void
885 release_vec_loop_controls (vec<rgroup_controls> *controls)
887 rgroup_controls *rgc;
888 unsigned int i;
889 FOR_EACH_VEC_ELT (*controls, i, rgc)
890 rgc->controls.release ();
891 controls->release ();
894 /* Free all memory used by the _loop_vec_info, as well as all the
895 stmt_vec_info structs of all the stmts in the loop. */
897 _loop_vec_info::~_loop_vec_info ()
899 free (bbs);
901 release_vec_loop_controls (&masks);
902 release_vec_loop_controls (&lens);
903 delete ivexpr_map;
904 delete scan_map;
905 epilogue_vinfos.release ();
907 loop->aux = NULL;
910 /* Return an invariant or register for EXPR and emit necessary
911 computations in the LOOP_VINFO loop preheader. */
913 tree
914 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
916 if (is_gimple_reg (expr)
917 || is_gimple_min_invariant (expr))
918 return expr;
920 if (! loop_vinfo->ivexpr_map)
921 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
922 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
923 if (! cached)
925 gimple_seq stmts = NULL;
926 cached = force_gimple_operand (unshare_expr (expr),
927 &stmts, true, NULL_TREE);
928 if (stmts)
930 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
931 gsi_insert_seq_on_edge_immediate (e, stmts);
934 return cached;
937 /* Return true if we can use CMP_TYPE as the comparison type to produce
938 all masks required to mask LOOP_VINFO. */
940 static bool
941 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
943 rgroup_controls *rgm;
944 unsigned int i;
945 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
946 if (rgm->type != NULL_TREE
947 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
948 cmp_type, rgm->type,
949 OPTIMIZE_FOR_SPEED))
950 return false;
951 return true;
954 /* Calculate the maximum number of scalars per iteration for every
955 rgroup in LOOP_VINFO. */
957 static unsigned int
958 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
960 unsigned int res = 1;
961 unsigned int i;
962 rgroup_controls *rgm;
963 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
964 res = MAX (res, rgm->max_nscalars_per_iter);
965 return res;
968 /* Calculate the minimum precision necessary to represent:
970 MAX_NITERS * FACTOR
972 as an unsigned integer, where MAX_NITERS is the maximum number of
973 loop header iterations for the original scalar form of LOOP_VINFO. */
975 static unsigned
976 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
978 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
980 /* Get the maximum number of iterations that is representable
981 in the counter type. */
982 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
983 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
985 /* Get a more refined estimate for the number of iterations. */
986 widest_int max_back_edges;
987 if (max_loop_iterations (loop, &max_back_edges))
988 max_ni = wi::smin (max_ni, max_back_edges + 1);
990 /* Work out how many bits we need to represent the limit. */
991 return wi::min_precision (max_ni * factor, UNSIGNED);
994 /* True if the loop needs peeling or partial vectors when vectorized. */
996 static bool
997 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
999 unsigned HOST_WIDE_INT const_vf;
1000 HOST_WIDE_INT max_niter
1001 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1003 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1004 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1005 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1006 (loop_vinfo));
1008 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1009 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1011 /* Work out the (constant) number of iterations that need to be
1012 peeled for reasons other than niters. */
1013 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1014 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1015 peel_niter += 1;
1016 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1017 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1018 return true;
1020 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1021 /* ??? When peeling for gaps but not alignment, we could
1022 try to check whether the (variable) niters is known to be
1023 VF * N + 1. That's something of a niche case though. */
1024 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1025 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1026 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1027 < (unsigned) exact_log2 (const_vf))
1028 /* In case of versioning, check if the maximum number of
1029 iterations is greater than th. If they are identical,
1030 the epilogue is unnecessary. */
1031 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1032 || ((unsigned HOST_WIDE_INT) max_niter
1033 > (th / const_vf) * const_vf))))
1034 return true;
1036 return false;
1039 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1040 whether we can actually generate the masks required. Return true if so,
1041 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1043 static bool
1044 vect_verify_full_masking (loop_vec_info loop_vinfo)
1046 unsigned int min_ni_width;
1047 unsigned int max_nscalars_per_iter
1048 = vect_get_max_nscalars_per_iter (loop_vinfo);
1050 /* Use a normal loop if there are no statements that need masking.
1051 This only happens in rare degenerate cases: it means that the loop
1052 has no loads, no stores, and no live-out values. */
1053 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1054 return false;
1056 /* Work out how many bits we need to represent the limit. */
1057 min_ni_width
1058 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1060 /* Find a scalar mode for which WHILE_ULT is supported. */
1061 opt_scalar_int_mode cmp_mode_iter;
1062 tree cmp_type = NULL_TREE;
1063 tree iv_type = NULL_TREE;
1064 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1065 unsigned int iv_precision = UINT_MAX;
1067 if (iv_limit != -1)
1068 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1069 UNSIGNED);
1071 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1073 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1074 if (cmp_bits >= min_ni_width
1075 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1077 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1078 if (this_type
1079 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1081 /* Although we could stop as soon as we find a valid mode,
1082 there are at least two reasons why that's not always the
1083 best choice:
1085 - An IV that's Pmode or wider is more likely to be reusable
1086 in address calculations than an IV that's narrower than
1087 Pmode.
1089 - Doing the comparison in IV_PRECISION or wider allows
1090 a natural 0-based IV, whereas using a narrower comparison
1091 type requires mitigations against wrap-around.
1093 Conversely, if the IV limit is variable, doing the comparison
1094 in a wider type than the original type can introduce
1095 unnecessary extensions, so picking the widest valid mode
1096 is not always a good choice either.
1098 Here we prefer the first IV type that's Pmode or wider,
1099 and the first comparison type that's IV_PRECISION or wider.
1100 (The comparison type must be no wider than the IV type,
1101 to avoid extensions in the vector loop.)
1103 ??? We might want to try continuing beyond Pmode for ILP32
1104 targets if CMP_BITS < IV_PRECISION. */
1105 iv_type = this_type;
1106 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1107 cmp_type = this_type;
1108 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1109 break;
1114 if (!cmp_type)
1115 return false;
1117 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1118 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1119 return true;
1122 /* Check whether we can use vector access with length based on precison
1123 comparison. So far, to keep it simple, we only allow the case that the
1124 precision of the target supported length is larger than the precision
1125 required by loop niters. */
1127 static bool
1128 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1130 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1131 return false;
1133 unsigned int max_nitems_per_iter = 1;
1134 unsigned int i;
1135 rgroup_controls *rgl;
1136 /* Find the maximum number of items per iteration for every rgroup. */
1137 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1139 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1140 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1143 /* Work out how many bits we need to represent the length limit. */
1144 unsigned int min_ni_prec
1145 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1147 /* Now use the maximum of below precisions for one suitable IV type:
1148 - the IV's natural precision
1149 - the precision needed to hold: the maximum number of scalar
1150 iterations multiplied by the scale factor (min_ni_prec above)
1151 - the Pmode precision
1153 If min_ni_prec is less than the precision of the current niters,
1154 we perfer to still use the niters type. Prefer to use Pmode and
1155 wider IV to avoid narrow conversions. */
1157 unsigned int ni_prec
1158 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1159 min_ni_prec = MAX (min_ni_prec, ni_prec);
1160 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1162 tree iv_type = NULL_TREE;
1163 opt_scalar_int_mode tmode_iter;
1164 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1166 scalar_mode tmode = tmode_iter.require ();
1167 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1169 /* ??? Do we really want to construct one IV whose precision exceeds
1170 BITS_PER_WORD? */
1171 if (tbits > BITS_PER_WORD)
1172 break;
1174 /* Find the first available standard integral type. */
1175 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1177 iv_type = build_nonstandard_integer_type (tbits, true);
1178 break;
1182 if (!iv_type)
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "can't vectorize with length-based partial vectors"
1187 " because there is no suitable iv type.\n");
1188 return false;
1191 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1192 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1194 return true;
1197 /* Calculate the cost of one scalar iteration of the loop. */
1198 static void
1199 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1201 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1202 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1203 int nbbs = loop->num_nodes, factor;
1204 int innerloop_iters, i;
1206 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1208 /* Gather costs for statements in the scalar loop. */
1210 /* FORNOW. */
1211 innerloop_iters = 1;
1212 if (loop->inner)
1213 innerloop_iters = 50; /* FIXME */
1215 for (i = 0; i < nbbs; i++)
1217 gimple_stmt_iterator si;
1218 basic_block bb = bbs[i];
1220 if (bb->loop_father == loop->inner)
1221 factor = innerloop_iters;
1222 else
1223 factor = 1;
1225 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1227 gimple *stmt = gsi_stmt (si);
1228 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1230 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1231 continue;
1233 /* Skip stmts that are not vectorized inside the loop. */
1234 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1235 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1236 && (!STMT_VINFO_LIVE_P (vstmt_info)
1237 || !VECTORIZABLE_CYCLE_DEF
1238 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1239 continue;
1241 vect_cost_for_stmt kind;
1242 if (STMT_VINFO_DATA_REF (stmt_info))
1244 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1245 kind = scalar_load;
1246 else
1247 kind = scalar_store;
1249 else if (vect_nop_conversion_p (stmt_info))
1250 continue;
1251 else
1252 kind = scalar_stmt;
1254 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1255 factor, kind, stmt_info, 0, vect_prologue);
1259 /* Now accumulate cost. */
1260 void *target_cost_data = init_cost (loop);
1261 stmt_info_for_cost *si;
1262 int j;
1263 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1264 j, si)
1265 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1266 si->kind, si->stmt_info, si->vectype,
1267 si->misalign, vect_body);
1268 unsigned dummy, body_cost = 0;
1269 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1270 destroy_cost_data (target_cost_data);
1271 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1275 /* Function vect_analyze_loop_form_1.
1277 Verify that certain CFG restrictions hold, including:
1278 - the loop has a pre-header
1279 - the loop has a single entry and exit
1280 - the loop exit condition is simple enough
1281 - the number of iterations can be analyzed, i.e, a countable loop. The
1282 niter could be analyzed under some assumptions. */
1284 opt_result
1285 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1286 tree *assumptions, tree *number_of_iterationsm1,
1287 tree *number_of_iterations, gcond **inner_loop_cond)
1289 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1291 /* Different restrictions apply when we are considering an inner-most loop,
1292 vs. an outer (nested) loop.
1293 (FORNOW. May want to relax some of these restrictions in the future). */
1295 if (!loop->inner)
1297 /* Inner-most loop. We currently require that the number of BBs is
1298 exactly 2 (the header and latch). Vectorizable inner-most loops
1299 look like this:
1301 (pre-header)
1303 header <--------+
1304 | | |
1305 | +--> latch --+
1307 (exit-bb) */
1309 if (loop->num_nodes != 2)
1310 return opt_result::failure_at (vect_location,
1311 "not vectorized:"
1312 " control flow in loop.\n");
1314 if (empty_block_p (loop->header))
1315 return opt_result::failure_at (vect_location,
1316 "not vectorized: empty loop.\n");
1318 else
1320 class loop *innerloop = loop->inner;
1321 edge entryedge;
1323 /* Nested loop. We currently require that the loop is doubly-nested,
1324 contains a single inner loop, and the number of BBs is exactly 5.
1325 Vectorizable outer-loops look like this:
1327 (pre-header)
1329 header <---+
1331 inner-loop |
1333 tail ------+
1335 (exit-bb)
1337 The inner-loop has the properties expected of inner-most loops
1338 as described above. */
1340 if ((loop->inner)->inner || (loop->inner)->next)
1341 return opt_result::failure_at (vect_location,
1342 "not vectorized:"
1343 " multiple nested loops.\n");
1345 if (loop->num_nodes != 5)
1346 return opt_result::failure_at (vect_location,
1347 "not vectorized:"
1348 " control flow in loop.\n");
1350 entryedge = loop_preheader_edge (innerloop);
1351 if (entryedge->src != loop->header
1352 || !single_exit (innerloop)
1353 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1354 return opt_result::failure_at (vect_location,
1355 "not vectorized:"
1356 " unsupported outerloop form.\n");
1358 /* Analyze the inner-loop. */
1359 tree inner_niterm1, inner_niter, inner_assumptions;
1360 opt_result res
1361 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1362 &inner_assumptions, &inner_niterm1,
1363 &inner_niter, NULL);
1364 if (!res)
1366 if (dump_enabled_p ())
1367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368 "not vectorized: Bad inner loop.\n");
1369 return res;
1372 /* Don't support analyzing niter under assumptions for inner
1373 loop. */
1374 if (!integer_onep (inner_assumptions))
1375 return opt_result::failure_at (vect_location,
1376 "not vectorized: Bad inner loop.\n");
1378 if (!expr_invariant_in_loop_p (loop, inner_niter))
1379 return opt_result::failure_at (vect_location,
1380 "not vectorized: inner-loop count not"
1381 " invariant.\n");
1383 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_NOTE, vect_location,
1385 "Considering outer-loop vectorization.\n");
1388 if (!single_exit (loop))
1389 return opt_result::failure_at (vect_location,
1390 "not vectorized: multiple exits.\n");
1391 if (EDGE_COUNT (loop->header->preds) != 2)
1392 return opt_result::failure_at (vect_location,
1393 "not vectorized:"
1394 " too many incoming edges.\n");
1396 /* We assume that the loop exit condition is at the end of the loop. i.e,
1397 that the loop is represented as a do-while (with a proper if-guard
1398 before the loop if needed), where the loop header contains all the
1399 executable statements, and the latch is empty. */
1400 if (!empty_block_p (loop->latch)
1401 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: latch block not empty.\n");
1405 /* Make sure the exit is not abnormal. */
1406 edge e = single_exit (loop);
1407 if (e->flags & EDGE_ABNORMAL)
1408 return opt_result::failure_at (vect_location,
1409 "not vectorized:"
1410 " abnormal loop exit edge.\n");
1412 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1413 number_of_iterationsm1);
1414 if (!*loop_cond)
1415 return opt_result::failure_at
1416 (vect_location,
1417 "not vectorized: complicated exit condition.\n");
1419 if (integer_zerop (*assumptions)
1420 || !*number_of_iterations
1421 || chrec_contains_undetermined (*number_of_iterations))
1422 return opt_result::failure_at
1423 (*loop_cond,
1424 "not vectorized: number of iterations cannot be computed.\n");
1426 if (integer_zerop (*number_of_iterations))
1427 return opt_result::failure_at
1428 (*loop_cond,
1429 "not vectorized: number of iterations = 0.\n");
1431 return opt_result::success ();
1434 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1436 opt_loop_vec_info
1437 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1439 tree assumptions, number_of_iterations, number_of_iterationsm1;
1440 gcond *loop_cond, *inner_loop_cond = NULL;
1442 opt_result res
1443 = vect_analyze_loop_form_1 (loop, &loop_cond,
1444 &assumptions, &number_of_iterationsm1,
1445 &number_of_iterations, &inner_loop_cond);
1446 if (!res)
1447 return opt_loop_vec_info::propagate_failure (res);
1449 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1450 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1451 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1452 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1453 if (!integer_onep (assumptions))
1455 /* We consider to vectorize this loop by versioning it under
1456 some assumptions. In order to do this, we need to clear
1457 existing information computed by scev and niter analyzer. */
1458 scev_reset_htab ();
1459 free_numbers_of_iterations_estimates (loop);
1460 /* Also set flag for this loop so that following scev and niter
1461 analysis are done under the assumptions. */
1462 loop_constraint_set (loop, LOOP_C_FINITE);
1463 /* Also record the assumptions for versioning. */
1464 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1467 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1469 if (dump_enabled_p ())
1471 dump_printf_loc (MSG_NOTE, vect_location,
1472 "Symbolic number of iterations is ");
1473 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1474 dump_printf (MSG_NOTE, "\n");
1478 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1479 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1480 if (inner_loop_cond)
1482 stmt_vec_info inner_loop_cond_info
1483 = loop_vinfo->lookup_stmt (inner_loop_cond);
1484 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1487 gcc_assert (!loop->aux);
1488 loop->aux = loop_vinfo;
1489 return opt_loop_vec_info::success (loop_vinfo);
1494 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1495 statements update the vectorization factor. */
1497 static void
1498 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1500 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1501 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1502 int nbbs = loop->num_nodes;
1503 poly_uint64 vectorization_factor;
1504 int i;
1506 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1508 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1509 gcc_assert (known_ne (vectorization_factor, 0U));
1511 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1512 vectorization factor of the loop is the unrolling factor required by
1513 the SLP instances. If that unrolling factor is 1, we say, that we
1514 perform pure SLP on loop - cross iteration parallelism is not
1515 exploited. */
1516 bool only_slp_in_loop = true;
1517 for (i = 0; i < nbbs; i++)
1519 basic_block bb = bbs[i];
1520 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1521 gsi_next (&si))
1523 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1524 if (!stmt_info)
1525 continue;
1526 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1527 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1528 && !PURE_SLP_STMT (stmt_info))
1529 /* STMT needs both SLP and loop-based vectorization. */
1530 only_slp_in_loop = false;
1532 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1533 gsi_next (&si))
1535 if (is_gimple_debug (gsi_stmt (si)))
1536 continue;
1537 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1538 stmt_info = vect_stmt_to_vectorize (stmt_info);
1539 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1540 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1541 && !PURE_SLP_STMT (stmt_info))
1542 /* STMT needs both SLP and loop-based vectorization. */
1543 only_slp_in_loop = false;
1547 if (only_slp_in_loop)
1549 if (dump_enabled_p ())
1550 dump_printf_loc (MSG_NOTE, vect_location,
1551 "Loop contains only SLP stmts\n");
1552 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1554 else
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_NOTE, vect_location,
1558 "Loop contains SLP and non-SLP stmts\n");
1559 /* Both the vectorization factor and unroll factor have the form
1560 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1561 so they must have a common multiple. */
1562 vectorization_factor
1563 = force_common_multiple (vectorization_factor,
1564 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1567 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1568 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_NOTE, vect_location,
1571 "Updating vectorization factor to ");
1572 dump_dec (MSG_NOTE, vectorization_factor);
1573 dump_printf (MSG_NOTE, ".\n");
1577 /* Return true if STMT_INFO describes a double reduction phi and if
1578 the other phi in the reduction is also relevant for vectorization.
1579 This rejects cases such as:
1581 outer1:
1582 x_1 = PHI <x_3(outer2), ...>;
1585 inner:
1586 x_2 = ...;
1589 outer2:
1590 x_3 = PHI <x_2(inner)>;
1592 if nothing in x_2 or elsewhere makes x_1 relevant. */
1594 static bool
1595 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1597 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1598 return false;
1600 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1603 /* Function vect_analyze_loop_operations.
1605 Scan the loop stmts and make sure they are all vectorizable. */
1607 static opt_result
1608 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1610 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1611 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1612 int nbbs = loop->num_nodes;
1613 int i;
1614 stmt_vec_info stmt_info;
1615 bool need_to_vectorize = false;
1616 bool ok;
1618 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1620 auto_vec<stmt_info_for_cost> cost_vec;
1622 for (i = 0; i < nbbs; i++)
1624 basic_block bb = bbs[i];
1626 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1627 gsi_next (&si))
1629 gphi *phi = si.phi ();
1630 ok = true;
1632 stmt_info = loop_vinfo->lookup_stmt (phi);
1633 if (dump_enabled_p ())
1634 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1635 if (virtual_operand_p (gimple_phi_result (phi)))
1636 continue;
1638 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1639 (i.e., a phi in the tail of the outer-loop). */
1640 if (! is_loop_header_bb_p (bb))
1642 /* FORNOW: we currently don't support the case that these phis
1643 are not used in the outerloop (unless it is double reduction,
1644 i.e., this phi is vect_reduction_def), cause this case
1645 requires to actually do something here. */
1646 if (STMT_VINFO_LIVE_P (stmt_info)
1647 && !vect_active_double_reduction_p (stmt_info))
1648 return opt_result::failure_at (phi,
1649 "Unsupported loop-closed phi"
1650 " in outer-loop.\n");
1652 /* If PHI is used in the outer loop, we check that its operand
1653 is defined in the inner loop. */
1654 if (STMT_VINFO_RELEVANT_P (stmt_info))
1656 tree phi_op;
1658 if (gimple_phi_num_args (phi) != 1)
1659 return opt_result::failure_at (phi, "unsupported phi");
1661 phi_op = PHI_ARG_DEF (phi, 0);
1662 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1663 if (!op_def_info)
1664 return opt_result::failure_at (phi, "unsupported phi\n");
1666 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1667 && (STMT_VINFO_RELEVANT (op_def_info)
1668 != vect_used_in_outer_by_reduction))
1669 return opt_result::failure_at (phi, "unsupported phi\n");
1671 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1672 || (STMT_VINFO_DEF_TYPE (stmt_info)
1673 == vect_double_reduction_def))
1674 && !vectorizable_lc_phi (loop_vinfo,
1675 stmt_info, NULL, NULL))
1676 return opt_result::failure_at (phi, "unsupported phi\n");
1679 continue;
1682 gcc_assert (stmt_info);
1684 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1685 || STMT_VINFO_LIVE_P (stmt_info))
1686 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1687 /* A scalar-dependence cycle that we don't support. */
1688 return opt_result::failure_at (phi,
1689 "not vectorized:"
1690 " scalar dependence cycle.\n");
1692 if (STMT_VINFO_RELEVANT_P (stmt_info))
1694 need_to_vectorize = true;
1695 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1696 && ! PURE_SLP_STMT (stmt_info))
1697 ok = vectorizable_induction (loop_vinfo,
1698 stmt_info, NULL, NULL,
1699 &cost_vec);
1700 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1701 || (STMT_VINFO_DEF_TYPE (stmt_info)
1702 == vect_double_reduction_def)
1703 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1704 && ! PURE_SLP_STMT (stmt_info))
1705 ok = vectorizable_reduction (loop_vinfo,
1706 stmt_info, NULL, NULL, &cost_vec);
1709 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1710 if (ok
1711 && STMT_VINFO_LIVE_P (stmt_info)
1712 && !PURE_SLP_STMT (stmt_info))
1713 ok = vectorizable_live_operation (loop_vinfo,
1714 stmt_info, NULL, NULL, NULL,
1715 -1, false, &cost_vec);
1717 if (!ok)
1718 return opt_result::failure_at (phi,
1719 "not vectorized: relevant phi not "
1720 "supported: %G",
1721 static_cast <gimple *> (phi));
1724 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1725 gsi_next (&si))
1727 gimple *stmt = gsi_stmt (si);
1728 if (!gimple_clobber_p (stmt)
1729 && !is_gimple_debug (stmt))
1731 opt_result res
1732 = vect_analyze_stmt (loop_vinfo,
1733 loop_vinfo->lookup_stmt (stmt),
1734 &need_to_vectorize,
1735 NULL, NULL, &cost_vec);
1736 if (!res)
1737 return res;
1740 } /* bbs */
1742 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1744 /* All operations in the loop are either irrelevant (deal with loop
1745 control, or dead), or only used outside the loop and can be moved
1746 out of the loop (e.g. invariants, inductions). The loop can be
1747 optimized away by scalar optimizations. We're better off not
1748 touching this loop. */
1749 if (!need_to_vectorize)
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_NOTE, vect_location,
1753 "All the computation can be taken out of the loop.\n");
1754 return opt_result::failure_at
1755 (vect_location,
1756 "not vectorized: redundant loop. no profit to vectorize.\n");
1759 return opt_result::success ();
1762 /* Return true if we know that the iteration count is smaller than the
1763 vectorization factor. Return false if it isn't, or if we can't be sure
1764 either way. */
1766 static bool
1767 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1769 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1771 HOST_WIDE_INT max_niter;
1772 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1773 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1774 else
1775 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1777 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1778 return true;
1780 return false;
1783 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1784 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1785 definitely no, or -1 if it's worth retrying. */
1787 static int
1788 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1790 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1791 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793 /* Only loops that can handle partially-populated vectors can have iteration
1794 counts less than the vectorization factor. */
1795 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1797 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1801 "not vectorized: iteration count smaller than "
1802 "vectorization factor.\n");
1803 return 0;
1807 int min_profitable_iters, min_profitable_estimate;
1808 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1809 &min_profitable_estimate);
1811 if (min_profitable_iters < 0)
1813 if (dump_enabled_p ())
1814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815 "not vectorized: vectorization not profitable.\n");
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818 "not vectorized: vector version will never be "
1819 "profitable.\n");
1820 return -1;
1823 int min_scalar_loop_bound = (param_min_vect_loop_bound
1824 * assumed_vf);
1826 /* Use the cost model only if it is more conservative than user specified
1827 threshold. */
1828 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1829 min_profitable_iters);
1831 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1833 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1834 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "not vectorized: vectorization not profitable.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "not vectorized: iteration count smaller than user "
1842 "specified loop bound parameter or minimum profitable "
1843 "iterations (whichever is more conservative).\n");
1844 return 0;
1847 /* The static profitablity threshold min_profitable_estimate includes
1848 the cost of having to check at runtime whether the scalar loop
1849 should be used instead. If it turns out that we don't need or want
1850 such a check, the threshold we should use for the static estimate
1851 is simply the point at which the vector loop becomes more profitable
1852 than the scalar loop. */
1853 if (min_profitable_estimate > min_profitable_iters
1854 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1855 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1856 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1861 " choice between the scalar and vector loops\n");
1862 min_profitable_estimate = min_profitable_iters;
1865 HOST_WIDE_INT estimated_niter;
1867 /* If we are vectorizing an epilogue then we know the maximum number of
1868 scalar iterations it will cover is at least one lower than the
1869 vectorization factor of the main loop. */
1870 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1871 estimated_niter
1872 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1873 else
1875 estimated_niter = estimated_stmt_executions_int (loop);
1876 if (estimated_niter == -1)
1877 estimated_niter = likely_max_stmt_executions_int (loop);
1879 if (estimated_niter != -1
1880 && ((unsigned HOST_WIDE_INT) estimated_niter
1881 < MAX (th, (unsigned) min_profitable_estimate)))
1883 if (dump_enabled_p ())
1884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885 "not vectorized: estimated iteration count too "
1886 "small.\n");
1887 if (dump_enabled_p ())
1888 dump_printf_loc (MSG_NOTE, vect_location,
1889 "not vectorized: estimated iteration count smaller "
1890 "than specified loop bound parameter or minimum "
1891 "profitable iterations (whichever is more "
1892 "conservative).\n");
1893 return -1;
1896 return 1;
1899 static opt_result
1900 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1901 vec<data_reference_p> *datarefs,
1902 unsigned int *n_stmts)
1904 *n_stmts = 0;
1905 for (unsigned i = 0; i < loop->num_nodes; i++)
1906 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1907 !gsi_end_p (gsi); gsi_next (&gsi))
1909 gimple *stmt = gsi_stmt (gsi);
1910 if (is_gimple_debug (stmt))
1911 continue;
1912 ++(*n_stmts);
1913 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1914 NULL, 0);
1915 if (!res)
1917 if (is_gimple_call (stmt) && loop->safelen)
1919 tree fndecl = gimple_call_fndecl (stmt), op;
1920 if (fndecl != NULL_TREE)
1922 cgraph_node *node = cgraph_node::get (fndecl);
1923 if (node != NULL && node->simd_clones != NULL)
1925 unsigned int j, n = gimple_call_num_args (stmt);
1926 for (j = 0; j < n; j++)
1928 op = gimple_call_arg (stmt, j);
1929 if (DECL_P (op)
1930 || (REFERENCE_CLASS_P (op)
1931 && get_base_address (op)))
1932 break;
1934 op = gimple_call_lhs (stmt);
1935 /* Ignore #pragma omp declare simd functions
1936 if they don't have data references in the
1937 call stmt itself. */
1938 if (j == n
1939 && !(op
1940 && (DECL_P (op)
1941 || (REFERENCE_CLASS_P (op)
1942 && get_base_address (op)))))
1943 continue;
1947 return res;
1949 /* If dependence analysis will give up due to the limit on the
1950 number of datarefs stop here and fail fatally. */
1951 if (datarefs->length ()
1952 > (unsigned)param_loop_max_datarefs_for_datadeps)
1953 return opt_result::failure_at (stmt, "exceeded param "
1954 "loop-max-datarefs-for-datadeps\n");
1956 return opt_result::success ();
1959 /* Look for SLP-only access groups and turn each individual access into its own
1960 group. */
1961 static void
1962 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1964 unsigned int i;
1965 struct data_reference *dr;
1967 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1969 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1970 FOR_EACH_VEC_ELT (datarefs, i, dr)
1972 gcc_assert (DR_REF (dr));
1973 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1975 /* Check if the load is a part of an interleaving chain. */
1976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1978 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1979 unsigned int group_size = DR_GROUP_SIZE (first_element);
1981 /* Check if SLP-only groups. */
1982 if (!STMT_SLP_TYPE (stmt_info)
1983 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1985 /* Dissolve the group. */
1986 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1988 stmt_vec_info vinfo = first_element;
1989 while (vinfo)
1991 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1992 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1993 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1994 DR_GROUP_SIZE (vinfo) = 1;
1995 if (STMT_VINFO_STRIDED_P (first_element))
1996 DR_GROUP_GAP (vinfo) = 0;
1997 else
1998 DR_GROUP_GAP (vinfo) = group_size - 1;
1999 vinfo = next;
2006 /* Determine if operating on full vectors for LOOP_VINFO might leave
2007 some scalar iterations still to do. If so, decide how we should
2008 handle those scalar iterations. The possibilities are:
2010 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2011 In this case:
2013 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2014 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2015 LOOP_VINFO_PEELING_FOR_NITER == false
2017 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2018 to handle the remaining scalar iterations. In this case:
2020 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2021 LOOP_VINFO_PEELING_FOR_NITER == true
2023 There are two choices:
2025 (2a) Consider vectorizing the epilogue loop at the same VF as the
2026 main loop, but using partial vectors instead of full vectors.
2027 In this case:
2029 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2031 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2032 In this case:
2034 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2036 When FOR_EPILOGUE_P is true, make this determination based on the
2037 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2038 based on the assumption that LOOP_VINFO is the main loop. The caller
2039 has made sure that the number of iterations is set appropriately for
2040 this value of FOR_EPILOGUE_P. */
2042 opt_result
2043 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2044 bool for_epilogue_p)
2046 /* Determine whether there would be any scalar iterations left over. */
2047 bool need_peeling_or_partial_vectors_p
2048 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2050 /* Decide whether to vectorize the loop with partial vectors. */
2051 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2052 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2053 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2054 && need_peeling_or_partial_vectors_p)
2056 /* For partial-vector-usage=1, try to push the handling of partial
2057 vectors to the epilogue, with the main loop continuing to operate
2058 on full vectors.
2060 ??? We could then end up failing to use partial vectors if we
2061 decide to peel iterations into a prologue, and if the main loop
2062 then ends up processing fewer than VF iterations. */
2063 if (param_vect_partial_vector_usage == 1
2064 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2065 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2066 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2067 else
2068 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2071 if (dump_enabled_p ())
2073 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2074 dump_printf_loc (MSG_NOTE, vect_location,
2075 "operating on partial vectors%s.\n",
2076 for_epilogue_p ? " for epilogue loop" : "");
2077 else
2078 dump_printf_loc (MSG_NOTE, vect_location,
2079 "operating only on full vectors%s.\n",
2080 for_epilogue_p ? " for epilogue loop" : "");
2083 if (for_epilogue_p)
2085 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2086 gcc_assert (orig_loop_vinfo);
2087 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2088 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2089 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2092 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2093 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2095 /* Check that the loop processes at least one full vector. */
2096 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2097 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2098 if (known_lt (wi::to_widest (scalar_niters), vf))
2099 return opt_result::failure_at (vect_location,
2100 "loop does not have enough iterations"
2101 " to support vectorization.\n");
2103 /* If we need to peel an extra epilogue iteration to handle data
2104 accesses with gaps, check that there are enough scalar iterations
2105 available.
2107 The check above is redundant with this one when peeling for gaps,
2108 but the distinction is useful for diagnostics. */
2109 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2110 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2111 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2112 return opt_result::failure_at (vect_location,
2113 "loop does not have enough iterations"
2114 " to support peeling for gaps.\n");
2117 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2118 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2119 && need_peeling_or_partial_vectors_p);
2121 return opt_result::success ();
2124 /* Function vect_analyze_loop_2.
2126 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2127 for it. The different analyses will record information in the
2128 loop_vec_info struct. */
2129 static opt_result
2130 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2132 opt_result ok = opt_result::success ();
2133 int res;
2134 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2135 poly_uint64 min_vf = 2;
2136 loop_vec_info orig_loop_vinfo = NULL;
2138 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2139 loop_vec_info of the first vectorized loop. */
2140 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2141 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2142 else
2143 orig_loop_vinfo = loop_vinfo;
2144 gcc_assert (orig_loop_vinfo);
2146 /* The first group of checks is independent of the vector size. */
2147 fatal = true;
2149 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2150 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2151 return opt_result::failure_at (vect_location,
2152 "not vectorized: simd if(0)\n");
2154 /* Find all data references in the loop (which correspond to vdefs/vuses)
2155 and analyze their evolution in the loop. */
2157 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2159 /* Gather the data references and count stmts in the loop. */
2160 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2162 opt_result res
2163 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2164 &LOOP_VINFO_DATAREFS (loop_vinfo),
2165 n_stmts);
2166 if (!res)
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170 "not vectorized: loop contains function "
2171 "calls or data references that cannot "
2172 "be analyzed\n");
2173 return res;
2175 loop_vinfo->shared->save_datarefs ();
2177 else
2178 loop_vinfo->shared->check_datarefs ();
2180 /* Analyze the data references and also adjust the minimal
2181 vectorization factor according to the loads and stores. */
2183 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2184 if (!ok)
2186 if (dump_enabled_p ())
2187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2188 "bad data references.\n");
2189 return ok;
2192 /* Classify all cross-iteration scalar data-flow cycles.
2193 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2194 vect_analyze_scalar_cycles (loop_vinfo);
2196 vect_pattern_recog (loop_vinfo);
2198 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2200 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2201 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2203 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2204 if (!ok)
2206 if (dump_enabled_p ())
2207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208 "bad data access.\n");
2209 return ok;
2212 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2214 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2215 if (!ok)
2217 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219 "unexpected pattern.\n");
2220 return ok;
2223 /* While the rest of the analysis below depends on it in some way. */
2224 fatal = false;
2226 /* Analyze data dependences between the data-refs in the loop
2227 and adjust the maximum vectorization factor according to
2228 the dependences.
2229 FORNOW: fail at the first data dependence that we encounter. */
2231 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2232 if (!ok)
2234 if (dump_enabled_p ())
2235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236 "bad data dependence.\n");
2237 return ok;
2239 if (max_vf != MAX_VECTORIZATION_FACTOR
2240 && maybe_lt (max_vf, min_vf))
2241 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2242 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2244 ok = vect_determine_vectorization_factor (loop_vinfo);
2245 if (!ok)
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "can't determine vectorization factor.\n");
2250 return ok;
2252 if (max_vf != MAX_VECTORIZATION_FACTOR
2253 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2254 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2256 /* Compute the scalar iteration cost. */
2257 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2259 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2261 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2262 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2263 if (!ok)
2264 return ok;
2266 /* If there are any SLP instances mark them as pure_slp. */
2267 bool slp = vect_make_slp_decision (loop_vinfo);
2268 if (slp)
2270 /* Find stmts that need to be both vectorized and SLPed. */
2271 vect_detect_hybrid_slp (loop_vinfo);
2273 /* Update the vectorization factor based on the SLP decision. */
2274 vect_update_vf_for_slp (loop_vinfo);
2276 /* Optimize the SLP graph with the vectorization factor fixed. */
2277 vect_optimize_slp (loop_vinfo);
2280 bool saved_can_use_partial_vectors_p
2281 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2283 /* We don't expect to have to roll back to anything other than an empty
2284 set of rgroups. */
2285 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2287 /* This is the point where we can re-start analysis with SLP forced off. */
2288 start_over:
2290 /* Now the vectorization factor is final. */
2291 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2292 gcc_assert (known_ne (vectorization_factor, 0U));
2294 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "vectorization_factor = ");
2298 dump_dec (MSG_NOTE, vectorization_factor);
2299 dump_printf (MSG_NOTE, ", niters = %wd\n",
2300 LOOP_VINFO_INT_NITERS (loop_vinfo));
2303 /* Analyze the alignment of the data-refs in the loop.
2304 Fail if a data reference is found that cannot be vectorized. */
2306 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2307 if (!ok)
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "bad data alignment.\n");
2312 return ok;
2315 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2316 It is important to call pruning after vect_analyze_data_ref_accesses,
2317 since we use grouping information gathered by interleaving analysis. */
2318 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2319 if (!ok)
2320 return ok;
2322 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2323 vectorization, since we do not want to add extra peeling or
2324 add versioning for alignment. */
2325 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2326 /* This pass will decide on using loop versioning and/or loop peeling in
2327 order to enhance the alignment of data references in the loop. */
2328 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2329 if (!ok)
2330 return ok;
2332 if (slp)
2334 /* Analyze operations in the SLP instances. Note this may
2335 remove unsupported SLP instances which makes the above
2336 SLP kind detection invalid. */
2337 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2338 vect_slp_analyze_operations (loop_vinfo);
2339 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2341 ok = opt_result::failure_at (vect_location,
2342 "unsupported SLP instances\n");
2343 goto again;
2347 /* Dissolve SLP-only groups. */
2348 vect_dissolve_slp_only_groups (loop_vinfo);
2350 /* Scan all the remaining operations in the loop that are not subject
2351 to SLP and make sure they are vectorizable. */
2352 ok = vect_analyze_loop_operations (loop_vinfo);
2353 if (!ok)
2355 if (dump_enabled_p ())
2356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2357 "bad operation or unsupported loop bound.\n");
2358 return ok;
2361 /* For now, we don't expect to mix both masking and length approaches for one
2362 loop, disable it if both are recorded. */
2363 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2364 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2365 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 "can't vectorize a loop with partial vectors"
2370 " because we don't expect to mix different"
2371 " approaches with partial vectors for the"
2372 " same loop.\n");
2373 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2376 /* If we still have the option of using partial vectors,
2377 check whether we can generate the necessary loop controls. */
2378 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2379 && !vect_verify_full_masking (loop_vinfo)
2380 && !vect_verify_loop_lens (loop_vinfo))
2381 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2383 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2384 to be able to handle fewer than VF scalars, or needs to have a lower VF
2385 than the main loop. */
2386 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2387 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2388 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2389 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2390 return opt_result::failure_at (vect_location,
2391 "Vectorization factor too high for"
2392 " epilogue loop.\n");
2394 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2395 assuming that the loop will be used as a main loop. We will redo
2396 this analysis later if we instead decide to use the loop as an
2397 epilogue loop. */
2398 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2399 if (!ok)
2400 return ok;
2402 /* Check the costings of the loop make vectorizing worthwhile. */
2403 res = vect_analyze_loop_costing (loop_vinfo);
2404 if (res < 0)
2406 ok = opt_result::failure_at (vect_location,
2407 "Loop costings may not be worthwhile.\n");
2408 goto again;
2410 if (!res)
2411 return opt_result::failure_at (vect_location,
2412 "Loop costings not worthwhile.\n");
2414 /* If an epilogue loop is required make sure we can create one. */
2415 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2416 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2420 if (!vect_can_advance_ivs_p (loop_vinfo)
2421 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2422 single_exit (LOOP_VINFO_LOOP
2423 (loop_vinfo))))
2425 ok = opt_result::failure_at (vect_location,
2426 "not vectorized: can't create required "
2427 "epilog loop\n");
2428 goto again;
2432 /* During peeling, we need to check if number of loop iterations is
2433 enough for both peeled prolog loop and vector loop. This check
2434 can be merged along with threshold check of loop versioning, so
2435 increase threshold for this case if necessary.
2437 If we are analyzing an epilogue we still want to check what its
2438 versioning threshold would be. If we decide to vectorize the epilogues we
2439 will want to use the lowest versioning threshold of all epilogues and main
2440 loop. This will enable us to enter a vectorized epilogue even when
2441 versioning the loop. We can't simply check whether the epilogue requires
2442 versioning though since we may have skipped some versioning checks when
2443 analyzing the epilogue. For instance, checks for alias versioning will be
2444 skipped when dealing with epilogues as we assume we already checked them
2445 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2446 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2448 poly_uint64 niters_th = 0;
2449 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2451 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2453 /* Niters for peeled prolog loop. */
2454 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2456 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2457 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2458 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2460 else
2461 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2464 /* Niters for at least one iteration of vectorized loop. */
2465 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2466 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2467 /* One additional iteration because of peeling for gap. */
2468 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2469 niters_th += 1;
2471 /* Use the same condition as vect_transform_loop to decide when to use
2472 the cost to determine a versioning threshold. */
2473 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2474 && ordered_p (th, niters_th))
2475 niters_th = ordered_max (poly_uint64 (th), niters_th);
2477 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2480 gcc_assert (known_eq (vectorization_factor,
2481 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2483 /* Ok to vectorize! */
2484 return opt_result::success ();
2486 again:
2487 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2488 gcc_assert (!ok);
2490 /* Try again with SLP forced off but if we didn't do any SLP there is
2491 no point in re-trying. */
2492 if (!slp)
2493 return ok;
2495 /* If there are reduction chains re-trying will fail anyway. */
2496 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2497 return ok;
2499 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2500 via interleaving or lane instructions. */
2501 slp_instance instance;
2502 slp_tree node;
2503 unsigned i, j;
2504 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2506 stmt_vec_info vinfo;
2507 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2508 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2509 continue;
2510 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2511 unsigned int size = DR_GROUP_SIZE (vinfo);
2512 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2513 if (! vect_store_lanes_supported (vectype, size, false)
2514 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2515 && ! vect_grouped_store_supported (vectype, size))
2516 return opt_result::failure_at (vinfo->stmt,
2517 "unsupported grouped store\n");
2518 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2520 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2521 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2522 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2523 size = DR_GROUP_SIZE (vinfo);
2524 vectype = STMT_VINFO_VECTYPE (vinfo);
2525 if (! vect_load_lanes_supported (vectype, size, false)
2526 && ! vect_grouped_load_supported (vectype, single_element_p,
2527 size))
2528 return opt_result::failure_at (vinfo->stmt,
2529 "unsupported grouped load\n");
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_NOTE, vect_location,
2535 "re-trying with SLP disabled\n");
2537 /* Roll back state appropriately. No SLP this time. */
2538 slp = false;
2539 /* Restore vectorization factor as it were without SLP. */
2540 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2541 /* Free the SLP instances. */
2542 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2543 vect_free_slp_instance (instance);
2544 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2545 /* Reset SLP type to loop_vect on all stmts. */
2546 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2548 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2549 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2550 !gsi_end_p (si); gsi_next (&si))
2552 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2553 STMT_SLP_TYPE (stmt_info) = loop_vect;
2554 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2555 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2557 /* vectorizable_reduction adjusts reduction stmt def-types,
2558 restore them to that of the PHI. */
2559 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2560 = STMT_VINFO_DEF_TYPE (stmt_info);
2561 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2562 (STMT_VINFO_REDUC_DEF (stmt_info)))
2563 = STMT_VINFO_DEF_TYPE (stmt_info);
2566 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2567 !gsi_end_p (si); gsi_next (&si))
2569 if (is_gimple_debug (gsi_stmt (si)))
2570 continue;
2571 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2572 STMT_SLP_TYPE (stmt_info) = loop_vect;
2573 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2575 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2576 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2577 STMT_SLP_TYPE (stmt_info) = loop_vect;
2578 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2579 !gsi_end_p (pi); gsi_next (&pi))
2580 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2581 = loop_vect;
2585 /* Free optimized alias test DDRS. */
2586 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2587 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2588 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2589 /* Reset target cost data. */
2590 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2591 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2592 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2593 /* Reset accumulated rgroup information. */
2594 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2595 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2596 /* Reset assorted flags. */
2597 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2598 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2599 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2600 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2601 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2602 = saved_can_use_partial_vectors_p;
2604 goto start_over;
2607 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2608 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2609 OLD_LOOP_VINFO is better unless something specifically indicates
2610 otherwise.
2612 Note that this deliberately isn't a partial order. */
2614 static bool
2615 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2616 loop_vec_info old_loop_vinfo)
2618 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2619 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2621 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2622 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2624 /* Always prefer a VF of loop->simdlen over any other VF. */
2625 if (loop->simdlen)
2627 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2628 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2629 if (new_simdlen_p != old_simdlen_p)
2630 return new_simdlen_p;
2633 /* Limit the VFs to what is likely to be the maximum number of iterations,
2634 to handle cases in which at least one loop_vinfo is fully-masked. */
2635 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2636 if (estimated_max_niter != -1)
2638 if (known_le (estimated_max_niter, new_vf))
2639 new_vf = estimated_max_niter;
2640 if (known_le (estimated_max_niter, old_vf))
2641 old_vf = estimated_max_niter;
2644 /* Check whether the (fractional) cost per scalar iteration is lower
2645 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2646 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2647 * poly_widest_int (old_vf));
2648 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2649 * poly_widest_int (new_vf));
2650 if (maybe_lt (rel_old, rel_new))
2652 /* When old_loop_vinfo uses a variable vectorization factor,
2653 we know that it has a lower cost for at least one runtime VF.
2654 However, we don't know how likely that VF is.
2656 One option would be to compare the costs for the estimated VFs.
2657 The problem is that that can put too much pressure on the cost
2658 model. E.g. if the estimated VF is also the lowest possible VF,
2659 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2660 for the estimated VF, we'd then choose new_loop_vinfo even
2661 though (a) new_loop_vinfo might not actually be better than
2662 old_loop_vinfo for that VF and (b) it would be significantly
2663 worse at larger VFs.
2665 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2666 no more expensive than old_loop_vinfo even after doubling the
2667 estimated old_loop_vinfo VF. For all but trivial loops, this
2668 ensures that we only pick new_loop_vinfo if it is significantly
2669 better than old_loop_vinfo at the estimated VF. */
2670 if (rel_new.is_constant ())
2671 return false;
2673 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2674 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2675 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2676 * widest_int (old_estimated_vf));
2677 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2678 * widest_int (new_estimated_vf));
2679 return estimated_rel_new * 2 <= estimated_rel_old;
2681 if (known_lt (rel_new, rel_old))
2682 return true;
2684 /* If there's nothing to choose between the loop bodies, see whether
2685 there's a difference in the prologue and epilogue costs. */
2686 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2687 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2689 return false;
2692 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2693 true if we should. */
2695 static bool
2696 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2697 loop_vec_info old_loop_vinfo)
2699 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2700 return false;
2702 if (dump_enabled_p ())
2703 dump_printf_loc (MSG_NOTE, vect_location,
2704 "***** Preferring vector mode %s to vector mode %s\n",
2705 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2706 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2707 return true;
2710 /* Function vect_analyze_loop.
2712 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2713 for it. The different analyses will record information in the
2714 loop_vec_info struct. */
2715 opt_loop_vec_info
2716 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2718 auto_vector_modes vector_modes;
2720 /* Autodetect first vector size we try. */
2721 unsigned int autovec_flags
2722 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2723 loop->simdlen != 0);
2724 unsigned int mode_i = 0;
2726 DUMP_VECT_SCOPE ("analyze_loop_nest");
2728 if (loop_outer (loop)
2729 && loop_vec_info_for_loop (loop_outer (loop))
2730 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2731 return opt_loop_vec_info::failure_at (vect_location,
2732 "outer-loop already vectorized.\n");
2734 if (!find_loop_nest (loop, &shared->loop_nest))
2735 return opt_loop_vec_info::failure_at
2736 (vect_location,
2737 "not vectorized: loop nest containing two or more consecutive inner"
2738 " loops cannot be vectorized\n");
2740 unsigned n_stmts = 0;
2741 machine_mode autodetected_vector_mode = VOIDmode;
2742 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2743 machine_mode next_vector_mode = VOIDmode;
2744 poly_uint64 lowest_th = 0;
2745 unsigned vectorized_loops = 0;
2746 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2747 && !unlimited_cost_model (loop));
2749 bool vect_epilogues = false;
2750 opt_result res = opt_result::success ();
2751 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2752 while (1)
2754 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2755 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2756 if (!loop_vinfo)
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "bad loop form.\n");
2761 gcc_checking_assert (first_loop_vinfo == NULL);
2762 return loop_vinfo;
2764 loop_vinfo->vector_mode = next_vector_mode;
2766 bool fatal = false;
2768 /* When pick_lowest_cost_p is true, we should in principle iterate
2769 over all the loop_vec_infos that LOOP_VINFO could replace and
2770 try to vectorize LOOP_VINFO under the same conditions.
2771 E.g. when trying to replace an epilogue loop, we should vectorize
2772 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2773 to replace the main loop, we should vectorize LOOP_VINFO as a main
2774 loop too.
2776 However, autovectorize_vector_modes is usually sorted as follows:
2778 - Modes that naturally produce lower VFs usually follow modes that
2779 naturally produce higher VFs.
2781 - When modes naturally produce the same VF, maskable modes
2782 usually follow unmaskable ones, so that the maskable mode
2783 can be used to vectorize the epilogue of the unmaskable mode.
2785 This order is preferred because it leads to the maximum
2786 epilogue vectorization opportunities. Targets should only use
2787 a different order if they want to make wide modes available while
2788 disparaging them relative to earlier, smaller modes. The assumption
2789 in that case is that the wider modes are more expensive in some
2790 way that isn't reflected directly in the costs.
2792 There should therefore be few interesting cases in which
2793 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2794 treated as a standalone loop, and ends up being genuinely cheaper
2795 than FIRST_LOOP_VINFO. */
2796 if (vect_epilogues)
2797 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2799 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2800 if (mode_i == 0)
2801 autodetected_vector_mode = loop_vinfo->vector_mode;
2802 if (dump_enabled_p ())
2804 if (res)
2805 dump_printf_loc (MSG_NOTE, vect_location,
2806 "***** Analysis succeeded with vector mode %s\n",
2807 GET_MODE_NAME (loop_vinfo->vector_mode));
2808 else
2809 dump_printf_loc (MSG_NOTE, vect_location,
2810 "***** Analysis failed with vector mode %s\n",
2811 GET_MODE_NAME (loop_vinfo->vector_mode));
2814 loop->aux = NULL;
2816 if (!fatal)
2817 while (mode_i < vector_modes.length ()
2818 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2820 if (dump_enabled_p ())
2821 dump_printf_loc (MSG_NOTE, vect_location,
2822 "***** The result for vector mode %s would"
2823 " be the same\n",
2824 GET_MODE_NAME (vector_modes[mode_i]));
2825 mode_i += 1;
2828 if (res)
2830 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2831 vectorized_loops++;
2833 /* Once we hit the desired simdlen for the first time,
2834 discard any previous attempts. */
2835 if (simdlen
2836 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2838 delete first_loop_vinfo;
2839 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2840 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2841 simdlen = 0;
2843 else if (pick_lowest_cost_p && first_loop_vinfo)
2845 /* Keep trying to roll back vectorization attempts while the
2846 loop_vec_infos they produced were worse than this one. */
2847 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2848 while (!vinfos.is_empty ()
2849 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2851 gcc_assert (vect_epilogues);
2852 delete vinfos.pop ();
2854 if (vinfos.is_empty ()
2855 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2857 delete first_loop_vinfo;
2858 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2859 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2863 if (first_loop_vinfo == NULL)
2865 first_loop_vinfo = loop_vinfo;
2866 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2868 else if (vect_epilogues
2869 /* For now only allow one epilogue loop. */
2870 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2872 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2873 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2874 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2875 || maybe_ne (lowest_th, 0U));
2876 /* Keep track of the known smallest versioning
2877 threshold. */
2878 if (ordered_p (lowest_th, th))
2879 lowest_th = ordered_min (lowest_th, th);
2881 else
2883 delete loop_vinfo;
2884 loop_vinfo = opt_loop_vec_info::success (NULL);
2887 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2888 enabled, SIMDUID is not set, it is the innermost loop and we have
2889 either already found the loop's SIMDLEN or there was no SIMDLEN to
2890 begin with.
2891 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2892 vect_epilogues = (!simdlen
2893 && loop->inner == NULL
2894 && param_vect_epilogues_nomask
2895 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2896 && !loop->simduid
2897 /* For now only allow one epilogue loop, but allow
2898 pick_lowest_cost_p to replace it. */
2899 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2900 || pick_lowest_cost_p));
2902 /* Commit to first_loop_vinfo if we have no reason to try
2903 alternatives. */
2904 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2905 break;
2907 else
2909 delete loop_vinfo;
2910 loop_vinfo = opt_loop_vec_info::success (NULL);
2911 if (fatal)
2913 gcc_checking_assert (first_loop_vinfo == NULL);
2914 break;
2918 /* Handle the case that the original loop can use partial
2919 vectorization, but want to only adopt it for the epilogue.
2920 The retry should be in the same mode as original. */
2921 if (vect_epilogues
2922 && loop_vinfo
2923 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2925 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2926 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2927 if (dump_enabled_p ())
2928 dump_printf_loc (MSG_NOTE, vect_location,
2929 "***** Re-trying analysis with same vector mode"
2930 " %s for epilogue with partial vectors.\n",
2931 GET_MODE_NAME (loop_vinfo->vector_mode));
2932 continue;
2935 if (mode_i < vector_modes.length ()
2936 && VECTOR_MODE_P (autodetected_vector_mode)
2937 && (related_vector_mode (vector_modes[mode_i],
2938 GET_MODE_INNER (autodetected_vector_mode))
2939 == autodetected_vector_mode)
2940 && (related_vector_mode (autodetected_vector_mode,
2941 GET_MODE_INNER (vector_modes[mode_i]))
2942 == vector_modes[mode_i]))
2944 if (dump_enabled_p ())
2945 dump_printf_loc (MSG_NOTE, vect_location,
2946 "***** Skipping vector mode %s, which would"
2947 " repeat the analysis for %s\n",
2948 GET_MODE_NAME (vector_modes[mode_i]),
2949 GET_MODE_NAME (autodetected_vector_mode));
2950 mode_i += 1;
2953 if (mode_i == vector_modes.length ()
2954 || autodetected_vector_mode == VOIDmode)
2955 break;
2957 /* Try the next biggest vector size. */
2958 next_vector_mode = vector_modes[mode_i++];
2959 if (dump_enabled_p ())
2960 dump_printf_loc (MSG_NOTE, vect_location,
2961 "***** Re-trying analysis with vector mode %s\n",
2962 GET_MODE_NAME (next_vector_mode));
2965 if (first_loop_vinfo)
2967 loop->aux = (loop_vec_info) first_loop_vinfo;
2968 if (dump_enabled_p ())
2969 dump_printf_loc (MSG_NOTE, vect_location,
2970 "***** Choosing vector mode %s\n",
2971 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2972 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2973 return first_loop_vinfo;
2976 return opt_loop_vec_info::propagate_failure (res);
2979 /* Return true if there is an in-order reduction function for CODE, storing
2980 it in *REDUC_FN if so. */
2982 static bool
2983 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2985 switch (code)
2987 case PLUS_EXPR:
2988 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2989 return true;
2991 default:
2992 return false;
2996 /* Function reduction_fn_for_scalar_code
2998 Input:
2999 CODE - tree_code of a reduction operations.
3001 Output:
3002 REDUC_FN - the corresponding internal function to be used to reduce the
3003 vector of partial results into a single scalar result, or IFN_LAST
3004 if the operation is a supported reduction operation, but does not have
3005 such an internal function.
3007 Return FALSE if CODE currently cannot be vectorized as reduction. */
3009 static bool
3010 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3012 switch (code)
3014 case MAX_EXPR:
3015 *reduc_fn = IFN_REDUC_MAX;
3016 return true;
3018 case MIN_EXPR:
3019 *reduc_fn = IFN_REDUC_MIN;
3020 return true;
3022 case PLUS_EXPR:
3023 *reduc_fn = IFN_REDUC_PLUS;
3024 return true;
3026 case BIT_AND_EXPR:
3027 *reduc_fn = IFN_REDUC_AND;
3028 return true;
3030 case BIT_IOR_EXPR:
3031 *reduc_fn = IFN_REDUC_IOR;
3032 return true;
3034 case BIT_XOR_EXPR:
3035 *reduc_fn = IFN_REDUC_XOR;
3036 return true;
3038 case MULT_EXPR:
3039 case MINUS_EXPR:
3040 *reduc_fn = IFN_LAST;
3041 return true;
3043 default:
3044 return false;
3048 /* If there is a neutral value X such that SLP reduction NODE would not
3049 be affected by the introduction of additional X elements, return that X,
3050 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3051 is the vector type that would hold element X. REDUC_CHAIN is true if
3052 the SLP statements perform a single reduction, false if each statement
3053 performs an independent reduction. */
3055 static tree
3056 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3057 tree_code code, bool reduc_chain)
3059 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3060 stmt_vec_info stmt_vinfo = stmts[0];
3061 tree scalar_type = TREE_TYPE (vector_type);
3062 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3063 gcc_assert (loop);
3065 switch (code)
3067 case WIDEN_SUM_EXPR:
3068 case DOT_PROD_EXPR:
3069 case SAD_EXPR:
3070 case PLUS_EXPR:
3071 case MINUS_EXPR:
3072 case BIT_IOR_EXPR:
3073 case BIT_XOR_EXPR:
3074 return build_zero_cst (scalar_type);
3076 case MULT_EXPR:
3077 return build_one_cst (scalar_type);
3079 case BIT_AND_EXPR:
3080 return build_all_ones_cst (scalar_type);
3082 case MAX_EXPR:
3083 case MIN_EXPR:
3084 /* For MIN/MAX the initial values are neutral. A reduction chain
3085 has only a single initial value, so that value is neutral for
3086 all statements. */
3087 if (reduc_chain)
3088 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3089 loop_preheader_edge (loop));
3090 return NULL_TREE;
3092 default:
3093 return NULL_TREE;
3097 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3098 STMT is printed with a message MSG. */
3100 static void
3101 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3103 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3106 /* Return true if we need an in-order reduction for operation CODE
3107 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3108 overflow must wrap. */
3110 bool
3111 needs_fold_left_reduction_p (tree type, tree_code code)
3113 /* CHECKME: check for !flag_finite_math_only too? */
3114 if (SCALAR_FLOAT_TYPE_P (type))
3115 switch (code)
3117 case MIN_EXPR:
3118 case MAX_EXPR:
3119 return false;
3121 default:
3122 return !flag_associative_math;
3125 if (INTEGRAL_TYPE_P (type))
3127 if (!operation_no_trapping_overflow (type, code))
3128 return true;
3129 return false;
3132 if (SAT_FIXED_POINT_TYPE_P (type))
3133 return true;
3135 return false;
3138 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3139 has a handled computation expression. Store the main reduction
3140 operation in *CODE. */
3142 static bool
3143 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3144 tree loop_arg, enum tree_code *code,
3145 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3147 auto_bitmap visited;
3148 tree lookfor = PHI_RESULT (phi);
3149 ssa_op_iter curri;
3150 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3151 while (USE_FROM_PTR (curr) != loop_arg)
3152 curr = op_iter_next_use (&curri);
3153 curri.i = curri.numops;
3156 path.safe_push (std::make_pair (curri, curr));
3157 tree use = USE_FROM_PTR (curr);
3158 if (use == lookfor)
3159 break;
3160 gimple *def = SSA_NAME_DEF_STMT (use);
3161 if (gimple_nop_p (def)
3162 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3164 pop:
3167 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3168 curri = x.first;
3169 curr = x.second;
3171 curr = op_iter_next_use (&curri);
3172 /* Skip already visited or non-SSA operands (from iterating
3173 over PHI args). */
3174 while (curr != NULL_USE_OPERAND_P
3175 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3176 || ! bitmap_set_bit (visited,
3177 SSA_NAME_VERSION
3178 (USE_FROM_PTR (curr)))));
3180 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3181 if (curr == NULL_USE_OPERAND_P)
3182 break;
3184 else
3186 if (gimple_code (def) == GIMPLE_PHI)
3187 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3188 else
3189 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3190 while (curr != NULL_USE_OPERAND_P
3191 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3192 || ! bitmap_set_bit (visited,
3193 SSA_NAME_VERSION
3194 (USE_FROM_PTR (curr)))))
3195 curr = op_iter_next_use (&curri);
3196 if (curr == NULL_USE_OPERAND_P)
3197 goto pop;
3200 while (1);
3201 if (dump_file && (dump_flags & TDF_DETAILS))
3203 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3204 unsigned i;
3205 std::pair<ssa_op_iter, use_operand_p> *x;
3206 FOR_EACH_VEC_ELT (path, i, x)
3207 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3208 dump_printf (MSG_NOTE, "\n");
3211 /* Check whether the reduction path detected is valid. */
3212 bool fail = path.length () == 0;
3213 bool neg = false;
3214 int sign = -1;
3215 *code = ERROR_MARK;
3216 for (unsigned i = 1; i < path.length (); ++i)
3218 gimple *use_stmt = USE_STMT (path[i].second);
3219 tree op = USE_FROM_PTR (path[i].second);
3220 if (! is_gimple_assign (use_stmt)
3221 /* The following make sure we can compute the operand index
3222 easily plus it mostly disallows chaining via COND_EXPR condition
3223 operands. */
3224 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3225 && (gimple_num_ops (use_stmt) <= 2
3226 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3227 && (gimple_num_ops (use_stmt) <= 3
3228 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3230 fail = true;
3231 break;
3233 /* Check there's only a single stmt the op is used on inside
3234 of the loop. */
3235 imm_use_iterator imm_iter;
3236 gimple *op_use_stmt;
3237 unsigned cnt = 0;
3238 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3239 if (!is_gimple_debug (op_use_stmt)
3240 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3242 /* We want to allow x + x but not x < 1 ? x : 2. */
3243 if (is_gimple_assign (op_use_stmt)
3244 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3246 use_operand_p use_p;
3247 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3248 cnt++;
3250 else
3251 cnt++;
3253 if (cnt != 1)
3255 fail = true;
3256 break;
3258 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3259 if (use_code == MINUS_EXPR)
3261 use_code = PLUS_EXPR;
3262 /* Track whether we negate the reduction value each iteration. */
3263 if (gimple_assign_rhs2 (use_stmt) == op)
3264 neg = ! neg;
3266 if (CONVERT_EXPR_CODE_P (use_code)
3267 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3268 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3270 else if (*code == ERROR_MARK)
3272 *code = use_code;
3273 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3275 else if (use_code != *code)
3277 fail = true;
3278 break;
3280 else if ((use_code == MIN_EXPR
3281 || use_code == MAX_EXPR)
3282 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3284 fail = true;
3285 break;
3288 return ! fail && ! neg && *code != ERROR_MARK;
3291 bool
3292 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3293 tree loop_arg, enum tree_code code)
3295 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3296 enum tree_code code_;
3297 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3298 && code_ == code);
3303 /* Function vect_is_simple_reduction
3305 (1) Detect a cross-iteration def-use cycle that represents a simple
3306 reduction computation. We look for the following pattern:
3308 loop_header:
3309 a1 = phi < a0, a2 >
3310 a3 = ...
3311 a2 = operation (a3, a1)
3315 a3 = ...
3316 loop_header:
3317 a1 = phi < a0, a2 >
3318 a2 = operation (a3, a1)
3320 such that:
3321 1. operation is commutative and associative and it is safe to
3322 change the order of the computation
3323 2. no uses for a2 in the loop (a2 is used out of the loop)
3324 3. no uses of a1 in the loop besides the reduction operation
3325 4. no uses of a1 outside the loop.
3327 Conditions 1,4 are tested here.
3328 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3330 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3331 nested cycles.
3333 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3334 reductions:
3336 a1 = phi < a0, a2 >
3337 inner loop (def of a3)
3338 a2 = phi < a3 >
3340 (4) Detect condition expressions, ie:
3341 for (int i = 0; i < N; i++)
3342 if (a[i] < val)
3343 ret_val = a[i];
3347 static stmt_vec_info
3348 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3349 bool *double_reduc, bool *reduc_chain_p)
3351 gphi *phi = as_a <gphi *> (phi_info->stmt);
3352 gimple *phi_use_stmt = NULL;
3353 imm_use_iterator imm_iter;
3354 use_operand_p use_p;
3356 *double_reduc = false;
3357 *reduc_chain_p = false;
3358 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3360 tree phi_name = PHI_RESULT (phi);
3361 /* ??? If there are no uses of the PHI result the inner loop reduction
3362 won't be detected as possibly double-reduction by vectorizable_reduction
3363 because that tries to walk the PHI arg from the preheader edge which
3364 can be constant. See PR60382. */
3365 if (has_zero_uses (phi_name))
3366 return NULL;
3367 class loop *loop = (gimple_bb (phi))->loop_father;
3368 unsigned nphi_def_loop_uses = 0;
3369 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3371 gimple *use_stmt = USE_STMT (use_p);
3372 if (is_gimple_debug (use_stmt))
3373 continue;
3375 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3377 if (dump_enabled_p ())
3378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3379 "intermediate value used outside loop.\n");
3381 return NULL;
3384 nphi_def_loop_uses++;
3385 phi_use_stmt = use_stmt;
3388 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3389 if (TREE_CODE (latch_def) != SSA_NAME)
3391 if (dump_enabled_p ())
3392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3393 "reduction: not ssa_name: %T\n", latch_def);
3394 return NULL;
3397 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3398 if (!def_stmt_info
3399 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3400 return NULL;
3402 bool nested_in_vect_loop
3403 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3404 unsigned nlatch_def_loop_uses = 0;
3405 auto_vec<gphi *, 3> lcphis;
3406 bool inner_loop_of_double_reduc = false;
3407 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3409 gimple *use_stmt = USE_STMT (use_p);
3410 if (is_gimple_debug (use_stmt))
3411 continue;
3412 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3413 nlatch_def_loop_uses++;
3414 else
3416 /* We can have more than one loop-closed PHI. */
3417 lcphis.safe_push (as_a <gphi *> (use_stmt));
3418 if (nested_in_vect_loop
3419 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3420 == vect_double_reduction_def))
3421 inner_loop_of_double_reduc = true;
3425 /* If we are vectorizing an inner reduction we are executing that
3426 in the original order only in case we are not dealing with a
3427 double reduction. */
3428 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3430 if (dump_enabled_p ())
3431 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3432 "detected nested cycle: ");
3433 return def_stmt_info;
3436 /* If this isn't a nested cycle or if the nested cycle reduction value
3437 is used ouside of the inner loop we cannot handle uses of the reduction
3438 value. */
3439 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3441 if (dump_enabled_p ())
3442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3443 "reduction used in loop.\n");
3444 return NULL;
3447 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3448 defined in the inner loop. */
3449 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3451 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3452 if (gimple_phi_num_args (def_stmt) != 1
3453 || TREE_CODE (op1) != SSA_NAME)
3455 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3457 "unsupported phi node definition.\n");
3459 return NULL;
3462 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3463 if (gimple_bb (def1)
3464 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3465 && loop->inner
3466 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3467 && is_gimple_assign (def1)
3468 && is_a <gphi *> (phi_use_stmt)
3469 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3471 if (dump_enabled_p ())
3472 report_vect_op (MSG_NOTE, def_stmt,
3473 "detected double reduction: ");
3475 *double_reduc = true;
3476 return def_stmt_info;
3479 return NULL;
3482 /* Look for the expression computing latch_def from then loop PHI result. */
3483 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3484 enum tree_code code;
3485 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3486 path))
3488 STMT_VINFO_REDUC_CODE (phi_info) = code;
3489 if (code == COND_EXPR && !nested_in_vect_loop)
3490 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3492 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3493 reduction chain for which the additional restriction is that
3494 all operations in the chain are the same. */
3495 auto_vec<stmt_vec_info, 8> reduc_chain;
3496 unsigned i;
3497 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3498 for (i = path.length () - 1; i >= 1; --i)
3500 gimple *stmt = USE_STMT (path[i].second);
3501 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3502 STMT_VINFO_REDUC_IDX (stmt_info)
3503 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3504 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3505 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3506 && (i == 1 || i == path.length () - 1));
3507 if ((stmt_code != code && !leading_conversion)
3508 /* We can only handle the final value in epilogue
3509 generation for reduction chains. */
3510 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3511 is_slp_reduc = false;
3512 /* For reduction chains we support a trailing/leading
3513 conversions. We do not store those in the actual chain. */
3514 if (leading_conversion)
3515 continue;
3516 reduc_chain.safe_push (stmt_info);
3518 if (is_slp_reduc && reduc_chain.length () > 1)
3520 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3522 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3523 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3525 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3526 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3528 /* Save the chain for further analysis in SLP detection. */
3529 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3530 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3532 *reduc_chain_p = true;
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE, vect_location,
3535 "reduction: detected reduction chain\n");
3537 else if (dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE, vect_location,
3539 "reduction: detected reduction\n");
3541 return def_stmt_info;
3544 if (dump_enabled_p ())
3545 dump_printf_loc (MSG_NOTE, vect_location,
3546 "reduction: unknown pattern\n");
3548 return NULL;
3551 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3552 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3553 or -1 if not known. */
3555 static int
3556 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3558 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3559 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3561 if (dump_enabled_p ())
3562 dump_printf_loc (MSG_NOTE, vect_location,
3563 "cost model: epilogue peel iters set to vf/2 "
3564 "because loop iterations are unknown .\n");
3565 return assumed_vf / 2;
3567 else
3569 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3570 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3571 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3572 /* If we need to peel for gaps, but no peeling is required, we have to
3573 peel VF iterations. */
3574 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3575 peel_iters_epilogue = assumed_vf;
3576 return peel_iters_epilogue;
3580 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3582 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3583 int *peel_iters_epilogue,
3584 stmt_vector_for_cost *scalar_cost_vec,
3585 stmt_vector_for_cost *prologue_cost_vec,
3586 stmt_vector_for_cost *epilogue_cost_vec)
3588 int retval = 0;
3590 *peel_iters_epilogue
3591 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3593 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3595 /* If peeled iterations are known but number of scalar loop
3596 iterations are unknown, count a taken branch per peeled loop. */
3597 if (peel_iters_prologue > 0)
3598 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3599 NULL, NULL_TREE, 0, vect_prologue);
3600 if (*peel_iters_epilogue > 0)
3601 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3602 NULL, NULL_TREE, 0, vect_epilogue);
3605 stmt_info_for_cost *si;
3606 int j;
3607 if (peel_iters_prologue)
3608 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3609 retval += record_stmt_cost (prologue_cost_vec,
3610 si->count * peel_iters_prologue,
3611 si->kind, si->stmt_info, si->misalign,
3612 vect_prologue);
3613 if (*peel_iters_epilogue)
3614 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3615 retval += record_stmt_cost (epilogue_cost_vec,
3616 si->count * *peel_iters_epilogue,
3617 si->kind, si->stmt_info, si->misalign,
3618 vect_epilogue);
3620 return retval;
3623 /* Function vect_estimate_min_profitable_iters
3625 Return the number of iterations required for the vector version of the
3626 loop to be profitable relative to the cost of the scalar version of the
3627 loop.
3629 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3630 of iterations for vectorization. -1 value means loop vectorization
3631 is not profitable. This returned value may be used for dynamic
3632 profitability check.
3634 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3635 for static check against estimated number of iterations. */
3637 static void
3638 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3639 int *ret_min_profitable_niters,
3640 int *ret_min_profitable_estimate)
3642 int min_profitable_iters;
3643 int min_profitable_estimate;
3644 int peel_iters_prologue;
3645 int peel_iters_epilogue;
3646 unsigned vec_inside_cost = 0;
3647 int vec_outside_cost = 0;
3648 unsigned vec_prologue_cost = 0;
3649 unsigned vec_epilogue_cost = 0;
3650 int scalar_single_iter_cost = 0;
3651 int scalar_outside_cost = 0;
3652 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3653 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3654 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3656 /* Cost model disabled. */
3657 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3659 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3661 *ret_min_profitable_niters = 0;
3662 *ret_min_profitable_estimate = 0;
3663 return;
3666 /* Requires loop versioning tests to handle misalignment. */
3667 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3669 /* FIXME: Make cost depend on complexity of individual check. */
3670 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3671 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3672 NULL, NULL_TREE, 0, vect_prologue);
3673 if (dump_enabled_p ())
3674 dump_printf (MSG_NOTE,
3675 "cost model: Adding cost of checks for loop "
3676 "versioning to treat misalignment.\n");
3679 /* Requires loop versioning with alias checks. */
3680 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3682 /* FIXME: Make cost depend on complexity of individual check. */
3683 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3684 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3685 NULL, NULL_TREE, 0, vect_prologue);
3686 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3687 if (len)
3688 /* Count LEN - 1 ANDs and LEN comparisons. */
3689 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3690 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3691 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3692 if (len)
3694 /* Count LEN - 1 ANDs and LEN comparisons. */
3695 unsigned int nstmts = len * 2 - 1;
3696 /* +1 for each bias that needs adding. */
3697 for (unsigned int i = 0; i < len; ++i)
3698 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3699 nstmts += 1;
3700 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3701 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3703 if (dump_enabled_p ())
3704 dump_printf (MSG_NOTE,
3705 "cost model: Adding cost of checks for loop "
3706 "versioning aliasing.\n");
3709 /* Requires loop versioning with niter checks. */
3710 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3712 /* FIXME: Make cost depend on complexity of individual check. */
3713 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3714 NULL, NULL_TREE, 0, vect_prologue);
3715 if (dump_enabled_p ())
3716 dump_printf (MSG_NOTE,
3717 "cost model: Adding cost of checks for loop "
3718 "versioning niters.\n");
3721 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3722 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3723 NULL, NULL_TREE, 0, vect_prologue);
3725 /* Count statements in scalar loop. Using this as scalar cost for a single
3726 iteration for now.
3728 TODO: Add outer loop support.
3730 TODO: Consider assigning different costs to different scalar
3731 statements. */
3733 scalar_single_iter_cost
3734 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3736 /* Add additional cost for the peeled instructions in prologue and epilogue
3737 loop. (For fully-masked loops there will be no peeling.)
3739 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3740 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3742 TODO: Build an expression that represents peel_iters for prologue and
3743 epilogue to be used in a run-time test. */
3745 bool prologue_need_br_taken_cost = false;
3746 bool prologue_need_br_not_taken_cost = false;
3748 /* Calculate peel_iters_prologue. */
3749 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3750 peel_iters_prologue = 0;
3751 else if (npeel < 0)
3753 peel_iters_prologue = assumed_vf / 2;
3754 if (dump_enabled_p ())
3755 dump_printf (MSG_NOTE, "cost model: "
3756 "prologue peel iters set to vf/2.\n");
3758 /* If peeled iterations are unknown, count a taken branch and a not taken
3759 branch per peeled loop. Even if scalar loop iterations are known,
3760 vector iterations are not known since peeled prologue iterations are
3761 not known. Hence guards remain the same. */
3762 prologue_need_br_taken_cost = true;
3763 prologue_need_br_not_taken_cost = true;
3765 else
3767 peel_iters_prologue = npeel;
3768 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3769 /* If peeled iterations are known but number of scalar loop
3770 iterations are unknown, count a taken branch per peeled loop. */
3771 prologue_need_br_taken_cost = true;
3774 bool epilogue_need_br_taken_cost = false;
3775 bool epilogue_need_br_not_taken_cost = false;
3777 /* Calculate peel_iters_epilogue. */
3778 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3779 /* We need to peel exactly one iteration for gaps. */
3780 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3781 else if (npeel < 0)
3783 /* If peeling for alignment is unknown, loop bound of main loop
3784 becomes unknown. */
3785 peel_iters_epilogue = assumed_vf / 2;
3786 if (dump_enabled_p ())
3787 dump_printf (MSG_NOTE, "cost model: "
3788 "epilogue peel iters set to vf/2 because "
3789 "peeling for alignment is unknown.\n");
3791 /* See the same reason above in peel_iters_prologue calculation. */
3792 epilogue_need_br_taken_cost = true;
3793 epilogue_need_br_not_taken_cost = true;
3795 else
3797 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3798 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3799 /* If peeled iterations are known but number of scalar loop
3800 iterations are unknown, count a taken branch per peeled loop. */
3801 epilogue_need_br_taken_cost = true;
3804 stmt_info_for_cost *si;
3805 int j;
3806 /* Add costs associated with peel_iters_prologue. */
3807 if (peel_iters_prologue)
3808 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3810 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3811 si->count * peel_iters_prologue, si->kind,
3812 si->stmt_info, si->vectype, si->misalign,
3813 vect_prologue);
3816 /* Add costs associated with peel_iters_epilogue. */
3817 if (peel_iters_epilogue)
3818 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3820 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3821 si->count * peel_iters_epilogue, si->kind,
3822 si->stmt_info, si->vectype, si->misalign,
3823 vect_epilogue);
3826 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3828 if (prologue_need_br_taken_cost)
3829 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3830 NULL, NULL_TREE, 0, vect_prologue);
3832 if (prologue_need_br_not_taken_cost)
3833 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3834 cond_branch_not_taken, NULL, NULL_TREE, 0,
3835 vect_prologue);
3837 if (epilogue_need_br_taken_cost)
3838 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3839 NULL, NULL_TREE, 0, vect_epilogue);
3841 if (epilogue_need_br_not_taken_cost)
3842 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3843 cond_branch_not_taken, NULL, NULL_TREE, 0,
3844 vect_epilogue);
3846 /* Take care of special costs for rgroup controls of partial vectors. */
3847 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3849 /* Calculate how many masks we need to generate. */
3850 unsigned int num_masks = 0;
3851 rgroup_controls *rgm;
3852 unsigned int num_vectors_m1;
3853 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3854 if (rgm->type)
3855 num_masks += num_vectors_m1 + 1;
3856 gcc_assert (num_masks > 0);
3858 /* In the worst case, we need to generate each mask in the prologue
3859 and in the loop body. One of the loop body mask instructions
3860 replaces the comparison in the scalar loop, and since we don't
3861 count the scalar comparison against the scalar body, we shouldn't
3862 count that vector instruction against the vector body either.
3864 Sometimes we can use unpacks instead of generating prologue
3865 masks and sometimes the prologue mask will fold to a constant,
3866 so the actual prologue cost might be smaller. However, it's
3867 simpler and safer to use the worst-case cost; if this ends up
3868 being the tie-breaker between vectorizing or not, then it's
3869 probably better not to vectorize. */
3870 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3871 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3872 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3873 vector_stmt, NULL, NULL_TREE, 0, vect_body);
3875 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3877 /* Referring to the functions vect_set_loop_condition_partial_vectors
3878 and vect_set_loop_controls_directly, we need to generate each
3879 length in the prologue and in the loop body if required. Although
3880 there are some possible optimizations, we consider the worst case
3881 here. */
3883 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3884 bool need_iterate_p
3885 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3886 && !vect_known_niters_smaller_than_vf (loop_vinfo));
3888 /* Calculate how many statements to be added. */
3889 unsigned int prologue_stmts = 0;
3890 unsigned int body_stmts = 0;
3892 rgroup_controls *rgc;
3893 unsigned int num_vectors_m1;
3894 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3895 if (rgc->type)
3897 /* May need one SHIFT for nitems_total computation. */
3898 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3899 if (nitems != 1 && !niters_known_p)
3900 prologue_stmts += 1;
3902 /* May need one MAX and one MINUS for wrap around. */
3903 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3904 prologue_stmts += 2;
3906 /* Need one MAX and one MINUS for each batch limit excepting for
3907 the 1st one. */
3908 prologue_stmts += num_vectors_m1 * 2;
3910 unsigned int num_vectors = num_vectors_m1 + 1;
3912 /* Need to set up lengths in prologue, only one MIN required
3913 for each since start index is zero. */
3914 prologue_stmts += num_vectors;
3916 /* Each may need two MINs and one MINUS to update lengths in body
3917 for next iteration. */
3918 if (need_iterate_p)
3919 body_stmts += 3 * num_vectors;
3922 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3923 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3924 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3925 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3928 /* FORNOW: The scalar outside cost is incremented in one of the
3929 following ways:
3931 1. The vectorizer checks for alignment and aliasing and generates
3932 a condition that allows dynamic vectorization. A cost model
3933 check is ANDED with the versioning condition. Hence scalar code
3934 path now has the added cost of the versioning check.
3936 if (cost > th & versioning_check)
3937 jmp to vector code
3939 Hence run-time scalar is incremented by not-taken branch cost.
3941 2. The vectorizer then checks if a prologue is required. If the
3942 cost model check was not done before during versioning, it has to
3943 be done before the prologue check.
3945 if (cost <= th)
3946 prologue = scalar_iters
3947 if (prologue == 0)
3948 jmp to vector code
3949 else
3950 execute prologue
3951 if (prologue == num_iters)
3952 go to exit
3954 Hence the run-time scalar cost is incremented by a taken branch,
3955 plus a not-taken branch, plus a taken branch cost.
3957 3. The vectorizer then checks if an epilogue is required. If the
3958 cost model check was not done before during prologue check, it
3959 has to be done with the epilogue check.
3961 if (prologue == 0)
3962 jmp to vector code
3963 else
3964 execute prologue
3965 if (prologue == num_iters)
3966 go to exit
3967 vector code:
3968 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3969 jmp to epilogue
3971 Hence the run-time scalar cost should be incremented by 2 taken
3972 branches.
3974 TODO: The back end may reorder the BBS's differently and reverse
3975 conditions/branch directions. Change the estimates below to
3976 something more reasonable. */
3978 /* If the number of iterations is known and we do not do versioning, we can
3979 decide whether to vectorize at compile time. Hence the scalar version
3980 do not carry cost model guard costs. */
3981 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3982 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3984 /* Cost model check occurs at versioning. */
3985 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3986 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3987 else
3989 /* Cost model check occurs at prologue generation. */
3990 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3991 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3992 + vect_get_stmt_cost (cond_branch_not_taken);
3993 /* Cost model check occurs at epilogue generation. */
3994 else
3995 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3999 /* Complete the target-specific cost calculations. */
4000 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4001 &vec_inside_cost, &vec_epilogue_cost);
4003 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4005 /* Stash the costs so that we can compare two loop_vec_infos. */
4006 loop_vinfo->vec_inside_cost = vec_inside_cost;
4007 loop_vinfo->vec_outside_cost = vec_outside_cost;
4009 if (dump_enabled_p ())
4011 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4012 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4013 vec_inside_cost);
4014 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4015 vec_prologue_cost);
4016 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4017 vec_epilogue_cost);
4018 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4019 scalar_single_iter_cost);
4020 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4021 scalar_outside_cost);
4022 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4023 vec_outside_cost);
4024 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4025 peel_iters_prologue);
4026 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4027 peel_iters_epilogue);
4030 /* Calculate number of iterations required to make the vector version
4031 profitable, relative to the loop bodies only. The following condition
4032 must hold true:
4033 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4034 where
4035 SIC = scalar iteration cost, VIC = vector iteration cost,
4036 VOC = vector outside cost, VF = vectorization factor,
4037 NPEEL = prologue iterations + epilogue iterations,
4038 SOC = scalar outside cost for run time cost model check. */
4040 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4041 - vec_inside_cost);
4042 if (saving_per_viter <= 0)
4044 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4045 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4046 "vectorization did not happen for a simd loop");
4048 if (dump_enabled_p ())
4049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4050 "cost model: the vector iteration cost = %d "
4051 "divided by the scalar iteration cost = %d "
4052 "is greater or equal to the vectorization factor = %d"
4053 ".\n",
4054 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4055 *ret_min_profitable_niters = -1;
4056 *ret_min_profitable_estimate = -1;
4057 return;
4060 /* ??? The "if" arm is written to handle all cases; see below for what
4061 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4062 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4064 /* Rewriting the condition above in terms of the number of
4065 vector iterations (vniters) rather than the number of
4066 scalar iterations (niters) gives:
4068 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4070 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4072 For integer N, X and Y when X > 0:
4074 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4075 int outside_overhead = (vec_outside_cost
4076 - scalar_single_iter_cost * peel_iters_prologue
4077 - scalar_single_iter_cost * peel_iters_epilogue
4078 - scalar_outside_cost);
4079 /* We're only interested in cases that require at least one
4080 vector iteration. */
4081 int min_vec_niters = 1;
4082 if (outside_overhead > 0)
4083 min_vec_niters = outside_overhead / saving_per_viter + 1;
4085 if (dump_enabled_p ())
4086 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4087 min_vec_niters);
4089 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4091 /* Now that we know the minimum number of vector iterations,
4092 find the minimum niters for which the scalar cost is larger:
4094 SIC * niters > VIC * vniters + VOC - SOC
4096 We know that the minimum niters is no more than
4097 vniters * VF + NPEEL, but it might be (and often is) less
4098 than that if a partial vector iteration is cheaper than the
4099 equivalent scalar code. */
4100 int threshold = (vec_inside_cost * min_vec_niters
4101 + vec_outside_cost
4102 - scalar_outside_cost);
4103 if (threshold <= 0)
4104 min_profitable_iters = 1;
4105 else
4106 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4108 else
4109 /* Convert the number of vector iterations into a number of
4110 scalar iterations. */
4111 min_profitable_iters = (min_vec_niters * assumed_vf
4112 + peel_iters_prologue
4113 + peel_iters_epilogue);
4115 else
4117 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4118 * assumed_vf
4119 - vec_inside_cost * peel_iters_prologue
4120 - vec_inside_cost * peel_iters_epilogue);
4121 if (min_profitable_iters <= 0)
4122 min_profitable_iters = 0;
4123 else
4125 min_profitable_iters /= saving_per_viter;
4127 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4128 <= (((int) vec_inside_cost * min_profitable_iters)
4129 + (((int) vec_outside_cost - scalar_outside_cost)
4130 * assumed_vf)))
4131 min_profitable_iters++;
4135 if (dump_enabled_p ())
4136 dump_printf (MSG_NOTE,
4137 " Calculated minimum iters for profitability: %d\n",
4138 min_profitable_iters);
4140 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4141 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4142 /* We want the vectorized loop to execute at least once. */
4143 min_profitable_iters = assumed_vf + peel_iters_prologue;
4144 else if (min_profitable_iters < peel_iters_prologue)
4145 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4146 vectorized loop executes at least once. */
4147 min_profitable_iters = peel_iters_prologue;
4149 if (dump_enabled_p ())
4150 dump_printf_loc (MSG_NOTE, vect_location,
4151 " Runtime profitability threshold = %d\n",
4152 min_profitable_iters);
4154 *ret_min_profitable_niters = min_profitable_iters;
4156 /* Calculate number of iterations required to make the vector version
4157 profitable, relative to the loop bodies only.
4159 Non-vectorized variant is SIC * niters and it must win over vector
4160 variant on the expected loop trip count. The following condition must hold true:
4161 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4163 if (vec_outside_cost <= 0)
4164 min_profitable_estimate = 0;
4165 /* ??? This "else if" arm is written to handle all cases; see below for
4166 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4167 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4169 /* This is a repeat of the code above, but with + SOC rather
4170 than - SOC. */
4171 int outside_overhead = (vec_outside_cost
4172 - scalar_single_iter_cost * peel_iters_prologue
4173 - scalar_single_iter_cost * peel_iters_epilogue
4174 + scalar_outside_cost);
4175 int min_vec_niters = 1;
4176 if (outside_overhead > 0)
4177 min_vec_niters = outside_overhead / saving_per_viter + 1;
4179 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4181 int threshold = (vec_inside_cost * min_vec_niters
4182 + vec_outside_cost
4183 + scalar_outside_cost);
4184 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4186 else
4187 min_profitable_estimate = (min_vec_niters * assumed_vf
4188 + peel_iters_prologue
4189 + peel_iters_epilogue);
4191 else
4193 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4194 * assumed_vf
4195 - vec_inside_cost * peel_iters_prologue
4196 - vec_inside_cost * peel_iters_epilogue)
4197 / ((scalar_single_iter_cost * assumed_vf)
4198 - vec_inside_cost);
4200 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4201 if (dump_enabled_p ())
4202 dump_printf_loc (MSG_NOTE, vect_location,
4203 " Static estimate profitability threshold = %d\n",
4204 min_profitable_estimate);
4206 *ret_min_profitable_estimate = min_profitable_estimate;
4209 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4210 vector elements (not bits) for a vector with NELT elements. */
4211 static void
4212 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4213 vec_perm_builder *sel)
4215 /* The encoding is a single stepped pattern. Any wrap-around is handled
4216 by vec_perm_indices. */
4217 sel->new_vector (nelt, 1, 3);
4218 for (unsigned int i = 0; i < 3; i++)
4219 sel->quick_push (i + offset);
4222 /* Checks whether the target supports whole-vector shifts for vectors of mode
4223 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4224 it supports vec_perm_const with masks for all necessary shift amounts. */
4225 static bool
4226 have_whole_vector_shift (machine_mode mode)
4228 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4229 return true;
4231 /* Variable-length vectors should be handled via the optab. */
4232 unsigned int nelt;
4233 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4234 return false;
4236 vec_perm_builder sel;
4237 vec_perm_indices indices;
4238 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4240 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4241 indices.new_vector (sel, 2, nelt);
4242 if (!can_vec_perm_const_p (mode, indices, false))
4243 return false;
4245 return true;
4248 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4249 functions. Design better to avoid maintenance issues. */
4251 /* Function vect_model_reduction_cost.
4253 Models cost for a reduction operation, including the vector ops
4254 generated within the strip-mine loop, the initial definition before
4255 the loop, and the epilogue code that must be generated. */
4257 static void
4258 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4259 stmt_vec_info stmt_info, internal_fn reduc_fn,
4260 vect_reduction_type reduction_type,
4261 int ncopies, stmt_vector_for_cost *cost_vec)
4263 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4264 enum tree_code code;
4265 optab optab;
4266 tree vectype;
4267 machine_mode mode;
4268 class loop *loop = NULL;
4270 if (loop_vinfo)
4271 loop = LOOP_VINFO_LOOP (loop_vinfo);
4273 /* Condition reductions generate two reductions in the loop. */
4274 if (reduction_type == COND_REDUCTION)
4275 ncopies *= 2;
4277 vectype = STMT_VINFO_VECTYPE (stmt_info);
4278 mode = TYPE_MODE (vectype);
4279 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4281 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4283 if (reduction_type == EXTRACT_LAST_REDUCTION)
4284 /* No extra instructions are needed in the prologue. The loop body
4285 operations are costed in vectorizable_condition. */
4286 inside_cost = 0;
4287 else if (reduction_type == FOLD_LEFT_REDUCTION)
4289 /* No extra instructions needed in the prologue. */
4290 prologue_cost = 0;
4292 if (reduc_fn != IFN_LAST)
4293 /* Count one reduction-like operation per vector. */
4294 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4295 stmt_info, 0, vect_body);
4296 else
4298 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4299 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4300 inside_cost = record_stmt_cost (cost_vec, nelements,
4301 vec_to_scalar, stmt_info, 0,
4302 vect_body);
4303 inside_cost += record_stmt_cost (cost_vec, nelements,
4304 scalar_stmt, stmt_info, 0,
4305 vect_body);
4308 else
4310 /* Add in cost for initial definition.
4311 For cond reduction we have four vectors: initial index, step,
4312 initial result of the data reduction, initial value of the index
4313 reduction. */
4314 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4315 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4316 scalar_to_vec, stmt_info, 0,
4317 vect_prologue);
4319 /* Cost of reduction op inside loop. */
4320 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4321 stmt_info, 0, vect_body);
4324 /* Determine cost of epilogue code.
4326 We have a reduction operator that will reduce the vector in one statement.
4327 Also requires scalar extract. */
4329 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4331 if (reduc_fn != IFN_LAST)
4333 if (reduction_type == COND_REDUCTION)
4335 /* An EQ stmt and an COND_EXPR stmt. */
4336 epilogue_cost += record_stmt_cost (cost_vec, 2,
4337 vector_stmt, stmt_info, 0,
4338 vect_epilogue);
4339 /* Reduction of the max index and a reduction of the found
4340 values. */
4341 epilogue_cost += record_stmt_cost (cost_vec, 2,
4342 vec_to_scalar, stmt_info, 0,
4343 vect_epilogue);
4344 /* A broadcast of the max value. */
4345 epilogue_cost += record_stmt_cost (cost_vec, 1,
4346 scalar_to_vec, stmt_info, 0,
4347 vect_epilogue);
4349 else
4351 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4352 stmt_info, 0, vect_epilogue);
4353 epilogue_cost += record_stmt_cost (cost_vec, 1,
4354 vec_to_scalar, stmt_info, 0,
4355 vect_epilogue);
4358 else if (reduction_type == COND_REDUCTION)
4360 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4361 /* Extraction of scalar elements. */
4362 epilogue_cost += record_stmt_cost (cost_vec,
4363 2 * estimated_nunits,
4364 vec_to_scalar, stmt_info, 0,
4365 vect_epilogue);
4366 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4367 epilogue_cost += record_stmt_cost (cost_vec,
4368 2 * estimated_nunits - 3,
4369 scalar_stmt, stmt_info, 0,
4370 vect_epilogue);
4372 else if (reduction_type == EXTRACT_LAST_REDUCTION
4373 || reduction_type == FOLD_LEFT_REDUCTION)
4374 /* No extra instructions need in the epilogue. */
4376 else
4378 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4379 tree bitsize =
4380 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4381 int element_bitsize = tree_to_uhwi (bitsize);
4382 int nelements = vec_size_in_bits / element_bitsize;
4384 if (code == COND_EXPR)
4385 code = MAX_EXPR;
4387 optab = optab_for_tree_code (code, vectype, optab_default);
4389 /* We have a whole vector shift available. */
4390 if (optab != unknown_optab
4391 && VECTOR_MODE_P (mode)
4392 && optab_handler (optab, mode) != CODE_FOR_nothing
4393 && have_whole_vector_shift (mode))
4395 /* Final reduction via vector shifts and the reduction operator.
4396 Also requires scalar extract. */
4397 epilogue_cost += record_stmt_cost (cost_vec,
4398 exact_log2 (nelements) * 2,
4399 vector_stmt, stmt_info, 0,
4400 vect_epilogue);
4401 epilogue_cost += record_stmt_cost (cost_vec, 1,
4402 vec_to_scalar, stmt_info, 0,
4403 vect_epilogue);
4405 else
4406 /* Use extracts and reduction op for final reduction. For N
4407 elements, we have N extracts and N-1 reduction ops. */
4408 epilogue_cost += record_stmt_cost (cost_vec,
4409 nelements + nelements - 1,
4410 vector_stmt, stmt_info, 0,
4411 vect_epilogue);
4415 if (dump_enabled_p ())
4416 dump_printf (MSG_NOTE,
4417 "vect_model_reduction_cost: inside_cost = %d, "
4418 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4419 prologue_cost, epilogue_cost);
4423 /* Function vect_model_induction_cost.
4425 Models cost for induction operations. */
4427 static void
4428 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4429 stmt_vector_for_cost *cost_vec)
4431 unsigned inside_cost, prologue_cost;
4433 if (PURE_SLP_STMT (stmt_info))
4434 return;
4436 /* loop cost for vec_loop. */
4437 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4438 stmt_info, 0, vect_body);
4440 /* prologue cost for vec_init and vec_step. */
4441 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4442 stmt_info, 0, vect_prologue);
4444 if (dump_enabled_p ())
4445 dump_printf_loc (MSG_NOTE, vect_location,
4446 "vect_model_induction_cost: inside_cost = %d, "
4447 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4452 /* Function get_initial_def_for_reduction
4454 Input:
4455 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4456 INIT_VAL - the initial value of the reduction variable
4458 Output:
4459 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4460 of the reduction (used for adjusting the epilog - see below).
4461 Return a vector variable, initialized according to the operation that
4462 STMT_VINFO performs. This vector will be used as the initial value
4463 of the vector of partial results.
4465 Option1 (adjust in epilog): Initialize the vector as follows:
4466 add/bit or/xor: [0,0,...,0,0]
4467 mult/bit and: [1,1,...,1,1]
4468 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4469 and when necessary (e.g. add/mult case) let the caller know
4470 that it needs to adjust the result by init_val.
4472 Option2: Initialize the vector as follows:
4473 add/bit or/xor: [init_val,0,0,...,0]
4474 mult/bit and: [init_val,1,1,...,1]
4475 min/max/cond_expr: [init_val,init_val,...,init_val]
4476 and no adjustments are needed.
4478 For example, for the following code:
4480 s = init_val;
4481 for (i=0;i<n;i++)
4482 s = s + a[i];
4484 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4485 For a vector of 4 units, we want to return either [0,0,0,init_val],
4486 or [0,0,0,0] and let the caller know that it needs to adjust
4487 the result at the end by 'init_val'.
4489 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4490 initialization vector is simpler (same element in all entries), if
4491 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4493 A cost model should help decide between these two schemes. */
4495 static tree
4496 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4497 stmt_vec_info stmt_vinfo,
4498 enum tree_code code, tree init_val,
4499 tree *adjustment_def)
4501 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4502 tree scalar_type = TREE_TYPE (init_val);
4503 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4504 tree def_for_init;
4505 tree init_def;
4506 REAL_VALUE_TYPE real_init_val = dconst0;
4507 int int_init_val = 0;
4508 gimple_seq stmts = NULL;
4510 gcc_assert (vectype);
4512 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4513 || SCALAR_FLOAT_TYPE_P (scalar_type));
4515 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4516 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4518 /* ADJUSTMENT_DEF is NULL when called from
4519 vect_create_epilog_for_reduction to vectorize double reduction. */
4520 if (adjustment_def)
4521 *adjustment_def = NULL;
4523 switch (code)
4525 case WIDEN_SUM_EXPR:
4526 case DOT_PROD_EXPR:
4527 case SAD_EXPR:
4528 case PLUS_EXPR:
4529 case MINUS_EXPR:
4530 case BIT_IOR_EXPR:
4531 case BIT_XOR_EXPR:
4532 case MULT_EXPR:
4533 case BIT_AND_EXPR:
4535 if (code == MULT_EXPR)
4537 real_init_val = dconst1;
4538 int_init_val = 1;
4541 if (code == BIT_AND_EXPR)
4542 int_init_val = -1;
4544 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4545 def_for_init = build_real (scalar_type, real_init_val);
4546 else
4547 def_for_init = build_int_cst (scalar_type, int_init_val);
4549 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4551 /* Option1: the first element is '0' or '1' as well. */
4552 if (!operand_equal_p (def_for_init, init_val, 0))
4553 *adjustment_def = init_val;
4554 init_def = gimple_build_vector_from_val (&stmts, vectype,
4555 def_for_init);
4557 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4559 /* Option2 (variable length): the first element is INIT_VAL. */
4560 init_def = gimple_build_vector_from_val (&stmts, vectype,
4561 def_for_init);
4562 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4563 vectype, init_def, init_val);
4565 else
4567 /* Option2: the first element is INIT_VAL. */
4568 tree_vector_builder elts (vectype, 1, 2);
4569 elts.quick_push (init_val);
4570 elts.quick_push (def_for_init);
4571 init_def = gimple_build_vector (&stmts, &elts);
4574 break;
4576 case MIN_EXPR:
4577 case MAX_EXPR:
4578 case COND_EXPR:
4580 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4581 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4583 break;
4585 default:
4586 gcc_unreachable ();
4589 if (stmts)
4590 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4591 return init_def;
4594 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4595 NUMBER_OF_VECTORS is the number of vector defs to create.
4596 If NEUTRAL_OP is nonnull, introducing extra elements of that
4597 value will not change the result. */
4599 static void
4600 get_initial_defs_for_reduction (vec_info *vinfo,
4601 slp_tree slp_node,
4602 vec<tree> *vec_oprnds,
4603 unsigned int number_of_vectors,
4604 bool reduc_chain, tree neutral_op)
4606 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4607 stmt_vec_info stmt_vinfo = stmts[0];
4608 unsigned HOST_WIDE_INT nunits;
4609 unsigned j, number_of_places_left_in_vector;
4610 tree vector_type;
4611 unsigned int group_size = stmts.length ();
4612 unsigned int i;
4613 class loop *loop;
4615 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4617 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4619 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4620 gcc_assert (loop);
4621 edge pe = loop_preheader_edge (loop);
4623 gcc_assert (!reduc_chain || neutral_op);
4625 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4626 created vectors. It is greater than 1 if unrolling is performed.
4628 For example, we have two scalar operands, s1 and s2 (e.g., group of
4629 strided accesses of size two), while NUNITS is four (i.e., four scalars
4630 of this type can be packed in a vector). The output vector will contain
4631 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4632 will be 2).
4634 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4635 vectors containing the operands.
4637 For example, NUNITS is four as before, and the group size is 8
4638 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4639 {s5, s6, s7, s8}. */
4641 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4642 nunits = group_size;
4644 number_of_places_left_in_vector = nunits;
4645 bool constant_p = true;
4646 tree_vector_builder elts (vector_type, nunits, 1);
4647 elts.quick_grow (nunits);
4648 gimple_seq ctor_seq = NULL;
4649 for (j = 0; j < nunits * number_of_vectors; ++j)
4651 tree op;
4652 i = j % group_size;
4653 stmt_vinfo = stmts[i];
4655 /* Get the def before the loop. In reduction chain we have only
4656 one initial value. Else we have as many as PHIs in the group. */
4657 if (reduc_chain)
4658 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4659 else if (((vec_oprnds->length () + 1) * nunits
4660 - number_of_places_left_in_vector >= group_size)
4661 && neutral_op)
4662 op = neutral_op;
4663 else
4664 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4666 /* Create 'vect_ = {op0,op1,...,opn}'. */
4667 number_of_places_left_in_vector--;
4668 elts[nunits - number_of_places_left_in_vector - 1] = op;
4669 if (!CONSTANT_CLASS_P (op))
4670 constant_p = false;
4672 if (number_of_places_left_in_vector == 0)
4674 tree init;
4675 if (constant_p && !neutral_op
4676 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4677 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4678 /* Build the vector directly from ELTS. */
4679 init = gimple_build_vector (&ctor_seq, &elts);
4680 else if (neutral_op)
4682 /* Build a vector of the neutral value and shift the
4683 other elements into place. */
4684 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4685 neutral_op);
4686 int k = nunits;
4687 while (k > 0 && elts[k - 1] == neutral_op)
4688 k -= 1;
4689 while (k > 0)
4691 k -= 1;
4692 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4693 vector_type, init, elts[k]);
4696 else
4698 /* First time round, duplicate ELTS to fill the
4699 required number of vectors. */
4700 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4701 number_of_vectors, *vec_oprnds);
4702 break;
4704 vec_oprnds->quick_push (init);
4706 number_of_places_left_in_vector = nunits;
4707 elts.new_vector (vector_type, nunits, 1);
4708 elts.quick_grow (nunits);
4709 constant_p = true;
4712 if (ctor_seq != NULL)
4713 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4716 /* For a statement STMT_INFO taking part in a reduction operation return
4717 the stmt_vec_info the meta information is stored on. */
4719 stmt_vec_info
4720 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4722 stmt_info = vect_orig_stmt (stmt_info);
4723 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4724 if (!is_a <gphi *> (stmt_info->stmt)
4725 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4726 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4727 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4728 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4730 if (gimple_phi_num_args (phi) == 1)
4731 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4733 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4735 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4736 stmt_vec_info info
4737 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4738 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4739 stmt_info = info;
4741 return stmt_info;
4744 /* Function vect_create_epilog_for_reduction
4746 Create code at the loop-epilog to finalize the result of a reduction
4747 computation.
4749 STMT_INFO is the scalar reduction stmt that is being vectorized.
4750 SLP_NODE is an SLP node containing a group of reduction statements. The
4751 first one in this group is STMT_INFO.
4752 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4753 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4754 (counting from 0)
4756 This function:
4757 1. Completes the reduction def-use cycles.
4758 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4759 by calling the function specified by REDUC_FN if available, or by
4760 other means (whole-vector shifts or a scalar loop).
4761 The function also creates a new phi node at the loop exit to preserve
4762 loop-closed form, as illustrated below.
4764 The flow at the entry to this function:
4766 loop:
4767 vec_def = phi <vec_init, null> # REDUCTION_PHI
4768 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4769 s_loop = scalar_stmt # (scalar) STMT_INFO
4770 loop_exit:
4771 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4772 use <s_out0>
4773 use <s_out0>
4775 The above is transformed by this function into:
4777 loop:
4778 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4779 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4780 s_loop = scalar_stmt # (scalar) STMT_INFO
4781 loop_exit:
4782 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4783 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4784 v_out2 = reduce <v_out1>
4785 s_out3 = extract_field <v_out2, 0>
4786 s_out4 = adjust_result <s_out3>
4787 use <s_out4>
4788 use <s_out4>
4791 static void
4792 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4793 stmt_vec_info stmt_info,
4794 slp_tree slp_node,
4795 slp_instance slp_node_instance)
4797 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4798 gcc_assert (reduc_info->is_reduc_info);
4799 /* For double reductions we need to get at the inner loop reduction
4800 stmt which has the meta info attached. Our stmt_info is that of the
4801 loop-closed PHI of the inner loop which we remember as
4802 def for the reduction PHI generation. */
4803 bool double_reduc = false;
4804 stmt_vec_info rdef_info = stmt_info;
4805 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4807 gcc_assert (!slp_node);
4808 double_reduc = true;
4809 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4810 (stmt_info->stmt, 0));
4811 stmt_info = vect_stmt_to_vectorize (stmt_info);
4813 gphi *reduc_def_stmt
4814 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4815 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4816 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4817 tree vectype;
4818 machine_mode mode;
4819 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4820 basic_block exit_bb;
4821 tree scalar_dest;
4822 tree scalar_type;
4823 gimple *new_phi = NULL, *phi;
4824 gimple_stmt_iterator exit_gsi;
4825 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4826 gimple *epilog_stmt = NULL;
4827 gimple *exit_phi;
4828 tree bitsize;
4829 tree def;
4830 tree orig_name, scalar_result;
4831 imm_use_iterator imm_iter, phi_imm_iter;
4832 use_operand_p use_p, phi_use_p;
4833 gimple *use_stmt;
4834 bool nested_in_vect_loop = false;
4835 auto_vec<gimple *> new_phis;
4836 int j, i;
4837 auto_vec<tree> scalar_results;
4838 unsigned int group_size = 1, k;
4839 auto_vec<gimple *> phis;
4840 bool slp_reduc = false;
4841 bool direct_slp_reduc;
4842 tree new_phi_result;
4843 tree induction_index = NULL_TREE;
4845 if (slp_node)
4846 group_size = SLP_TREE_LANES (slp_node);
4848 if (nested_in_vect_loop_p (loop, stmt_info))
4850 outer_loop = loop;
4851 loop = loop->inner;
4852 nested_in_vect_loop = true;
4853 gcc_assert (!slp_node);
4855 gcc_assert (!nested_in_vect_loop || double_reduc);
4857 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4858 gcc_assert (vectype);
4859 mode = TYPE_MODE (vectype);
4861 tree initial_def = NULL;
4862 tree induc_val = NULL_TREE;
4863 tree adjustment_def = NULL;
4864 if (slp_node)
4866 else
4868 /* Get at the scalar def before the loop, that defines the initial value
4869 of the reduction variable. */
4870 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4871 loop_preheader_edge (loop));
4872 /* Optimize: for induction condition reduction, if we can't use zero
4873 for induc_val, use initial_def. */
4874 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4875 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4876 else if (double_reduc)
4878 else if (nested_in_vect_loop)
4880 else
4881 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4884 unsigned vec_num;
4885 int ncopies;
4886 if (slp_node)
4888 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4889 ncopies = 1;
4891 else
4893 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4894 vec_num = 1;
4895 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4898 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4899 which is updated with the current index of the loop for every match of
4900 the original loop's cond_expr (VEC_STMT). This results in a vector
4901 containing the last time the condition passed for that vector lane.
4902 The first match will be a 1 to allow 0 to be used for non-matching
4903 indexes. If there are no matches at all then the vector will be all
4904 zeroes.
4906 PR92772: This algorithm is broken for architectures that support
4907 masked vectors, but do not provide fold_extract_last. */
4908 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4910 auto_vec<std::pair<tree, bool>, 2> ccompares;
4911 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4912 cond_info = vect_stmt_to_vectorize (cond_info);
4913 while (cond_info != reduc_info)
4915 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4917 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4918 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4919 ccompares.safe_push
4920 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4921 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4923 cond_info
4924 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4925 1 + STMT_VINFO_REDUC_IDX
4926 (cond_info)));
4927 cond_info = vect_stmt_to_vectorize (cond_info);
4929 gcc_assert (ccompares.length () != 0);
4931 tree indx_before_incr, indx_after_incr;
4932 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4933 int scalar_precision
4934 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4935 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4936 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4937 (TYPE_MODE (vectype), cr_index_scalar_type,
4938 TYPE_VECTOR_SUBPARTS (vectype));
4940 /* First we create a simple vector induction variable which starts
4941 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4942 vector size (STEP). */
4944 /* Create a {1,2,3,...} vector. */
4945 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4947 /* Create a vector of the step value. */
4948 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4949 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4951 /* Create an induction variable. */
4952 gimple_stmt_iterator incr_gsi;
4953 bool insert_after;
4954 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4955 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4956 insert_after, &indx_before_incr, &indx_after_incr);
4958 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4959 filled with zeros (VEC_ZERO). */
4961 /* Create a vector of 0s. */
4962 tree zero = build_zero_cst (cr_index_scalar_type);
4963 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4965 /* Create a vector phi node. */
4966 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4967 new_phi = create_phi_node (new_phi_tree, loop->header);
4968 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4969 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4971 /* Now take the condition from the loops original cond_exprs
4972 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4973 every match uses values from the induction variable
4974 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4975 (NEW_PHI_TREE).
4976 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4977 the new cond_expr (INDEX_COND_EXPR). */
4978 gimple_seq stmts = NULL;
4979 for (int i = ccompares.length () - 1; i != -1; --i)
4981 tree ccompare = ccompares[i].first;
4982 if (ccompares[i].second)
4983 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4984 cr_index_vector_type,
4985 ccompare,
4986 indx_before_incr, new_phi_tree);
4987 else
4988 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4989 cr_index_vector_type,
4990 ccompare,
4991 new_phi_tree, indx_before_incr);
4993 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4995 /* Update the phi with the vec cond. */
4996 induction_index = new_phi_tree;
4997 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4998 loop_latch_edge (loop), UNKNOWN_LOCATION);
5001 /* 2. Create epilog code.
5002 The reduction epilog code operates across the elements of the vector
5003 of partial results computed by the vectorized loop.
5004 The reduction epilog code consists of:
5006 step 1: compute the scalar result in a vector (v_out2)
5007 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5008 step 3: adjust the scalar result (s_out3) if needed.
5010 Step 1 can be accomplished using one the following three schemes:
5011 (scheme 1) using reduc_fn, if available.
5012 (scheme 2) using whole-vector shifts, if available.
5013 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5014 combined.
5016 The overall epilog code looks like this:
5018 s_out0 = phi <s_loop> # original EXIT_PHI
5019 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5020 v_out2 = reduce <v_out1> # step 1
5021 s_out3 = extract_field <v_out2, 0> # step 2
5022 s_out4 = adjust_result <s_out3> # step 3
5024 (step 3 is optional, and steps 1 and 2 may be combined).
5025 Lastly, the uses of s_out0 are replaced by s_out4. */
5028 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5029 v_out1 = phi <VECT_DEF>
5030 Store them in NEW_PHIS. */
5031 if (double_reduc)
5032 loop = outer_loop;
5033 exit_bb = single_exit (loop)->dest;
5034 new_phis.create (slp_node ? vec_num : ncopies);
5035 for (unsigned i = 0; i < vec_num; i++)
5037 if (slp_node)
5038 def = vect_get_slp_vect_def (slp_node, i);
5039 else
5040 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5041 for (j = 0; j < ncopies; j++)
5043 tree new_def = copy_ssa_name (def);
5044 phi = create_phi_node (new_def, exit_bb);
5045 if (j == 0)
5046 new_phis.quick_push (phi);
5047 else
5049 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5050 new_phis.quick_push (phi);
5053 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5057 exit_gsi = gsi_after_labels (exit_bb);
5059 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5060 (i.e. when reduc_fn is not available) and in the final adjustment
5061 code (if needed). Also get the original scalar reduction variable as
5062 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5063 represents a reduction pattern), the tree-code and scalar-def are
5064 taken from the original stmt that the pattern-stmt (STMT) replaces.
5065 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5066 are taken from STMT. */
5068 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5069 if (orig_stmt_info != stmt_info)
5071 /* Reduction pattern */
5072 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5073 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5076 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5077 scalar_type = TREE_TYPE (scalar_dest);
5078 scalar_results.create (group_size);
5079 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5080 bitsize = TYPE_SIZE (scalar_type);
5082 /* SLP reduction without reduction chain, e.g.,
5083 # a1 = phi <a2, a0>
5084 # b1 = phi <b2, b0>
5085 a2 = operation (a1)
5086 b2 = operation (b1) */
5087 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5089 /* True if we should implement SLP_REDUC using native reduction operations
5090 instead of scalar operations. */
5091 direct_slp_reduc = (reduc_fn != IFN_LAST
5092 && slp_reduc
5093 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5095 /* In case of reduction chain, e.g.,
5096 # a1 = phi <a3, a0>
5097 a2 = operation (a1)
5098 a3 = operation (a2),
5100 we may end up with more than one vector result. Here we reduce them to
5101 one vector. */
5102 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5104 gimple_seq stmts = NULL;
5105 tree first_vect = PHI_RESULT (new_phis[0]);
5106 first_vect = gimple_convert (&stmts, vectype, first_vect);
5107 for (k = 1; k < new_phis.length (); k++)
5109 gimple *next_phi = new_phis[k];
5110 tree second_vect = PHI_RESULT (next_phi);
5111 second_vect = gimple_convert (&stmts, vectype, second_vect);
5112 first_vect = gimple_build (&stmts, code, vectype,
5113 first_vect, second_vect);
5115 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5117 new_phi_result = first_vect;
5118 new_phis.truncate (0);
5119 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5121 /* Likewise if we couldn't use a single defuse cycle. */
5122 else if (ncopies > 1)
5124 gimple_seq stmts = NULL;
5125 tree first_vect = PHI_RESULT (new_phis[0]);
5126 first_vect = gimple_convert (&stmts, vectype, first_vect);
5127 for (int k = 1; k < ncopies; ++k)
5129 tree second_vect = PHI_RESULT (new_phis[k]);
5130 second_vect = gimple_convert (&stmts, vectype, second_vect);
5131 first_vect = gimple_build (&stmts, code, vectype,
5132 first_vect, second_vect);
5134 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5135 new_phi_result = first_vect;
5136 new_phis.truncate (0);
5137 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5139 else
5140 new_phi_result = PHI_RESULT (new_phis[0]);
5142 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5143 && reduc_fn != IFN_LAST)
5145 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5146 various data values where the condition matched and another vector
5147 (INDUCTION_INDEX) containing all the indexes of those matches. We
5148 need to extract the last matching index (which will be the index with
5149 highest value) and use this to index into the data vector.
5150 For the case where there were no matches, the data vector will contain
5151 all default values and the index vector will be all zeros. */
5153 /* Get various versions of the type of the vector of indexes. */
5154 tree index_vec_type = TREE_TYPE (induction_index);
5155 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5156 tree index_scalar_type = TREE_TYPE (index_vec_type);
5157 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5159 /* Get an unsigned integer version of the type of the data vector. */
5160 int scalar_precision
5161 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5162 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5163 tree vectype_unsigned = build_vector_type
5164 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5166 /* First we need to create a vector (ZERO_VEC) of zeros and another
5167 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5168 can create using a MAX reduction and then expanding.
5169 In the case where the loop never made any matches, the max index will
5170 be zero. */
5172 /* Vector of {0, 0, 0,...}. */
5173 tree zero_vec = build_zero_cst (vectype);
5175 gimple_seq stmts = NULL;
5176 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5177 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5179 /* Find maximum value from the vector of found indexes. */
5180 tree max_index = make_ssa_name (index_scalar_type);
5181 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5182 1, induction_index);
5183 gimple_call_set_lhs (max_index_stmt, max_index);
5184 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5186 /* Vector of {max_index, max_index, max_index,...}. */
5187 tree max_index_vec = make_ssa_name (index_vec_type);
5188 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5189 max_index);
5190 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5191 max_index_vec_rhs);
5192 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5194 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5195 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5196 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5197 otherwise. Only one value should match, resulting in a vector
5198 (VEC_COND) with one data value and the rest zeros.
5199 In the case where the loop never made any matches, every index will
5200 match, resulting in a vector with all data values (which will all be
5201 the default value). */
5203 /* Compare the max index vector to the vector of found indexes to find
5204 the position of the max value. */
5205 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5206 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5207 induction_index,
5208 max_index_vec);
5209 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5211 /* Use the compare to choose either values from the data vector or
5212 zero. */
5213 tree vec_cond = make_ssa_name (vectype);
5214 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5215 vec_compare, new_phi_result,
5216 zero_vec);
5217 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5219 /* Finally we need to extract the data value from the vector (VEC_COND)
5220 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5221 reduction, but because this doesn't exist, we can use a MAX reduction
5222 instead. The data value might be signed or a float so we need to cast
5223 it first.
5224 In the case where the loop never made any matches, the data values are
5225 all identical, and so will reduce down correctly. */
5227 /* Make the matched data values unsigned. */
5228 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5229 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5230 vec_cond);
5231 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5232 VIEW_CONVERT_EXPR,
5233 vec_cond_cast_rhs);
5234 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5236 /* Reduce down to a scalar value. */
5237 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5238 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5239 1, vec_cond_cast);
5240 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5241 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5243 /* Convert the reduced value back to the result type and set as the
5244 result. */
5245 stmts = NULL;
5246 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5247 data_reduc);
5248 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5249 scalar_results.safe_push (new_temp);
5251 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5252 && reduc_fn == IFN_LAST)
5254 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5255 idx = 0;
5256 idx_val = induction_index[0];
5257 val = data_reduc[0];
5258 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5259 if (induction_index[i] > idx_val)
5260 val = data_reduc[i], idx_val = induction_index[i];
5261 return val; */
5263 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5264 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5265 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5266 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5267 /* Enforced by vectorizable_reduction, which ensures we have target
5268 support before allowing a conditional reduction on variable-length
5269 vectors. */
5270 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5271 tree idx_val = NULL_TREE, val = NULL_TREE;
5272 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5274 tree old_idx_val = idx_val;
5275 tree old_val = val;
5276 idx_val = make_ssa_name (idx_eltype);
5277 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5278 build3 (BIT_FIELD_REF, idx_eltype,
5279 induction_index,
5280 bitsize_int (el_size),
5281 bitsize_int (off)));
5282 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283 val = make_ssa_name (data_eltype);
5284 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5285 build3 (BIT_FIELD_REF,
5286 data_eltype,
5287 new_phi_result,
5288 bitsize_int (el_size),
5289 bitsize_int (off)));
5290 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5291 if (off != 0)
5293 tree new_idx_val = idx_val;
5294 if (off != v_size - el_size)
5296 new_idx_val = make_ssa_name (idx_eltype);
5297 epilog_stmt = gimple_build_assign (new_idx_val,
5298 MAX_EXPR, idx_val,
5299 old_idx_val);
5300 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5302 tree new_val = make_ssa_name (data_eltype);
5303 epilog_stmt = gimple_build_assign (new_val,
5304 COND_EXPR,
5305 build2 (GT_EXPR,
5306 boolean_type_node,
5307 idx_val,
5308 old_idx_val),
5309 val, old_val);
5310 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5311 idx_val = new_idx_val;
5312 val = new_val;
5315 /* Convert the reduced value back to the result type and set as the
5316 result. */
5317 gimple_seq stmts = NULL;
5318 val = gimple_convert (&stmts, scalar_type, val);
5319 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5320 scalar_results.safe_push (val);
5323 /* 2.3 Create the reduction code, using one of the three schemes described
5324 above. In SLP we simply need to extract all the elements from the
5325 vector (without reducing them), so we use scalar shifts. */
5326 else if (reduc_fn != IFN_LAST && !slp_reduc)
5328 tree tmp;
5329 tree vec_elem_type;
5331 /* Case 1: Create:
5332 v_out2 = reduc_expr <v_out1> */
5334 if (dump_enabled_p ())
5335 dump_printf_loc (MSG_NOTE, vect_location,
5336 "Reduce using direct vector reduction.\n");
5338 gimple_seq stmts = NULL;
5339 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5340 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5341 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5342 vec_elem_type, new_phi_result);
5343 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5344 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5346 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5347 && induc_val)
5349 /* Earlier we set the initial value to be a vector if induc_val
5350 values. Check the result and if it is induc_val then replace
5351 with the original initial value, unless induc_val is
5352 the same as initial_def already. */
5353 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5354 induc_val);
5356 tmp = make_ssa_name (new_scalar_dest);
5357 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5358 initial_def, new_temp);
5359 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360 new_temp = tmp;
5363 scalar_results.safe_push (new_temp);
5365 else if (direct_slp_reduc)
5367 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5368 with the elements for other SLP statements replaced with the
5369 neutral value. We can then do a normal reduction on each vector. */
5371 /* Enforced by vectorizable_reduction. */
5372 gcc_assert (new_phis.length () == 1);
5373 gcc_assert (pow2p_hwi (group_size));
5375 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5376 vec<stmt_vec_info> orig_phis
5377 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5378 gimple_seq seq = NULL;
5380 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5381 and the same element size as VECTYPE. */
5382 tree index = build_index_vector (vectype, 0, 1);
5383 tree index_type = TREE_TYPE (index);
5384 tree index_elt_type = TREE_TYPE (index_type);
5385 tree mask_type = truth_type_for (index_type);
5387 /* Create a vector that, for each element, identifies which of
5388 the REDUC_GROUP_SIZE results should use it. */
5389 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5390 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5391 build_vector_from_val (index_type, index_mask));
5393 /* Get a neutral vector value. This is simply a splat of the neutral
5394 scalar value if we have one, otherwise the initial scalar value
5395 is itself a neutral value. */
5396 tree vector_identity = NULL_TREE;
5397 tree neutral_op = NULL_TREE;
5398 if (slp_node)
5400 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5401 neutral_op
5402 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5403 vectype, code, first != NULL);
5405 if (neutral_op)
5406 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5407 neutral_op);
5408 for (unsigned int i = 0; i < group_size; ++i)
5410 /* If there's no univeral neutral value, we can use the
5411 initial scalar value from the original PHI. This is used
5412 for MIN and MAX reduction, for example. */
5413 if (!neutral_op)
5415 tree scalar_value
5416 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5417 loop_preheader_edge (loop));
5418 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5419 scalar_value);
5420 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5421 scalar_value);
5424 /* Calculate the equivalent of:
5426 sel[j] = (index[j] == i);
5428 which selects the elements of NEW_PHI_RESULT that should
5429 be included in the result. */
5430 tree compare_val = build_int_cst (index_elt_type, i);
5431 compare_val = build_vector_from_val (index_type, compare_val);
5432 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5433 index, compare_val);
5435 /* Calculate the equivalent of:
5437 vec = seq ? new_phi_result : vector_identity;
5439 VEC is now suitable for a full vector reduction. */
5440 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5441 sel, new_phi_result, vector_identity);
5443 /* Do the reduction and convert it to the appropriate type. */
5444 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5445 TREE_TYPE (vectype), vec);
5446 scalar = gimple_convert (&seq, scalar_type, scalar);
5447 scalar_results.safe_push (scalar);
5449 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5451 else
5453 bool reduce_with_shift;
5454 tree vec_temp;
5456 gcc_assert (slp_reduc || new_phis.length () == 1);
5458 /* See if the target wants to do the final (shift) reduction
5459 in a vector mode of smaller size and first reduce upper/lower
5460 halves against each other. */
5461 enum machine_mode mode1 = mode;
5462 tree stype = TREE_TYPE (vectype);
5463 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5464 unsigned nunits1 = nunits;
5465 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5466 && new_phis.length () == 1)
5468 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5469 /* For SLP reductions we have to make sure lanes match up, but
5470 since we're doing individual element final reduction reducing
5471 vector width here is even more important.
5472 ??? We can also separate lanes with permutes, for the common
5473 case of power-of-two group-size odd/even extracts would work. */
5474 if (slp_reduc && nunits != nunits1)
5476 nunits1 = least_common_multiple (nunits1, group_size);
5477 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5480 if (!slp_reduc
5481 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5482 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5484 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5485 stype, nunits1);
5486 reduce_with_shift = have_whole_vector_shift (mode1);
5487 if (!VECTOR_MODE_P (mode1))
5488 reduce_with_shift = false;
5489 else
5491 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5492 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5493 reduce_with_shift = false;
5496 /* First reduce the vector to the desired vector size we should
5497 do shift reduction on by combining upper and lower halves. */
5498 new_temp = new_phi_result;
5499 while (nunits > nunits1)
5501 nunits /= 2;
5502 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5503 stype, nunits);
5504 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5506 /* The target has to make sure we support lowpart/highpart
5507 extraction, either via direct vector extract or through
5508 an integer mode punning. */
5509 tree dst1, dst2;
5510 if (convert_optab_handler (vec_extract_optab,
5511 TYPE_MODE (TREE_TYPE (new_temp)),
5512 TYPE_MODE (vectype1))
5513 != CODE_FOR_nothing)
5515 /* Extract sub-vectors directly once vec_extract becomes
5516 a conversion optab. */
5517 dst1 = make_ssa_name (vectype1);
5518 epilog_stmt
5519 = gimple_build_assign (dst1, BIT_FIELD_REF,
5520 build3 (BIT_FIELD_REF, vectype1,
5521 new_temp, TYPE_SIZE (vectype1),
5522 bitsize_int (0)));
5523 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5524 dst2 = make_ssa_name (vectype1);
5525 epilog_stmt
5526 = gimple_build_assign (dst2, BIT_FIELD_REF,
5527 build3 (BIT_FIELD_REF, vectype1,
5528 new_temp, TYPE_SIZE (vectype1),
5529 bitsize_int (bitsize)));
5530 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5532 else
5534 /* Extract via punning to appropriately sized integer mode
5535 vector. */
5536 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5537 tree etype = build_vector_type (eltype, 2);
5538 gcc_assert (convert_optab_handler (vec_extract_optab,
5539 TYPE_MODE (etype),
5540 TYPE_MODE (eltype))
5541 != CODE_FOR_nothing);
5542 tree tem = make_ssa_name (etype);
5543 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5544 build1 (VIEW_CONVERT_EXPR,
5545 etype, new_temp));
5546 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5547 new_temp = tem;
5548 tem = make_ssa_name (eltype);
5549 epilog_stmt
5550 = gimple_build_assign (tem, BIT_FIELD_REF,
5551 build3 (BIT_FIELD_REF, eltype,
5552 new_temp, TYPE_SIZE (eltype),
5553 bitsize_int (0)));
5554 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5555 dst1 = make_ssa_name (vectype1);
5556 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5557 build1 (VIEW_CONVERT_EXPR,
5558 vectype1, tem));
5559 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5560 tem = make_ssa_name (eltype);
5561 epilog_stmt
5562 = gimple_build_assign (tem, BIT_FIELD_REF,
5563 build3 (BIT_FIELD_REF, eltype,
5564 new_temp, TYPE_SIZE (eltype),
5565 bitsize_int (bitsize)));
5566 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5567 dst2 = make_ssa_name (vectype1);
5568 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5569 build1 (VIEW_CONVERT_EXPR,
5570 vectype1, tem));
5571 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5574 new_temp = make_ssa_name (vectype1);
5575 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5576 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5577 new_phis[0] = epilog_stmt;
5580 if (reduce_with_shift && !slp_reduc)
5582 int element_bitsize = tree_to_uhwi (bitsize);
5583 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5584 for variable-length vectors and also requires direct target support
5585 for loop reductions. */
5586 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5587 int nelements = vec_size_in_bits / element_bitsize;
5588 vec_perm_builder sel;
5589 vec_perm_indices indices;
5591 int elt_offset;
5593 tree zero_vec = build_zero_cst (vectype1);
5594 /* Case 2: Create:
5595 for (offset = nelements/2; offset >= 1; offset/=2)
5597 Create: va' = vec_shift <va, offset>
5598 Create: va = vop <va, va'>
5599 } */
5601 tree rhs;
5603 if (dump_enabled_p ())
5604 dump_printf_loc (MSG_NOTE, vect_location,
5605 "Reduce using vector shifts\n");
5607 gimple_seq stmts = NULL;
5608 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5609 for (elt_offset = nelements / 2;
5610 elt_offset >= 1;
5611 elt_offset /= 2)
5613 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5614 indices.new_vector (sel, 2, nelements);
5615 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5616 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5617 new_temp, zero_vec, mask);
5618 new_temp = gimple_build (&stmts, code,
5619 vectype1, new_name, new_temp);
5621 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5623 /* 2.4 Extract the final scalar result. Create:
5624 s_out3 = extract_field <v_out2, bitpos> */
5626 if (dump_enabled_p ())
5627 dump_printf_loc (MSG_NOTE, vect_location,
5628 "extract scalar result\n");
5630 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5631 bitsize, bitsize_zero_node);
5632 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5633 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5634 gimple_assign_set_lhs (epilog_stmt, new_temp);
5635 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5636 scalar_results.safe_push (new_temp);
5638 else
5640 /* Case 3: Create:
5641 s = extract_field <v_out2, 0>
5642 for (offset = element_size;
5643 offset < vector_size;
5644 offset += element_size;)
5646 Create: s' = extract_field <v_out2, offset>
5647 Create: s = op <s, s'> // For non SLP cases
5648 } */
5650 if (dump_enabled_p ())
5651 dump_printf_loc (MSG_NOTE, vect_location,
5652 "Reduce using scalar code.\n");
5654 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5655 int element_bitsize = tree_to_uhwi (bitsize);
5656 tree compute_type = TREE_TYPE (vectype);
5657 gimple_seq stmts = NULL;
5658 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5660 int bit_offset;
5661 if (gimple_code (new_phi) == GIMPLE_PHI)
5662 vec_temp = PHI_RESULT (new_phi);
5663 else
5664 vec_temp = gimple_assign_lhs (new_phi);
5665 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5666 vec_temp, bitsize, bitsize_zero_node);
5668 /* In SLP we don't need to apply reduction operation, so we just
5669 collect s' values in SCALAR_RESULTS. */
5670 if (slp_reduc)
5671 scalar_results.safe_push (new_temp);
5673 for (bit_offset = element_bitsize;
5674 bit_offset < vec_size_in_bits;
5675 bit_offset += element_bitsize)
5677 tree bitpos = bitsize_int (bit_offset);
5678 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5679 compute_type, vec_temp,
5680 bitsize, bitpos);
5681 if (slp_reduc)
5683 /* In SLP we don't need to apply reduction operation, so
5684 we just collect s' values in SCALAR_RESULTS. */
5685 new_temp = new_name;
5686 scalar_results.safe_push (new_name);
5688 else
5689 new_temp = gimple_build (&stmts, code, compute_type,
5690 new_name, new_temp);
5694 /* The only case where we need to reduce scalar results in SLP, is
5695 unrolling. If the size of SCALAR_RESULTS is greater than
5696 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5697 REDUC_GROUP_SIZE. */
5698 if (slp_reduc)
5700 tree res, first_res, new_res;
5702 /* Reduce multiple scalar results in case of SLP unrolling. */
5703 for (j = group_size; scalar_results.iterate (j, &res);
5704 j++)
5706 first_res = scalar_results[j % group_size];
5707 new_res = gimple_build (&stmts, code, compute_type,
5708 first_res, res);
5709 scalar_results[j % group_size] = new_res;
5711 for (k = 0; k < group_size; k++)
5712 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5713 scalar_results[k]);
5715 else
5717 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5718 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5719 scalar_results.safe_push (new_temp);
5722 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5725 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5726 && induc_val)
5728 /* Earlier we set the initial value to be a vector if induc_val
5729 values. Check the result and if it is induc_val then replace
5730 with the original initial value, unless induc_val is
5731 the same as initial_def already. */
5732 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5733 induc_val);
5735 tree tmp = make_ssa_name (new_scalar_dest);
5736 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5737 initial_def, new_temp);
5738 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5739 scalar_results[0] = tmp;
5743 /* 2.5 Adjust the final result by the initial value of the reduction
5744 variable. (When such adjustment is not needed, then
5745 'adjustment_def' is zero). For example, if code is PLUS we create:
5746 new_temp = loop_exit_def + adjustment_def */
5748 if (adjustment_def)
5750 gcc_assert (!slp_reduc);
5751 gimple_seq stmts = NULL;
5752 if (nested_in_vect_loop)
5754 new_phi = new_phis[0];
5755 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5756 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5757 new_temp = gimple_build (&stmts, code, vectype,
5758 PHI_RESULT (new_phi), adjustment_def);
5760 else
5762 new_temp = scalar_results[0];
5763 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5764 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5765 new_temp = gimple_build (&stmts, code, scalar_type,
5766 new_temp, adjustment_def);
5769 epilog_stmt = gimple_seq_last_stmt (stmts);
5770 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5771 if (nested_in_vect_loop)
5773 if (!double_reduc)
5774 scalar_results.quick_push (new_temp);
5775 else
5776 scalar_results[0] = new_temp;
5778 else
5779 scalar_results[0] = new_temp;
5781 new_phis[0] = epilog_stmt;
5784 if (double_reduc)
5785 loop = loop->inner;
5787 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5788 phis with new adjusted scalar results, i.e., replace use <s_out0>
5789 with use <s_out4>.
5791 Transform:
5792 loop_exit:
5793 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5794 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5795 v_out2 = reduce <v_out1>
5796 s_out3 = extract_field <v_out2, 0>
5797 s_out4 = adjust_result <s_out3>
5798 use <s_out0>
5799 use <s_out0>
5801 into:
5803 loop_exit:
5804 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5805 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5806 v_out2 = reduce <v_out1>
5807 s_out3 = extract_field <v_out2, 0>
5808 s_out4 = adjust_result <s_out3>
5809 use <s_out4>
5810 use <s_out4> */
5813 /* In SLP reduction chain we reduce vector results into one vector if
5814 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5815 LHS of the last stmt in the reduction chain, since we are looking for
5816 the loop exit phi node. */
5817 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5819 stmt_vec_info dest_stmt_info
5820 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5821 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5822 group_size = 1;
5825 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5826 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5827 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5828 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5829 correspond to the first vector stmt, etc.
5830 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5831 if (group_size > new_phis.length ())
5832 gcc_assert (!(group_size % new_phis.length ()));
5834 for (k = 0; k < group_size; k++)
5836 if (slp_reduc)
5838 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5840 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5841 /* SLP statements can't participate in patterns. */
5842 gcc_assert (!orig_stmt_info);
5843 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5846 if (nested_in_vect_loop)
5848 if (double_reduc)
5849 loop = outer_loop;
5850 else
5851 gcc_unreachable ();
5854 phis.create (3);
5855 /* Find the loop-closed-use at the loop exit of the original scalar
5856 result. (The reduction result is expected to have two immediate uses,
5857 one at the latch block, and one at the loop exit). For double
5858 reductions we are looking for exit phis of the outer loop. */
5859 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5861 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5863 if (!is_gimple_debug (USE_STMT (use_p)))
5864 phis.safe_push (USE_STMT (use_p));
5866 else
5868 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5870 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5872 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5874 if (!flow_bb_inside_loop_p (loop,
5875 gimple_bb (USE_STMT (phi_use_p)))
5876 && !is_gimple_debug (USE_STMT (phi_use_p)))
5877 phis.safe_push (USE_STMT (phi_use_p));
5883 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5885 /* Replace the uses: */
5886 orig_name = PHI_RESULT (exit_phi);
5887 scalar_result = scalar_results[k];
5888 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5890 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5891 SET_USE (use_p, scalar_result);
5892 update_stmt (use_stmt);
5896 phis.release ();
5900 /* Return a vector of type VECTYPE that is equal to the vector select
5901 operation "MASK ? VEC : IDENTITY". Insert the select statements
5902 before GSI. */
5904 static tree
5905 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5906 tree vec, tree identity)
5908 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5909 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5910 mask, vec, identity);
5911 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5912 return cond;
5915 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5916 order, starting with LHS. Insert the extraction statements before GSI and
5917 associate the new scalar SSA names with variable SCALAR_DEST.
5918 Return the SSA name for the result. */
5920 static tree
5921 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5922 tree_code code, tree lhs, tree vector_rhs)
5924 tree vectype = TREE_TYPE (vector_rhs);
5925 tree scalar_type = TREE_TYPE (vectype);
5926 tree bitsize = TYPE_SIZE (scalar_type);
5927 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5928 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5930 for (unsigned HOST_WIDE_INT bit_offset = 0;
5931 bit_offset < vec_size_in_bits;
5932 bit_offset += element_bitsize)
5934 tree bitpos = bitsize_int (bit_offset);
5935 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5936 bitsize, bitpos);
5938 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5939 rhs = make_ssa_name (scalar_dest, stmt);
5940 gimple_assign_set_lhs (stmt, rhs);
5941 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5943 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5944 tree new_name = make_ssa_name (scalar_dest, stmt);
5945 gimple_assign_set_lhs (stmt, new_name);
5946 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5947 lhs = new_name;
5949 return lhs;
5952 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5953 type of the vector input. */
5955 static internal_fn
5956 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5958 internal_fn mask_reduc_fn;
5960 switch (reduc_fn)
5962 case IFN_FOLD_LEFT_PLUS:
5963 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5964 break;
5966 default:
5967 return IFN_LAST;
5970 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5971 OPTIMIZE_FOR_SPEED))
5972 return mask_reduc_fn;
5973 return IFN_LAST;
5976 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5977 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5978 statement. CODE is the operation performed by STMT_INFO and OPS are
5979 its scalar operands. REDUC_INDEX is the index of the operand in
5980 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5981 implements in-order reduction, or IFN_LAST if we should open-code it.
5982 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5983 that should be used to control the operation in a fully-masked loop. */
5985 static bool
5986 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5987 stmt_vec_info stmt_info,
5988 gimple_stmt_iterator *gsi,
5989 gimple **vec_stmt, slp_tree slp_node,
5990 gimple *reduc_def_stmt,
5991 tree_code code, internal_fn reduc_fn,
5992 tree ops[3], tree vectype_in,
5993 int reduc_index, vec_loop_masks *masks)
5995 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5996 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5997 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5999 int ncopies;
6000 if (slp_node)
6001 ncopies = 1;
6002 else
6003 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6005 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6006 gcc_assert (ncopies == 1);
6007 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6009 if (slp_node)
6010 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6011 TYPE_VECTOR_SUBPARTS (vectype_in)));
6013 tree op0 = ops[1 - reduc_index];
6015 int group_size = 1;
6016 stmt_vec_info scalar_dest_def_info;
6017 auto_vec<tree> vec_oprnds0;
6018 if (slp_node)
6020 auto_vec<vec<tree> > vec_defs (2);
6021 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6022 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6023 vec_defs[0].release ();
6024 vec_defs[1].release ();
6025 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6026 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6028 else
6030 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6031 op0, &vec_oprnds0);
6032 scalar_dest_def_info = stmt_info;
6035 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6036 tree scalar_type = TREE_TYPE (scalar_dest);
6037 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6039 int vec_num = vec_oprnds0.length ();
6040 gcc_assert (vec_num == 1 || slp_node);
6041 tree vec_elem_type = TREE_TYPE (vectype_out);
6042 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6044 tree vector_identity = NULL_TREE;
6045 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6046 vector_identity = build_zero_cst (vectype_out);
6048 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6049 int i;
6050 tree def0;
6051 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6053 gimple *new_stmt;
6054 tree mask = NULL_TREE;
6055 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6056 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6058 /* Handle MINUS by adding the negative. */
6059 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6061 tree negated = make_ssa_name (vectype_out);
6062 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6063 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6064 def0 = negated;
6067 if (mask && mask_reduc_fn == IFN_LAST)
6068 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6069 vector_identity);
6071 /* On the first iteration the input is simply the scalar phi
6072 result, and for subsequent iterations it is the output of
6073 the preceding operation. */
6074 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6076 if (mask && mask_reduc_fn != IFN_LAST)
6077 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6078 def0, mask);
6079 else
6080 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6081 def0);
6082 /* For chained SLP reductions the output of the previous reduction
6083 operation serves as the input of the next. For the final statement
6084 the output cannot be a temporary - we reuse the original
6085 scalar destination of the last statement. */
6086 if (i != vec_num - 1)
6088 gimple_set_lhs (new_stmt, scalar_dest_var);
6089 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6090 gimple_set_lhs (new_stmt, reduc_var);
6093 else
6095 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6096 reduc_var, def0);
6097 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6098 /* Remove the statement, so that we can use the same code paths
6099 as for statements that we've just created. */
6100 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6101 gsi_remove (&tmp_gsi, true);
6104 if (i == vec_num - 1)
6106 gimple_set_lhs (new_stmt, scalar_dest);
6107 vect_finish_replace_stmt (loop_vinfo,
6108 scalar_dest_def_info,
6109 new_stmt);
6111 else
6112 vect_finish_stmt_generation (loop_vinfo,
6113 scalar_dest_def_info,
6114 new_stmt, gsi);
6116 if (slp_node)
6117 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6118 else
6120 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6121 *vec_stmt = new_stmt;
6125 return true;
6128 /* Function is_nonwrapping_integer_induction.
6130 Check if STMT_VINO (which is part of loop LOOP) both increments and
6131 does not cause overflow. */
6133 static bool
6134 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6136 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6137 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6138 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6139 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6140 widest_int ni, max_loop_value, lhs_max;
6141 wi::overflow_type overflow = wi::OVF_NONE;
6143 /* Make sure the loop is integer based. */
6144 if (TREE_CODE (base) != INTEGER_CST
6145 || TREE_CODE (step) != INTEGER_CST)
6146 return false;
6148 /* Check that the max size of the loop will not wrap. */
6150 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6151 return true;
6153 if (! max_stmt_executions (loop, &ni))
6154 return false;
6156 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6157 &overflow);
6158 if (overflow)
6159 return false;
6161 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6162 TYPE_SIGN (lhs_type), &overflow);
6163 if (overflow)
6164 return false;
6166 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6167 <= TYPE_PRECISION (lhs_type));
6170 /* Check if masking can be supported by inserting a conditional expression.
6171 CODE is the code for the operation. COND_FN is the conditional internal
6172 function, if it exists. VECTYPE_IN is the type of the vector input. */
6173 static bool
6174 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6175 tree vectype_in)
6177 if (cond_fn != IFN_LAST
6178 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6179 OPTIMIZE_FOR_SPEED))
6180 return false;
6182 switch (code)
6184 case DOT_PROD_EXPR:
6185 case SAD_EXPR:
6186 return true;
6188 default:
6189 return false;
6193 /* Insert a conditional expression to enable masked vectorization. CODE is the
6194 code for the operation. VOP is the array of operands. MASK is the loop
6195 mask. GSI is a statement iterator used to place the new conditional
6196 expression. */
6197 static void
6198 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6199 gimple_stmt_iterator *gsi)
6201 switch (code)
6203 case DOT_PROD_EXPR:
6205 tree vectype = TREE_TYPE (vop[1]);
6206 tree zero = build_zero_cst (vectype);
6207 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6208 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6209 mask, vop[1], zero);
6210 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6211 vop[1] = masked_op1;
6212 break;
6215 case SAD_EXPR:
6217 tree vectype = TREE_TYPE (vop[1]);
6218 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6219 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6220 mask, vop[1], vop[0]);
6221 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6222 vop[1] = masked_op1;
6223 break;
6226 default:
6227 gcc_unreachable ();
6231 /* Function vectorizable_reduction.
6233 Check if STMT_INFO performs a reduction operation that can be vectorized.
6234 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6235 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6236 Return true if STMT_INFO is vectorizable in this way.
6238 This function also handles reduction idioms (patterns) that have been
6239 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6240 may be of this form:
6241 X = pattern_expr (arg0, arg1, ..., X)
6242 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6243 sequence that had been detected and replaced by the pattern-stmt
6244 (STMT_INFO).
6246 This function also handles reduction of condition expressions, for example:
6247 for (int i = 0; i < N; i++)
6248 if (a[i] < value)
6249 last = a[i];
6250 This is handled by vectorising the loop and creating an additional vector
6251 containing the loop indexes for which "a[i] < value" was true. In the
6252 function epilogue this is reduced to a single max value and then used to
6253 index into the vector of results.
6255 In some cases of reduction patterns, the type of the reduction variable X is
6256 different than the type of the other arguments of STMT_INFO.
6257 In such cases, the vectype that is used when transforming STMT_INFO into
6258 a vector stmt is different than the vectype that is used to determine the
6259 vectorization factor, because it consists of a different number of elements
6260 than the actual number of elements that are being operated upon in parallel.
6262 For example, consider an accumulation of shorts into an int accumulator.
6263 On some targets it's possible to vectorize this pattern operating on 8
6264 shorts at a time (hence, the vectype for purposes of determining the
6265 vectorization factor should be V8HI); on the other hand, the vectype that
6266 is used to create the vector form is actually V4SI (the type of the result).
6268 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6269 indicates what is the actual level of parallelism (V8HI in the example), so
6270 that the right vectorization factor would be derived. This vectype
6271 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6272 be used to create the vectorized stmt. The right vectype for the vectorized
6273 stmt is obtained from the type of the result X:
6274 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6276 This means that, contrary to "regular" reductions (or "regular" stmts in
6277 general), the following equation:
6278 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6279 does *NOT* necessarily hold for reduction patterns. */
6281 bool
6282 vectorizable_reduction (loop_vec_info loop_vinfo,
6283 stmt_vec_info stmt_info, slp_tree slp_node,
6284 slp_instance slp_node_instance,
6285 stmt_vector_for_cost *cost_vec)
6287 tree scalar_dest;
6288 tree vectype_in = NULL_TREE;
6289 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6290 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6291 stmt_vec_info cond_stmt_vinfo = NULL;
6292 tree scalar_type;
6293 int i;
6294 int ncopies;
6295 bool single_defuse_cycle = false;
6296 bool nested_cycle = false;
6297 bool double_reduc = false;
6298 int vec_num;
6299 tree tem;
6300 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6301 tree cond_reduc_val = NULL_TREE;
6303 /* Make sure it was already recognized as a reduction computation. */
6304 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6305 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6306 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6307 return false;
6309 /* The stmt we store reduction analysis meta on. */
6310 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6311 reduc_info->is_reduc_info = true;
6313 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6315 if (is_a <gphi *> (stmt_info->stmt))
6316 /* Analysis for double-reduction is done on the outer
6317 loop PHI, nested cycles have no further restrictions. */
6318 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6319 else
6320 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6321 return true;
6324 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6325 stmt_vec_info phi_info = stmt_info;
6326 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6327 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6329 if (!is_a <gphi *> (stmt_info->stmt))
6331 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6332 return true;
6334 if (slp_node)
6336 slp_node_instance->reduc_phis = slp_node;
6337 /* ??? We're leaving slp_node to point to the PHIs, we only
6338 need it to get at the number of vector stmts which wasn't
6339 yet initialized for the instance root. */
6341 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6342 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6343 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6345 use_operand_p use_p;
6346 gimple *use_stmt;
6347 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6348 &use_p, &use_stmt);
6349 gcc_assert (res);
6350 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6351 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6355 /* PHIs should not participate in patterns. */
6356 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6357 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6359 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6360 and compute the reduction chain length. Discover the real
6361 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6362 tree reduc_def
6363 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6364 loop_latch_edge
6365 (gimple_bb (reduc_def_phi)->loop_father));
6366 unsigned reduc_chain_length = 0;
6367 bool only_slp_reduc_chain = true;
6368 stmt_info = NULL;
6369 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6370 while (reduc_def != PHI_RESULT (reduc_def_phi))
6372 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6373 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6374 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6376 if (dump_enabled_p ())
6377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6378 "reduction chain broken by patterns.\n");
6379 return false;
6381 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6382 only_slp_reduc_chain = false;
6383 /* ??? For epilogue generation live members of the chain need
6384 to point back to the PHI via their original stmt for
6385 info_for_reduction to work. */
6386 if (STMT_VINFO_LIVE_P (vdef))
6387 STMT_VINFO_REDUC_DEF (def) = phi_info;
6388 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6389 if (!assign)
6391 if (dump_enabled_p ())
6392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6393 "reduction chain includes calls.\n");
6394 return false;
6396 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6398 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6399 TREE_TYPE (gimple_assign_rhs1 (assign))))
6401 if (dump_enabled_p ())
6402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6403 "conversion in the reduction chain.\n");
6404 return false;
6407 else if (!stmt_info)
6408 /* First non-conversion stmt. */
6409 stmt_info = vdef;
6410 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6411 reduc_chain_length++;
6412 if (!stmt_info && slp_node)
6413 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6415 /* PHIs should not participate in patterns. */
6416 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6418 if (nested_in_vect_loop_p (loop, stmt_info))
6420 loop = loop->inner;
6421 nested_cycle = true;
6424 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6425 element. */
6426 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6428 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6429 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6431 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6432 gcc_assert (slp_node
6433 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6435 /* 1. Is vectorizable reduction? */
6436 /* Not supportable if the reduction variable is used in the loop, unless
6437 it's a reduction chain. */
6438 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6439 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6440 return false;
6442 /* Reductions that are not used even in an enclosing outer-loop,
6443 are expected to be "live" (used out of the loop). */
6444 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6445 && !STMT_VINFO_LIVE_P (stmt_info))
6446 return false;
6448 /* 2. Has this been recognized as a reduction pattern?
6450 Check if STMT represents a pattern that has been recognized
6451 in earlier analysis stages. For stmts that represent a pattern,
6452 the STMT_VINFO_RELATED_STMT field records the last stmt in
6453 the original sequence that constitutes the pattern. */
6455 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6456 if (orig_stmt_info)
6458 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6459 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6462 /* 3. Check the operands of the operation. The first operands are defined
6463 inside the loop body. The last operand is the reduction variable,
6464 which is defined by the loop-header-phi. */
6466 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6467 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6468 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6469 enum tree_code code = gimple_assign_rhs_code (stmt);
6470 bool lane_reduc_code_p
6471 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6472 int op_type = TREE_CODE_LENGTH (code);
6474 scalar_dest = gimple_assign_lhs (stmt);
6475 scalar_type = TREE_TYPE (scalar_dest);
6476 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6477 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6478 return false;
6480 /* Do not try to vectorize bit-precision reductions. */
6481 if (!type_has_mode_precision_p (scalar_type))
6482 return false;
6484 /* For lane-reducing ops we're reducing the number of reduction PHIs
6485 which means the only use of that may be in the lane-reducing operation. */
6486 if (lane_reduc_code_p
6487 && reduc_chain_length != 1
6488 && !only_slp_reduc_chain)
6490 if (dump_enabled_p ())
6491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492 "lane-reducing reduction with extra stmts.\n");
6493 return false;
6496 /* All uses but the last are expected to be defined in the loop.
6497 The last use is the reduction variable. In case of nested cycle this
6498 assumption is not true: we use reduc_index to record the index of the
6499 reduction variable. */
6500 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6501 /* We need to skip an extra operand for COND_EXPRs with embedded
6502 comparison. */
6503 unsigned opno_adjust = 0;
6504 if (code == COND_EXPR
6505 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6506 opno_adjust = 1;
6507 for (i = 0; i < op_type; i++)
6509 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6510 if (i == 0 && code == COND_EXPR)
6511 continue;
6513 stmt_vec_info def_stmt_info;
6514 enum vect_def_type dt;
6515 tree op;
6516 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6517 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6518 &def_stmt_info))
6520 if (dump_enabled_p ())
6521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522 "use not simple.\n");
6523 return false;
6525 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6526 continue;
6528 /* There should be only one cycle def in the stmt, the one
6529 leading to reduc_def. */
6530 if (VECTORIZABLE_CYCLE_DEF (dt))
6531 return false;
6533 /* To properly compute ncopies we are interested in the widest
6534 non-reduction input type in case we're looking at a widening
6535 accumulation that we later handle in vect_transform_reduction. */
6536 if (lane_reduc_code_p
6537 && tem
6538 && (!vectype_in
6539 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6540 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6541 vectype_in = tem;
6543 if (code == COND_EXPR)
6545 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6546 if (dt == vect_constant_def)
6548 cond_reduc_dt = dt;
6549 cond_reduc_val = op;
6551 if (dt == vect_induction_def
6552 && def_stmt_info
6553 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6555 cond_reduc_dt = dt;
6556 cond_stmt_vinfo = def_stmt_info;
6560 if (!vectype_in)
6561 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6562 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6564 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6565 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6566 /* If we have a condition reduction, see if we can simplify it further. */
6567 if (v_reduc_type == COND_REDUCTION)
6569 if (slp_node)
6570 return false;
6572 /* When the condition uses the reduction value in the condition, fail. */
6573 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6575 if (dump_enabled_p ())
6576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6577 "condition depends on previous iteration\n");
6578 return false;
6581 if (reduc_chain_length == 1
6582 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6583 vectype_in, OPTIMIZE_FOR_SPEED))
6585 if (dump_enabled_p ())
6586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6587 "optimizing condition reduction with"
6588 " FOLD_EXTRACT_LAST.\n");
6589 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6591 else if (cond_reduc_dt == vect_induction_def)
6593 tree base
6594 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6595 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6597 gcc_assert (TREE_CODE (base) == INTEGER_CST
6598 && TREE_CODE (step) == INTEGER_CST);
6599 cond_reduc_val = NULL_TREE;
6600 enum tree_code cond_reduc_op_code = ERROR_MARK;
6601 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6602 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6604 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6605 above base; punt if base is the minimum value of the type for
6606 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6607 else if (tree_int_cst_sgn (step) == -1)
6609 cond_reduc_op_code = MIN_EXPR;
6610 if (tree_int_cst_sgn (base) == -1)
6611 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6612 else if (tree_int_cst_lt (base,
6613 TYPE_MAX_VALUE (TREE_TYPE (base))))
6614 cond_reduc_val
6615 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6617 else
6619 cond_reduc_op_code = MAX_EXPR;
6620 if (tree_int_cst_sgn (base) == 1)
6621 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6622 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6623 base))
6624 cond_reduc_val
6625 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6627 if (cond_reduc_val)
6629 if (dump_enabled_p ())
6630 dump_printf_loc (MSG_NOTE, vect_location,
6631 "condition expression based on "
6632 "integer induction.\n");
6633 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6634 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6635 = cond_reduc_val;
6636 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6639 else if (cond_reduc_dt == vect_constant_def)
6641 enum vect_def_type cond_initial_dt;
6642 tree cond_initial_val
6643 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6645 gcc_assert (cond_reduc_val != NULL_TREE);
6646 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6647 if (cond_initial_dt == vect_constant_def
6648 && types_compatible_p (TREE_TYPE (cond_initial_val),
6649 TREE_TYPE (cond_reduc_val)))
6651 tree e = fold_binary (LE_EXPR, boolean_type_node,
6652 cond_initial_val, cond_reduc_val);
6653 if (e && (integer_onep (e) || integer_zerop (e)))
6655 if (dump_enabled_p ())
6656 dump_printf_loc (MSG_NOTE, vect_location,
6657 "condition expression based on "
6658 "compile time constant.\n");
6659 /* Record reduction code at analysis stage. */
6660 STMT_VINFO_REDUC_CODE (reduc_info)
6661 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6662 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6668 if (STMT_VINFO_LIVE_P (phi_info))
6669 return false;
6671 if (slp_node)
6672 ncopies = 1;
6673 else
6674 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6676 gcc_assert (ncopies >= 1);
6678 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6680 if (nested_cycle)
6682 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6683 == vect_double_reduction_def);
6684 double_reduc = true;
6687 /* 4.2. Check support for the epilog operation.
6689 If STMT represents a reduction pattern, then the type of the
6690 reduction variable may be different than the type of the rest
6691 of the arguments. For example, consider the case of accumulation
6692 of shorts into an int accumulator; The original code:
6693 S1: int_a = (int) short_a;
6694 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6696 was replaced with:
6697 STMT: int_acc = widen_sum <short_a, int_acc>
6699 This means that:
6700 1. The tree-code that is used to create the vector operation in the
6701 epilog code (that reduces the partial results) is not the
6702 tree-code of STMT, but is rather the tree-code of the original
6703 stmt from the pattern that STMT is replacing. I.e, in the example
6704 above we want to use 'widen_sum' in the loop, but 'plus' in the
6705 epilog.
6706 2. The type (mode) we use to check available target support
6707 for the vector operation to be created in the *epilog*, is
6708 determined by the type of the reduction variable (in the example
6709 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6710 However the type (mode) we use to check available target support
6711 for the vector operation to be created *inside the loop*, is
6712 determined by the type of the other arguments to STMT (in the
6713 example we'd check this: optab_handler (widen_sum_optab,
6714 vect_short_mode)).
6716 This is contrary to "regular" reductions, in which the types of all
6717 the arguments are the same as the type of the reduction variable.
6718 For "regular" reductions we can therefore use the same vector type
6719 (and also the same tree-code) when generating the epilog code and
6720 when generating the code inside the loop. */
6722 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6723 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6725 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6726 if (reduction_type == TREE_CODE_REDUCTION)
6728 /* Check whether it's ok to change the order of the computation.
6729 Generally, when vectorizing a reduction we change the order of the
6730 computation. This may change the behavior of the program in some
6731 cases, so we need to check that this is ok. One exception is when
6732 vectorizing an outer-loop: the inner-loop is executed sequentially,
6733 and therefore vectorizing reductions in the inner-loop during
6734 outer-loop vectorization is safe. */
6735 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6737 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6738 is not directy used in stmt. */
6739 if (!only_slp_reduc_chain
6740 && reduc_chain_length != 1)
6742 if (dump_enabled_p ())
6743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744 "in-order reduction chain without SLP.\n");
6745 return false;
6747 STMT_VINFO_REDUC_TYPE (reduc_info)
6748 = reduction_type = FOLD_LEFT_REDUCTION;
6750 else if (!commutative_tree_code (orig_code)
6751 || !associative_tree_code (orig_code))
6753 if (dump_enabled_p ())
6754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755 "reduction: not commutative/associative");
6756 return false;
6760 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6761 && ncopies > 1)
6763 if (dump_enabled_p ())
6764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765 "multiple types in double reduction or condition "
6766 "reduction or fold-left reduction.\n");
6767 return false;
6770 internal_fn reduc_fn = IFN_LAST;
6771 if (reduction_type == TREE_CODE_REDUCTION
6772 || reduction_type == FOLD_LEFT_REDUCTION
6773 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6774 || reduction_type == CONST_COND_REDUCTION)
6776 if (reduction_type == FOLD_LEFT_REDUCTION
6777 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6778 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6780 if (reduc_fn != IFN_LAST
6781 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6782 OPTIMIZE_FOR_SPEED))
6784 if (dump_enabled_p ())
6785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6786 "reduc op not supported by target.\n");
6788 reduc_fn = IFN_LAST;
6791 else
6793 if (!nested_cycle || double_reduc)
6795 if (dump_enabled_p ())
6796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6797 "no reduc code for scalar code.\n");
6799 return false;
6803 else if (reduction_type == COND_REDUCTION)
6805 int scalar_precision
6806 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6807 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6808 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6809 nunits_out);
6811 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6812 OPTIMIZE_FOR_SPEED))
6813 reduc_fn = IFN_REDUC_MAX;
6815 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6817 if (reduction_type != EXTRACT_LAST_REDUCTION
6818 && (!nested_cycle || double_reduc)
6819 && reduc_fn == IFN_LAST
6820 && !nunits_out.is_constant ())
6822 if (dump_enabled_p ())
6823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6824 "missing target support for reduction on"
6825 " variable-length vectors.\n");
6826 return false;
6829 /* For SLP reductions, see if there is a neutral value we can use. */
6830 tree neutral_op = NULL_TREE;
6831 if (slp_node)
6832 neutral_op = neutral_op_for_slp_reduction
6833 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6834 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6836 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6838 /* We can't support in-order reductions of code such as this:
6840 for (int i = 0; i < n1; ++i)
6841 for (int j = 0; j < n2; ++j)
6842 l += a[j];
6844 since GCC effectively transforms the loop when vectorizing:
6846 for (int i = 0; i < n1 / VF; ++i)
6847 for (int j = 0; j < n2; ++j)
6848 for (int k = 0; k < VF; ++k)
6849 l += a[j];
6851 which is a reassociation of the original operation. */
6852 if (dump_enabled_p ())
6853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854 "in-order double reduction not supported.\n");
6856 return false;
6859 if (reduction_type == FOLD_LEFT_REDUCTION
6860 && slp_node
6861 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6863 /* We cannot use in-order reductions in this case because there is
6864 an implicit reassociation of the operations involved. */
6865 if (dump_enabled_p ())
6866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867 "in-order unchained SLP reductions not supported.\n");
6868 return false;
6871 /* For double reductions, and for SLP reductions with a neutral value,
6872 we construct a variable-length initial vector by loading a vector
6873 full of the neutral value and then shift-and-inserting the start
6874 values into the low-numbered elements. */
6875 if ((double_reduc || neutral_op)
6876 && !nunits_out.is_constant ()
6877 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6878 vectype_out, OPTIMIZE_FOR_SPEED))
6880 if (dump_enabled_p ())
6881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6882 "reduction on variable-length vectors requires"
6883 " target support for a vector-shift-and-insert"
6884 " operation.\n");
6885 return false;
6888 /* Check extra constraints for variable-length unchained SLP reductions. */
6889 if (STMT_SLP_TYPE (stmt_info)
6890 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6891 && !nunits_out.is_constant ())
6893 /* We checked above that we could build the initial vector when
6894 there's a neutral element value. Check here for the case in
6895 which each SLP statement has its own initial value and in which
6896 that value needs to be repeated for every instance of the
6897 statement within the initial vector. */
6898 unsigned int group_size = SLP_TREE_LANES (slp_node);
6899 if (!neutral_op
6900 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6901 TREE_TYPE (vectype_out)))
6903 if (dump_enabled_p ())
6904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6905 "unsupported form of SLP reduction for"
6906 " variable-length vectors: cannot build"
6907 " initial vector.\n");
6908 return false;
6910 /* The epilogue code relies on the number of elements being a multiple
6911 of the group size. The duplicate-and-interleave approach to setting
6912 up the initial vector does too. */
6913 if (!multiple_p (nunits_out, group_size))
6915 if (dump_enabled_p ())
6916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6917 "unsupported form of SLP reduction for"
6918 " variable-length vectors: the vector size"
6919 " is not a multiple of the number of results.\n");
6920 return false;
6924 if (reduction_type == COND_REDUCTION)
6926 widest_int ni;
6928 if (! max_loop_iterations (loop, &ni))
6930 if (dump_enabled_p ())
6931 dump_printf_loc (MSG_NOTE, vect_location,
6932 "loop count not known, cannot create cond "
6933 "reduction.\n");
6934 return false;
6936 /* Convert backedges to iterations. */
6937 ni += 1;
6939 /* The additional index will be the same type as the condition. Check
6940 that the loop can fit into this less one (because we'll use up the
6941 zero slot for when there are no matches). */
6942 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6943 if (wi::geu_p (ni, wi::to_widest (max_index)))
6945 if (dump_enabled_p ())
6946 dump_printf_loc (MSG_NOTE, vect_location,
6947 "loop size is greater than data size.\n");
6948 return false;
6952 /* In case the vectorization factor (VF) is bigger than the number
6953 of elements that we can fit in a vectype (nunits), we have to generate
6954 more than one vector stmt - i.e - we need to "unroll" the
6955 vector stmt by a factor VF/nunits. For more details see documentation
6956 in vectorizable_operation. */
6958 /* If the reduction is used in an outer loop we need to generate
6959 VF intermediate results, like so (e.g. for ncopies=2):
6960 r0 = phi (init, r0)
6961 r1 = phi (init, r1)
6962 r0 = x0 + r0;
6963 r1 = x1 + r1;
6964 (i.e. we generate VF results in 2 registers).
6965 In this case we have a separate def-use cycle for each copy, and therefore
6966 for each copy we get the vector def for the reduction variable from the
6967 respective phi node created for this copy.
6969 Otherwise (the reduction is unused in the loop nest), we can combine
6970 together intermediate results, like so (e.g. for ncopies=2):
6971 r = phi (init, r)
6972 r = x0 + r;
6973 r = x1 + r;
6974 (i.e. we generate VF/2 results in a single register).
6975 In this case for each copy we get the vector def for the reduction variable
6976 from the vectorized reduction operation generated in the previous iteration.
6978 This only works when we see both the reduction PHI and its only consumer
6979 in vectorizable_reduction and there are no intermediate stmts
6980 participating. */
6981 if (ncopies > 1
6982 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6983 && reduc_chain_length == 1)
6984 single_defuse_cycle = true;
6986 if (single_defuse_cycle || lane_reduc_code_p)
6988 gcc_assert (code != COND_EXPR);
6990 /* 4. Supportable by target? */
6991 bool ok = true;
6993 /* 4.1. check support for the operation in the loop */
6994 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6995 if (!optab)
6997 if (dump_enabled_p ())
6998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6999 "no optab.\n");
7000 ok = false;
7003 machine_mode vec_mode = TYPE_MODE (vectype_in);
7004 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7006 if (dump_enabled_p ())
7007 dump_printf (MSG_NOTE, "op not supported by target.\n");
7008 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7009 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7010 ok = false;
7011 else
7012 if (dump_enabled_p ())
7013 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7016 /* Worthwhile without SIMD support? */
7017 if (ok
7018 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7019 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7021 if (dump_enabled_p ())
7022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7023 "not worthwhile without SIMD support.\n");
7024 ok = false;
7027 /* lane-reducing operations have to go through vect_transform_reduction.
7028 For the other cases try without the single cycle optimization. */
7029 if (!ok)
7031 if (lane_reduc_code_p)
7032 return false;
7033 else
7034 single_defuse_cycle = false;
7037 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7039 /* If the reduction stmt is one of the patterns that have lane
7040 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7041 if ((ncopies > 1 && ! single_defuse_cycle)
7042 && lane_reduc_code_p)
7044 if (dump_enabled_p ())
7045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7046 "multi def-use cycle not possible for lane-reducing "
7047 "reduction operation\n");
7048 return false;
7051 if (slp_node
7052 && !(!single_defuse_cycle
7053 && code != DOT_PROD_EXPR
7054 && code != WIDEN_SUM_EXPR
7055 && code != SAD_EXPR
7056 && reduction_type != FOLD_LEFT_REDUCTION))
7057 for (i = 0; i < op_type; i++)
7058 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7060 if (dump_enabled_p ())
7061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7062 "incompatible vector types for invariants\n");
7063 return false;
7066 if (slp_node)
7067 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7068 else
7069 vec_num = 1;
7071 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7072 reduction_type, ncopies, cost_vec);
7073 if (dump_enabled_p ()
7074 && reduction_type == FOLD_LEFT_REDUCTION)
7075 dump_printf_loc (MSG_NOTE, vect_location,
7076 "using an in-order (fold-left) reduction.\n");
7077 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7078 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7079 reductions go through their own vectorizable_* routines. */
7080 if (!single_defuse_cycle
7081 && code != DOT_PROD_EXPR
7082 && code != WIDEN_SUM_EXPR
7083 && code != SAD_EXPR
7084 && reduction_type != FOLD_LEFT_REDUCTION)
7086 stmt_vec_info tem
7087 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7088 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7090 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7091 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7093 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7094 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7096 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7098 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7099 internal_fn cond_fn = get_conditional_internal_fn (code);
7101 if (reduction_type != FOLD_LEFT_REDUCTION
7102 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7103 && (cond_fn == IFN_LAST
7104 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7105 OPTIMIZE_FOR_SPEED)))
7107 if (dump_enabled_p ())
7108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109 "can't operate on partial vectors because"
7110 " no conditional operation is available.\n");
7111 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7113 else if (reduction_type == FOLD_LEFT_REDUCTION
7114 && reduc_fn == IFN_LAST
7115 && !expand_vec_cond_expr_p (vectype_in,
7116 truth_type_for (vectype_in),
7117 SSA_NAME))
7119 if (dump_enabled_p ())
7120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7121 "can't operate on partial vectors because"
7122 " no conditional operation is available.\n");
7123 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7125 else
7126 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7127 vectype_in, NULL);
7129 return true;
7132 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7133 value. */
7135 bool
7136 vect_transform_reduction (loop_vec_info loop_vinfo,
7137 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7138 gimple **vec_stmt, slp_tree slp_node)
7140 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7141 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7142 int i;
7143 int ncopies;
7144 int vec_num;
7146 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7147 gcc_assert (reduc_info->is_reduc_info);
7149 if (nested_in_vect_loop_p (loop, stmt_info))
7151 loop = loop->inner;
7152 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7155 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7156 enum tree_code code = gimple_assign_rhs_code (stmt);
7157 int op_type = TREE_CODE_LENGTH (code);
7159 /* Flatten RHS. */
7160 tree ops[3];
7161 switch (get_gimple_rhs_class (code))
7163 case GIMPLE_TERNARY_RHS:
7164 ops[2] = gimple_assign_rhs3 (stmt);
7165 /* Fall thru. */
7166 case GIMPLE_BINARY_RHS:
7167 ops[0] = gimple_assign_rhs1 (stmt);
7168 ops[1] = gimple_assign_rhs2 (stmt);
7169 break;
7170 default:
7171 gcc_unreachable ();
7174 /* All uses but the last are expected to be defined in the loop.
7175 The last use is the reduction variable. In case of nested cycle this
7176 assumption is not true: we use reduc_index to record the index of the
7177 reduction variable. */
7178 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7179 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7180 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7181 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7183 if (slp_node)
7185 ncopies = 1;
7186 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7188 else
7190 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7191 vec_num = 1;
7194 internal_fn cond_fn = get_conditional_internal_fn (code);
7195 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7196 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7198 /* Transform. */
7199 tree new_temp = NULL_TREE;
7200 auto_vec<tree> vec_oprnds0;
7201 auto_vec<tree> vec_oprnds1;
7202 auto_vec<tree> vec_oprnds2;
7203 tree def0;
7205 if (dump_enabled_p ())
7206 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7208 /* FORNOW: Multiple types are not supported for condition. */
7209 if (code == COND_EXPR)
7210 gcc_assert (ncopies == 1);
7212 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7214 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7215 if (reduction_type == FOLD_LEFT_REDUCTION)
7217 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7218 return vectorize_fold_left_reduction
7219 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7220 reduc_fn, ops, vectype_in, reduc_index, masks);
7223 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7224 gcc_assert (single_defuse_cycle
7225 || code == DOT_PROD_EXPR
7226 || code == WIDEN_SUM_EXPR
7227 || code == SAD_EXPR);
7229 /* Create the destination vector */
7230 tree scalar_dest = gimple_assign_lhs (stmt);
7231 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7233 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7234 single_defuse_cycle && reduc_index == 0
7235 ? NULL_TREE : ops[0], &vec_oprnds0,
7236 single_defuse_cycle && reduc_index == 1
7237 ? NULL_TREE : ops[1], &vec_oprnds1,
7238 op_type == ternary_op
7239 && !(single_defuse_cycle && reduc_index == 2)
7240 ? ops[2] : NULL_TREE, &vec_oprnds2);
7241 if (single_defuse_cycle)
7243 gcc_assert (!slp_node);
7244 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7245 ops[reduc_index],
7246 reduc_index == 0 ? &vec_oprnds0
7247 : (reduc_index == 1 ? &vec_oprnds1
7248 : &vec_oprnds2));
7251 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7253 gimple *new_stmt;
7254 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7255 if (masked_loop_p && !mask_by_cond_expr)
7257 /* Make sure that the reduction accumulator is vop[0]. */
7258 if (reduc_index == 1)
7260 gcc_assert (commutative_tree_code (code));
7261 std::swap (vop[0], vop[1]);
7263 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7264 vectype_in, i);
7265 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7266 vop[0], vop[1], vop[0]);
7267 new_temp = make_ssa_name (vec_dest, call);
7268 gimple_call_set_lhs (call, new_temp);
7269 gimple_call_set_nothrow (call, true);
7270 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7271 new_stmt = call;
7273 else
7275 if (op_type == ternary_op)
7276 vop[2] = vec_oprnds2[i];
7278 if (masked_loop_p && mask_by_cond_expr)
7280 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7281 vectype_in, i);
7282 build_vect_cond_expr (code, vop, mask, gsi);
7285 new_stmt = gimple_build_assign (vec_dest, code,
7286 vop[0], vop[1], vop[2]);
7287 new_temp = make_ssa_name (vec_dest, new_stmt);
7288 gimple_assign_set_lhs (new_stmt, new_temp);
7289 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7292 if (slp_node)
7293 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7294 else if (single_defuse_cycle
7295 && i < ncopies - 1)
7297 if (reduc_index == 0)
7298 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7299 else if (reduc_index == 1)
7300 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7301 else if (reduc_index == 2)
7302 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7304 else
7305 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7308 if (!slp_node)
7309 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7311 return true;
7314 /* Transform phase of a cycle PHI. */
7316 bool
7317 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7318 stmt_vec_info stmt_info, gimple **vec_stmt,
7319 slp_tree slp_node, slp_instance slp_node_instance)
7321 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7322 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7323 int i;
7324 int ncopies;
7325 int j;
7326 bool nested_cycle = false;
7327 int vec_num;
7329 if (nested_in_vect_loop_p (loop, stmt_info))
7331 loop = loop->inner;
7332 nested_cycle = true;
7335 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7336 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7337 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7338 gcc_assert (reduc_info->is_reduc_info);
7340 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7341 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7342 /* Leave the scalar phi in place. */
7343 return true;
7345 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7346 /* For a nested cycle we do not fill the above. */
7347 if (!vectype_in)
7348 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7349 gcc_assert (vectype_in);
7351 if (slp_node)
7353 /* The size vect_schedule_slp_instance computes is off for us. */
7354 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7355 * SLP_TREE_LANES (slp_node), vectype_in);
7356 ncopies = 1;
7358 else
7360 vec_num = 1;
7361 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7364 /* Check whether we should use a single PHI node and accumulate
7365 vectors to one before the backedge. */
7366 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7367 ncopies = 1;
7369 /* Create the destination vector */
7370 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7371 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7372 vectype_out);
7374 /* Get the loop-entry arguments. */
7375 tree vec_initial_def;
7376 auto_vec<tree> vec_initial_defs;
7377 if (slp_node)
7379 vec_initial_defs.reserve (vec_num);
7380 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7381 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7382 tree neutral_op
7383 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7384 STMT_VINFO_REDUC_CODE (reduc_info),
7385 first != NULL);
7386 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7387 &vec_initial_defs, vec_num,
7388 first != NULL, neutral_op);
7390 else
7392 /* Get at the scalar def before the loop, that defines the initial
7393 value of the reduction variable. */
7394 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7395 loop_preheader_edge (loop));
7396 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7397 and we can't use zero for induc_val, use initial_def. Similarly
7398 for REDUC_MIN and initial_def larger than the base. */
7399 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7401 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7402 if (TREE_CODE (initial_def) == INTEGER_CST
7403 && !integer_zerop (induc_val)
7404 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7405 && tree_int_cst_lt (initial_def, induc_val))
7406 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7407 && tree_int_cst_lt (induc_val, initial_def))))
7409 induc_val = initial_def;
7410 /* Communicate we used the initial_def to epilouge
7411 generation. */
7412 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7414 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7415 vec_initial_defs.create (ncopies);
7416 for (i = 0; i < ncopies; ++i)
7417 vec_initial_defs.quick_push (vec_initial_def);
7419 else if (nested_cycle)
7421 /* Do not use an adjustment def as that case is not supported
7422 correctly if ncopies is not one. */
7423 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7424 ncopies, initial_def,
7425 &vec_initial_defs);
7427 else
7429 tree adjustment_def = NULL_TREE;
7430 tree *adjustment_defp = &adjustment_def;
7431 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7432 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7433 adjustment_defp = NULL;
7434 vec_initial_def
7435 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7436 initial_def, adjustment_defp);
7437 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7438 vec_initial_defs.create (ncopies);
7439 for (i = 0; i < ncopies; ++i)
7440 vec_initial_defs.quick_push (vec_initial_def);
7444 /* Generate the reduction PHIs upfront. */
7445 for (i = 0; i < vec_num; i++)
7447 tree vec_init_def = vec_initial_defs[i];
7448 for (j = 0; j < ncopies; j++)
7450 /* Create the reduction-phi that defines the reduction
7451 operand. */
7452 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7454 /* Set the loop-entry arg of the reduction-phi. */
7455 if (j != 0 && nested_cycle)
7456 vec_init_def = vec_initial_defs[j];
7457 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7458 UNKNOWN_LOCATION);
7460 /* The loop-latch arg is set in epilogue processing. */
7462 if (slp_node)
7463 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7464 else
7466 if (j == 0)
7467 *vec_stmt = new_phi;
7468 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7473 return true;
7476 /* Vectorizes LC PHIs. */
7478 bool
7479 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7480 stmt_vec_info stmt_info, gimple **vec_stmt,
7481 slp_tree slp_node)
7483 if (!loop_vinfo
7484 || !is_a <gphi *> (stmt_info->stmt)
7485 || gimple_phi_num_args (stmt_info->stmt) != 1)
7486 return false;
7488 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7489 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7490 return false;
7492 if (!vec_stmt) /* transformation not required. */
7494 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7495 return true;
7498 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7499 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7500 basic_block bb = gimple_bb (stmt_info->stmt);
7501 edge e = single_pred_edge (bb);
7502 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7503 auto_vec<tree> vec_oprnds;
7504 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7505 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7506 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7507 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7509 /* Create the vectorized LC PHI node. */
7510 gphi *new_phi = create_phi_node (vec_dest, bb);
7511 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7512 if (slp_node)
7513 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7514 else
7515 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7517 if (!slp_node)
7518 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7520 return true;
7524 /* Function vect_min_worthwhile_factor.
7526 For a loop where we could vectorize the operation indicated by CODE,
7527 return the minimum vectorization factor that makes it worthwhile
7528 to use generic vectors. */
7529 static unsigned int
7530 vect_min_worthwhile_factor (enum tree_code code)
7532 switch (code)
7534 case PLUS_EXPR:
7535 case MINUS_EXPR:
7536 case NEGATE_EXPR:
7537 return 4;
7539 case BIT_AND_EXPR:
7540 case BIT_IOR_EXPR:
7541 case BIT_XOR_EXPR:
7542 case BIT_NOT_EXPR:
7543 return 2;
7545 default:
7546 return INT_MAX;
7550 /* Return true if VINFO indicates we are doing loop vectorization and if
7551 it is worth decomposing CODE operations into scalar operations for
7552 that loop's vectorization factor. */
7554 bool
7555 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7557 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7558 unsigned HOST_WIDE_INT value;
7559 return (loop_vinfo
7560 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7561 && value >= vect_min_worthwhile_factor (code));
7564 /* Function vectorizable_induction
7566 Check if STMT_INFO performs an induction computation that can be vectorized.
7567 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7568 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7569 Return true if STMT_INFO is vectorizable in this way. */
7571 bool
7572 vectorizable_induction (loop_vec_info loop_vinfo,
7573 stmt_vec_info stmt_info,
7574 gimple **vec_stmt, slp_tree slp_node,
7575 stmt_vector_for_cost *cost_vec)
7577 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7578 unsigned ncopies;
7579 bool nested_in_vect_loop = false;
7580 class loop *iv_loop;
7581 tree vec_def;
7582 edge pe = loop_preheader_edge (loop);
7583 basic_block new_bb;
7584 tree new_vec, vec_init, vec_step, t;
7585 tree new_name;
7586 gimple *new_stmt;
7587 gphi *induction_phi;
7588 tree induc_def, vec_dest;
7589 tree init_expr, step_expr;
7590 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7591 unsigned i;
7592 tree expr;
7593 gimple_seq stmts;
7594 gimple_stmt_iterator si;
7596 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7597 if (!phi)
7598 return false;
7600 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7601 return false;
7603 /* Make sure it was recognized as induction computation. */
7604 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7605 return false;
7607 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7608 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7610 if (slp_node)
7611 ncopies = 1;
7612 else
7613 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7614 gcc_assert (ncopies >= 1);
7616 /* FORNOW. These restrictions should be relaxed. */
7617 if (nested_in_vect_loop_p (loop, stmt_info))
7619 imm_use_iterator imm_iter;
7620 use_operand_p use_p;
7621 gimple *exit_phi;
7622 edge latch_e;
7623 tree loop_arg;
7625 if (ncopies > 1)
7627 if (dump_enabled_p ())
7628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7629 "multiple types in nested loop.\n");
7630 return false;
7633 /* FORNOW: outer loop induction with SLP not supported. */
7634 if (STMT_SLP_TYPE (stmt_info))
7635 return false;
7637 exit_phi = NULL;
7638 latch_e = loop_latch_edge (loop->inner);
7639 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7640 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7642 gimple *use_stmt = USE_STMT (use_p);
7643 if (is_gimple_debug (use_stmt))
7644 continue;
7646 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7648 exit_phi = use_stmt;
7649 break;
7652 if (exit_phi)
7654 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7655 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7656 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7658 if (dump_enabled_p ())
7659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7660 "inner-loop induction only used outside "
7661 "of the outer vectorized loop.\n");
7662 return false;
7666 nested_in_vect_loop = true;
7667 iv_loop = loop->inner;
7669 else
7670 iv_loop = loop;
7671 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7673 if (slp_node && !nunits.is_constant ())
7675 /* The current SLP code creates the initial value element-by-element. */
7676 if (dump_enabled_p ())
7677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7678 "SLP induction not supported for variable-length"
7679 " vectors.\n");
7680 return false;
7683 if (!vec_stmt) /* transformation not required. */
7685 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7686 DUMP_VECT_SCOPE ("vectorizable_induction");
7687 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7688 return true;
7691 /* Transform. */
7693 /* Compute a vector variable, initialized with the first VF values of
7694 the induction variable. E.g., for an iv with IV_PHI='X' and
7695 evolution S, for a vector of 4 units, we want to compute:
7696 [X, X + S, X + 2*S, X + 3*S]. */
7698 if (dump_enabled_p ())
7699 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7701 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7702 gcc_assert (step_expr != NULL_TREE);
7703 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7705 pe = loop_preheader_edge (iv_loop);
7706 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7707 loop_preheader_edge (iv_loop));
7709 stmts = NULL;
7710 if (!nested_in_vect_loop)
7712 /* Convert the initial value to the IV update type. */
7713 tree new_type = TREE_TYPE (step_expr);
7714 init_expr = gimple_convert (&stmts, new_type, init_expr);
7716 /* If we are using the loop mask to "peel" for alignment then we need
7717 to adjust the start value here. */
7718 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7719 if (skip_niters != NULL_TREE)
7721 if (FLOAT_TYPE_P (vectype))
7722 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7723 skip_niters);
7724 else
7725 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7726 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7727 skip_niters, step_expr);
7728 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7729 init_expr, skip_step);
7733 if (stmts)
7735 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7736 gcc_assert (!new_bb);
7739 /* Find the first insertion point in the BB. */
7740 basic_block bb = gimple_bb (phi);
7741 si = gsi_after_labels (bb);
7743 /* For SLP induction we have to generate several IVs as for example
7744 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7745 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7746 [VF*S, VF*S, VF*S, VF*S] for all. */
7747 if (slp_node)
7749 /* Enforced above. */
7750 unsigned int const_nunits = nunits.to_constant ();
7752 /* Generate [VF*S, VF*S, ... ]. */
7753 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7755 expr = build_int_cst (integer_type_node, vf);
7756 expr = fold_convert (TREE_TYPE (step_expr), expr);
7758 else
7759 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7760 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7761 expr, step_expr);
7762 if (! CONSTANT_CLASS_P (new_name))
7763 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7764 TREE_TYPE (step_expr), NULL);
7765 new_vec = build_vector_from_val (step_vectype, new_name);
7766 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7767 new_vec, step_vectype, NULL);
7769 /* Now generate the IVs. */
7770 unsigned group_size = SLP_TREE_LANES (slp_node);
7771 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7772 unsigned elts = const_nunits * nvects;
7773 /* Compute the number of distinct IVs we need. First reduce
7774 group_size if it is a multiple of const_nunits so we get
7775 one IV for a group_size of 4 but const_nunits 2. */
7776 unsigned group_sizep = group_size;
7777 if (group_sizep % const_nunits == 0)
7778 group_sizep = group_sizep / const_nunits;
7779 unsigned nivs = least_common_multiple (group_sizep,
7780 const_nunits) / const_nunits;
7781 gcc_assert (elts % group_size == 0);
7782 tree elt = init_expr;
7783 unsigned ivn;
7784 for (ivn = 0; ivn < nivs; ++ivn)
7786 tree_vector_builder elts (step_vectype, const_nunits, 1);
7787 stmts = NULL;
7788 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7790 if (ivn*const_nunits + eltn >= group_size
7791 && (ivn * const_nunits + eltn) % group_size == 0)
7792 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7793 elt, step_expr);
7794 elts.quick_push (elt);
7796 vec_init = gimple_build_vector (&stmts, &elts);
7797 vec_init = gimple_convert (&stmts, vectype, vec_init);
7798 if (stmts)
7800 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7801 gcc_assert (!new_bb);
7804 /* Create the induction-phi that defines the induction-operand. */
7805 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7806 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7807 induc_def = PHI_RESULT (induction_phi);
7809 /* Create the iv update inside the loop */
7810 gimple_seq stmts = NULL;
7811 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7812 vec_def = gimple_build (&stmts,
7813 PLUS_EXPR, step_vectype, vec_def, vec_step);
7814 vec_def = gimple_convert (&stmts, vectype, vec_def);
7815 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7817 /* Set the arguments of the phi node: */
7818 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7819 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7820 UNKNOWN_LOCATION);
7822 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7824 /* Fill up to the number of vectors we need for the whole group. */
7825 nivs = least_common_multiple (group_size,
7826 const_nunits) / const_nunits;
7827 for (; ivn < nivs; ++ivn)
7828 SLP_TREE_VEC_STMTS (slp_node)
7829 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7831 /* Re-use IVs when we can. */
7832 if (ivn < nvects)
7834 unsigned vfp
7835 = least_common_multiple (group_size, const_nunits) / group_size;
7836 /* Generate [VF'*S, VF'*S, ... ]. */
7837 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7839 expr = build_int_cst (integer_type_node, vfp);
7840 expr = fold_convert (TREE_TYPE (step_expr), expr);
7842 else
7843 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7844 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7845 expr, step_expr);
7846 if (! CONSTANT_CLASS_P (new_name))
7847 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7848 TREE_TYPE (step_expr), NULL);
7849 new_vec = build_vector_from_val (step_vectype, new_name);
7850 vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7851 step_vectype, NULL);
7852 for (; ivn < nvects; ++ivn)
7854 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7855 tree def;
7856 if (gimple_code (iv) == GIMPLE_PHI)
7857 def = gimple_phi_result (iv);
7858 else
7859 def = gimple_assign_lhs (iv);
7860 gimple_seq stmts = NULL;
7861 def = gimple_convert (&stmts, step_vectype, def);
7862 def = gimple_build (&stmts,
7863 PLUS_EXPR, step_vectype, def, vec_step);
7864 def = gimple_convert (&stmts, vectype, def);
7865 if (gimple_code (iv) == GIMPLE_PHI)
7866 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7867 else
7869 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7870 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7872 SLP_TREE_VEC_STMTS (slp_node)
7873 .quick_push (SSA_NAME_DEF_STMT (def));
7877 return true;
7880 /* Create the vector that holds the initial_value of the induction. */
7881 if (nested_in_vect_loop)
7883 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7884 been created during vectorization of previous stmts. We obtain it
7885 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7886 auto_vec<tree> vec_inits;
7887 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7888 init_expr, &vec_inits);
7889 vec_init = vec_inits[0];
7890 /* If the initial value is not of proper type, convert it. */
7891 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7893 new_stmt
7894 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7895 vect_simple_var,
7896 "vec_iv_"),
7897 VIEW_CONVERT_EXPR,
7898 build1 (VIEW_CONVERT_EXPR, vectype,
7899 vec_init));
7900 vec_init = gimple_assign_lhs (new_stmt);
7901 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7902 new_stmt);
7903 gcc_assert (!new_bb);
7906 else
7908 /* iv_loop is the loop to be vectorized. Create:
7909 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7910 stmts = NULL;
7911 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7913 unsigned HOST_WIDE_INT const_nunits;
7914 if (nunits.is_constant (&const_nunits))
7916 tree_vector_builder elts (step_vectype, const_nunits, 1);
7917 elts.quick_push (new_name);
7918 for (i = 1; i < const_nunits; i++)
7920 /* Create: new_name_i = new_name + step_expr */
7921 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7922 new_name, step_expr);
7923 elts.quick_push (new_name);
7925 /* Create a vector from [new_name_0, new_name_1, ...,
7926 new_name_nunits-1] */
7927 vec_init = gimple_build_vector (&stmts, &elts);
7929 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7930 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7931 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7932 new_name, step_expr);
7933 else
7935 /* Build:
7936 [base, base, base, ...]
7937 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7938 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7939 gcc_assert (flag_associative_math);
7940 tree index = build_index_vector (step_vectype, 0, 1);
7941 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7942 new_name);
7943 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7944 step_expr);
7945 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7946 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7947 vec_init, step_vec);
7948 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7949 vec_init, base_vec);
7951 vec_init = gimple_convert (&stmts, vectype, vec_init);
7953 if (stmts)
7955 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7956 gcc_assert (!new_bb);
7961 /* Create the vector that holds the step of the induction. */
7962 if (nested_in_vect_loop)
7963 /* iv_loop is nested in the loop to be vectorized. Generate:
7964 vec_step = [S, S, S, S] */
7965 new_name = step_expr;
7966 else
7968 /* iv_loop is the loop to be vectorized. Generate:
7969 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7970 gimple_seq seq = NULL;
7971 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7973 expr = build_int_cst (integer_type_node, vf);
7974 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7976 else
7977 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7978 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7979 expr, step_expr);
7980 if (seq)
7982 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7983 gcc_assert (!new_bb);
7987 t = unshare_expr (new_name);
7988 gcc_assert (CONSTANT_CLASS_P (new_name)
7989 || TREE_CODE (new_name) == SSA_NAME);
7990 new_vec = build_vector_from_val (step_vectype, t);
7991 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7992 new_vec, step_vectype, NULL);
7995 /* Create the following def-use cycle:
7996 loop prolog:
7997 vec_init = ...
7998 vec_step = ...
7999 loop:
8000 vec_iv = PHI <vec_init, vec_loop>
8002 STMT
8004 vec_loop = vec_iv + vec_step; */
8006 /* Create the induction-phi that defines the induction-operand. */
8007 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8008 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8009 induc_def = PHI_RESULT (induction_phi);
8011 /* Create the iv update inside the loop */
8012 stmts = NULL;
8013 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8014 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8015 vec_def = gimple_convert (&stmts, vectype, vec_def);
8016 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8017 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8019 /* Set the arguments of the phi node: */
8020 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8021 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8022 UNKNOWN_LOCATION);
8024 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8025 *vec_stmt = induction_phi;
8027 /* In case that vectorization factor (VF) is bigger than the number
8028 of elements that we can fit in a vectype (nunits), we have to generate
8029 more than one vector stmt - i.e - we need to "unroll" the
8030 vector stmt by a factor VF/nunits. For more details see documentation
8031 in vectorizable_operation. */
8033 if (ncopies > 1)
8035 gimple_seq seq = NULL;
8036 /* FORNOW. This restriction should be relaxed. */
8037 gcc_assert (!nested_in_vect_loop);
8039 /* Create the vector that holds the step of the induction. */
8040 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8042 expr = build_int_cst (integer_type_node, nunits);
8043 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8045 else
8046 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8047 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8048 expr, step_expr);
8049 if (seq)
8051 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8052 gcc_assert (!new_bb);
8055 t = unshare_expr (new_name);
8056 gcc_assert (CONSTANT_CLASS_P (new_name)
8057 || TREE_CODE (new_name) == SSA_NAME);
8058 new_vec = build_vector_from_val (step_vectype, t);
8059 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8060 new_vec, step_vectype, NULL);
8062 vec_def = induc_def;
8063 for (i = 1; i < ncopies; i++)
8065 /* vec_i = vec_prev + vec_step */
8066 gimple_seq stmts = NULL;
8067 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8068 vec_def = gimple_build (&stmts,
8069 PLUS_EXPR, step_vectype, vec_def, vec_step);
8070 vec_def = gimple_convert (&stmts, vectype, vec_def);
8072 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8073 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8074 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8078 if (dump_enabled_p ())
8079 dump_printf_loc (MSG_NOTE, vect_location,
8080 "transform induction: created def-use cycle: %G%G",
8081 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8083 return true;
8086 /* Function vectorizable_live_operation.
8088 STMT_INFO computes a value that is used outside the loop. Check if
8089 it can be supported. */
8091 bool
8092 vectorizable_live_operation (vec_info *vinfo,
8093 stmt_vec_info stmt_info,
8094 gimple_stmt_iterator *gsi,
8095 slp_tree slp_node, slp_instance slp_node_instance,
8096 int slp_index, bool vec_stmt_p,
8097 stmt_vector_for_cost *cost_vec)
8099 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8100 imm_use_iterator imm_iter;
8101 tree lhs, lhs_type, bitsize, vec_bitsize;
8102 tree vectype = (slp_node
8103 ? SLP_TREE_VECTYPE (slp_node)
8104 : STMT_VINFO_VECTYPE (stmt_info));
8105 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8106 int ncopies;
8107 gimple *use_stmt;
8108 auto_vec<tree> vec_oprnds;
8109 int vec_entry = 0;
8110 poly_uint64 vec_index = 0;
8112 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8114 /* If a stmt of a reduction is live, vectorize it via
8115 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8116 validity so just trigger the transform here. */
8117 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8119 if (!vec_stmt_p)
8120 return true;
8121 if (slp_node)
8123 /* For reduction chains the meta-info is attached to
8124 the group leader. */
8125 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8126 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8127 /* For SLP reductions we vectorize the epilogue for
8128 all involved stmts together. */
8129 else if (slp_index != 0)
8130 return true;
8131 else
8132 /* For SLP reductions the meta-info is attached to
8133 the representative. */
8134 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8136 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8137 gcc_assert (reduc_info->is_reduc_info);
8138 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8139 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8140 return true;
8141 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8142 slp_node_instance);
8143 return true;
8146 /* If STMT is not relevant and it is a simple assignment and its inputs are
8147 invariant then it can remain in place, unvectorized. The original last
8148 scalar value that it computes will be used. */
8149 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8151 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8152 if (dump_enabled_p ())
8153 dump_printf_loc (MSG_NOTE, vect_location,
8154 "statement is simple and uses invariant. Leaving in "
8155 "place.\n");
8156 return true;
8159 if (slp_node)
8160 ncopies = 1;
8161 else
8162 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8164 if (slp_node)
8166 gcc_assert (slp_index >= 0);
8168 /* Get the last occurrence of the scalar index from the concatenation of
8169 all the slp vectors. Calculate which slp vector it is and the index
8170 within. */
8171 int num_scalar = SLP_TREE_LANES (slp_node);
8172 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8173 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8175 /* Calculate which vector contains the result, and which lane of
8176 that vector we need. */
8177 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8179 if (dump_enabled_p ())
8180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8181 "Cannot determine which vector holds the"
8182 " final result.\n");
8183 return false;
8187 if (!vec_stmt_p)
8189 /* No transformation required. */
8190 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8192 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8193 OPTIMIZE_FOR_SPEED))
8195 if (dump_enabled_p ())
8196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8197 "can't operate on partial vectors "
8198 "because the target doesn't support extract "
8199 "last reduction.\n");
8200 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8202 else if (slp_node)
8204 if (dump_enabled_p ())
8205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8206 "can't operate on partial vectors "
8207 "because an SLP statement is live after "
8208 "the loop.\n");
8209 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8211 else if (ncopies > 1)
8213 if (dump_enabled_p ())
8214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8215 "can't operate on partial vectors "
8216 "because ncopies is greater than 1.\n");
8217 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8219 else
8221 gcc_assert (ncopies == 1 && !slp_node);
8222 vect_record_loop_mask (loop_vinfo,
8223 &LOOP_VINFO_MASKS (loop_vinfo),
8224 1, vectype, NULL);
8227 /* ??? Enable for loop costing as well. */
8228 if (!loop_vinfo)
8229 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8230 0, vect_epilogue);
8231 return true;
8234 /* Use the lhs of the original scalar statement. */
8235 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8236 if (dump_enabled_p ())
8237 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8238 "stmt %G", stmt);
8240 lhs = gimple_get_lhs (stmt);
8241 lhs_type = TREE_TYPE (lhs);
8243 bitsize = vector_element_bits_tree (vectype);
8244 vec_bitsize = TYPE_SIZE (vectype);
8246 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8247 tree vec_lhs, bitstart;
8248 gimple *vec_stmt;
8249 if (slp_node)
8251 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8253 /* Get the correct slp vectorized stmt. */
8254 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8255 vec_lhs = gimple_get_lhs (vec_stmt);
8257 /* Get entry to use. */
8258 bitstart = bitsize_int (vec_index);
8259 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8261 else
8263 /* For multiple copies, get the last copy. */
8264 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8265 vec_lhs = gimple_get_lhs (vec_stmt);
8267 /* Get the last lane in the vector. */
8268 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8271 if (loop_vinfo)
8273 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8274 requirement, insert one phi node for it. It looks like:
8275 loop;
8277 # lhs' = PHI <lhs>
8279 loop;
8281 # vec_lhs' = PHI <vec_lhs>
8282 new_tree = lane_extract <vec_lhs', ...>;
8283 lhs' = new_tree; */
8285 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8286 basic_block exit_bb = single_exit (loop)->dest;
8287 gcc_assert (single_pred_p (exit_bb));
8289 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8290 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8291 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8293 gimple_seq stmts = NULL;
8294 tree new_tree;
8295 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8297 /* Emit:
8299 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8301 where VEC_LHS is the vectorized live-out result and MASK is
8302 the loop mask for the final iteration. */
8303 gcc_assert (ncopies == 1 && !slp_node);
8304 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8305 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8306 1, vectype, 0);
8307 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8308 mask, vec_lhs_phi);
8310 /* Convert the extracted vector element to the scalar type. */
8311 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8313 else
8315 tree bftype = TREE_TYPE (vectype);
8316 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8317 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8318 new_tree = build3 (BIT_FIELD_REF, bftype,
8319 vec_lhs_phi, bitsize, bitstart);
8320 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8321 &stmts, true, NULL_TREE);
8324 if (stmts)
8326 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8327 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8329 /* Remove existing phi from lhs and create one copy from new_tree. */
8330 tree lhs_phi = NULL_TREE;
8331 gimple_stmt_iterator gsi;
8332 for (gsi = gsi_start_phis (exit_bb);
8333 !gsi_end_p (gsi); gsi_next (&gsi))
8335 gimple *phi = gsi_stmt (gsi);
8336 if ((gimple_phi_arg_def (phi, 0) == lhs))
8338 remove_phi_node (&gsi, false);
8339 lhs_phi = gimple_phi_result (phi);
8340 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8341 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8342 break;
8347 /* Replace use of lhs with newly computed result. If the use stmt is a
8348 single arg PHI, just replace all uses of PHI result. It's necessary
8349 because lcssa PHI defining lhs may be before newly inserted stmt. */
8350 use_operand_p use_p;
8351 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8352 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8353 && !is_gimple_debug (use_stmt))
8355 if (gimple_code (use_stmt) == GIMPLE_PHI
8356 && gimple_phi_num_args (use_stmt) == 1)
8358 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8360 else
8362 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8363 SET_USE (use_p, new_tree);
8365 update_stmt (use_stmt);
8368 else
8370 /* For basic-block vectorization simply insert the lane-extraction. */
8371 tree bftype = TREE_TYPE (vectype);
8372 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8373 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8374 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8375 vec_lhs, bitsize, bitstart);
8376 gimple_seq stmts = NULL;
8377 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8378 &stmts, true, NULL_TREE);
8380 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
8382 /* Replace use of lhs with newly computed result. If the use stmt is a
8383 single arg PHI, just replace all uses of PHI result. It's necessary
8384 because lcssa PHI defining lhs may be before newly inserted stmt. */
8385 use_operand_p use_p;
8386 stmt_vec_info use_stmt_info;
8387 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8388 if (!is_gimple_debug (use_stmt)
8389 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8390 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8392 /* ??? This can happen when the live lane ends up being
8393 used in a vector construction code-generated by an
8394 external SLP node (and code-generation for that already
8395 happened). See gcc.dg/vect/bb-slp-47.c.
8396 Doing this is what would happen if that vector CTOR
8397 were not code-generated yet so it is not too bad.
8398 ??? In fact we'd likely want to avoid this situation
8399 in the first place. */
8400 if (TREE_CODE (new_tree) == SSA_NAME
8401 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8402 && gimple_code (use_stmt) != GIMPLE_PHI
8403 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8404 use_stmt))
8406 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8407 gcc_assert (code == CONSTRUCTOR
8408 || code == VIEW_CONVERT_EXPR
8409 || CONVERT_EXPR_CODE_P (code));
8410 if (dump_enabled_p ())
8411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8412 "Using original scalar computation for "
8413 "live lane because use preceeds vector "
8414 "def\n");
8415 continue;
8417 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8418 SET_USE (use_p, new_tree);
8419 update_stmt (use_stmt);
8423 return true;
8426 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8428 static void
8429 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8431 ssa_op_iter op_iter;
8432 imm_use_iterator imm_iter;
8433 def_operand_p def_p;
8434 gimple *ustmt;
8436 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8438 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8440 basic_block bb;
8442 if (!is_gimple_debug (ustmt))
8443 continue;
8445 bb = gimple_bb (ustmt);
8447 if (!flow_bb_inside_loop_p (loop, bb))
8449 if (gimple_debug_bind_p (ustmt))
8451 if (dump_enabled_p ())
8452 dump_printf_loc (MSG_NOTE, vect_location,
8453 "killing debug use\n");
8455 gimple_debug_bind_reset_value (ustmt);
8456 update_stmt (ustmt);
8458 else
8459 gcc_unreachable ();
8465 /* Given loop represented by LOOP_VINFO, return true if computation of
8466 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8467 otherwise. */
8469 static bool
8470 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8472 /* Constant case. */
8473 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8475 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8476 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8478 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8479 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8480 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8481 return true;
8484 widest_int max;
8485 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8486 /* Check the upper bound of loop niters. */
8487 if (get_max_loop_iterations (loop, &max))
8489 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8490 signop sgn = TYPE_SIGN (type);
8491 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8492 if (max < type_max)
8493 return true;
8495 return false;
8498 /* Return a mask type with half the number of elements as OLD_TYPE,
8499 given that it should have mode NEW_MODE. */
8501 tree
8502 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8504 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8505 return build_truth_vector_type_for_mode (nunits, new_mode);
8508 /* Return a mask type with twice as many elements as OLD_TYPE,
8509 given that it should have mode NEW_MODE. */
8511 tree
8512 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8514 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8515 return build_truth_vector_type_for_mode (nunits, new_mode);
8518 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8519 contain a sequence of NVECTORS masks that each control a vector of type
8520 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8521 these vector masks with the vector version of SCALAR_MASK. */
8523 void
8524 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8525 unsigned int nvectors, tree vectype, tree scalar_mask)
8527 gcc_assert (nvectors != 0);
8528 if (masks->length () < nvectors)
8529 masks->safe_grow_cleared (nvectors, true);
8530 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8531 /* The number of scalars per iteration and the number of vectors are
8532 both compile-time constants. */
8533 unsigned int nscalars_per_iter
8534 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8535 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8537 if (scalar_mask)
8539 scalar_cond_masked_key cond (scalar_mask, nvectors);
8540 loop_vinfo->scalar_cond_masked_set.add (cond);
8543 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8545 rgm->max_nscalars_per_iter = nscalars_per_iter;
8546 rgm->type = truth_type_for (vectype);
8547 rgm->factor = 1;
8551 /* Given a complete set of masks MASKS, extract mask number INDEX
8552 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8553 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8555 See the comment above vec_loop_masks for more details about the mask
8556 arrangement. */
8558 tree
8559 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8560 unsigned int nvectors, tree vectype, unsigned int index)
8562 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8563 tree mask_type = rgm->type;
8565 /* Populate the rgroup's mask array, if this is the first time we've
8566 used it. */
8567 if (rgm->controls.is_empty ())
8569 rgm->controls.safe_grow_cleared (nvectors, true);
8570 for (unsigned int i = 0; i < nvectors; ++i)
8572 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8573 /* Provide a dummy definition until the real one is available. */
8574 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8575 rgm->controls[i] = mask;
8579 tree mask = rgm->controls[index];
8580 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8581 TYPE_VECTOR_SUBPARTS (vectype)))
8583 /* A loop mask for data type X can be reused for data type Y
8584 if X has N times more elements than Y and if Y's elements
8585 are N times bigger than X's. In this case each sequence
8586 of N elements in the loop mask will be all-zero or all-one.
8587 We can then view-convert the mask so that each sequence of
8588 N elements is replaced by a single element. */
8589 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8590 TYPE_VECTOR_SUBPARTS (vectype)));
8591 gimple_seq seq = NULL;
8592 mask_type = truth_type_for (vectype);
8593 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8594 if (seq)
8595 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8597 return mask;
8600 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8601 lengths for controlling an operation on VECTYPE. The operation splits
8602 each element of VECTYPE into FACTOR separate subelements, measuring the
8603 length as a number of these subelements. */
8605 void
8606 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8607 unsigned int nvectors, tree vectype, unsigned int factor)
8609 gcc_assert (nvectors != 0);
8610 if (lens->length () < nvectors)
8611 lens->safe_grow_cleared (nvectors, true);
8612 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8614 /* The number of scalars per iteration, scalar occupied bytes and
8615 the number of vectors are both compile-time constants. */
8616 unsigned int nscalars_per_iter
8617 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8618 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8620 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8622 /* For now, we only support cases in which all loads and stores fall back
8623 to VnQI or none do. */
8624 gcc_assert (!rgl->max_nscalars_per_iter
8625 || (rgl->factor == 1 && factor == 1)
8626 || (rgl->max_nscalars_per_iter * rgl->factor
8627 == nscalars_per_iter * factor));
8628 rgl->max_nscalars_per_iter = nscalars_per_iter;
8629 rgl->type = vectype;
8630 rgl->factor = factor;
8634 /* Given a complete set of length LENS, extract length number INDEX for an
8635 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8637 tree
8638 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8639 unsigned int nvectors, unsigned int index)
8641 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8643 /* Populate the rgroup's len array, if this is the first time we've
8644 used it. */
8645 if (rgl->controls.is_empty ())
8647 rgl->controls.safe_grow_cleared (nvectors, true);
8648 for (unsigned int i = 0; i < nvectors; ++i)
8650 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8651 gcc_assert (len_type != NULL_TREE);
8652 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8654 /* Provide a dummy definition until the real one is available. */
8655 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8656 rgl->controls[i] = len;
8660 return rgl->controls[index];
8663 /* Scale profiling counters by estimation for LOOP which is vectorized
8664 by factor VF. */
8666 static void
8667 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8669 edge preheader = loop_preheader_edge (loop);
8670 /* Reduce loop iterations by the vectorization factor. */
8671 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8672 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8674 if (freq_h.nonzero_p ())
8676 profile_probability p;
8678 /* Avoid dropping loop body profile counter to 0 because of zero count
8679 in loop's preheader. */
8680 if (!(freq_e == profile_count::zero ()))
8681 freq_e = freq_e.force_nonzero ();
8682 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8683 scale_loop_frequencies (loop, p);
8686 edge exit_e = single_exit (loop);
8687 exit_e->probability = profile_probability::always ()
8688 .apply_scale (1, new_est_niter + 1);
8690 edge exit_l = single_pred_edge (loop->latch);
8691 profile_probability prob = exit_l->probability;
8692 exit_l->probability = exit_e->probability.invert ();
8693 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8694 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8697 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8698 latch edge values originally defined by it. */
8700 static void
8701 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8702 stmt_vec_info def_stmt_info)
8704 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8705 if (!def || TREE_CODE (def) != SSA_NAME)
8706 return;
8707 stmt_vec_info phi_info;
8708 imm_use_iterator iter;
8709 use_operand_p use_p;
8710 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8711 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8712 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8713 && (phi_info = loop_vinfo->lookup_stmt (phi))
8714 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8715 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8716 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8718 loop_p loop = gimple_bb (phi)->loop_father;
8719 edge e = loop_latch_edge (loop);
8720 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8722 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
8723 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
8724 gcc_assert (phi_defs.length () == latch_defs.length ());
8725 for (unsigned i = 0; i < phi_defs.length (); ++i)
8726 add_phi_arg (as_a <gphi *> (phi_defs[i]),
8727 gimple_get_lhs (latch_defs[i]), e,
8728 gimple_phi_arg_location (phi, e->dest_idx));
8733 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8734 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8735 stmt_vec_info. */
8737 static void
8738 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8739 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8741 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8742 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8744 if (dump_enabled_p ())
8745 dump_printf_loc (MSG_NOTE, vect_location,
8746 "------>vectorizing statement: %G", stmt_info->stmt);
8748 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8749 vect_loop_kill_debug_uses (loop, stmt_info);
8751 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8752 && !STMT_VINFO_LIVE_P (stmt_info))
8753 return;
8755 if (STMT_VINFO_VECTYPE (stmt_info))
8757 poly_uint64 nunits
8758 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8759 if (!STMT_SLP_TYPE (stmt_info)
8760 && maybe_ne (nunits, vf)
8761 && dump_enabled_p ())
8762 /* For SLP VF is set according to unrolling factor, and not
8763 to vector size, hence for SLP this print is not valid. */
8764 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8767 /* Pure SLP statements have already been vectorized. We still need
8768 to apply loop vectorization to hybrid SLP statements. */
8769 if (PURE_SLP_STMT (stmt_info))
8770 return;
8772 if (dump_enabled_p ())
8773 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8775 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8776 *seen_store = stmt_info;
8779 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8780 in the hash_map with its corresponding values. */
8782 static tree
8783 find_in_mapping (tree t, void *context)
8785 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8787 tree *value = mapping->get (t);
8788 return value ? *value : t;
8791 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8792 original loop that has now been vectorized.
8794 The inits of the data_references need to be advanced with the number of
8795 iterations of the main loop. This has been computed in vect_do_peeling and
8796 is stored in parameter ADVANCE. We first restore the data_references
8797 initial offset with the values recored in ORIG_DRS_INIT.
8799 Since the loop_vec_info of this EPILOGUE was constructed for the original
8800 loop, its stmt_vec_infos all point to the original statements. These need
8801 to be updated to point to their corresponding copies as well as the SSA_NAMES
8802 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8804 The data_reference's connections also need to be updated. Their
8805 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8806 stmt_vec_infos, their statements need to point to their corresponding copy,
8807 if they are gather loads or scatter stores then their reference needs to be
8808 updated to point to its corresponding copy and finally we set
8809 'base_misaligned' to false as we have already peeled for alignment in the
8810 prologue of the main loop. */
8812 static void
8813 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8815 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8816 auto_vec<gimple *> stmt_worklist;
8817 hash_map<tree,tree> mapping;
8818 gimple *orig_stmt, *new_stmt;
8819 gimple_stmt_iterator epilogue_gsi;
8820 gphi_iterator epilogue_phi_gsi;
8821 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8822 basic_block *epilogue_bbs = get_loop_body (epilogue);
8823 unsigned i;
8825 free (LOOP_VINFO_BBS (epilogue_vinfo));
8826 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8828 /* Advance data_reference's with the number of iterations of the previous
8829 loop and its prologue. */
8830 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8833 /* The EPILOGUE loop is a copy of the original loop so they share the same
8834 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8835 point to the copied statements. We also create a mapping of all LHS' in
8836 the original loop and all the LHS' in the EPILOGUE and create worklists to
8837 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8838 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8840 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8841 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8843 new_stmt = epilogue_phi_gsi.phi ();
8845 gcc_assert (gimple_uid (new_stmt) > 0);
8846 stmt_vinfo
8847 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8849 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8850 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8852 mapping.put (gimple_phi_result (orig_stmt),
8853 gimple_phi_result (new_stmt));
8854 /* PHI nodes can not have patterns or related statements. */
8855 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8856 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8859 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8860 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8862 new_stmt = gsi_stmt (epilogue_gsi);
8863 if (is_gimple_debug (new_stmt))
8864 continue;
8866 gcc_assert (gimple_uid (new_stmt) > 0);
8867 stmt_vinfo
8868 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8870 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8871 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8873 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8874 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8876 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8878 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8879 for (gimple_stmt_iterator gsi = gsi_start (seq);
8880 !gsi_end_p (gsi); gsi_next (&gsi))
8881 stmt_worklist.safe_push (gsi_stmt (gsi));
8884 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8885 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8887 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8888 stmt_worklist.safe_push (stmt);
8889 /* Set BB such that the assert in
8890 'get_initial_def_for_reduction' is able to determine that
8891 the BB of the related stmt is inside this loop. */
8892 gimple_set_bb (stmt,
8893 gimple_bb (new_stmt));
8894 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8895 gcc_assert (related_vinfo == NULL
8896 || related_vinfo == stmt_vinfo);
8901 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8902 using the original main loop and thus need to be updated to refer to the
8903 cloned variables used in the epilogue. */
8904 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8906 gimple *stmt = stmt_worklist[i];
8907 tree *new_op;
8909 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8911 tree op = gimple_op (stmt, j);
8912 if ((new_op = mapping.get(op)))
8913 gimple_set_op (stmt, j, *new_op);
8914 else
8916 /* PR92429: The last argument of simplify_replace_tree disables
8917 folding when replacing arguments. This is required as
8918 otherwise you might end up with different statements than the
8919 ones analyzed in vect_loop_analyze, leading to different
8920 vectorization. */
8921 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8922 &find_in_mapping, &mapping, false);
8923 gimple_set_op (stmt, j, op);
8928 struct data_reference *dr;
8929 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
8930 FOR_EACH_VEC_ELT (datarefs, i, dr)
8932 orig_stmt = DR_STMT (dr);
8933 gcc_assert (gimple_uid (orig_stmt) > 0);
8934 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8935 /* Data references for gather loads and scatter stores do not use the
8936 updated offset we set using ADVANCE. Instead we have to make sure the
8937 reference in the data references point to the corresponding copy of
8938 the original in the epilogue. */
8939 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8940 == VMAT_GATHER_SCATTER)
8942 DR_REF (dr)
8943 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8944 &find_in_mapping, &mapping);
8945 DR_BASE_ADDRESS (dr)
8946 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8947 &find_in_mapping, &mapping);
8949 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8950 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8951 /* The vector size of the epilogue is smaller than that of the main loop
8952 so the alignment is either the same or lower. This means the dr will
8953 thus by definition be aligned. */
8954 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8957 epilogue_vinfo->shared->datarefs_copy.release ();
8958 epilogue_vinfo->shared->save_datarefs ();
8961 /* Function vect_transform_loop.
8963 The analysis phase has determined that the loop is vectorizable.
8964 Vectorize the loop - created vectorized stmts to replace the scalar
8965 stmts in the loop, and update the loop exit condition.
8966 Returns scalar epilogue loop if any. */
8968 class loop *
8969 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8971 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8972 class loop *epilogue = NULL;
8973 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8974 int nbbs = loop->num_nodes;
8975 int i;
8976 tree niters_vector = NULL_TREE;
8977 tree step_vector = NULL_TREE;
8978 tree niters_vector_mult_vf = NULL_TREE;
8979 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8980 unsigned int lowest_vf = constant_lower_bound (vf);
8981 gimple *stmt;
8982 bool check_profitability = false;
8983 unsigned int th;
8985 DUMP_VECT_SCOPE ("vec_transform_loop");
8987 loop_vinfo->shared->check_datarefs ();
8989 /* Use the more conservative vectorization threshold. If the number
8990 of iterations is constant assume the cost check has been performed
8991 by our caller. If the threshold makes all loops profitable that
8992 run at least the (estimated) vectorization factor number of times
8993 checking is pointless, too. */
8994 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8995 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8997 if (dump_enabled_p ())
8998 dump_printf_loc (MSG_NOTE, vect_location,
8999 "Profitability threshold is %d loop iterations.\n",
9000 th);
9001 check_profitability = true;
9004 /* Make sure there exists a single-predecessor exit bb. Do this before
9005 versioning. */
9006 edge e = single_exit (loop);
9007 if (! single_pred_p (e->dest))
9009 split_loop_exit_edge (e, true);
9010 if (dump_enabled_p ())
9011 dump_printf (MSG_NOTE, "split exit edge\n");
9014 /* Version the loop first, if required, so the profitability check
9015 comes first. */
9017 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9019 class loop *sloop
9020 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9021 sloop->force_vectorize = false;
9022 check_profitability = false;
9025 /* Make sure there exists a single-predecessor exit bb also on the
9026 scalar loop copy. Do this after versioning but before peeling
9027 so CFG structure is fine for both scalar and if-converted loop
9028 to make slpeel_duplicate_current_defs_from_edges face matched
9029 loop closed PHI nodes on the exit. */
9030 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9032 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9033 if (! single_pred_p (e->dest))
9035 split_loop_exit_edge (e, true);
9036 if (dump_enabled_p ())
9037 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9041 tree niters = vect_build_loop_niters (loop_vinfo);
9042 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9043 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9044 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9045 tree advance;
9046 drs_init_vec orig_drs_init;
9048 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9049 &step_vector, &niters_vector_mult_vf, th,
9050 check_profitability, niters_no_overflow,
9051 &advance);
9053 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9054 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9055 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9056 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9058 if (niters_vector == NULL_TREE)
9060 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9061 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9062 && known_eq (lowest_vf, vf))
9064 niters_vector
9065 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9066 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9067 step_vector = build_one_cst (TREE_TYPE (niters));
9069 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9070 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9071 &step_vector, niters_no_overflow);
9072 else
9073 /* vect_do_peeling subtracted the number of peeled prologue
9074 iterations from LOOP_VINFO_NITERS. */
9075 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9076 &niters_vector, &step_vector,
9077 niters_no_overflow);
9080 /* 1) Make sure the loop header has exactly two entries
9081 2) Make sure we have a preheader basic block. */
9083 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9085 split_edge (loop_preheader_edge (loop));
9087 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9088 /* This will deal with any possible peeling. */
9089 vect_prepare_for_masked_peels (loop_vinfo);
9091 /* Schedule the SLP instances first, then handle loop vectorization
9092 below. */
9093 if (!loop_vinfo->slp_instances.is_empty ())
9095 DUMP_VECT_SCOPE ("scheduling SLP instances");
9096 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9099 /* FORNOW: the vectorizer supports only loops which body consist
9100 of one basic block (header + empty latch). When the vectorizer will
9101 support more involved loop forms, the order by which the BBs are
9102 traversed need to be reconsidered. */
9104 for (i = 0; i < nbbs; i++)
9106 basic_block bb = bbs[i];
9107 stmt_vec_info stmt_info;
9109 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9110 gsi_next (&si))
9112 gphi *phi = si.phi ();
9113 if (dump_enabled_p ())
9114 dump_printf_loc (MSG_NOTE, vect_location,
9115 "------>vectorizing phi: %G", phi);
9116 stmt_info = loop_vinfo->lookup_stmt (phi);
9117 if (!stmt_info)
9118 continue;
9120 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9121 vect_loop_kill_debug_uses (loop, stmt_info);
9123 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9124 && !STMT_VINFO_LIVE_P (stmt_info))
9125 continue;
9127 if (STMT_VINFO_VECTYPE (stmt_info)
9128 && (maybe_ne
9129 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9130 && dump_enabled_p ())
9131 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9133 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9134 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9135 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9136 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9137 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9138 && ! PURE_SLP_STMT (stmt_info))
9140 if (dump_enabled_p ())
9141 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9142 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9146 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9147 gsi_next (&si))
9149 gphi *phi = si.phi ();
9150 stmt_info = loop_vinfo->lookup_stmt (phi);
9151 if (!stmt_info)
9152 continue;
9154 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9155 && !STMT_VINFO_LIVE_P (stmt_info))
9156 continue;
9158 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9159 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9160 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9161 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9162 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9163 && ! PURE_SLP_STMT (stmt_info))
9164 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9167 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9168 !gsi_end_p (si);)
9170 stmt = gsi_stmt (si);
9171 /* During vectorization remove existing clobber stmts. */
9172 if (gimple_clobber_p (stmt))
9174 unlink_stmt_vdef (stmt);
9175 gsi_remove (&si, true);
9176 release_defs (stmt);
9178 else
9180 /* Ignore vector stmts created in the outer loop. */
9181 stmt_info = loop_vinfo->lookup_stmt (stmt);
9183 /* vector stmts created in the outer-loop during vectorization of
9184 stmts in an inner-loop may not have a stmt_info, and do not
9185 need to be vectorized. */
9186 stmt_vec_info seen_store = NULL;
9187 if (stmt_info)
9189 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9191 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9192 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9193 !gsi_end_p (subsi); gsi_next (&subsi))
9195 stmt_vec_info pat_stmt_info
9196 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9197 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9198 &si, &seen_store);
9200 stmt_vec_info pat_stmt_info
9201 = STMT_VINFO_RELATED_STMT (stmt_info);
9202 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9203 &seen_store);
9204 maybe_set_vectorized_backedge_value (loop_vinfo,
9205 pat_stmt_info);
9207 else
9209 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9210 &seen_store);
9211 maybe_set_vectorized_backedge_value (loop_vinfo,
9212 stmt_info);
9215 gsi_next (&si);
9216 if (seen_store)
9218 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9219 /* Interleaving. If IS_STORE is TRUE, the
9220 vectorization of the interleaving chain was
9221 completed - free all the stores in the chain. */
9222 vect_remove_stores (loop_vinfo,
9223 DR_GROUP_FIRST_ELEMENT (seen_store));
9224 else
9225 /* Free the attached stmt_vec_info and remove the stmt. */
9226 loop_vinfo->remove_stmt (stmt_info);
9231 /* Stub out scalar statements that must not survive vectorization.
9232 Doing this here helps with grouped statements, or statements that
9233 are involved in patterns. */
9234 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9235 !gsi_end_p (gsi); gsi_next (&gsi))
9237 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9238 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9240 tree lhs = gimple_get_lhs (call);
9241 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9243 tree zero = build_zero_cst (TREE_TYPE (lhs));
9244 gimple *new_stmt = gimple_build_assign (lhs, zero);
9245 gsi_replace (&gsi, new_stmt, true);
9249 } /* BBs in loop */
9251 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9252 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9253 if (integer_onep (step_vector))
9254 niters_no_overflow = true;
9255 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9256 niters_vector_mult_vf, !niters_no_overflow);
9258 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9259 scale_profile_for_vect_loop (loop, assumed_vf);
9261 /* True if the final iteration might not handle a full vector's
9262 worth of scalar iterations. */
9263 bool final_iter_may_be_partial
9264 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9265 /* The minimum number of iterations performed by the epilogue. This
9266 is 1 when peeling for gaps because we always need a final scalar
9267 iteration. */
9268 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9269 /* +1 to convert latch counts to loop iteration counts,
9270 -min_epilogue_iters to remove iterations that cannot be performed
9271 by the vector code. */
9272 int bias_for_lowest = 1 - min_epilogue_iters;
9273 int bias_for_assumed = bias_for_lowest;
9274 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9275 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9277 /* When the amount of peeling is known at compile time, the first
9278 iteration will have exactly alignment_npeels active elements.
9279 In the worst case it will have at least one. */
9280 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9281 bias_for_lowest += lowest_vf - min_first_active;
9282 bias_for_assumed += assumed_vf - min_first_active;
9284 /* In these calculations the "- 1" converts loop iteration counts
9285 back to latch counts. */
9286 if (loop->any_upper_bound)
9287 loop->nb_iterations_upper_bound
9288 = (final_iter_may_be_partial
9289 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9290 lowest_vf) - 1
9291 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9292 lowest_vf) - 1);
9293 if (loop->any_likely_upper_bound)
9294 loop->nb_iterations_likely_upper_bound
9295 = (final_iter_may_be_partial
9296 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9297 + bias_for_lowest, lowest_vf) - 1
9298 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9299 + bias_for_lowest, lowest_vf) - 1);
9300 if (loop->any_estimate)
9301 loop->nb_iterations_estimate
9302 = (final_iter_may_be_partial
9303 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9304 assumed_vf) - 1
9305 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9306 assumed_vf) - 1);
9308 if (dump_enabled_p ())
9310 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9312 dump_printf_loc (MSG_NOTE, vect_location,
9313 "LOOP VECTORIZED\n");
9314 if (loop->inner)
9315 dump_printf_loc (MSG_NOTE, vect_location,
9316 "OUTER LOOP VECTORIZED\n");
9317 dump_printf (MSG_NOTE, "\n");
9319 else
9320 dump_printf_loc (MSG_NOTE, vect_location,
9321 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9322 GET_MODE_NAME (loop_vinfo->vector_mode));
9325 /* Loops vectorized with a variable factor won't benefit from
9326 unrolling/peeling. */
9327 if (!vf.is_constant ())
9329 loop->unroll = 1;
9330 if (dump_enabled_p ())
9331 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9332 " variable-length vectorization factor\n");
9334 /* Free SLP instances here because otherwise stmt reference counting
9335 won't work. */
9336 slp_instance instance;
9337 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9338 vect_free_slp_instance (instance);
9339 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9340 /* Clear-up safelen field since its value is invalid after vectorization
9341 since vectorized loop can have loop-carried dependencies. */
9342 loop->safelen = 0;
9344 if (epilogue)
9346 update_epilogue_loop_vinfo (epilogue, advance);
9348 epilogue->simduid = loop->simduid;
9349 epilogue->force_vectorize = loop->force_vectorize;
9350 epilogue->dont_vectorize = false;
9353 return epilogue;
9356 /* The code below is trying to perform simple optimization - revert
9357 if-conversion for masked stores, i.e. if the mask of a store is zero
9358 do not perform it and all stored value producers also if possible.
9359 For example,
9360 for (i=0; i<n; i++)
9361 if (c[i])
9363 p1[i] += 1;
9364 p2[i] = p3[i] +2;
9366 this transformation will produce the following semi-hammock:
9368 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9370 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9371 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9372 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9373 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9374 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9375 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9379 void
9380 optimize_mask_stores (class loop *loop)
9382 basic_block *bbs = get_loop_body (loop);
9383 unsigned nbbs = loop->num_nodes;
9384 unsigned i;
9385 basic_block bb;
9386 class loop *bb_loop;
9387 gimple_stmt_iterator gsi;
9388 gimple *stmt;
9389 auto_vec<gimple *> worklist;
9390 auto_purge_vect_location sentinel;
9392 vect_location = find_loop_location (loop);
9393 /* Pick up all masked stores in loop if any. */
9394 for (i = 0; i < nbbs; i++)
9396 bb = bbs[i];
9397 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9398 gsi_next (&gsi))
9400 stmt = gsi_stmt (gsi);
9401 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9402 worklist.safe_push (stmt);
9406 free (bbs);
9407 if (worklist.is_empty ())
9408 return;
9410 /* Loop has masked stores. */
9411 while (!worklist.is_empty ())
9413 gimple *last, *last_store;
9414 edge e, efalse;
9415 tree mask;
9416 basic_block store_bb, join_bb;
9417 gimple_stmt_iterator gsi_to;
9418 tree vdef, new_vdef;
9419 gphi *phi;
9420 tree vectype;
9421 tree zero;
9423 last = worklist.pop ();
9424 mask = gimple_call_arg (last, 2);
9425 bb = gimple_bb (last);
9426 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9427 the same loop as if_bb. It could be different to LOOP when two
9428 level loop-nest is vectorized and mask_store belongs to the inner
9429 one. */
9430 e = split_block (bb, last);
9431 bb_loop = bb->loop_father;
9432 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9433 join_bb = e->dest;
9434 store_bb = create_empty_bb (bb);
9435 add_bb_to_loop (store_bb, bb_loop);
9436 e->flags = EDGE_TRUE_VALUE;
9437 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9438 /* Put STORE_BB to likely part. */
9439 efalse->probability = profile_probability::unlikely ();
9440 store_bb->count = efalse->count ();
9441 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9442 if (dom_info_available_p (CDI_DOMINATORS))
9443 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9444 if (dump_enabled_p ())
9445 dump_printf_loc (MSG_NOTE, vect_location,
9446 "Create new block %d to sink mask stores.",
9447 store_bb->index);
9448 /* Create vector comparison with boolean result. */
9449 vectype = TREE_TYPE (mask);
9450 zero = build_zero_cst (vectype);
9451 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9452 gsi = gsi_last_bb (bb);
9453 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9454 /* Create new PHI node for vdef of the last masked store:
9455 .MEM_2 = VDEF <.MEM_1>
9456 will be converted to
9457 .MEM.3 = VDEF <.MEM_1>
9458 and new PHI node will be created in join bb
9459 .MEM_2 = PHI <.MEM_1, .MEM_3>
9461 vdef = gimple_vdef (last);
9462 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9463 gimple_set_vdef (last, new_vdef);
9464 phi = create_phi_node (vdef, join_bb);
9465 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9467 /* Put all masked stores with the same mask to STORE_BB if possible. */
9468 while (true)
9470 gimple_stmt_iterator gsi_from;
9471 gimple *stmt1 = NULL;
9473 /* Move masked store to STORE_BB. */
9474 last_store = last;
9475 gsi = gsi_for_stmt (last);
9476 gsi_from = gsi;
9477 /* Shift GSI to the previous stmt for further traversal. */
9478 gsi_prev (&gsi);
9479 gsi_to = gsi_start_bb (store_bb);
9480 gsi_move_before (&gsi_from, &gsi_to);
9481 /* Setup GSI_TO to the non-empty block start. */
9482 gsi_to = gsi_start_bb (store_bb);
9483 if (dump_enabled_p ())
9484 dump_printf_loc (MSG_NOTE, vect_location,
9485 "Move stmt to created bb\n%G", last);
9486 /* Move all stored value producers if possible. */
9487 while (!gsi_end_p (gsi))
9489 tree lhs;
9490 imm_use_iterator imm_iter;
9491 use_operand_p use_p;
9492 bool res;
9494 /* Skip debug statements. */
9495 if (is_gimple_debug (gsi_stmt (gsi)))
9497 gsi_prev (&gsi);
9498 continue;
9500 stmt1 = gsi_stmt (gsi);
9501 /* Do not consider statements writing to memory or having
9502 volatile operand. */
9503 if (gimple_vdef (stmt1)
9504 || gimple_has_volatile_ops (stmt1))
9505 break;
9506 gsi_from = gsi;
9507 gsi_prev (&gsi);
9508 lhs = gimple_get_lhs (stmt1);
9509 if (!lhs)
9510 break;
9512 /* LHS of vectorized stmt must be SSA_NAME. */
9513 if (TREE_CODE (lhs) != SSA_NAME)
9514 break;
9516 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9518 /* Remove dead scalar statement. */
9519 if (has_zero_uses (lhs))
9521 gsi_remove (&gsi_from, true);
9522 continue;
9526 /* Check that LHS does not have uses outside of STORE_BB. */
9527 res = true;
9528 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9530 gimple *use_stmt;
9531 use_stmt = USE_STMT (use_p);
9532 if (is_gimple_debug (use_stmt))
9533 continue;
9534 if (gimple_bb (use_stmt) != store_bb)
9536 res = false;
9537 break;
9540 if (!res)
9541 break;
9543 if (gimple_vuse (stmt1)
9544 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9545 break;
9547 /* Can move STMT1 to STORE_BB. */
9548 if (dump_enabled_p ())
9549 dump_printf_loc (MSG_NOTE, vect_location,
9550 "Move stmt to created bb\n%G", stmt1);
9551 gsi_move_before (&gsi_from, &gsi_to);
9552 /* Shift GSI_TO for further insertion. */
9553 gsi_prev (&gsi_to);
9555 /* Put other masked stores with the same mask to STORE_BB. */
9556 if (worklist.is_empty ()
9557 || gimple_call_arg (worklist.last (), 2) != mask
9558 || worklist.last () != stmt1)
9559 break;
9560 last = worklist.pop ();
9562 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9566 /* Decide whether it is possible to use a zero-based induction variable
9567 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9568 the value that the induction variable must be able to hold in order
9569 to ensure that the rgroups eventually have no active vector elements.
9570 Return -1 otherwise. */
9572 widest_int
9573 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9575 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9576 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9577 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9579 /* Calculate the value that the induction variable must be able
9580 to hit in order to ensure that we end the loop with an all-false mask.
9581 This involves adding the maximum number of inactive trailing scalar
9582 iterations. */
9583 widest_int iv_limit = -1;
9584 if (max_loop_iterations (loop, &iv_limit))
9586 if (niters_skip)
9588 /* Add the maximum number of skipped iterations to the
9589 maximum iteration count. */
9590 if (TREE_CODE (niters_skip) == INTEGER_CST)
9591 iv_limit += wi::to_widest (niters_skip);
9592 else
9593 iv_limit += max_vf - 1;
9595 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9596 /* Make a conservatively-correct assumption. */
9597 iv_limit += max_vf - 1;
9599 /* IV_LIMIT is the maximum number of latch iterations, which is also
9600 the maximum in-range IV value. Round this value down to the previous
9601 vector alignment boundary and then add an extra full iteration. */
9602 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9603 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9605 return iv_limit;
9608 /* For the given rgroup_controls RGC, check whether an induction variable
9609 would ever hit a value that produces a set of all-false masks or zero
9610 lengths before wrapping around. Return true if it's possible to wrap
9611 around before hitting the desirable value, otherwise return false. */
9613 bool
9614 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9616 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9618 if (iv_limit == -1)
9619 return true;
9621 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9622 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9623 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9625 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9626 return true;
9628 return false;