Add testcase of PR c++/92542, already fixed.
[official-gcc.git] / gcc / tree-vect-loop.c
blobe5fb434bd4e5d98587791287d51d3e8ea7d1460f
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
185 if (stmt_vectype)
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else
195 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 if (nunits_vectype)
199 vect_update_max_nunits (vf, nunits_vectype);
201 return opt_result::success ();
204 /* Subroutine of vect_determine_vectorization_factor. Set the vector
205 types of STMT_INFO and all attached pattern statements and update
206 the vectorization factor VF accordingly. Return true on success
207 or false if something prevented vectorization. */
209 static opt_result
210 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
212 vec_info *vinfo = stmt_info->vinfo;
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
215 stmt_info->stmt);
216 opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
217 if (!res)
218 return res;
220 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
221 && STMT_VINFO_RELATED_STMT (stmt_info))
223 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
224 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
226 /* If a pattern statement has def stmts, analyze them too. */
227 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
228 !gsi_end_p (si); gsi_next (&si))
230 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
231 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "==> examining pattern def stmt: %G",
234 def_stmt_info->stmt);
235 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
236 if (!res)
237 return res;
240 if (dump_enabled_p ())
241 dump_printf_loc (MSG_NOTE, vect_location,
242 "==> examining pattern statement: %G",
243 stmt_info->stmt);
244 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
245 if (!res)
246 return res;
249 return opt_result::success ();
252 /* Function vect_determine_vectorization_factor
254 Determine the vectorization factor (VF). VF is the number of data elements
255 that are operated upon in parallel in a single iteration of the vectorized
256 loop. For example, when vectorizing a loop that operates on 4byte elements,
257 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
258 elements can fit in a single vector register.
260 We currently support vectorization of loops in which all types operated upon
261 are of the same size. Therefore this function currently sets VF according to
262 the size of the types operated upon, and fails if there are multiple sizes
263 in the loop.
265 VF is also the factor by which the loop iterations are strip-mined, e.g.:
266 original loop:
267 for (i=0; i<N; i++){
268 a[i] = b[i] + c[i];
271 vectorized loop:
272 for (i=0; i<N; i+=VF){
273 a[i:VF] = b[i:VF] + c[i:VF];
277 static opt_result
278 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
280 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
281 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
282 unsigned nbbs = loop->num_nodes;
283 poly_uint64 vectorization_factor = 1;
284 tree scalar_type = NULL_TREE;
285 gphi *phi;
286 tree vectype;
287 stmt_vec_info stmt_info;
288 unsigned i;
290 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
292 for (i = 0; i < nbbs; i++)
294 basic_block bb = bbs[i];
296 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
297 gsi_next (&si))
299 phi = si.phi ();
300 stmt_info = loop_vinfo->lookup_stmt (phi);
301 if (dump_enabled_p ())
302 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
303 phi);
305 gcc_assert (stmt_info);
307 if (STMT_VINFO_RELEVANT_P (stmt_info)
308 || STMT_VINFO_LIVE_P (stmt_info))
310 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
311 scalar_type = TREE_TYPE (PHI_RESULT (phi));
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location,
315 "get vectype for scalar type: %T\n",
316 scalar_type);
318 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
319 if (!vectype)
320 return opt_result::failure_at (phi,
321 "not vectorized: unsupported "
322 "data-type %T\n",
323 scalar_type);
324 STMT_VINFO_VECTYPE (stmt_info) = vectype;
326 if (dump_enabled_p ())
327 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
328 vectype);
330 if (dump_enabled_p ())
332 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
333 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
334 dump_printf (MSG_NOTE, "\n");
337 vect_update_max_nunits (&vectorization_factor, vectype);
341 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
342 gsi_next (&si))
344 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
345 opt_result res
346 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
347 if (!res)
348 return res;
352 /* TODO: Analyze cost. Decide if worth while to vectorize. */
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
356 dump_dec (MSG_NOTE, vectorization_factor);
357 dump_printf (MSG_NOTE, "\n");
360 if (known_le (vectorization_factor, 1U))
361 return opt_result::failure_at (vect_location,
362 "not vectorized: unsupported data-type\n");
363 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
364 return opt_result::success ();
368 /* Function vect_is_simple_iv_evolution.
370 FORNOW: A simple evolution of an induction variables in the loop is
371 considered a polynomial evolution. */
373 static bool
374 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
375 tree * step)
377 tree init_expr;
378 tree step_expr;
379 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
380 basic_block bb;
382 /* When there is no evolution in this loop, the evolution function
383 is not "simple". */
384 if (evolution_part == NULL_TREE)
385 return false;
387 /* When the evolution is a polynomial of degree >= 2
388 the evolution function is not "simple". */
389 if (tree_is_chrec (evolution_part))
390 return false;
392 step_expr = evolution_part;
393 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
395 if (dump_enabled_p ())
396 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
397 step_expr, init_expr);
399 *init = init_expr;
400 *step = step_expr;
402 if (TREE_CODE (step_expr) != INTEGER_CST
403 && (TREE_CODE (step_expr) != SSA_NAME
404 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
405 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
406 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
407 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
408 || !flag_associative_math)))
409 && (TREE_CODE (step_expr) != REAL_CST
410 || !flag_associative_math))
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "step unknown.\n");
415 return false;
418 return true;
421 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
422 what we are assuming is a double reduction. For example, given
423 a structure like this:
425 outer1:
426 x_1 = PHI <x_4(outer2), ...>;
429 inner:
430 x_2 = PHI <x_1(outer1), ...>;
432 x_3 = ...;
435 outer2:
436 x_4 = PHI <x_3(inner)>;
439 outer loop analysis would treat x_1 as a double reduction phi and
440 this function would then return true for x_2. */
442 static bool
443 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
445 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
446 use_operand_p use_p;
447 ssa_op_iter op_iter;
448 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
449 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
450 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
451 return true;
452 return false;
455 /* Function vect_analyze_scalar_cycles_1.
457 Examine the cross iteration def-use cycles of scalar variables
458 in LOOP. LOOP_VINFO represents the loop that is now being
459 considered for vectorization (can be LOOP, or an outer-loop
460 enclosing LOOP). */
462 static void
463 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
465 basic_block bb = loop->header;
466 tree init, step;
467 auto_vec<stmt_vec_info, 64> worklist;
468 gphi_iterator gsi;
469 bool double_reduc, reduc_chain;
471 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
473 /* First - identify all inductions. Reduction detection assumes that all the
474 inductions have been identified, therefore, this order must not be
475 changed. */
476 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
478 gphi *phi = gsi.phi ();
479 tree access_fn = NULL;
480 tree def = PHI_RESULT (phi);
481 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
486 /* Skip virtual phi's. The data dependences that are associated with
487 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
488 if (virtual_operand_p (def))
489 continue;
491 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
493 /* Analyze the evolution function. */
494 access_fn = analyze_scalar_evolution (loop, def);
495 if (access_fn)
497 STRIP_NOPS (access_fn);
498 if (dump_enabled_p ())
499 dump_printf_loc (MSG_NOTE, vect_location,
500 "Access function of PHI: %T\n", access_fn);
501 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
502 = initial_condition_in_loop_num (access_fn, loop->num);
503 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
504 = evolution_part_in_loop_num (access_fn, loop->num);
507 if (!access_fn
508 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
509 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
510 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
511 && TREE_CODE (step) != INTEGER_CST))
513 worklist.safe_push (stmt_vinfo);
514 continue;
517 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
518 != NULL_TREE);
519 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
523 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
527 /* Second - identify all reductions and nested cycles. */
528 while (worklist.length () > 0)
530 stmt_vec_info stmt_vinfo = worklist.pop ();
531 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
532 tree def = PHI_RESULT (phi);
534 if (dump_enabled_p ())
535 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
537 gcc_assert (!virtual_operand_p (def)
538 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
540 stmt_vec_info reduc_stmt_info
541 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
542 &reduc_chain);
543 if (reduc_stmt_info)
545 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
546 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
547 if (double_reduc)
549 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location,
551 "Detected double reduction.\n");
553 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
554 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
556 else
558 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
560 if (dump_enabled_p ())
561 dump_printf_loc (MSG_NOTE, vect_location,
562 "Detected vectorizable nested cycle.\n");
564 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
566 else
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected reduction.\n");
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
574 /* Store the reduction cycles for possible vectorization in
575 loop-aware SLP if it was not detected as reduction
576 chain. */
577 if (! reduc_chain)
578 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
579 (reduc_stmt_info);
583 else
584 if (dump_enabled_p ())
585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
586 "Unknown def-use cycle pattern.\n");
591 /* Function vect_analyze_scalar_cycles.
593 Examine the cross iteration def-use cycles of scalar variables, by
594 analyzing the loop-header PHIs of scalar variables. Classify each
595 cycle as one of the following: invariant, induction, reduction, unknown.
596 We do that for the loop represented by LOOP_VINFO, and also to its
597 inner-loop, if exists.
598 Examples for scalar cycles:
600 Example1: reduction:
602 loop1:
603 for (i=0; i<N; i++)
604 sum += a[i];
606 Example2: induction:
608 loop2:
609 for (i=0; i<N; i++)
610 a[i] = i; */
612 static void
613 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
615 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
617 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
619 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
620 Reductions in such inner-loop therefore have different properties than
621 the reductions in the nest that gets vectorized:
622 1. When vectorized, they are executed in the same order as in the original
623 scalar loop, so we can't change the order of computation when
624 vectorizing them.
625 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
626 current checks are too strict. */
628 if (loop->inner)
629 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
632 /* Transfer group and reduction information from STMT_INFO to its
633 pattern stmt. */
635 static void
636 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
638 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
639 stmt_vec_info stmtp;
640 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
641 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
642 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
645 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
646 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
647 == STMT_VINFO_DEF_TYPE (stmt_info));
648 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
649 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
650 if (stmt_info)
651 REDUC_GROUP_NEXT_ELEMENT (stmtp)
652 = STMT_VINFO_RELATED_STMT (stmt_info);
654 while (stmt_info);
657 /* Fixup scalar cycles that now have their stmts detected as patterns. */
659 static void
660 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
662 stmt_vec_info first;
663 unsigned i;
665 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
666 if (STMT_VINFO_IN_PATTERN_P (first))
668 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
669 while (next)
671 if (! STMT_VINFO_IN_PATTERN_P (next)
672 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
673 break;
674 next = REDUC_GROUP_NEXT_ELEMENT (next);
676 /* If not all stmt in the chain are patterns or if we failed
677 to update STMT_VINFO_REDUC_IDX try to handle the chain
678 without patterns. */
679 if (! next
680 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
682 vect_fixup_reduc_chain (first);
683 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
684 = STMT_VINFO_RELATED_STMT (first);
689 /* Function vect_get_loop_niters.
691 Determine how many iterations the loop is executed and place it
692 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
693 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
694 niter information holds in ASSUMPTIONS.
696 Return the loop exit condition. */
699 static gcond *
700 vect_get_loop_niters (class loop *loop, tree *assumptions,
701 tree *number_of_iterations, tree *number_of_iterationsm1)
703 edge exit = single_exit (loop);
704 class tree_niter_desc niter_desc;
705 tree niter_assumptions, niter, may_be_zero;
706 gcond *cond = get_loop_exit_condition (loop);
708 *assumptions = boolean_true_node;
709 *number_of_iterationsm1 = chrec_dont_know;
710 *number_of_iterations = chrec_dont_know;
711 DUMP_VECT_SCOPE ("get_loop_niters");
713 if (!exit)
714 return cond;
716 may_be_zero = NULL_TREE;
717 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
718 || chrec_contains_undetermined (niter_desc.niter))
719 return cond;
721 niter_assumptions = niter_desc.assumptions;
722 may_be_zero = niter_desc.may_be_zero;
723 niter = niter_desc.niter;
725 if (may_be_zero && integer_zerop (may_be_zero))
726 may_be_zero = NULL_TREE;
728 if (may_be_zero)
730 if (COMPARISON_CLASS_P (may_be_zero))
732 /* Try to combine may_be_zero with assumptions, this can simplify
733 computation of niter expression. */
734 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
735 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
736 niter_assumptions,
737 fold_build1 (TRUTH_NOT_EXPR,
738 boolean_type_node,
739 may_be_zero));
740 else
741 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
742 build_int_cst (TREE_TYPE (niter), 0),
743 rewrite_to_non_trapping_overflow (niter));
745 may_be_zero = NULL_TREE;
747 else if (integer_nonzerop (may_be_zero))
749 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
750 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
751 return cond;
753 else
754 return cond;
757 *assumptions = niter_assumptions;
758 *number_of_iterationsm1 = niter;
760 /* We want the number of loop header executions which is the number
761 of latch executions plus one.
762 ??? For UINT_MAX latch executions this number overflows to zero
763 for loops like do { n++; } while (n != 0); */
764 if (niter && !chrec_contains_undetermined (niter))
765 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
766 build_int_cst (TREE_TYPE (niter), 1));
767 *number_of_iterations = niter;
769 return cond;
772 /* Function bb_in_loop_p
774 Used as predicate for dfs order traversal of the loop bbs. */
776 static bool
777 bb_in_loop_p (const_basic_block bb, const void *data)
779 const class loop *const loop = (const class loop *)data;
780 if (flow_bb_inside_loop_p (loop, bb))
781 return true;
782 return false;
786 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
787 stmt_vec_info structs for all the stmts in LOOP_IN. */
789 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
790 : vec_info (vec_info::loop, init_cost (loop_in), shared),
791 loop (loop_in),
792 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
793 num_itersm1 (NULL_TREE),
794 num_iters (NULL_TREE),
795 num_iters_unchanged (NULL_TREE),
796 num_iters_assumptions (NULL_TREE),
797 th (0),
798 versioning_threshold (0),
799 vectorization_factor (0),
800 max_vectorization_factor (0),
801 mask_skip_niters (NULL_TREE),
802 mask_compare_type (NULL_TREE),
803 simd_if_cond (NULL_TREE),
804 unaligned_dr (NULL),
805 peeling_for_alignment (0),
806 ptr_mask (0),
807 ivexpr_map (NULL),
808 scan_map (NULL),
809 slp_unrolling_factor (1),
810 single_scalar_iteration_cost (0),
811 vec_outside_cost (0),
812 vec_inside_cost (0),
813 vectorizable (false),
814 can_fully_mask_p (true),
815 fully_masked_p (false),
816 peeling_for_gaps (false),
817 peeling_for_niter (false),
818 no_data_dependencies (false),
819 has_mask_store (false),
820 scalar_loop_scaling (profile_probability::uninitialized ()),
821 scalar_loop (NULL),
822 orig_loop_info (NULL)
824 /* CHECKME: We want to visit all BBs before their successors (except for
825 latch blocks, for which this assertion wouldn't hold). In the simple
826 case of the loop forms we allow, a dfs order of the BBs would the same
827 as reversed postorder traversal, so we are safe. */
829 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
830 bbs, loop->num_nodes, loop);
831 gcc_assert (nbbs == loop->num_nodes);
833 for (unsigned int i = 0; i < nbbs; i++)
835 basic_block bb = bbs[i];
836 gimple_stmt_iterator si;
838 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
840 gimple *phi = gsi_stmt (si);
841 gimple_set_uid (phi, 0);
842 add_stmt (phi);
845 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
847 gimple *stmt = gsi_stmt (si);
848 gimple_set_uid (stmt, 0);
849 add_stmt (stmt);
850 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
851 third argument is the #pragma omp simd if (x) condition, when 0,
852 loop shouldn't be vectorized, when non-zero constant, it should
853 be vectorized normally, otherwise versioned with vectorized loop
854 done if the condition is non-zero at runtime. */
855 if (loop_in->simduid
856 && is_gimple_call (stmt)
857 && gimple_call_internal_p (stmt)
858 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
859 && gimple_call_num_args (stmt) >= 3
860 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
861 && (loop_in->simduid
862 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
864 tree arg = gimple_call_arg (stmt, 2);
865 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
866 simd_if_cond = arg;
867 else
868 gcc_assert (integer_nonzerop (arg));
873 epilogue_vinfos.create (6);
876 /* Free all levels of MASKS. */
878 void
879 release_vec_loop_masks (vec_loop_masks *masks)
881 rgroup_masks *rgm;
882 unsigned int i;
883 FOR_EACH_VEC_ELT (*masks, i, rgm)
884 rgm->masks.release ();
885 masks->release ();
888 /* Free all memory used by the _loop_vec_info, as well as all the
889 stmt_vec_info structs of all the stmts in the loop. */
891 _loop_vec_info::~_loop_vec_info ()
893 free (bbs);
895 release_vec_loop_masks (&masks);
896 delete ivexpr_map;
897 delete scan_map;
898 epilogue_vinfos.release ();
900 loop->aux = NULL;
903 /* Return an invariant or register for EXPR and emit necessary
904 computations in the LOOP_VINFO loop preheader. */
906 tree
907 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
909 if (is_gimple_reg (expr)
910 || is_gimple_min_invariant (expr))
911 return expr;
913 if (! loop_vinfo->ivexpr_map)
914 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
915 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
916 if (! cached)
918 gimple_seq stmts = NULL;
919 cached = force_gimple_operand (unshare_expr (expr),
920 &stmts, true, NULL_TREE);
921 if (stmts)
923 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
924 gsi_insert_seq_on_edge_immediate (e, stmts);
927 return cached;
930 /* Return true if we can use CMP_TYPE as the comparison type to produce
931 all masks required to mask LOOP_VINFO. */
933 static bool
934 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
936 rgroup_masks *rgm;
937 unsigned int i;
938 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
939 if (rgm->mask_type != NULL_TREE
940 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
941 cmp_type, rgm->mask_type,
942 OPTIMIZE_FOR_SPEED))
943 return false;
944 return true;
947 /* Calculate the maximum number of scalars per iteration for every
948 rgroup in LOOP_VINFO. */
950 static unsigned int
951 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
953 unsigned int res = 1;
954 unsigned int i;
955 rgroup_masks *rgm;
956 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
957 res = MAX (res, rgm->max_nscalars_per_iter);
958 return res;
961 /* Each statement in LOOP_VINFO can be masked where necessary. Check
962 whether we can actually generate the masks required. Return true if so,
963 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
965 static bool
966 vect_verify_full_masking (loop_vec_info loop_vinfo)
968 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
969 unsigned int min_ni_width;
970 unsigned int max_nscalars_per_iter
971 = vect_get_max_nscalars_per_iter (loop_vinfo);
973 /* Use a normal loop if there are no statements that need masking.
974 This only happens in rare degenerate cases: it means that the loop
975 has no loads, no stores, and no live-out values. */
976 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
977 return false;
979 /* Get the maximum number of iterations that is representable
980 in the counter type. */
981 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
982 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
984 /* Get a more refined estimate for the number of iterations. */
985 widest_int max_back_edges;
986 if (max_loop_iterations (loop, &max_back_edges))
987 max_ni = wi::smin (max_ni, max_back_edges + 1);
989 /* Account for rgroup masks, in which each bit is replicated N times. */
990 max_ni *= max_nscalars_per_iter;
992 /* Work out how many bits we need to represent the limit. */
993 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
995 /* Find a scalar mode for which WHILE_ULT is supported. */
996 opt_scalar_int_mode cmp_mode_iter;
997 tree cmp_type = NULL_TREE;
998 tree iv_type = NULL_TREE;
999 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1000 unsigned int iv_precision = UINT_MAX;
1002 if (iv_limit != -1)
1003 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1004 UNSIGNED);
1006 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1008 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1009 if (cmp_bits >= min_ni_width
1010 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1012 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1013 if (this_type
1014 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1016 /* Although we could stop as soon as we find a valid mode,
1017 there are at least two reasons why that's not always the
1018 best choice:
1020 - An IV that's Pmode or wider is more likely to be reusable
1021 in address calculations than an IV that's narrower than
1022 Pmode.
1024 - Doing the comparison in IV_PRECISION or wider allows
1025 a natural 0-based IV, whereas using a narrower comparison
1026 type requires mitigations against wrap-around.
1028 Conversely, if the IV limit is variable, doing the comparison
1029 in a wider type than the original type can introduce
1030 unnecessary extensions, so picking the widest valid mode
1031 is not always a good choice either.
1033 Here we prefer the first IV type that's Pmode or wider,
1034 and the first comparison type that's IV_PRECISION or wider.
1035 (The comparison type must be no wider than the IV type,
1036 to avoid extensions in the vector loop.)
1038 ??? We might want to try continuing beyond Pmode for ILP32
1039 targets if CMP_BITS < IV_PRECISION. */
1040 iv_type = this_type;
1041 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1042 cmp_type = this_type;
1043 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1044 break;
1049 if (!cmp_type)
1050 return false;
1052 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1053 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1054 return true;
1057 /* Calculate the cost of one scalar iteration of the loop. */
1058 static void
1059 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1061 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1062 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1063 int nbbs = loop->num_nodes, factor;
1064 int innerloop_iters, i;
1066 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1068 /* Gather costs for statements in the scalar loop. */
1070 /* FORNOW. */
1071 innerloop_iters = 1;
1072 if (loop->inner)
1073 innerloop_iters = 50; /* FIXME */
1075 for (i = 0; i < nbbs; i++)
1077 gimple_stmt_iterator si;
1078 basic_block bb = bbs[i];
1080 if (bb->loop_father == loop->inner)
1081 factor = innerloop_iters;
1082 else
1083 factor = 1;
1085 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1087 gimple *stmt = gsi_stmt (si);
1088 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1090 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1091 continue;
1093 /* Skip stmts that are not vectorized inside the loop. */
1094 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1095 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1096 && (!STMT_VINFO_LIVE_P (vstmt_info)
1097 || !VECTORIZABLE_CYCLE_DEF
1098 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1099 continue;
1101 vect_cost_for_stmt kind;
1102 if (STMT_VINFO_DATA_REF (stmt_info))
1104 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1105 kind = scalar_load;
1106 else
1107 kind = scalar_store;
1109 else if (vect_nop_conversion_p (stmt_info))
1110 continue;
1111 else
1112 kind = scalar_stmt;
1114 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1115 factor, kind, stmt_info, 0, vect_prologue);
1119 /* Now accumulate cost. */
1120 void *target_cost_data = init_cost (loop);
1121 stmt_info_for_cost *si;
1122 int j;
1123 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1124 j, si)
1125 (void) add_stmt_cost (target_cost_data, si->count,
1126 si->kind, si->stmt_info, si->misalign,
1127 vect_body);
1128 unsigned dummy, body_cost = 0;
1129 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1130 destroy_cost_data (target_cost_data);
1131 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1135 /* Function vect_analyze_loop_form_1.
1137 Verify that certain CFG restrictions hold, including:
1138 - the loop has a pre-header
1139 - the loop has a single entry and exit
1140 - the loop exit condition is simple enough
1141 - the number of iterations can be analyzed, i.e, a countable loop. The
1142 niter could be analyzed under some assumptions. */
1144 opt_result
1145 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1146 tree *assumptions, tree *number_of_iterationsm1,
1147 tree *number_of_iterations, gcond **inner_loop_cond)
1149 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1151 /* Different restrictions apply when we are considering an inner-most loop,
1152 vs. an outer (nested) loop.
1153 (FORNOW. May want to relax some of these restrictions in the future). */
1155 if (!loop->inner)
1157 /* Inner-most loop. We currently require that the number of BBs is
1158 exactly 2 (the header and latch). Vectorizable inner-most loops
1159 look like this:
1161 (pre-header)
1163 header <--------+
1164 | | |
1165 | +--> latch --+
1167 (exit-bb) */
1169 if (loop->num_nodes != 2)
1170 return opt_result::failure_at (vect_location,
1171 "not vectorized:"
1172 " control flow in loop.\n");
1174 if (empty_block_p (loop->header))
1175 return opt_result::failure_at (vect_location,
1176 "not vectorized: empty loop.\n");
1178 else
1180 class loop *innerloop = loop->inner;
1181 edge entryedge;
1183 /* Nested loop. We currently require that the loop is doubly-nested,
1184 contains a single inner loop, and the number of BBs is exactly 5.
1185 Vectorizable outer-loops look like this:
1187 (pre-header)
1189 header <---+
1191 inner-loop |
1193 tail ------+
1195 (exit-bb)
1197 The inner-loop has the properties expected of inner-most loops
1198 as described above. */
1200 if ((loop->inner)->inner || (loop->inner)->next)
1201 return opt_result::failure_at (vect_location,
1202 "not vectorized:"
1203 " multiple nested loops.\n");
1205 if (loop->num_nodes != 5)
1206 return opt_result::failure_at (vect_location,
1207 "not vectorized:"
1208 " control flow in loop.\n");
1210 entryedge = loop_preheader_edge (innerloop);
1211 if (entryedge->src != loop->header
1212 || !single_exit (innerloop)
1213 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1214 return opt_result::failure_at (vect_location,
1215 "not vectorized:"
1216 " unsupported outerloop form.\n");
1218 /* Analyze the inner-loop. */
1219 tree inner_niterm1, inner_niter, inner_assumptions;
1220 opt_result res
1221 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1222 &inner_assumptions, &inner_niterm1,
1223 &inner_niter, NULL);
1224 if (!res)
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "not vectorized: Bad inner loop.\n");
1229 return res;
1232 /* Don't support analyzing niter under assumptions for inner
1233 loop. */
1234 if (!integer_onep (inner_assumptions))
1235 return opt_result::failure_at (vect_location,
1236 "not vectorized: Bad inner loop.\n");
1238 if (!expr_invariant_in_loop_p (loop, inner_niter))
1239 return opt_result::failure_at (vect_location,
1240 "not vectorized: inner-loop count not"
1241 " invariant.\n");
1243 if (dump_enabled_p ())
1244 dump_printf_loc (MSG_NOTE, vect_location,
1245 "Considering outer-loop vectorization.\n");
1248 if (!single_exit (loop))
1249 return opt_result::failure_at (vect_location,
1250 "not vectorized: multiple exits.\n");
1251 if (EDGE_COUNT (loop->header->preds) != 2)
1252 return opt_result::failure_at (vect_location,
1253 "not vectorized:"
1254 " too many incoming edges.\n");
1256 /* We assume that the loop exit condition is at the end of the loop. i.e,
1257 that the loop is represented as a do-while (with a proper if-guard
1258 before the loop if needed), where the loop header contains all the
1259 executable statements, and the latch is empty. */
1260 if (!empty_block_p (loop->latch)
1261 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1262 return opt_result::failure_at (vect_location,
1263 "not vectorized: latch block not empty.\n");
1265 /* Make sure the exit is not abnormal. */
1266 edge e = single_exit (loop);
1267 if (e->flags & EDGE_ABNORMAL)
1268 return opt_result::failure_at (vect_location,
1269 "not vectorized:"
1270 " abnormal loop exit edge.\n");
1272 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1273 number_of_iterationsm1);
1274 if (!*loop_cond)
1275 return opt_result::failure_at
1276 (vect_location,
1277 "not vectorized: complicated exit condition.\n");
1279 if (integer_zerop (*assumptions)
1280 || !*number_of_iterations
1281 || chrec_contains_undetermined (*number_of_iterations))
1282 return opt_result::failure_at
1283 (*loop_cond,
1284 "not vectorized: number of iterations cannot be computed.\n");
1286 if (integer_zerop (*number_of_iterations))
1287 return opt_result::failure_at
1288 (*loop_cond,
1289 "not vectorized: number of iterations = 0.\n");
1291 return opt_result::success ();
1294 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1296 opt_loop_vec_info
1297 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1299 tree assumptions, number_of_iterations, number_of_iterationsm1;
1300 gcond *loop_cond, *inner_loop_cond = NULL;
1302 opt_result res
1303 = vect_analyze_loop_form_1 (loop, &loop_cond,
1304 &assumptions, &number_of_iterationsm1,
1305 &number_of_iterations, &inner_loop_cond);
1306 if (!res)
1307 return opt_loop_vec_info::propagate_failure (res);
1309 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1310 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1311 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1312 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1313 if (!integer_onep (assumptions))
1315 /* We consider to vectorize this loop by versioning it under
1316 some assumptions. In order to do this, we need to clear
1317 existing information computed by scev and niter analyzer. */
1318 scev_reset_htab ();
1319 free_numbers_of_iterations_estimates (loop);
1320 /* Also set flag for this loop so that following scev and niter
1321 analysis are done under the assumptions. */
1322 loop_constraint_set (loop, LOOP_C_FINITE);
1323 /* Also record the assumptions for versioning. */
1324 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1327 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1329 if (dump_enabled_p ())
1331 dump_printf_loc (MSG_NOTE, vect_location,
1332 "Symbolic number of iterations is ");
1333 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1334 dump_printf (MSG_NOTE, "\n");
1338 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1339 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1340 if (inner_loop_cond)
1342 stmt_vec_info inner_loop_cond_info
1343 = loop_vinfo->lookup_stmt (inner_loop_cond);
1344 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347 gcc_assert (!loop->aux);
1348 loop->aux = loop_vinfo;
1349 return opt_loop_vec_info::success (loop_vinfo);
1354 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1355 statements update the vectorization factor. */
1357 static void
1358 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1360 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1361 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1362 int nbbs = loop->num_nodes;
1363 poly_uint64 vectorization_factor;
1364 int i;
1366 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1368 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1369 gcc_assert (known_ne (vectorization_factor, 0U));
1371 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1372 vectorization factor of the loop is the unrolling factor required by
1373 the SLP instances. If that unrolling factor is 1, we say, that we
1374 perform pure SLP on loop - cross iteration parallelism is not
1375 exploited. */
1376 bool only_slp_in_loop = true;
1377 for (i = 0; i < nbbs; i++)
1379 basic_block bb = bbs[i];
1380 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1381 gsi_next (&si))
1383 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1384 if (!stmt_info)
1385 continue;
1386 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1387 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1388 && !PURE_SLP_STMT (stmt_info))
1389 /* STMT needs both SLP and loop-based vectorization. */
1390 only_slp_in_loop = false;
1392 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1393 gsi_next (&si))
1395 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1396 stmt_info = vect_stmt_to_vectorize (stmt_info);
1397 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399 && !PURE_SLP_STMT (stmt_info))
1400 /* STMT needs both SLP and loop-based vectorization. */
1401 only_slp_in_loop = false;
1405 if (only_slp_in_loop)
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_NOTE, vect_location,
1409 "Loop contains only SLP stmts\n");
1410 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1412 else
1414 if (dump_enabled_p ())
1415 dump_printf_loc (MSG_NOTE, vect_location,
1416 "Loop contains SLP and non-SLP stmts\n");
1417 /* Both the vectorization factor and unroll factor have the form
1418 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1419 so they must have a common multiple. */
1420 vectorization_factor
1421 = force_common_multiple (vectorization_factor,
1422 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1425 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1426 if (dump_enabled_p ())
1428 dump_printf_loc (MSG_NOTE, vect_location,
1429 "Updating vectorization factor to ");
1430 dump_dec (MSG_NOTE, vectorization_factor);
1431 dump_printf (MSG_NOTE, ".\n");
1435 /* Return true if STMT_INFO describes a double reduction phi and if
1436 the other phi in the reduction is also relevant for vectorization.
1437 This rejects cases such as:
1439 outer1:
1440 x_1 = PHI <x_3(outer2), ...>;
1443 inner:
1444 x_2 = ...;
1447 outer2:
1448 x_3 = PHI <x_2(inner)>;
1450 if nothing in x_2 or elsewhere makes x_1 relevant. */
1452 static bool
1453 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1455 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1456 return false;
1458 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1461 /* Function vect_analyze_loop_operations.
1463 Scan the loop stmts and make sure they are all vectorizable. */
1465 static opt_result
1466 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1468 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1469 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1470 int nbbs = loop->num_nodes;
1471 int i;
1472 stmt_vec_info stmt_info;
1473 bool need_to_vectorize = false;
1474 bool ok;
1476 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1478 auto_vec<stmt_info_for_cost> cost_vec;
1480 for (i = 0; i < nbbs; i++)
1482 basic_block bb = bbs[i];
1484 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1485 gsi_next (&si))
1487 gphi *phi = si.phi ();
1488 ok = true;
1490 stmt_info = loop_vinfo->lookup_stmt (phi);
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1493 if (virtual_operand_p (gimple_phi_result (phi)))
1494 continue;
1496 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1497 (i.e., a phi in the tail of the outer-loop). */
1498 if (! is_loop_header_bb_p (bb))
1500 /* FORNOW: we currently don't support the case that these phis
1501 are not used in the outerloop (unless it is double reduction,
1502 i.e., this phi is vect_reduction_def), cause this case
1503 requires to actually do something here. */
1504 if (STMT_VINFO_LIVE_P (stmt_info)
1505 && !vect_active_double_reduction_p (stmt_info))
1506 return opt_result::failure_at (phi,
1507 "Unsupported loop-closed phi"
1508 " in outer-loop.\n");
1510 /* If PHI is used in the outer loop, we check that its operand
1511 is defined in the inner loop. */
1512 if (STMT_VINFO_RELEVANT_P (stmt_info))
1514 tree phi_op;
1516 if (gimple_phi_num_args (phi) != 1)
1517 return opt_result::failure_at (phi, "unsupported phi");
1519 phi_op = PHI_ARG_DEF (phi, 0);
1520 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1521 if (!op_def_info)
1522 return opt_result::failure_at (phi, "unsupported phi\n");
1524 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1525 && (STMT_VINFO_RELEVANT (op_def_info)
1526 != vect_used_in_outer_by_reduction))
1527 return opt_result::failure_at (phi, "unsupported phi\n");
1529 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1530 || (STMT_VINFO_DEF_TYPE (stmt_info)
1531 == vect_double_reduction_def))
1532 && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1533 return opt_result::failure_at (phi, "unsupported phi\n");
1536 continue;
1539 gcc_assert (stmt_info);
1541 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1542 || STMT_VINFO_LIVE_P (stmt_info))
1543 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1544 /* A scalar-dependence cycle that we don't support. */
1545 return opt_result::failure_at (phi,
1546 "not vectorized:"
1547 " scalar dependence cycle.\n");
1549 if (STMT_VINFO_RELEVANT_P (stmt_info))
1551 need_to_vectorize = true;
1552 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1553 && ! PURE_SLP_STMT (stmt_info))
1554 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1555 &cost_vec);
1556 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1557 || (STMT_VINFO_DEF_TYPE (stmt_info)
1558 == vect_double_reduction_def)
1559 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1560 && ! PURE_SLP_STMT (stmt_info))
1561 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1564 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1565 if (ok
1566 && STMT_VINFO_LIVE_P (stmt_info)
1567 && !PURE_SLP_STMT (stmt_info))
1568 ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1569 -1, false, &cost_vec);
1571 if (!ok)
1572 return opt_result::failure_at (phi,
1573 "not vectorized: relevant phi not "
1574 "supported: %G",
1575 static_cast <gimple *> (phi));
1578 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579 gsi_next (&si))
1581 gimple *stmt = gsi_stmt (si);
1582 if (!gimple_clobber_p (stmt))
1584 opt_result res
1585 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1586 &need_to_vectorize,
1587 NULL, NULL, &cost_vec);
1588 if (!res)
1589 return res;
1592 } /* bbs */
1594 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1596 /* All operations in the loop are either irrelevant (deal with loop
1597 control, or dead), or only used outside the loop and can be moved
1598 out of the loop (e.g. invariants, inductions). The loop can be
1599 optimized away by scalar optimizations. We're better off not
1600 touching this loop. */
1601 if (!need_to_vectorize)
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_NOTE, vect_location,
1605 "All the computation can be taken out of the loop.\n");
1606 return opt_result::failure_at
1607 (vect_location,
1608 "not vectorized: redundant loop. no profit to vectorize.\n");
1611 return opt_result::success ();
1614 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1615 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1616 definitely no, or -1 if it's worth retrying. */
1618 static int
1619 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1621 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1622 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1624 /* Only fully-masked loops can have iteration counts less than the
1625 vectorization factor. */
1626 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1628 HOST_WIDE_INT max_niter;
1630 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1631 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1632 else
1633 max_niter = max_stmt_executions_int (loop);
1635 if (max_niter != -1
1636 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1638 if (dump_enabled_p ())
1639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640 "not vectorized: iteration count smaller than "
1641 "vectorization factor.\n");
1642 return 0;
1646 int min_profitable_iters, min_profitable_estimate;
1647 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1648 &min_profitable_estimate);
1650 if (min_profitable_iters < 0)
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654 "not vectorized: vectorization not profitable.\n");
1655 if (dump_enabled_p ())
1656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657 "not vectorized: vector version will never be "
1658 "profitable.\n");
1659 return -1;
1662 int min_scalar_loop_bound = (param_min_vect_loop_bound
1663 * assumed_vf);
1665 /* Use the cost model only if it is more conservative than user specified
1666 threshold. */
1667 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1668 min_profitable_iters);
1670 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1672 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1673 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1675 if (dump_enabled_p ())
1676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677 "not vectorized: vectorization not profitable.\n");
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_NOTE, vect_location,
1680 "not vectorized: iteration count smaller than user "
1681 "specified loop bound parameter or minimum profitable "
1682 "iterations (whichever is more conservative).\n");
1683 return 0;
1686 /* The static profitablity threshold min_profitable_estimate includes
1687 the cost of having to check at runtime whether the scalar loop
1688 should be used instead. If it turns out that we don't need or want
1689 such a check, the threshold we should use for the static estimate
1690 is simply the point at which the vector loop becomes more profitable
1691 than the scalar loop. */
1692 if (min_profitable_estimate > min_profitable_iters
1693 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1694 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1695 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1696 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1698 if (dump_enabled_p ())
1699 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1700 " choice between the scalar and vector loops\n");
1701 min_profitable_estimate = min_profitable_iters;
1704 HOST_WIDE_INT estimated_niter;
1706 /* If we are vectorizing an epilogue then we know the maximum number of
1707 scalar iterations it will cover is at least one lower than the
1708 vectorization factor of the main loop. */
1709 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1710 estimated_niter
1711 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1712 else
1714 estimated_niter = estimated_stmt_executions_int (loop);
1715 if (estimated_niter == -1)
1716 estimated_niter = likely_max_stmt_executions_int (loop);
1718 if (estimated_niter != -1
1719 && ((unsigned HOST_WIDE_INT) estimated_niter
1720 < MAX (th, (unsigned) min_profitable_estimate)))
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724 "not vectorized: estimated iteration count too "
1725 "small.\n");
1726 if (dump_enabled_p ())
1727 dump_printf_loc (MSG_NOTE, vect_location,
1728 "not vectorized: estimated iteration count smaller "
1729 "than specified loop bound parameter or minimum "
1730 "profitable iterations (whichever is more "
1731 "conservative).\n");
1732 return -1;
1735 return 1;
1738 static opt_result
1739 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1740 vec<data_reference_p> *datarefs,
1741 unsigned int *n_stmts)
1743 *n_stmts = 0;
1744 for (unsigned i = 0; i < loop->num_nodes; i++)
1745 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1746 !gsi_end_p (gsi); gsi_next (&gsi))
1748 gimple *stmt = gsi_stmt (gsi);
1749 if (is_gimple_debug (stmt))
1750 continue;
1751 ++(*n_stmts);
1752 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1753 if (!res)
1755 if (is_gimple_call (stmt) && loop->safelen)
1757 tree fndecl = gimple_call_fndecl (stmt), op;
1758 if (fndecl != NULL_TREE)
1760 cgraph_node *node = cgraph_node::get (fndecl);
1761 if (node != NULL && node->simd_clones != NULL)
1763 unsigned int j, n = gimple_call_num_args (stmt);
1764 for (j = 0; j < n; j++)
1766 op = gimple_call_arg (stmt, j);
1767 if (DECL_P (op)
1768 || (REFERENCE_CLASS_P (op)
1769 && get_base_address (op)))
1770 break;
1772 op = gimple_call_lhs (stmt);
1773 /* Ignore #pragma omp declare simd functions
1774 if they don't have data references in the
1775 call stmt itself. */
1776 if (j == n
1777 && !(op
1778 && (DECL_P (op)
1779 || (REFERENCE_CLASS_P (op)
1780 && get_base_address (op)))))
1781 continue;
1785 return res;
1787 /* If dependence analysis will give up due to the limit on the
1788 number of datarefs stop here and fail fatally. */
1789 if (datarefs->length ()
1790 > (unsigned)param_loop_max_datarefs_for_datadeps)
1791 return opt_result::failure_at (stmt, "exceeded param "
1792 "loop-max-datarefs-for-datadeps\n");
1794 return opt_result::success ();
1797 /* Look for SLP-only access groups and turn each individual access into its own
1798 group. */
1799 static void
1800 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1802 unsigned int i;
1803 struct data_reference *dr;
1805 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1807 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1808 FOR_EACH_VEC_ELT (datarefs, i, dr)
1810 gcc_assert (DR_REF (dr));
1811 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1813 /* Check if the load is a part of an interleaving chain. */
1814 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1816 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1817 unsigned int group_size = DR_GROUP_SIZE (first_element);
1819 /* Check if SLP-only groups. */
1820 if (!STMT_SLP_TYPE (stmt_info)
1821 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1823 /* Dissolve the group. */
1824 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1826 stmt_vec_info vinfo = first_element;
1827 while (vinfo)
1829 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1830 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1831 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1832 DR_GROUP_SIZE (vinfo) = 1;
1833 if (STMT_VINFO_STRIDED_P (first_element))
1834 DR_GROUP_GAP (vinfo) = 0;
1835 else
1836 DR_GROUP_GAP (vinfo) = group_size - 1;
1837 vinfo = next;
1845 /* Decides whether we need to create an epilogue loop to handle
1846 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1848 void
1849 determine_peel_for_niter (loop_vec_info loop_vinfo)
1851 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1853 unsigned HOST_WIDE_INT const_vf;
1854 HOST_WIDE_INT max_niter
1855 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1857 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1858 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1859 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1860 (loop_vinfo));
1862 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1863 /* The main loop handles all iterations. */
1864 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1865 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1866 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1868 /* Work out the (constant) number of iterations that need to be
1869 peeled for reasons other than niters. */
1870 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1871 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1872 peel_niter += 1;
1873 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1874 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1875 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1877 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1878 /* ??? When peeling for gaps but not alignment, we could
1879 try to check whether the (variable) niters is known to be
1880 VF * N + 1. That's something of a niche case though. */
1881 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1882 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1883 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1884 < (unsigned) exact_log2 (const_vf))
1885 /* In case of versioning, check if the maximum number of
1886 iterations is greater than th. If they are identical,
1887 the epilogue is unnecessary. */
1888 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1889 || ((unsigned HOST_WIDE_INT) max_niter
1890 > (th / const_vf) * const_vf))))
1891 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1895 /* Function vect_analyze_loop_2.
1897 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1898 for it. The different analyses will record information in the
1899 loop_vec_info struct. */
1900 static opt_result
1901 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1903 opt_result ok = opt_result::success ();
1904 int res;
1905 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1906 poly_uint64 min_vf = 2;
1907 loop_vec_info orig_loop_vinfo = NULL;
1909 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1910 loop_vec_info of the first vectorized loop. */
1911 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1912 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1913 else
1914 orig_loop_vinfo = loop_vinfo;
1915 gcc_assert (orig_loop_vinfo);
1917 /* The first group of checks is independent of the vector size. */
1918 fatal = true;
1920 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1921 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1922 return opt_result::failure_at (vect_location,
1923 "not vectorized: simd if(0)\n");
1925 /* Find all data references in the loop (which correspond to vdefs/vuses)
1926 and analyze their evolution in the loop. */
1928 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1930 /* Gather the data references and count stmts in the loop. */
1931 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1933 opt_result res
1934 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1935 &LOOP_VINFO_DATAREFS (loop_vinfo),
1936 n_stmts);
1937 if (!res)
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 "not vectorized: loop contains function "
1942 "calls or data references that cannot "
1943 "be analyzed\n");
1944 return res;
1946 loop_vinfo->shared->save_datarefs ();
1948 else
1949 loop_vinfo->shared->check_datarefs ();
1951 /* Analyze the data references and also adjust the minimal
1952 vectorization factor according to the loads and stores. */
1954 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1955 if (!ok)
1957 if (dump_enabled_p ())
1958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959 "bad data references.\n");
1960 return ok;
1963 /* Classify all cross-iteration scalar data-flow cycles.
1964 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1965 vect_analyze_scalar_cycles (loop_vinfo);
1967 vect_pattern_recog (loop_vinfo);
1969 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1971 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1972 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1974 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1975 if (!ok)
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1979 "bad data access.\n");
1980 return ok;
1983 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1985 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1986 if (!ok)
1988 if (dump_enabled_p ())
1989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1990 "unexpected pattern.\n");
1991 return ok;
1994 /* While the rest of the analysis below depends on it in some way. */
1995 fatal = false;
1997 /* Analyze data dependences between the data-refs in the loop
1998 and adjust the maximum vectorization factor according to
1999 the dependences.
2000 FORNOW: fail at the first data dependence that we encounter. */
2002 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2003 if (!ok)
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2007 "bad data dependence.\n");
2008 return ok;
2010 if (max_vf != MAX_VECTORIZATION_FACTOR
2011 && maybe_lt (max_vf, min_vf))
2012 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2013 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2015 ok = vect_determine_vectorization_factor (loop_vinfo);
2016 if (!ok)
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 "can't determine vectorization factor.\n");
2021 return ok;
2023 if (max_vf != MAX_VECTORIZATION_FACTOR
2024 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2025 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2027 /* Compute the scalar iteration cost. */
2028 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2030 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2032 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2033 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2034 if (!ok)
2035 return ok;
2037 /* If there are any SLP instances mark them as pure_slp. */
2038 bool slp = vect_make_slp_decision (loop_vinfo);
2039 if (slp)
2041 /* Find stmts that need to be both vectorized and SLPed. */
2042 vect_detect_hybrid_slp (loop_vinfo);
2044 /* Update the vectorization factor based on the SLP decision. */
2045 vect_update_vf_for_slp (loop_vinfo);
2048 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2050 /* We don't expect to have to roll back to anything other than an empty
2051 set of rgroups. */
2052 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2054 /* This is the point where we can re-start analysis with SLP forced off. */
2055 start_over:
2057 /* Now the vectorization factor is final. */
2058 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059 gcc_assert (known_ne (vectorization_factor, 0U));
2061 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2063 dump_printf_loc (MSG_NOTE, vect_location,
2064 "vectorization_factor = ");
2065 dump_dec (MSG_NOTE, vectorization_factor);
2066 dump_printf (MSG_NOTE, ", niters = %wd\n",
2067 LOOP_VINFO_INT_NITERS (loop_vinfo));
2070 /* Analyze the alignment of the data-refs in the loop.
2071 Fail if a data reference is found that cannot be vectorized. */
2073 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2074 if (!ok)
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078 "bad data alignment.\n");
2079 return ok;
2082 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2083 It is important to call pruning after vect_analyze_data_ref_accesses,
2084 since we use grouping information gathered by interleaving analysis. */
2085 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2086 if (!ok)
2087 return ok;
2089 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2090 vectorization, since we do not want to add extra peeling or
2091 add versioning for alignment. */
2092 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2093 /* This pass will decide on using loop versioning and/or loop peeling in
2094 order to enhance the alignment of data references in the loop. */
2095 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2096 else
2097 ok = vect_verify_datarefs_alignment (loop_vinfo);
2098 if (!ok)
2099 return ok;
2101 if (slp)
2103 /* Analyze operations in the SLP instances. Note this may
2104 remove unsupported SLP instances which makes the above
2105 SLP kind detection invalid. */
2106 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2107 vect_slp_analyze_operations (loop_vinfo);
2108 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2110 ok = opt_result::failure_at (vect_location,
2111 "unsupported SLP instances\n");
2112 goto again;
2116 /* Dissolve SLP-only groups. */
2117 vect_dissolve_slp_only_groups (loop_vinfo);
2119 /* Scan all the remaining operations in the loop that are not subject
2120 to SLP and make sure they are vectorizable. */
2121 ok = vect_analyze_loop_operations (loop_vinfo);
2122 if (!ok)
2124 if (dump_enabled_p ())
2125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2126 "bad operation or unsupported loop bound.\n");
2127 return ok;
2130 /* Decide whether to use a fully-masked loop for this vectorization
2131 factor. */
2132 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2133 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2134 && vect_verify_full_masking (loop_vinfo));
2135 if (dump_enabled_p ())
2137 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2138 dump_printf_loc (MSG_NOTE, vect_location,
2139 "using a fully-masked loop.\n");
2140 else
2141 dump_printf_loc (MSG_NOTE, vect_location,
2142 "not using a fully-masked loop.\n");
2145 /* If epilog loop is required because of data accesses with gaps,
2146 one additional iteration needs to be peeled. Check if there is
2147 enough iterations for vectorization. */
2148 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2149 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2150 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2152 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2153 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2155 if (known_lt (wi::to_widest (scalar_niters), vf))
2156 return opt_result::failure_at (vect_location,
2157 "loop has no enough iterations to"
2158 " support peeling for gaps.\n");
2161 /* If we're vectorizing an epilogue loop, we either need a fully-masked
2162 loop or a loop that has a lower VF than the main loop. */
2163 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2164 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2165 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2166 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2167 return opt_result::failure_at (vect_location,
2168 "Vectorization factor too high for"
2169 " epilogue loop.\n");
2171 /* Check the costings of the loop make vectorizing worthwhile. */
2172 res = vect_analyze_loop_costing (loop_vinfo);
2173 if (res < 0)
2175 ok = opt_result::failure_at (vect_location,
2176 "Loop costings may not be worthwhile.\n");
2177 goto again;
2179 if (!res)
2180 return opt_result::failure_at (vect_location,
2181 "Loop costings not worthwhile.\n");
2183 determine_peel_for_niter (loop_vinfo);
2184 /* If an epilogue loop is required make sure we can create one. */
2185 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2186 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2188 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2190 if (!vect_can_advance_ivs_p (loop_vinfo)
2191 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2192 single_exit (LOOP_VINFO_LOOP
2193 (loop_vinfo))))
2195 ok = opt_result::failure_at (vect_location,
2196 "not vectorized: can't create required "
2197 "epilog loop\n");
2198 goto again;
2202 /* During peeling, we need to check if number of loop iterations is
2203 enough for both peeled prolog loop and vector loop. This check
2204 can be merged along with threshold check of loop versioning, so
2205 increase threshold for this case if necessary.
2207 If we are analyzing an epilogue we still want to check what its
2208 versioning threshold would be. If we decide to vectorize the epilogues we
2209 will want to use the lowest versioning threshold of all epilogues and main
2210 loop. This will enable us to enter a vectorized epilogue even when
2211 versioning the loop. We can't simply check whether the epilogue requires
2212 versioning though since we may have skipped some versioning checks when
2213 analyzing the epilogue. For instance, checks for alias versioning will be
2214 skipped when dealing with epilogues as we assume we already checked them
2215 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2216 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2218 poly_uint64 niters_th = 0;
2219 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2221 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2223 /* Niters for peeled prolog loop. */
2224 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2226 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2227 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2228 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2230 else
2231 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2234 /* Niters for at least one iteration of vectorized loop. */
2235 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2236 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2237 /* One additional iteration because of peeling for gap. */
2238 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2239 niters_th += 1;
2241 /* Use the same condition as vect_transform_loop to decide when to use
2242 the cost to determine a versioning threshold. */
2243 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2244 && ordered_p (th, niters_th))
2245 niters_th = ordered_max (poly_uint64 (th), niters_th);
2247 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2250 gcc_assert (known_eq (vectorization_factor,
2251 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2253 /* Ok to vectorize! */
2254 return opt_result::success ();
2256 again:
2257 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2258 gcc_assert (!ok);
2260 /* Try again with SLP forced off but if we didn't do any SLP there is
2261 no point in re-trying. */
2262 if (!slp)
2263 return ok;
2265 /* If there are reduction chains re-trying will fail anyway. */
2266 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2267 return ok;
2269 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2270 via interleaving or lane instructions. */
2271 slp_instance instance;
2272 slp_tree node;
2273 unsigned i, j;
2274 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2276 stmt_vec_info vinfo;
2277 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2278 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2279 continue;
2280 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2281 unsigned int size = DR_GROUP_SIZE (vinfo);
2282 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2283 if (! vect_store_lanes_supported (vectype, size, false)
2284 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2285 && ! vect_grouped_store_supported (vectype, size))
2286 return opt_result::failure_at (vinfo->stmt,
2287 "unsupported grouped store\n");
2288 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2290 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2291 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2292 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2293 size = DR_GROUP_SIZE (vinfo);
2294 vectype = STMT_VINFO_VECTYPE (vinfo);
2295 if (! vect_load_lanes_supported (vectype, size, false)
2296 && ! vect_grouped_load_supported (vectype, single_element_p,
2297 size))
2298 return opt_result::failure_at (vinfo->stmt,
2299 "unsupported grouped load\n");
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_NOTE, vect_location,
2305 "re-trying with SLP disabled\n");
2307 /* Roll back state appropriately. No SLP this time. */
2308 slp = false;
2309 /* Restore vectorization factor as it were without SLP. */
2310 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2311 /* Free the SLP instances. */
2312 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2313 vect_free_slp_instance (instance, false);
2314 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2315 /* Reset SLP type to loop_vect on all stmts. */
2316 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2318 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2319 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2320 !gsi_end_p (si); gsi_next (&si))
2322 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2323 STMT_SLP_TYPE (stmt_info) = loop_vect;
2324 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2325 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2327 /* vectorizable_reduction adjusts reduction stmt def-types,
2328 restore them to that of the PHI. */
2329 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2330 = STMT_VINFO_DEF_TYPE (stmt_info);
2331 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2332 (STMT_VINFO_REDUC_DEF (stmt_info)))
2333 = STMT_VINFO_DEF_TYPE (stmt_info);
2336 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2337 !gsi_end_p (si); gsi_next (&si))
2339 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2340 STMT_SLP_TYPE (stmt_info) = loop_vect;
2341 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2343 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2344 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2345 STMT_SLP_TYPE (stmt_info) = loop_vect;
2346 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2347 !gsi_end_p (pi); gsi_next (&pi))
2348 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2349 = loop_vect;
2353 /* Free optimized alias test DDRS. */
2354 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2355 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2356 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2357 /* Reset target cost data. */
2358 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2359 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2360 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2361 /* Reset accumulated rgroup information. */
2362 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2363 /* Reset assorted flags. */
2364 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2365 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2366 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2367 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2368 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2370 goto start_over;
2373 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2374 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2375 OLD_LOOP_VINFO is better unless something specifically indicates
2376 otherwise.
2378 Note that this deliberately isn't a partial order. */
2380 static bool
2381 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2382 loop_vec_info old_loop_vinfo)
2384 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2385 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2387 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2388 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2390 /* Always prefer a VF of loop->simdlen over any other VF. */
2391 if (loop->simdlen)
2393 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2394 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2395 if (new_simdlen_p != old_simdlen_p)
2396 return new_simdlen_p;
2399 /* Limit the VFs to what is likely to be the maximum number of iterations,
2400 to handle cases in which at least one loop_vinfo is fully-masked. */
2401 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2402 if (estimated_max_niter != -1)
2404 if (known_le (estimated_max_niter, new_vf))
2405 new_vf = estimated_max_niter;
2406 if (known_le (estimated_max_niter, old_vf))
2407 old_vf = estimated_max_niter;
2410 /* Check whether the (fractional) cost per scalar iteration is lower
2411 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2412 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2413 * poly_widest_int (old_vf));
2414 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2415 * poly_widest_int (new_vf));
2416 if (maybe_lt (rel_old, rel_new))
2417 return false;
2418 if (known_lt (rel_new, rel_old))
2419 return true;
2421 /* If there's nothing to choose between the loop bodies, see whether
2422 there's a difference in the prologue and epilogue costs. */
2423 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2424 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2426 return false;
2429 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2430 true if we should. */
2432 static bool
2433 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2434 loop_vec_info old_loop_vinfo)
2436 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2437 return false;
2439 if (dump_enabled_p ())
2440 dump_printf_loc (MSG_NOTE, vect_location,
2441 "***** Preferring vector mode %s to vector mode %s\n",
2442 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2443 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2444 return true;
2447 /* Function vect_analyze_loop.
2449 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2450 for it. The different analyses will record information in the
2451 loop_vec_info struct. */
2452 opt_loop_vec_info
2453 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2455 auto_vector_modes vector_modes;
2457 /* Autodetect first vector size we try. */
2458 unsigned int autovec_flags
2459 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2460 loop->simdlen != 0);
2461 unsigned int mode_i = 0;
2463 DUMP_VECT_SCOPE ("analyze_loop_nest");
2465 if (loop_outer (loop)
2466 && loop_vec_info_for_loop (loop_outer (loop))
2467 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2468 return opt_loop_vec_info::failure_at (vect_location,
2469 "outer-loop already vectorized.\n");
2471 if (!find_loop_nest (loop, &shared->loop_nest))
2472 return opt_loop_vec_info::failure_at
2473 (vect_location,
2474 "not vectorized: loop nest containing two or more consecutive inner"
2475 " loops cannot be vectorized\n");
2477 unsigned n_stmts = 0;
2478 machine_mode autodetected_vector_mode = VOIDmode;
2479 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2480 machine_mode next_vector_mode = VOIDmode;
2481 poly_uint64 lowest_th = 0;
2482 unsigned vectorized_loops = 0;
2483 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2484 && !unlimited_cost_model (loop));
2486 bool vect_epilogues = false;
2487 opt_result res = opt_result::success ();
2488 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2489 while (1)
2491 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2492 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2493 if (!loop_vinfo)
2495 if (dump_enabled_p ())
2496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2497 "bad loop form.\n");
2498 gcc_checking_assert (first_loop_vinfo == NULL);
2499 return loop_vinfo;
2501 loop_vinfo->vector_mode = next_vector_mode;
2503 bool fatal = false;
2505 /* When pick_lowest_cost_p is true, we should in principle iterate
2506 over all the loop_vec_infos that LOOP_VINFO could replace and
2507 try to vectorize LOOP_VINFO under the same conditions.
2508 E.g. when trying to replace an epilogue loop, we should vectorize
2509 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2510 to replace the main loop, we should vectorize LOOP_VINFO as a main
2511 loop too.
2513 However, autovectorize_vector_modes is usually sorted as follows:
2515 - Modes that naturally produce lower VFs usually follow modes that
2516 naturally produce higher VFs.
2518 - When modes naturally produce the same VF, maskable modes
2519 usually follow unmaskable ones, so that the maskable mode
2520 can be used to vectorize the epilogue of the unmaskable mode.
2522 This order is preferred because it leads to the maximum
2523 epilogue vectorization opportunities. Targets should only use
2524 a different order if they want to make wide modes available while
2525 disparaging them relative to earlier, smaller modes. The assumption
2526 in that case is that the wider modes are more expensive in some
2527 way that isn't reflected directly in the costs.
2529 There should therefore be few interesting cases in which
2530 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2531 treated as a standalone loop, and ends up being genuinely cheaper
2532 than FIRST_LOOP_VINFO. */
2533 if (vect_epilogues)
2534 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2536 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2537 if (mode_i == 0)
2538 autodetected_vector_mode = loop_vinfo->vector_mode;
2539 if (dump_enabled_p ())
2541 if (res)
2542 dump_printf_loc (MSG_NOTE, vect_location,
2543 "***** Analysis succeeded with vector mode %s\n",
2544 GET_MODE_NAME (loop_vinfo->vector_mode));
2545 else
2546 dump_printf_loc (MSG_NOTE, vect_location,
2547 "***** Analysis failed with vector mode %s\n",
2548 GET_MODE_NAME (loop_vinfo->vector_mode));
2551 loop->aux = NULL;
2553 if (!fatal)
2554 while (mode_i < vector_modes.length ()
2555 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2557 if (dump_enabled_p ())
2558 dump_printf_loc (MSG_NOTE, vect_location,
2559 "***** The result for vector mode %s would"
2560 " be the same\n",
2561 GET_MODE_NAME (vector_modes[mode_i]));
2562 mode_i += 1;
2565 if (res)
2567 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2568 vectorized_loops++;
2570 /* Once we hit the desired simdlen for the first time,
2571 discard any previous attempts. */
2572 if (simdlen
2573 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2575 delete first_loop_vinfo;
2576 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2577 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2578 simdlen = 0;
2580 else if (pick_lowest_cost_p && first_loop_vinfo)
2582 /* Keep trying to roll back vectorization attempts while the
2583 loop_vec_infos they produced were worse than this one. */
2584 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2585 while (!vinfos.is_empty ()
2586 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2588 gcc_assert (vect_epilogues);
2589 delete vinfos.pop ();
2591 if (vinfos.is_empty ()
2592 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2594 delete first_loop_vinfo;
2595 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2596 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2600 if (first_loop_vinfo == NULL)
2602 first_loop_vinfo = loop_vinfo;
2603 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2605 else if (vect_epilogues
2606 /* For now only allow one epilogue loop. */
2607 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2609 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2610 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2611 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2612 || maybe_ne (lowest_th, 0U));
2613 /* Keep track of the known smallest versioning
2614 threshold. */
2615 if (ordered_p (lowest_th, th))
2616 lowest_th = ordered_min (lowest_th, th);
2618 else
2619 delete loop_vinfo;
2621 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2622 enabled, SIMDUID is not set, it is the innermost loop and we have
2623 either already found the loop's SIMDLEN or there was no SIMDLEN to
2624 begin with.
2625 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2626 vect_epilogues = (!simdlen
2627 && loop->inner == NULL
2628 && param_vect_epilogues_nomask
2629 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2630 && !loop->simduid
2631 /* For now only allow one epilogue loop, but allow
2632 pick_lowest_cost_p to replace it. */
2633 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2634 || pick_lowest_cost_p));
2636 /* Commit to first_loop_vinfo if we have no reason to try
2637 alternatives. */
2638 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2639 break;
2641 else
2643 delete loop_vinfo;
2644 if (fatal)
2646 gcc_checking_assert (first_loop_vinfo == NULL);
2647 break;
2651 if (mode_i < vector_modes.length ()
2652 && VECTOR_MODE_P (autodetected_vector_mode)
2653 && (related_vector_mode (vector_modes[mode_i],
2654 GET_MODE_INNER (autodetected_vector_mode))
2655 == autodetected_vector_mode)
2656 && (related_vector_mode (autodetected_vector_mode,
2657 GET_MODE_INNER (vector_modes[mode_i]))
2658 == vector_modes[mode_i]))
2660 if (dump_enabled_p ())
2661 dump_printf_loc (MSG_NOTE, vect_location,
2662 "***** Skipping vector mode %s, which would"
2663 " repeat the analysis for %s\n",
2664 GET_MODE_NAME (vector_modes[mode_i]),
2665 GET_MODE_NAME (autodetected_vector_mode));
2666 mode_i += 1;
2669 if (mode_i == vector_modes.length ()
2670 || autodetected_vector_mode == VOIDmode)
2671 break;
2673 /* Try the next biggest vector size. */
2674 next_vector_mode = vector_modes[mode_i++];
2675 if (dump_enabled_p ())
2676 dump_printf_loc (MSG_NOTE, vect_location,
2677 "***** Re-trying analysis with vector mode %s\n",
2678 GET_MODE_NAME (next_vector_mode));
2681 if (first_loop_vinfo)
2683 loop->aux = (loop_vec_info) first_loop_vinfo;
2684 if (dump_enabled_p ())
2685 dump_printf_loc (MSG_NOTE, vect_location,
2686 "***** Choosing vector mode %s\n",
2687 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2688 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2689 return first_loop_vinfo;
2692 return opt_loop_vec_info::propagate_failure (res);
2695 /* Return true if there is an in-order reduction function for CODE, storing
2696 it in *REDUC_FN if so. */
2698 static bool
2699 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2701 switch (code)
2703 case PLUS_EXPR:
2704 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2705 return true;
2707 default:
2708 return false;
2712 /* Function reduction_fn_for_scalar_code
2714 Input:
2715 CODE - tree_code of a reduction operations.
2717 Output:
2718 REDUC_FN - the corresponding internal function to be used to reduce the
2719 vector of partial results into a single scalar result, or IFN_LAST
2720 if the operation is a supported reduction operation, but does not have
2721 such an internal function.
2723 Return FALSE if CODE currently cannot be vectorized as reduction. */
2725 static bool
2726 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2728 switch (code)
2730 case MAX_EXPR:
2731 *reduc_fn = IFN_REDUC_MAX;
2732 return true;
2734 case MIN_EXPR:
2735 *reduc_fn = IFN_REDUC_MIN;
2736 return true;
2738 case PLUS_EXPR:
2739 *reduc_fn = IFN_REDUC_PLUS;
2740 return true;
2742 case BIT_AND_EXPR:
2743 *reduc_fn = IFN_REDUC_AND;
2744 return true;
2746 case BIT_IOR_EXPR:
2747 *reduc_fn = IFN_REDUC_IOR;
2748 return true;
2750 case BIT_XOR_EXPR:
2751 *reduc_fn = IFN_REDUC_XOR;
2752 return true;
2754 case MULT_EXPR:
2755 case MINUS_EXPR:
2756 *reduc_fn = IFN_LAST;
2757 return true;
2759 default:
2760 return false;
2764 /* If there is a neutral value X such that SLP reduction NODE would not
2765 be affected by the introduction of additional X elements, return that X,
2766 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2767 is the vector type that would hold element X. REDUC_CHAIN is true if
2768 the SLP statements perform a single reduction, false if each statement
2769 performs an independent reduction. */
2771 static tree
2772 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2773 tree_code code, bool reduc_chain)
2775 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2776 stmt_vec_info stmt_vinfo = stmts[0];
2777 tree scalar_type = TREE_TYPE (vector_type);
2778 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2779 gcc_assert (loop);
2781 switch (code)
2783 case WIDEN_SUM_EXPR:
2784 case DOT_PROD_EXPR:
2785 case SAD_EXPR:
2786 case PLUS_EXPR:
2787 case MINUS_EXPR:
2788 case BIT_IOR_EXPR:
2789 case BIT_XOR_EXPR:
2790 return build_zero_cst (scalar_type);
2792 case MULT_EXPR:
2793 return build_one_cst (scalar_type);
2795 case BIT_AND_EXPR:
2796 return build_all_ones_cst (scalar_type);
2798 case MAX_EXPR:
2799 case MIN_EXPR:
2800 /* For MIN/MAX the initial values are neutral. A reduction chain
2801 has only a single initial value, so that value is neutral for
2802 all statements. */
2803 if (reduc_chain)
2804 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2805 loop_preheader_edge (loop));
2806 return NULL_TREE;
2808 default:
2809 return NULL_TREE;
2813 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2814 STMT is printed with a message MSG. */
2816 static void
2817 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2819 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2822 /* Return true if we need an in-order reduction for operation CODE
2823 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2824 overflow must wrap. */
2826 bool
2827 needs_fold_left_reduction_p (tree type, tree_code code)
2829 /* CHECKME: check for !flag_finite_math_only too? */
2830 if (SCALAR_FLOAT_TYPE_P (type))
2831 switch (code)
2833 case MIN_EXPR:
2834 case MAX_EXPR:
2835 return false;
2837 default:
2838 return !flag_associative_math;
2841 if (INTEGRAL_TYPE_P (type))
2843 if (!operation_no_trapping_overflow (type, code))
2844 return true;
2845 return false;
2848 if (SAT_FIXED_POINT_TYPE_P (type))
2849 return true;
2851 return false;
2854 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2855 has a handled computation expression. Store the main reduction
2856 operation in *CODE. */
2858 static bool
2859 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2860 tree loop_arg, enum tree_code *code,
2861 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2863 auto_bitmap visited;
2864 tree lookfor = PHI_RESULT (phi);
2865 ssa_op_iter curri;
2866 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2867 while (USE_FROM_PTR (curr) != loop_arg)
2868 curr = op_iter_next_use (&curri);
2869 curri.i = curri.numops;
2872 path.safe_push (std::make_pair (curri, curr));
2873 tree use = USE_FROM_PTR (curr);
2874 if (use == lookfor)
2875 break;
2876 gimple *def = SSA_NAME_DEF_STMT (use);
2877 if (gimple_nop_p (def)
2878 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2880 pop:
2883 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2884 curri = x.first;
2885 curr = x.second;
2887 curr = op_iter_next_use (&curri);
2888 /* Skip already visited or non-SSA operands (from iterating
2889 over PHI args). */
2890 while (curr != NULL_USE_OPERAND_P
2891 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2892 || ! bitmap_set_bit (visited,
2893 SSA_NAME_VERSION
2894 (USE_FROM_PTR (curr)))));
2896 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2897 if (curr == NULL_USE_OPERAND_P)
2898 break;
2900 else
2902 if (gimple_code (def) == GIMPLE_PHI)
2903 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2904 else
2905 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2906 while (curr != NULL_USE_OPERAND_P
2907 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2908 || ! bitmap_set_bit (visited,
2909 SSA_NAME_VERSION
2910 (USE_FROM_PTR (curr)))))
2911 curr = op_iter_next_use (&curri);
2912 if (curr == NULL_USE_OPERAND_P)
2913 goto pop;
2916 while (1);
2917 if (dump_file && (dump_flags & TDF_DETAILS))
2919 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2920 unsigned i;
2921 std::pair<ssa_op_iter, use_operand_p> *x;
2922 FOR_EACH_VEC_ELT (path, i, x)
2923 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2924 dump_printf (MSG_NOTE, "\n");
2927 /* Check whether the reduction path detected is valid. */
2928 bool fail = path.length () == 0;
2929 bool neg = false;
2930 int sign = -1;
2931 *code = ERROR_MARK;
2932 for (unsigned i = 1; i < path.length (); ++i)
2934 gimple *use_stmt = USE_STMT (path[i].second);
2935 tree op = USE_FROM_PTR (path[i].second);
2936 if (! is_gimple_assign (use_stmt)
2937 /* The following make sure we can compute the operand index
2938 easily plus it mostly disallows chaining via COND_EXPR condition
2939 operands. */
2940 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2941 && (gimple_num_ops (use_stmt) <= 2
2942 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2943 && (gimple_num_ops (use_stmt) <= 3
2944 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2946 fail = true;
2947 break;
2949 /* Check there's only a single stmt the op is used on inside
2950 of the loop. */
2951 imm_use_iterator imm_iter;
2952 gimple *op_use_stmt;
2953 unsigned cnt = 0;
2954 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2955 if (!is_gimple_debug (op_use_stmt)
2956 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2958 /* We want to allow x + x but not x < 1 ? x : 2. */
2959 if (is_gimple_assign (op_use_stmt)
2960 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2962 use_operand_p use_p;
2963 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2964 cnt++;
2966 else
2967 cnt++;
2969 if (cnt != 1)
2971 fail = true;
2972 break;
2974 tree_code use_code = gimple_assign_rhs_code (use_stmt);
2975 if (use_code == MINUS_EXPR)
2977 use_code = PLUS_EXPR;
2978 /* Track whether we negate the reduction value each iteration. */
2979 if (gimple_assign_rhs2 (use_stmt) == op)
2980 neg = ! neg;
2982 if (CONVERT_EXPR_CODE_P (use_code)
2983 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
2984 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
2986 else if (*code == ERROR_MARK)
2988 *code = use_code;
2989 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
2991 else if (use_code != *code)
2993 fail = true;
2994 break;
2996 else if ((use_code == MIN_EXPR
2997 || use_code == MAX_EXPR)
2998 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3000 fail = true;
3001 break;
3004 return ! fail && ! neg && *code != ERROR_MARK;
3007 bool
3008 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3009 tree loop_arg, enum tree_code code)
3011 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3012 enum tree_code code_;
3013 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3014 && code_ == code);
3019 /* Function vect_is_simple_reduction
3021 (1) Detect a cross-iteration def-use cycle that represents a simple
3022 reduction computation. We look for the following pattern:
3024 loop_header:
3025 a1 = phi < a0, a2 >
3026 a3 = ...
3027 a2 = operation (a3, a1)
3031 a3 = ...
3032 loop_header:
3033 a1 = phi < a0, a2 >
3034 a2 = operation (a3, a1)
3036 such that:
3037 1. operation is commutative and associative and it is safe to
3038 change the order of the computation
3039 2. no uses for a2 in the loop (a2 is used out of the loop)
3040 3. no uses of a1 in the loop besides the reduction operation
3041 4. no uses of a1 outside the loop.
3043 Conditions 1,4 are tested here.
3044 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3046 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3047 nested cycles.
3049 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3050 reductions:
3052 a1 = phi < a0, a2 >
3053 inner loop (def of a3)
3054 a2 = phi < a3 >
3056 (4) Detect condition expressions, ie:
3057 for (int i = 0; i < N; i++)
3058 if (a[i] < val)
3059 ret_val = a[i];
3063 static stmt_vec_info
3064 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3065 bool *double_reduc, bool *reduc_chain_p)
3067 gphi *phi = as_a <gphi *> (phi_info->stmt);
3068 gimple *phi_use_stmt = NULL;
3069 imm_use_iterator imm_iter;
3070 use_operand_p use_p;
3072 *double_reduc = false;
3073 *reduc_chain_p = false;
3074 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3076 tree phi_name = PHI_RESULT (phi);
3077 /* ??? If there are no uses of the PHI result the inner loop reduction
3078 won't be detected as possibly double-reduction by vectorizable_reduction
3079 because that tries to walk the PHI arg from the preheader edge which
3080 can be constant. See PR60382. */
3081 if (has_zero_uses (phi_name))
3082 return NULL;
3083 class loop *loop = (gimple_bb (phi))->loop_father;
3084 unsigned nphi_def_loop_uses = 0;
3085 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3087 gimple *use_stmt = USE_STMT (use_p);
3088 if (is_gimple_debug (use_stmt))
3089 continue;
3091 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3093 if (dump_enabled_p ())
3094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3095 "intermediate value used outside loop.\n");
3097 return NULL;
3100 nphi_def_loop_uses++;
3101 phi_use_stmt = use_stmt;
3104 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3105 if (TREE_CODE (latch_def) != SSA_NAME)
3107 if (dump_enabled_p ())
3108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3109 "reduction: not ssa_name: %T\n", latch_def);
3110 return NULL;
3113 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3114 if (!def_stmt_info
3115 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3116 return NULL;
3118 bool nested_in_vect_loop
3119 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3120 unsigned nlatch_def_loop_uses = 0;
3121 auto_vec<gphi *, 3> lcphis;
3122 bool inner_loop_of_double_reduc = false;
3123 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3125 gimple *use_stmt = USE_STMT (use_p);
3126 if (is_gimple_debug (use_stmt))
3127 continue;
3128 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3129 nlatch_def_loop_uses++;
3130 else
3132 /* We can have more than one loop-closed PHI. */
3133 lcphis.safe_push (as_a <gphi *> (use_stmt));
3134 if (nested_in_vect_loop
3135 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3136 == vect_double_reduction_def))
3137 inner_loop_of_double_reduc = true;
3141 /* If we are vectorizing an inner reduction we are executing that
3142 in the original order only in case we are not dealing with a
3143 double reduction. */
3144 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3146 if (dump_enabled_p ())
3147 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3148 "detected nested cycle: ");
3149 return def_stmt_info;
3152 /* If this isn't a nested cycle or if the nested cycle reduction value
3153 is used ouside of the inner loop we cannot handle uses of the reduction
3154 value. */
3155 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3157 if (dump_enabled_p ())
3158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159 "reduction used in loop.\n");
3160 return NULL;
3163 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3164 defined in the inner loop. */
3165 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3167 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3168 if (gimple_phi_num_args (def_stmt) != 1
3169 || TREE_CODE (op1) != SSA_NAME)
3171 if (dump_enabled_p ())
3172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3173 "unsupported phi node definition.\n");
3175 return NULL;
3178 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3179 if (gimple_bb (def1)
3180 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3181 && loop->inner
3182 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3183 && is_gimple_assign (def1)
3184 && is_a <gphi *> (phi_use_stmt)
3185 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3187 if (dump_enabled_p ())
3188 report_vect_op (MSG_NOTE, def_stmt,
3189 "detected double reduction: ");
3191 *double_reduc = true;
3192 return def_stmt_info;
3195 return NULL;
3198 /* Look for the expression computing latch_def from then loop PHI result. */
3199 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3200 enum tree_code code;
3201 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3202 path))
3204 STMT_VINFO_REDUC_CODE (phi_info) = code;
3205 if (code == COND_EXPR && !nested_in_vect_loop)
3206 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3208 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3209 reduction chain for which the additional restriction is that
3210 all operations in the chain are the same. */
3211 auto_vec<stmt_vec_info, 8> reduc_chain;
3212 unsigned i;
3213 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3214 for (i = path.length () - 1; i >= 1; --i)
3216 gimple *stmt = USE_STMT (path[i].second);
3217 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3218 STMT_VINFO_REDUC_IDX (stmt_info)
3219 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3220 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3221 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3222 && (i == 1 || i == path.length () - 1));
3223 if ((stmt_code != code && !leading_conversion)
3224 /* We can only handle the final value in epilogue
3225 generation for reduction chains. */
3226 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3227 is_slp_reduc = false;
3228 /* For reduction chains we support a trailing/leading
3229 conversions. We do not store those in the actual chain. */
3230 if (leading_conversion)
3231 continue;
3232 reduc_chain.safe_push (stmt_info);
3234 if (is_slp_reduc && reduc_chain.length () > 1)
3236 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3238 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3239 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3241 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3242 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3244 /* Save the chain for further analysis in SLP detection. */
3245 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3246 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3248 *reduc_chain_p = true;
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE, vect_location,
3251 "reduction: detected reduction chain\n");
3253 else if (dump_enabled_p ())
3254 dump_printf_loc (MSG_NOTE, vect_location,
3255 "reduction: detected reduction\n");
3257 return def_stmt_info;
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "reduction: unknown pattern\n");
3264 return NULL;
3267 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3269 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3270 int *peel_iters_epilogue,
3271 stmt_vector_for_cost *scalar_cost_vec,
3272 stmt_vector_for_cost *prologue_cost_vec,
3273 stmt_vector_for_cost *epilogue_cost_vec)
3275 int retval = 0;
3276 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3278 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3280 *peel_iters_epilogue = assumed_vf / 2;
3281 if (dump_enabled_p ())
3282 dump_printf_loc (MSG_NOTE, vect_location,
3283 "cost model: epilogue peel iters set to vf/2 "
3284 "because loop iterations are unknown .\n");
3286 /* If peeled iterations are known but number of scalar loop
3287 iterations are unknown, count a taken branch per peeled loop. */
3288 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3289 NULL, 0, vect_prologue);
3290 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3291 NULL, 0, vect_epilogue);
3293 else
3295 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3296 peel_iters_prologue = niters < peel_iters_prologue ?
3297 niters : peel_iters_prologue;
3298 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3299 /* If we need to peel for gaps, but no peeling is required, we have to
3300 peel VF iterations. */
3301 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3302 *peel_iters_epilogue = assumed_vf;
3305 stmt_info_for_cost *si;
3306 int j;
3307 if (peel_iters_prologue)
3308 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3309 retval += record_stmt_cost (prologue_cost_vec,
3310 si->count * peel_iters_prologue,
3311 si->kind, si->stmt_info, si->misalign,
3312 vect_prologue);
3313 if (*peel_iters_epilogue)
3314 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3315 retval += record_stmt_cost (epilogue_cost_vec,
3316 si->count * *peel_iters_epilogue,
3317 si->kind, si->stmt_info, si->misalign,
3318 vect_epilogue);
3320 return retval;
3323 /* Function vect_estimate_min_profitable_iters
3325 Return the number of iterations required for the vector version of the
3326 loop to be profitable relative to the cost of the scalar version of the
3327 loop.
3329 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3330 of iterations for vectorization. -1 value means loop vectorization
3331 is not profitable. This returned value may be used for dynamic
3332 profitability check.
3334 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3335 for static check against estimated number of iterations. */
3337 static void
3338 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3339 int *ret_min_profitable_niters,
3340 int *ret_min_profitable_estimate)
3342 int min_profitable_iters;
3343 int min_profitable_estimate;
3344 int peel_iters_prologue;
3345 int peel_iters_epilogue;
3346 unsigned vec_inside_cost = 0;
3347 int vec_outside_cost = 0;
3348 unsigned vec_prologue_cost = 0;
3349 unsigned vec_epilogue_cost = 0;
3350 int scalar_single_iter_cost = 0;
3351 int scalar_outside_cost = 0;
3352 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3353 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3354 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3356 /* Cost model disabled. */
3357 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3359 if (dump_enabled_p ())
3360 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3361 *ret_min_profitable_niters = 0;
3362 *ret_min_profitable_estimate = 0;
3363 return;
3366 /* Requires loop versioning tests to handle misalignment. */
3367 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3369 /* FIXME: Make cost depend on complexity of individual check. */
3370 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3371 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3372 vect_prologue);
3373 if (dump_enabled_p ())
3374 dump_printf (MSG_NOTE,
3375 "cost model: Adding cost of checks for loop "
3376 "versioning to treat misalignment.\n");
3379 /* Requires loop versioning with alias checks. */
3380 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3382 /* FIXME: Make cost depend on complexity of individual check. */
3383 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3384 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3385 vect_prologue);
3386 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3387 if (len)
3388 /* Count LEN - 1 ANDs and LEN comparisons. */
3389 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3390 NULL, 0, vect_prologue);
3391 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3392 if (len)
3394 /* Count LEN - 1 ANDs and LEN comparisons. */
3395 unsigned int nstmts = len * 2 - 1;
3396 /* +1 for each bias that needs adding. */
3397 for (unsigned int i = 0; i < len; ++i)
3398 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3399 nstmts += 1;
3400 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3401 NULL, 0, vect_prologue);
3403 if (dump_enabled_p ())
3404 dump_printf (MSG_NOTE,
3405 "cost model: Adding cost of checks for loop "
3406 "versioning aliasing.\n");
3409 /* Requires loop versioning with niter checks. */
3410 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3412 /* FIXME: Make cost depend on complexity of individual check. */
3413 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3414 vect_prologue);
3415 if (dump_enabled_p ())
3416 dump_printf (MSG_NOTE,
3417 "cost model: Adding cost of checks for loop "
3418 "versioning niters.\n");
3421 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3422 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3423 vect_prologue);
3425 /* Count statements in scalar loop. Using this as scalar cost for a single
3426 iteration for now.
3428 TODO: Add outer loop support.
3430 TODO: Consider assigning different costs to different scalar
3431 statements. */
3433 scalar_single_iter_cost
3434 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3436 /* Add additional cost for the peeled instructions in prologue and epilogue
3437 loop. (For fully-masked loops there will be no peeling.)
3439 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3440 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3442 TODO: Build an expression that represents peel_iters for prologue and
3443 epilogue to be used in a run-time test. */
3445 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3447 peel_iters_prologue = 0;
3448 peel_iters_epilogue = 0;
3450 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3452 /* We need to peel exactly one iteration. */
3453 peel_iters_epilogue += 1;
3454 stmt_info_for_cost *si;
3455 int j;
3456 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3457 j, si)
3458 (void) add_stmt_cost (target_cost_data, si->count,
3459 si->kind, si->stmt_info, si->misalign,
3460 vect_epilogue);
3463 /* Calculate how many masks we need to generate. */
3464 unsigned int num_masks = 0;
3465 rgroup_masks *rgm;
3466 unsigned int num_vectors_m1;
3467 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3468 if (rgm->mask_type)
3469 num_masks += num_vectors_m1 + 1;
3470 gcc_assert (num_masks > 0);
3472 /* In the worst case, we need to generate each mask in the prologue
3473 and in the loop body. One of the loop body mask instructions
3474 replaces the comparison in the scalar loop, and since we don't
3475 count the scalar comparison against the scalar body, we shouldn't
3476 count that vector instruction against the vector body either.
3478 Sometimes we can use unpacks instead of generating prologue
3479 masks and sometimes the prologue mask will fold to a constant,
3480 so the actual prologue cost might be smaller. However, it's
3481 simpler and safer to use the worst-case cost; if this ends up
3482 being the tie-breaker between vectorizing or not, then it's
3483 probably better not to vectorize. */
3484 (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
3485 NULL, 0, vect_prologue);
3486 (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
3487 NULL, 0, vect_body);
3489 else if (npeel < 0)
3491 peel_iters_prologue = assumed_vf / 2;
3492 if (dump_enabled_p ())
3493 dump_printf (MSG_NOTE, "cost model: "
3494 "prologue peel iters set to vf/2.\n");
3496 /* If peeling for alignment is unknown, loop bound of main loop becomes
3497 unknown. */
3498 peel_iters_epilogue = assumed_vf / 2;
3499 if (dump_enabled_p ())
3500 dump_printf (MSG_NOTE, "cost model: "
3501 "epilogue peel iters set to vf/2 because "
3502 "peeling for alignment is unknown.\n");
3504 /* If peeled iterations are unknown, count a taken branch and a not taken
3505 branch per peeled loop. Even if scalar loop iterations are known,
3506 vector iterations are not known since peeled prologue iterations are
3507 not known. Hence guards remain the same. */
3508 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3509 NULL, 0, vect_prologue);
3510 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3511 NULL, 0, vect_prologue);
3512 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3513 NULL, 0, vect_epilogue);
3514 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3515 NULL, 0, vect_epilogue);
3516 stmt_info_for_cost *si;
3517 int j;
3518 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3520 (void) add_stmt_cost (target_cost_data,
3521 si->count * peel_iters_prologue,
3522 si->kind, si->stmt_info, si->misalign,
3523 vect_prologue);
3524 (void) add_stmt_cost (target_cost_data,
3525 si->count * peel_iters_epilogue,
3526 si->kind, si->stmt_info, si->misalign,
3527 vect_epilogue);
3530 else
3532 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3533 stmt_info_for_cost *si;
3534 int j;
3535 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3537 prologue_cost_vec.create (2);
3538 epilogue_cost_vec.create (2);
3539 peel_iters_prologue = npeel;
3541 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3542 &peel_iters_epilogue,
3543 &LOOP_VINFO_SCALAR_ITERATION_COST
3544 (loop_vinfo),
3545 &prologue_cost_vec,
3546 &epilogue_cost_vec);
3548 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3549 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3550 si->misalign, vect_prologue);
3552 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3553 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3554 si->misalign, vect_epilogue);
3556 prologue_cost_vec.release ();
3557 epilogue_cost_vec.release ();
3560 /* FORNOW: The scalar outside cost is incremented in one of the
3561 following ways:
3563 1. The vectorizer checks for alignment and aliasing and generates
3564 a condition that allows dynamic vectorization. A cost model
3565 check is ANDED with the versioning condition. Hence scalar code
3566 path now has the added cost of the versioning check.
3568 if (cost > th & versioning_check)
3569 jmp to vector code
3571 Hence run-time scalar is incremented by not-taken branch cost.
3573 2. The vectorizer then checks if a prologue is required. If the
3574 cost model check was not done before during versioning, it has to
3575 be done before the prologue check.
3577 if (cost <= th)
3578 prologue = scalar_iters
3579 if (prologue == 0)
3580 jmp to vector code
3581 else
3582 execute prologue
3583 if (prologue == num_iters)
3584 go to exit
3586 Hence the run-time scalar cost is incremented by a taken branch,
3587 plus a not-taken branch, plus a taken branch cost.
3589 3. The vectorizer then checks if an epilogue is required. If the
3590 cost model check was not done before during prologue check, it
3591 has to be done with the epilogue check.
3593 if (prologue == 0)
3594 jmp to vector code
3595 else
3596 execute prologue
3597 if (prologue == num_iters)
3598 go to exit
3599 vector code:
3600 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3601 jmp to epilogue
3603 Hence the run-time scalar cost should be incremented by 2 taken
3604 branches.
3606 TODO: The back end may reorder the BBS's differently and reverse
3607 conditions/branch directions. Change the estimates below to
3608 something more reasonable. */
3610 /* If the number of iterations is known and we do not do versioning, we can
3611 decide whether to vectorize at compile time. Hence the scalar version
3612 do not carry cost model guard costs. */
3613 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3614 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3616 /* Cost model check occurs at versioning. */
3617 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3618 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3619 else
3621 /* Cost model check occurs at prologue generation. */
3622 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3623 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3624 + vect_get_stmt_cost (cond_branch_not_taken);
3625 /* Cost model check occurs at epilogue generation. */
3626 else
3627 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3631 /* Complete the target-specific cost calculations. */
3632 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3633 &vec_inside_cost, &vec_epilogue_cost);
3635 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3637 /* Stash the costs so that we can compare two loop_vec_infos. */
3638 loop_vinfo->vec_inside_cost = vec_inside_cost;
3639 loop_vinfo->vec_outside_cost = vec_outside_cost;
3641 if (dump_enabled_p ())
3643 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3644 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3645 vec_inside_cost);
3646 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3647 vec_prologue_cost);
3648 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3649 vec_epilogue_cost);
3650 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3651 scalar_single_iter_cost);
3652 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3653 scalar_outside_cost);
3654 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3655 vec_outside_cost);
3656 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3657 peel_iters_prologue);
3658 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3659 peel_iters_epilogue);
3662 /* Calculate number of iterations required to make the vector version
3663 profitable, relative to the loop bodies only. The following condition
3664 must hold true:
3665 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3666 where
3667 SIC = scalar iteration cost, VIC = vector iteration cost,
3668 VOC = vector outside cost, VF = vectorization factor,
3669 NPEEL = prologue iterations + epilogue iterations,
3670 SOC = scalar outside cost for run time cost model check. */
3672 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3673 - vec_inside_cost);
3674 if (saving_per_viter <= 0)
3676 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3677 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3678 "vectorization did not happen for a simd loop");
3680 if (dump_enabled_p ())
3681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3682 "cost model: the vector iteration cost = %d "
3683 "divided by the scalar iteration cost = %d "
3684 "is greater or equal to the vectorization factor = %d"
3685 ".\n",
3686 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3687 *ret_min_profitable_niters = -1;
3688 *ret_min_profitable_estimate = -1;
3689 return;
3692 /* ??? The "if" arm is written to handle all cases; see below for what
3693 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3694 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3696 /* Rewriting the condition above in terms of the number of
3697 vector iterations (vniters) rather than the number of
3698 scalar iterations (niters) gives:
3700 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3702 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3704 For integer N, X and Y when X > 0:
3706 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3707 int outside_overhead = (vec_outside_cost
3708 - scalar_single_iter_cost * peel_iters_prologue
3709 - scalar_single_iter_cost * peel_iters_epilogue
3710 - scalar_outside_cost);
3711 /* We're only interested in cases that require at least one
3712 vector iteration. */
3713 int min_vec_niters = 1;
3714 if (outside_overhead > 0)
3715 min_vec_niters = outside_overhead / saving_per_viter + 1;
3717 if (dump_enabled_p ())
3718 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3719 min_vec_niters);
3721 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3723 /* Now that we know the minimum number of vector iterations,
3724 find the minimum niters for which the scalar cost is larger:
3726 SIC * niters > VIC * vniters + VOC - SOC
3728 We know that the minimum niters is no more than
3729 vniters * VF + NPEEL, but it might be (and often is) less
3730 than that if a partial vector iteration is cheaper than the
3731 equivalent scalar code. */
3732 int threshold = (vec_inside_cost * min_vec_niters
3733 + vec_outside_cost
3734 - scalar_outside_cost);
3735 if (threshold <= 0)
3736 min_profitable_iters = 1;
3737 else
3738 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3740 else
3741 /* Convert the number of vector iterations into a number of
3742 scalar iterations. */
3743 min_profitable_iters = (min_vec_niters * assumed_vf
3744 + peel_iters_prologue
3745 + peel_iters_epilogue);
3747 else
3749 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3750 * assumed_vf
3751 - vec_inside_cost * peel_iters_prologue
3752 - vec_inside_cost * peel_iters_epilogue);
3753 if (min_profitable_iters <= 0)
3754 min_profitable_iters = 0;
3755 else
3757 min_profitable_iters /= saving_per_viter;
3759 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3760 <= (((int) vec_inside_cost * min_profitable_iters)
3761 + (((int) vec_outside_cost - scalar_outside_cost)
3762 * assumed_vf)))
3763 min_profitable_iters++;
3767 if (dump_enabled_p ())
3768 dump_printf (MSG_NOTE,
3769 " Calculated minimum iters for profitability: %d\n",
3770 min_profitable_iters);
3772 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3773 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3774 /* We want the vectorized loop to execute at least once. */
3775 min_profitable_iters = assumed_vf + peel_iters_prologue;
3777 if (dump_enabled_p ())
3778 dump_printf_loc (MSG_NOTE, vect_location,
3779 " Runtime profitability threshold = %d\n",
3780 min_profitable_iters);
3782 *ret_min_profitable_niters = min_profitable_iters;
3784 /* Calculate number of iterations required to make the vector version
3785 profitable, relative to the loop bodies only.
3787 Non-vectorized variant is SIC * niters and it must win over vector
3788 variant on the expected loop trip count. The following condition must hold true:
3789 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3791 if (vec_outside_cost <= 0)
3792 min_profitable_estimate = 0;
3793 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3795 /* This is a repeat of the code above, but with + SOC rather
3796 than - SOC. */
3797 int outside_overhead = (vec_outside_cost
3798 - scalar_single_iter_cost * peel_iters_prologue
3799 - scalar_single_iter_cost * peel_iters_epilogue
3800 + scalar_outside_cost);
3801 int min_vec_niters = 1;
3802 if (outside_overhead > 0)
3803 min_vec_niters = outside_overhead / saving_per_viter + 1;
3805 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3807 int threshold = (vec_inside_cost * min_vec_niters
3808 + vec_outside_cost
3809 + scalar_outside_cost);
3810 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3812 else
3813 min_profitable_estimate = (min_vec_niters * assumed_vf
3814 + peel_iters_prologue
3815 + peel_iters_epilogue);
3817 else
3819 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3820 * assumed_vf
3821 - vec_inside_cost * peel_iters_prologue
3822 - vec_inside_cost * peel_iters_epilogue)
3823 / ((scalar_single_iter_cost * assumed_vf)
3824 - vec_inside_cost);
3826 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3827 if (dump_enabled_p ())
3828 dump_printf_loc (MSG_NOTE, vect_location,
3829 " Static estimate profitability threshold = %d\n",
3830 min_profitable_estimate);
3832 *ret_min_profitable_estimate = min_profitable_estimate;
3835 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3836 vector elements (not bits) for a vector with NELT elements. */
3837 static void
3838 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3839 vec_perm_builder *sel)
3841 /* The encoding is a single stepped pattern. Any wrap-around is handled
3842 by vec_perm_indices. */
3843 sel->new_vector (nelt, 1, 3);
3844 for (unsigned int i = 0; i < 3; i++)
3845 sel->quick_push (i + offset);
3848 /* Checks whether the target supports whole-vector shifts for vectors of mode
3849 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3850 it supports vec_perm_const with masks for all necessary shift amounts. */
3851 static bool
3852 have_whole_vector_shift (machine_mode mode)
3854 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3855 return true;
3857 /* Variable-length vectors should be handled via the optab. */
3858 unsigned int nelt;
3859 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3860 return false;
3862 vec_perm_builder sel;
3863 vec_perm_indices indices;
3864 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3866 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3867 indices.new_vector (sel, 2, nelt);
3868 if (!can_vec_perm_const_p (mode, indices, false))
3869 return false;
3871 return true;
3874 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3875 functions. Design better to avoid maintenance issues. */
3877 /* Function vect_model_reduction_cost.
3879 Models cost for a reduction operation, including the vector ops
3880 generated within the strip-mine loop, the initial definition before
3881 the loop, and the epilogue code that must be generated. */
3883 static void
3884 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3885 vect_reduction_type reduction_type,
3886 int ncopies, stmt_vector_for_cost *cost_vec)
3888 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3889 enum tree_code code;
3890 optab optab;
3891 tree vectype;
3892 machine_mode mode;
3893 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3894 class loop *loop = NULL;
3896 if (loop_vinfo)
3897 loop = LOOP_VINFO_LOOP (loop_vinfo);
3899 /* Condition reductions generate two reductions in the loop. */
3900 if (reduction_type == COND_REDUCTION)
3901 ncopies *= 2;
3903 vectype = STMT_VINFO_VECTYPE (stmt_info);
3904 mode = TYPE_MODE (vectype);
3905 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3907 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3909 if (reduction_type == EXTRACT_LAST_REDUCTION)
3910 /* No extra instructions are needed in the prologue. The loop body
3911 operations are costed in vectorizable_condition. */
3912 inside_cost = 0;
3913 else if (reduction_type == FOLD_LEFT_REDUCTION)
3915 /* No extra instructions needed in the prologue. */
3916 prologue_cost = 0;
3918 if (reduc_fn != IFN_LAST)
3919 /* Count one reduction-like operation per vector. */
3920 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3921 stmt_info, 0, vect_body);
3922 else
3924 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3925 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3926 inside_cost = record_stmt_cost (cost_vec, nelements,
3927 vec_to_scalar, stmt_info, 0,
3928 vect_body);
3929 inside_cost += record_stmt_cost (cost_vec, nelements,
3930 scalar_stmt, stmt_info, 0,
3931 vect_body);
3934 else
3936 /* Add in cost for initial definition.
3937 For cond reduction we have four vectors: initial index, step,
3938 initial result of the data reduction, initial value of the index
3939 reduction. */
3940 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3941 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3942 scalar_to_vec, stmt_info, 0,
3943 vect_prologue);
3945 /* Cost of reduction op inside loop. */
3946 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3947 stmt_info, 0, vect_body);
3950 /* Determine cost of epilogue code.
3952 We have a reduction operator that will reduce the vector in one statement.
3953 Also requires scalar extract. */
3955 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3957 if (reduc_fn != IFN_LAST)
3959 if (reduction_type == COND_REDUCTION)
3961 /* An EQ stmt and an COND_EXPR stmt. */
3962 epilogue_cost += record_stmt_cost (cost_vec, 2,
3963 vector_stmt, stmt_info, 0,
3964 vect_epilogue);
3965 /* Reduction of the max index and a reduction of the found
3966 values. */
3967 epilogue_cost += record_stmt_cost (cost_vec, 2,
3968 vec_to_scalar, stmt_info, 0,
3969 vect_epilogue);
3970 /* A broadcast of the max value. */
3971 epilogue_cost += record_stmt_cost (cost_vec, 1,
3972 scalar_to_vec, stmt_info, 0,
3973 vect_epilogue);
3975 else
3977 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3978 stmt_info, 0, vect_epilogue);
3979 epilogue_cost += record_stmt_cost (cost_vec, 1,
3980 vec_to_scalar, stmt_info, 0,
3981 vect_epilogue);
3984 else if (reduction_type == COND_REDUCTION)
3986 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3987 /* Extraction of scalar elements. */
3988 epilogue_cost += record_stmt_cost (cost_vec,
3989 2 * estimated_nunits,
3990 vec_to_scalar, stmt_info, 0,
3991 vect_epilogue);
3992 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3993 epilogue_cost += record_stmt_cost (cost_vec,
3994 2 * estimated_nunits - 3,
3995 scalar_stmt, stmt_info, 0,
3996 vect_epilogue);
3998 else if (reduction_type == EXTRACT_LAST_REDUCTION
3999 || reduction_type == FOLD_LEFT_REDUCTION)
4000 /* No extra instructions need in the epilogue. */
4002 else
4004 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4005 tree bitsize =
4006 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4007 int element_bitsize = tree_to_uhwi (bitsize);
4008 int nelements = vec_size_in_bits / element_bitsize;
4010 if (code == COND_EXPR)
4011 code = MAX_EXPR;
4013 optab = optab_for_tree_code (code, vectype, optab_default);
4015 /* We have a whole vector shift available. */
4016 if (optab != unknown_optab
4017 && VECTOR_MODE_P (mode)
4018 && optab_handler (optab, mode) != CODE_FOR_nothing
4019 && have_whole_vector_shift (mode))
4021 /* Final reduction via vector shifts and the reduction operator.
4022 Also requires scalar extract. */
4023 epilogue_cost += record_stmt_cost (cost_vec,
4024 exact_log2 (nelements) * 2,
4025 vector_stmt, stmt_info, 0,
4026 vect_epilogue);
4027 epilogue_cost += record_stmt_cost (cost_vec, 1,
4028 vec_to_scalar, stmt_info, 0,
4029 vect_epilogue);
4031 else
4032 /* Use extracts and reduction op for final reduction. For N
4033 elements, we have N extracts and N-1 reduction ops. */
4034 epilogue_cost += record_stmt_cost (cost_vec,
4035 nelements + nelements - 1,
4036 vector_stmt, stmt_info, 0,
4037 vect_epilogue);
4041 if (dump_enabled_p ())
4042 dump_printf (MSG_NOTE,
4043 "vect_model_reduction_cost: inside_cost = %d, "
4044 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4045 prologue_cost, epilogue_cost);
4049 /* Function vect_model_induction_cost.
4051 Models cost for induction operations. */
4053 static void
4054 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4055 stmt_vector_for_cost *cost_vec)
4057 unsigned inside_cost, prologue_cost;
4059 if (PURE_SLP_STMT (stmt_info))
4060 return;
4062 /* loop cost for vec_loop. */
4063 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4064 stmt_info, 0, vect_body);
4066 /* prologue cost for vec_init and vec_step. */
4067 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4068 stmt_info, 0, vect_prologue);
4070 if (dump_enabled_p ())
4071 dump_printf_loc (MSG_NOTE, vect_location,
4072 "vect_model_induction_cost: inside_cost = %d, "
4073 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4078 /* Function get_initial_def_for_reduction
4080 Input:
4081 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4082 INIT_VAL - the initial value of the reduction variable
4084 Output:
4085 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4086 of the reduction (used for adjusting the epilog - see below).
4087 Return a vector variable, initialized according to the operation that
4088 STMT_VINFO performs. This vector will be used as the initial value
4089 of the vector of partial results.
4091 Option1 (adjust in epilog): Initialize the vector as follows:
4092 add/bit or/xor: [0,0,...,0,0]
4093 mult/bit and: [1,1,...,1,1]
4094 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4095 and when necessary (e.g. add/mult case) let the caller know
4096 that it needs to adjust the result by init_val.
4098 Option2: Initialize the vector as follows:
4099 add/bit or/xor: [init_val,0,0,...,0]
4100 mult/bit and: [init_val,1,1,...,1]
4101 min/max/cond_expr: [init_val,init_val,...,init_val]
4102 and no adjustments are needed.
4104 For example, for the following code:
4106 s = init_val;
4107 for (i=0;i<n;i++)
4108 s = s + a[i];
4110 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4111 For a vector of 4 units, we want to return either [0,0,0,init_val],
4112 or [0,0,0,0] and let the caller know that it needs to adjust
4113 the result at the end by 'init_val'.
4115 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4116 initialization vector is simpler (same element in all entries), if
4117 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4119 A cost model should help decide between these two schemes. */
4121 static tree
4122 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
4123 enum tree_code code, tree init_val,
4124 tree *adjustment_def)
4126 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4127 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4128 tree scalar_type = TREE_TYPE (init_val);
4129 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4130 tree def_for_init;
4131 tree init_def;
4132 REAL_VALUE_TYPE real_init_val = dconst0;
4133 int int_init_val = 0;
4134 gimple_seq stmts = NULL;
4136 gcc_assert (vectype);
4138 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4139 || SCALAR_FLOAT_TYPE_P (scalar_type));
4141 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4142 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4144 /* ADJUSTMENT_DEF is NULL when called from
4145 vect_create_epilog_for_reduction to vectorize double reduction. */
4146 if (adjustment_def)
4147 *adjustment_def = NULL;
4149 switch (code)
4151 case WIDEN_SUM_EXPR:
4152 case DOT_PROD_EXPR:
4153 case SAD_EXPR:
4154 case PLUS_EXPR:
4155 case MINUS_EXPR:
4156 case BIT_IOR_EXPR:
4157 case BIT_XOR_EXPR:
4158 case MULT_EXPR:
4159 case BIT_AND_EXPR:
4161 if (code == MULT_EXPR)
4163 real_init_val = dconst1;
4164 int_init_val = 1;
4167 if (code == BIT_AND_EXPR)
4168 int_init_val = -1;
4170 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4171 def_for_init = build_real (scalar_type, real_init_val);
4172 else
4173 def_for_init = build_int_cst (scalar_type, int_init_val);
4175 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4177 /* Option1: the first element is '0' or '1' as well. */
4178 if (!operand_equal_p (def_for_init, init_val, 0))
4179 *adjustment_def = init_val;
4180 init_def = gimple_build_vector_from_val (&stmts, vectype,
4181 def_for_init);
4183 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4185 /* Option2 (variable length): the first element is INIT_VAL. */
4186 init_def = gimple_build_vector_from_val (&stmts, vectype,
4187 def_for_init);
4188 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4189 vectype, init_def, init_val);
4191 else
4193 /* Option2: the first element is INIT_VAL. */
4194 tree_vector_builder elts (vectype, 1, 2);
4195 elts.quick_push (init_val);
4196 elts.quick_push (def_for_init);
4197 init_def = gimple_build_vector (&stmts, &elts);
4200 break;
4202 case MIN_EXPR:
4203 case MAX_EXPR:
4204 case COND_EXPR:
4206 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4207 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4209 break;
4211 default:
4212 gcc_unreachable ();
4215 if (stmts)
4216 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4217 return init_def;
4220 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4221 NUMBER_OF_VECTORS is the number of vector defs to create.
4222 If NEUTRAL_OP is nonnull, introducing extra elements of that
4223 value will not change the result. */
4225 static void
4226 get_initial_defs_for_reduction (slp_tree slp_node,
4227 vec<tree> *vec_oprnds,
4228 unsigned int number_of_vectors,
4229 bool reduc_chain, tree neutral_op)
4231 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4232 stmt_vec_info stmt_vinfo = stmts[0];
4233 vec_info *vinfo = stmt_vinfo->vinfo;
4234 unsigned HOST_WIDE_INT nunits;
4235 unsigned j, number_of_places_left_in_vector;
4236 tree vector_type;
4237 unsigned int group_size = stmts.length ();
4238 unsigned int i;
4239 class loop *loop;
4241 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4243 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4245 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4246 gcc_assert (loop);
4247 edge pe = loop_preheader_edge (loop);
4249 gcc_assert (!reduc_chain || neutral_op);
4251 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4252 created vectors. It is greater than 1 if unrolling is performed.
4254 For example, we have two scalar operands, s1 and s2 (e.g., group of
4255 strided accesses of size two), while NUNITS is four (i.e., four scalars
4256 of this type can be packed in a vector). The output vector will contain
4257 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4258 will be 2).
4260 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4261 vectors containing the operands.
4263 For example, NUNITS is four as before, and the group size is 8
4264 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4265 {s5, s6, s7, s8}. */
4267 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4268 nunits = group_size;
4270 number_of_places_left_in_vector = nunits;
4271 bool constant_p = true;
4272 tree_vector_builder elts (vector_type, nunits, 1);
4273 elts.quick_grow (nunits);
4274 gimple_seq ctor_seq = NULL;
4275 for (j = 0; j < nunits * number_of_vectors; ++j)
4277 tree op;
4278 i = j % group_size;
4279 stmt_vinfo = stmts[i];
4281 /* Get the def before the loop. In reduction chain we have only
4282 one initial value. Else we have as many as PHIs in the group. */
4283 if (reduc_chain)
4284 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4285 else if (((vec_oprnds->length () + 1) * nunits
4286 - number_of_places_left_in_vector >= group_size)
4287 && neutral_op)
4288 op = neutral_op;
4289 else
4290 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4292 /* Create 'vect_ = {op0,op1,...,opn}'. */
4293 number_of_places_left_in_vector--;
4294 elts[nunits - number_of_places_left_in_vector - 1] = op;
4295 if (!CONSTANT_CLASS_P (op))
4296 constant_p = false;
4298 if (number_of_places_left_in_vector == 0)
4300 tree init;
4301 if (constant_p && !neutral_op
4302 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4303 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4304 /* Build the vector directly from ELTS. */
4305 init = gimple_build_vector (&ctor_seq, &elts);
4306 else if (neutral_op)
4308 /* Build a vector of the neutral value and shift the
4309 other elements into place. */
4310 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4311 neutral_op);
4312 int k = nunits;
4313 while (k > 0 && elts[k - 1] == neutral_op)
4314 k -= 1;
4315 while (k > 0)
4317 k -= 1;
4318 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4319 vector_type, init, elts[k]);
4322 else
4324 /* First time round, duplicate ELTS to fill the
4325 required number of vectors. */
4326 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4327 number_of_vectors, *vec_oprnds);
4328 break;
4330 vec_oprnds->quick_push (init);
4332 number_of_places_left_in_vector = nunits;
4333 elts.new_vector (vector_type, nunits, 1);
4334 elts.quick_grow (nunits);
4335 constant_p = true;
4338 if (ctor_seq != NULL)
4339 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4342 /* For a statement STMT_INFO taking part in a reduction operation return
4343 the stmt_vec_info the meta information is stored on. */
4345 stmt_vec_info
4346 info_for_reduction (stmt_vec_info stmt_info)
4348 stmt_info = vect_orig_stmt (stmt_info);
4349 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4350 if (!is_a <gphi *> (stmt_info->stmt))
4351 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4352 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4353 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4355 if (gimple_phi_num_args (phi) == 1)
4356 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4358 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4360 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4361 stmt_vec_info info
4362 = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4363 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4364 stmt_info = info;
4366 return stmt_info;
4369 /* Function vect_create_epilog_for_reduction
4371 Create code at the loop-epilog to finalize the result of a reduction
4372 computation.
4374 STMT_INFO is the scalar reduction stmt that is being vectorized.
4375 SLP_NODE is an SLP node containing a group of reduction statements. The
4376 first one in this group is STMT_INFO.
4377 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4378 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4379 (counting from 0)
4381 This function:
4382 1. Completes the reduction def-use cycles.
4383 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4384 by calling the function specified by REDUC_FN if available, or by
4385 other means (whole-vector shifts or a scalar loop).
4386 The function also creates a new phi node at the loop exit to preserve
4387 loop-closed form, as illustrated below.
4389 The flow at the entry to this function:
4391 loop:
4392 vec_def = phi <vec_init, null> # REDUCTION_PHI
4393 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4394 s_loop = scalar_stmt # (scalar) STMT_INFO
4395 loop_exit:
4396 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4397 use <s_out0>
4398 use <s_out0>
4400 The above is transformed by this function into:
4402 loop:
4403 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4404 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4405 s_loop = scalar_stmt # (scalar) STMT_INFO
4406 loop_exit:
4407 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4408 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4409 v_out2 = reduce <v_out1>
4410 s_out3 = extract_field <v_out2, 0>
4411 s_out4 = adjust_result <s_out3>
4412 use <s_out4>
4413 use <s_out4>
4416 static void
4417 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4418 slp_tree slp_node,
4419 slp_instance slp_node_instance)
4421 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4422 gcc_assert (reduc_info->is_reduc_info);
4423 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4424 /* For double reductions we need to get at the inner loop reduction
4425 stmt which has the meta info attached. Our stmt_info is that of the
4426 loop-closed PHI of the inner loop which we remember as
4427 def for the reduction PHI generation. */
4428 bool double_reduc = false;
4429 stmt_vec_info rdef_info = stmt_info;
4430 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4432 gcc_assert (!slp_node);
4433 double_reduc = true;
4434 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4435 (stmt_info->stmt, 0));
4436 stmt_info = vect_stmt_to_vectorize (stmt_info);
4438 gphi *reduc_def_stmt
4439 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4440 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4441 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4442 stmt_vec_info prev_phi_info;
4443 tree vectype;
4444 machine_mode mode;
4445 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4446 basic_block exit_bb;
4447 tree scalar_dest;
4448 tree scalar_type;
4449 gimple *new_phi = NULL, *phi;
4450 stmt_vec_info phi_info;
4451 gimple_stmt_iterator exit_gsi;
4452 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4453 gimple *epilog_stmt = NULL;
4454 gimple *exit_phi;
4455 tree bitsize;
4456 tree def;
4457 tree orig_name, scalar_result;
4458 imm_use_iterator imm_iter, phi_imm_iter;
4459 use_operand_p use_p, phi_use_p;
4460 gimple *use_stmt;
4461 bool nested_in_vect_loop = false;
4462 auto_vec<gimple *> new_phis;
4463 int j, i;
4464 auto_vec<tree> scalar_results;
4465 unsigned int group_size = 1, k;
4466 auto_vec<gimple *> phis;
4467 bool slp_reduc = false;
4468 bool direct_slp_reduc;
4469 tree new_phi_result;
4470 tree induction_index = NULL_TREE;
4472 if (slp_node)
4473 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4475 if (nested_in_vect_loop_p (loop, stmt_info))
4477 outer_loop = loop;
4478 loop = loop->inner;
4479 nested_in_vect_loop = true;
4480 gcc_assert (!slp_node);
4482 gcc_assert (!nested_in_vect_loop || double_reduc);
4484 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4485 gcc_assert (vectype);
4486 mode = TYPE_MODE (vectype);
4488 tree initial_def = NULL;
4489 tree induc_val = NULL_TREE;
4490 tree adjustment_def = NULL;
4491 if (slp_node)
4493 else
4495 /* Get at the scalar def before the loop, that defines the initial value
4496 of the reduction variable. */
4497 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4498 loop_preheader_edge (loop));
4499 /* Optimize: for induction condition reduction, if we can't use zero
4500 for induc_val, use initial_def. */
4501 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4502 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4503 else if (double_reduc)
4505 else if (nested_in_vect_loop)
4507 else
4508 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4511 unsigned vec_num;
4512 int ncopies;
4513 if (slp_node)
4515 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4516 ncopies = 1;
4518 else
4520 vec_num = 1;
4521 ncopies = 0;
4522 phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4525 ncopies++;
4526 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4528 while (phi_info);
4531 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4532 which is updated with the current index of the loop for every match of
4533 the original loop's cond_expr (VEC_STMT). This results in a vector
4534 containing the last time the condition passed for that vector lane.
4535 The first match will be a 1 to allow 0 to be used for non-matching
4536 indexes. If there are no matches at all then the vector will be all
4537 zeroes.
4539 PR92772: This algorithm is broken for architectures that support
4540 masked vectors, but do not provide fold_extract_last. */
4541 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4543 auto_vec<std::pair<tree, bool>, 2> ccompares;
4544 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4545 cond_info = vect_stmt_to_vectorize (cond_info);
4546 while (cond_info != reduc_info)
4548 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4550 gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4551 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4552 ccompares.safe_push
4553 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4554 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4556 cond_info
4557 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4558 1 + STMT_VINFO_REDUC_IDX
4559 (cond_info)));
4560 cond_info = vect_stmt_to_vectorize (cond_info);
4562 gcc_assert (ccompares.length () != 0);
4564 tree indx_before_incr, indx_after_incr;
4565 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4566 int scalar_precision
4567 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4568 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4569 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4570 (TYPE_MODE (vectype), cr_index_scalar_type,
4571 TYPE_VECTOR_SUBPARTS (vectype));
4573 /* First we create a simple vector induction variable which starts
4574 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4575 vector size (STEP). */
4577 /* Create a {1,2,3,...} vector. */
4578 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4580 /* Create a vector of the step value. */
4581 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4582 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4584 /* Create an induction variable. */
4585 gimple_stmt_iterator incr_gsi;
4586 bool insert_after;
4587 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4588 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4589 insert_after, &indx_before_incr, &indx_after_incr);
4591 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4592 filled with zeros (VEC_ZERO). */
4594 /* Create a vector of 0s. */
4595 tree zero = build_zero_cst (cr_index_scalar_type);
4596 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4598 /* Create a vector phi node. */
4599 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4600 new_phi = create_phi_node (new_phi_tree, loop->header);
4601 loop_vinfo->add_stmt (new_phi);
4602 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4603 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4605 /* Now take the condition from the loops original cond_exprs
4606 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4607 every match uses values from the induction variable
4608 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4609 (NEW_PHI_TREE).
4610 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4611 the new cond_expr (INDEX_COND_EXPR). */
4612 gimple_seq stmts = NULL;
4613 for (int i = ccompares.length () - 1; i != -1; --i)
4615 tree ccompare = ccompares[i].first;
4616 if (ccompares[i].second)
4617 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4618 cr_index_vector_type,
4619 ccompare,
4620 indx_before_incr, new_phi_tree);
4621 else
4622 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4623 cr_index_vector_type,
4624 ccompare,
4625 new_phi_tree, indx_before_incr);
4627 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4628 stmt_vec_info index_vec_info
4629 = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4630 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4632 /* Update the phi with the vec cond. */
4633 induction_index = new_phi_tree;
4634 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4635 loop_latch_edge (loop), UNKNOWN_LOCATION);
4638 /* 2. Create epilog code.
4639 The reduction epilog code operates across the elements of the vector
4640 of partial results computed by the vectorized loop.
4641 The reduction epilog code consists of:
4643 step 1: compute the scalar result in a vector (v_out2)
4644 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4645 step 3: adjust the scalar result (s_out3) if needed.
4647 Step 1 can be accomplished using one the following three schemes:
4648 (scheme 1) using reduc_fn, if available.
4649 (scheme 2) using whole-vector shifts, if available.
4650 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4651 combined.
4653 The overall epilog code looks like this:
4655 s_out0 = phi <s_loop> # original EXIT_PHI
4656 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4657 v_out2 = reduce <v_out1> # step 1
4658 s_out3 = extract_field <v_out2, 0> # step 2
4659 s_out4 = adjust_result <s_out3> # step 3
4661 (step 3 is optional, and steps 1 and 2 may be combined).
4662 Lastly, the uses of s_out0 are replaced by s_out4. */
4665 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4666 v_out1 = phi <VECT_DEF>
4667 Store them in NEW_PHIS. */
4668 if (double_reduc)
4669 loop = outer_loop;
4670 exit_bb = single_exit (loop)->dest;
4671 prev_phi_info = NULL;
4672 new_phis.create (slp_node ? vec_num : ncopies);
4673 for (unsigned i = 0; i < vec_num; i++)
4675 if (slp_node)
4676 def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4677 else
4678 def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4679 for (j = 0; j < ncopies; j++)
4681 tree new_def = copy_ssa_name (def);
4682 phi = create_phi_node (new_def, exit_bb);
4683 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4684 if (j == 0)
4685 new_phis.quick_push (phi);
4686 else
4688 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4689 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4692 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4693 prev_phi_info = phi_info;
4697 exit_gsi = gsi_after_labels (exit_bb);
4699 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4700 (i.e. when reduc_fn is not available) and in the final adjustment
4701 code (if needed). Also get the original scalar reduction variable as
4702 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4703 represents a reduction pattern), the tree-code and scalar-def are
4704 taken from the original stmt that the pattern-stmt (STMT) replaces.
4705 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4706 are taken from STMT. */
4708 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4709 if (orig_stmt_info != stmt_info)
4711 /* Reduction pattern */
4712 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4713 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4716 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4717 scalar_type = TREE_TYPE (scalar_dest);
4718 scalar_results.create (group_size);
4719 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4720 bitsize = TYPE_SIZE (scalar_type);
4722 /* SLP reduction without reduction chain, e.g.,
4723 # a1 = phi <a2, a0>
4724 # b1 = phi <b2, b0>
4725 a2 = operation (a1)
4726 b2 = operation (b1) */
4727 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4729 /* True if we should implement SLP_REDUC using native reduction operations
4730 instead of scalar operations. */
4731 direct_slp_reduc = (reduc_fn != IFN_LAST
4732 && slp_reduc
4733 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4735 /* In case of reduction chain, e.g.,
4736 # a1 = phi <a3, a0>
4737 a2 = operation (a1)
4738 a3 = operation (a2),
4740 we may end up with more than one vector result. Here we reduce them to
4741 one vector. */
4742 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4744 gimple_seq stmts = NULL;
4745 tree first_vect = PHI_RESULT (new_phis[0]);
4746 first_vect = gimple_convert (&stmts, vectype, first_vect);
4747 for (k = 1; k < new_phis.length (); k++)
4749 gimple *next_phi = new_phis[k];
4750 tree second_vect = PHI_RESULT (next_phi);
4751 second_vect = gimple_convert (&stmts, vectype, second_vect);
4752 first_vect = gimple_build (&stmts, code, vectype,
4753 first_vect, second_vect);
4755 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4757 new_phi_result = first_vect;
4758 new_phis.truncate (0);
4759 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4761 /* Likewise if we couldn't use a single defuse cycle. */
4762 else if (ncopies > 1)
4764 gcc_assert (new_phis.length () == 1);
4765 gimple_seq stmts = NULL;
4766 tree first_vect = PHI_RESULT (new_phis[0]);
4767 first_vect = gimple_convert (&stmts, vectype, first_vect);
4768 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4769 for (int k = 1; k < ncopies; ++k)
4771 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4772 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4773 second_vect = gimple_convert (&stmts, vectype, second_vect);
4774 first_vect = gimple_build (&stmts, code, vectype,
4775 first_vect, second_vect);
4777 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4778 new_phi_result = first_vect;
4779 new_phis.truncate (0);
4780 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4782 else
4783 new_phi_result = PHI_RESULT (new_phis[0]);
4785 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4786 && reduc_fn != IFN_LAST)
4788 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4789 various data values where the condition matched and another vector
4790 (INDUCTION_INDEX) containing all the indexes of those matches. We
4791 need to extract the last matching index (which will be the index with
4792 highest value) and use this to index into the data vector.
4793 For the case where there were no matches, the data vector will contain
4794 all default values and the index vector will be all zeros. */
4796 /* Get various versions of the type of the vector of indexes. */
4797 tree index_vec_type = TREE_TYPE (induction_index);
4798 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4799 tree index_scalar_type = TREE_TYPE (index_vec_type);
4800 tree index_vec_cmp_type = truth_type_for (index_vec_type);
4802 /* Get an unsigned integer version of the type of the data vector. */
4803 int scalar_precision
4804 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4805 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4806 tree vectype_unsigned = build_vector_type
4807 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4809 /* First we need to create a vector (ZERO_VEC) of zeros and another
4810 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4811 can create using a MAX reduction and then expanding.
4812 In the case where the loop never made any matches, the max index will
4813 be zero. */
4815 /* Vector of {0, 0, 0,...}. */
4816 tree zero_vec = build_zero_cst (vectype);
4818 gimple_seq stmts = NULL;
4819 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4820 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4822 /* Find maximum value from the vector of found indexes. */
4823 tree max_index = make_ssa_name (index_scalar_type);
4824 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4825 1, induction_index);
4826 gimple_call_set_lhs (max_index_stmt, max_index);
4827 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4829 /* Vector of {max_index, max_index, max_index,...}. */
4830 tree max_index_vec = make_ssa_name (index_vec_type);
4831 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4832 max_index);
4833 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4834 max_index_vec_rhs);
4835 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4837 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4838 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4839 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4840 otherwise. Only one value should match, resulting in a vector
4841 (VEC_COND) with one data value and the rest zeros.
4842 In the case where the loop never made any matches, every index will
4843 match, resulting in a vector with all data values (which will all be
4844 the default value). */
4846 /* Compare the max index vector to the vector of found indexes to find
4847 the position of the max value. */
4848 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4849 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4850 induction_index,
4851 max_index_vec);
4852 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4854 /* Use the compare to choose either values from the data vector or
4855 zero. */
4856 tree vec_cond = make_ssa_name (vectype);
4857 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4858 vec_compare, new_phi_result,
4859 zero_vec);
4860 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4862 /* Finally we need to extract the data value from the vector (VEC_COND)
4863 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4864 reduction, but because this doesn't exist, we can use a MAX reduction
4865 instead. The data value might be signed or a float so we need to cast
4866 it first.
4867 In the case where the loop never made any matches, the data values are
4868 all identical, and so will reduce down correctly. */
4870 /* Make the matched data values unsigned. */
4871 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4872 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4873 vec_cond);
4874 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4875 VIEW_CONVERT_EXPR,
4876 vec_cond_cast_rhs);
4877 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4879 /* Reduce down to a scalar value. */
4880 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4881 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4882 1, vec_cond_cast);
4883 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4884 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4886 /* Convert the reduced value back to the result type and set as the
4887 result. */
4888 stmts = NULL;
4889 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4890 data_reduc);
4891 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4892 scalar_results.safe_push (new_temp);
4894 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4895 && reduc_fn == IFN_LAST)
4897 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4898 idx = 0;
4899 idx_val = induction_index[0];
4900 val = data_reduc[0];
4901 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4902 if (induction_index[i] > idx_val)
4903 val = data_reduc[i], idx_val = induction_index[i];
4904 return val; */
4906 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4907 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4908 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4909 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4910 /* Enforced by vectorizable_reduction, which ensures we have target
4911 support before allowing a conditional reduction on variable-length
4912 vectors. */
4913 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4914 tree idx_val = NULL_TREE, val = NULL_TREE;
4915 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4917 tree old_idx_val = idx_val;
4918 tree old_val = val;
4919 idx_val = make_ssa_name (idx_eltype);
4920 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4921 build3 (BIT_FIELD_REF, idx_eltype,
4922 induction_index,
4923 bitsize_int (el_size),
4924 bitsize_int (off)));
4925 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4926 val = make_ssa_name (data_eltype);
4927 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4928 build3 (BIT_FIELD_REF,
4929 data_eltype,
4930 new_phi_result,
4931 bitsize_int (el_size),
4932 bitsize_int (off)));
4933 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4934 if (off != 0)
4936 tree new_idx_val = idx_val;
4937 if (off != v_size - el_size)
4939 new_idx_val = make_ssa_name (idx_eltype);
4940 epilog_stmt = gimple_build_assign (new_idx_val,
4941 MAX_EXPR, idx_val,
4942 old_idx_val);
4943 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4945 tree new_val = make_ssa_name (data_eltype);
4946 epilog_stmt = gimple_build_assign (new_val,
4947 COND_EXPR,
4948 build2 (GT_EXPR,
4949 boolean_type_node,
4950 idx_val,
4951 old_idx_val),
4952 val, old_val);
4953 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4954 idx_val = new_idx_val;
4955 val = new_val;
4958 /* Convert the reduced value back to the result type and set as the
4959 result. */
4960 gimple_seq stmts = NULL;
4961 val = gimple_convert (&stmts, scalar_type, val);
4962 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4963 scalar_results.safe_push (val);
4966 /* 2.3 Create the reduction code, using one of the three schemes described
4967 above. In SLP we simply need to extract all the elements from the
4968 vector (without reducing them), so we use scalar shifts. */
4969 else if (reduc_fn != IFN_LAST && !slp_reduc)
4971 tree tmp;
4972 tree vec_elem_type;
4974 /* Case 1: Create:
4975 v_out2 = reduc_expr <v_out1> */
4977 if (dump_enabled_p ())
4978 dump_printf_loc (MSG_NOTE, vect_location,
4979 "Reduce using direct vector reduction.\n");
4981 gimple_seq stmts = NULL;
4982 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4983 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4984 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
4985 vec_elem_type, new_phi_result);
4986 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
4987 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4989 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4990 && induc_val)
4992 /* Earlier we set the initial value to be a vector if induc_val
4993 values. Check the result and if it is induc_val then replace
4994 with the original initial value, unless induc_val is
4995 the same as initial_def already. */
4996 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4997 induc_val);
4999 tmp = make_ssa_name (new_scalar_dest);
5000 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5001 initial_def, new_temp);
5002 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5003 new_temp = tmp;
5006 scalar_results.safe_push (new_temp);
5008 else if (direct_slp_reduc)
5010 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5011 with the elements for other SLP statements replaced with the
5012 neutral value. We can then do a normal reduction on each vector. */
5014 /* Enforced by vectorizable_reduction. */
5015 gcc_assert (new_phis.length () == 1);
5016 gcc_assert (pow2p_hwi (group_size));
5018 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5019 vec<stmt_vec_info> orig_phis
5020 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5021 gimple_seq seq = NULL;
5023 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5024 and the same element size as VECTYPE. */
5025 tree index = build_index_vector (vectype, 0, 1);
5026 tree index_type = TREE_TYPE (index);
5027 tree index_elt_type = TREE_TYPE (index_type);
5028 tree mask_type = truth_type_for (index_type);
5030 /* Create a vector that, for each element, identifies which of
5031 the REDUC_GROUP_SIZE results should use it. */
5032 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5033 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5034 build_vector_from_val (index_type, index_mask));
5036 /* Get a neutral vector value. This is simply a splat of the neutral
5037 scalar value if we have one, otherwise the initial scalar value
5038 is itself a neutral value. */
5039 tree vector_identity = NULL_TREE;
5040 tree neutral_op = NULL_TREE;
5041 if (slp_node)
5043 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5044 neutral_op
5045 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5046 vectype, code, first != NULL);
5048 if (neutral_op)
5049 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5050 neutral_op);
5051 for (unsigned int i = 0; i < group_size; ++i)
5053 /* If there's no univeral neutral value, we can use the
5054 initial scalar value from the original PHI. This is used
5055 for MIN and MAX reduction, for example. */
5056 if (!neutral_op)
5058 tree scalar_value
5059 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5060 loop_preheader_edge (loop));
5061 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5062 scalar_value);
5063 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5064 scalar_value);
5067 /* Calculate the equivalent of:
5069 sel[j] = (index[j] == i);
5071 which selects the elements of NEW_PHI_RESULT that should
5072 be included in the result. */
5073 tree compare_val = build_int_cst (index_elt_type, i);
5074 compare_val = build_vector_from_val (index_type, compare_val);
5075 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5076 index, compare_val);
5078 /* Calculate the equivalent of:
5080 vec = seq ? new_phi_result : vector_identity;
5082 VEC is now suitable for a full vector reduction. */
5083 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5084 sel, new_phi_result, vector_identity);
5086 /* Do the reduction and convert it to the appropriate type. */
5087 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5088 TREE_TYPE (vectype), vec);
5089 scalar = gimple_convert (&seq, scalar_type, scalar);
5090 scalar_results.safe_push (scalar);
5092 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5094 else
5096 bool reduce_with_shift;
5097 tree vec_temp;
5099 gcc_assert (slp_reduc || new_phis.length () == 1);
5101 /* See if the target wants to do the final (shift) reduction
5102 in a vector mode of smaller size and first reduce upper/lower
5103 halves against each other. */
5104 enum machine_mode mode1 = mode;
5105 tree stype = TREE_TYPE (vectype);
5106 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5107 unsigned nunits1 = nunits;
5108 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5109 && new_phis.length () == 1)
5111 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5112 /* For SLP reductions we have to make sure lanes match up, but
5113 since we're doing individual element final reduction reducing
5114 vector width here is even more important.
5115 ??? We can also separate lanes with permutes, for the common
5116 case of power-of-two group-size odd/even extracts would work. */
5117 if (slp_reduc && nunits != nunits1)
5119 nunits1 = least_common_multiple (nunits1, group_size);
5120 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5123 if (!slp_reduc
5124 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5125 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5127 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5128 stype, nunits1);
5129 reduce_with_shift = have_whole_vector_shift (mode1);
5130 if (!VECTOR_MODE_P (mode1))
5131 reduce_with_shift = false;
5132 else
5134 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5135 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5136 reduce_with_shift = false;
5139 /* First reduce the vector to the desired vector size we should
5140 do shift reduction on by combining upper and lower halves. */
5141 new_temp = new_phi_result;
5142 while (nunits > nunits1)
5144 nunits /= 2;
5145 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5146 stype, nunits);
5147 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5149 /* The target has to make sure we support lowpart/highpart
5150 extraction, either via direct vector extract or through
5151 an integer mode punning. */
5152 tree dst1, dst2;
5153 if (convert_optab_handler (vec_extract_optab,
5154 TYPE_MODE (TREE_TYPE (new_temp)),
5155 TYPE_MODE (vectype1))
5156 != CODE_FOR_nothing)
5158 /* Extract sub-vectors directly once vec_extract becomes
5159 a conversion optab. */
5160 dst1 = make_ssa_name (vectype1);
5161 epilog_stmt
5162 = gimple_build_assign (dst1, BIT_FIELD_REF,
5163 build3 (BIT_FIELD_REF, vectype1,
5164 new_temp, TYPE_SIZE (vectype1),
5165 bitsize_int (0)));
5166 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5167 dst2 = make_ssa_name (vectype1);
5168 epilog_stmt
5169 = gimple_build_assign (dst2, BIT_FIELD_REF,
5170 build3 (BIT_FIELD_REF, vectype1,
5171 new_temp, TYPE_SIZE (vectype1),
5172 bitsize_int (bitsize)));
5173 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5175 else
5177 /* Extract via punning to appropriately sized integer mode
5178 vector. */
5179 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5180 tree etype = build_vector_type (eltype, 2);
5181 gcc_assert (convert_optab_handler (vec_extract_optab,
5182 TYPE_MODE (etype),
5183 TYPE_MODE (eltype))
5184 != CODE_FOR_nothing);
5185 tree tem = make_ssa_name (etype);
5186 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5187 build1 (VIEW_CONVERT_EXPR,
5188 etype, new_temp));
5189 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5190 new_temp = tem;
5191 tem = make_ssa_name (eltype);
5192 epilog_stmt
5193 = gimple_build_assign (tem, BIT_FIELD_REF,
5194 build3 (BIT_FIELD_REF, eltype,
5195 new_temp, TYPE_SIZE (eltype),
5196 bitsize_int (0)));
5197 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5198 dst1 = make_ssa_name (vectype1);
5199 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5200 build1 (VIEW_CONVERT_EXPR,
5201 vectype1, tem));
5202 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5203 tem = make_ssa_name (eltype);
5204 epilog_stmt
5205 = gimple_build_assign (tem, BIT_FIELD_REF,
5206 build3 (BIT_FIELD_REF, eltype,
5207 new_temp, TYPE_SIZE (eltype),
5208 bitsize_int (bitsize)));
5209 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5210 dst2 = make_ssa_name (vectype1);
5211 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5212 build1 (VIEW_CONVERT_EXPR,
5213 vectype1, tem));
5214 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5217 new_temp = make_ssa_name (vectype1);
5218 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5219 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220 new_phis[0] = epilog_stmt;
5223 if (reduce_with_shift && !slp_reduc)
5225 int element_bitsize = tree_to_uhwi (bitsize);
5226 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5227 for variable-length vectors and also requires direct target support
5228 for loop reductions. */
5229 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5230 int nelements = vec_size_in_bits / element_bitsize;
5231 vec_perm_builder sel;
5232 vec_perm_indices indices;
5234 int elt_offset;
5236 tree zero_vec = build_zero_cst (vectype1);
5237 /* Case 2: Create:
5238 for (offset = nelements/2; offset >= 1; offset/=2)
5240 Create: va' = vec_shift <va, offset>
5241 Create: va = vop <va, va'>
5242 } */
5244 tree rhs;
5246 if (dump_enabled_p ())
5247 dump_printf_loc (MSG_NOTE, vect_location,
5248 "Reduce using vector shifts\n");
5250 gimple_seq stmts = NULL;
5251 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5252 for (elt_offset = nelements / 2;
5253 elt_offset >= 1;
5254 elt_offset /= 2)
5256 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5257 indices.new_vector (sel, 2, nelements);
5258 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5259 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5260 new_temp, zero_vec, mask);
5261 new_temp = gimple_build (&stmts, code,
5262 vectype1, new_name, new_temp);
5264 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5266 /* 2.4 Extract the final scalar result. Create:
5267 s_out3 = extract_field <v_out2, bitpos> */
5269 if (dump_enabled_p ())
5270 dump_printf_loc (MSG_NOTE, vect_location,
5271 "extract scalar result\n");
5273 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5274 bitsize, bitsize_zero_node);
5275 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5276 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5277 gimple_assign_set_lhs (epilog_stmt, new_temp);
5278 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5279 scalar_results.safe_push (new_temp);
5281 else
5283 /* Case 3: Create:
5284 s = extract_field <v_out2, 0>
5285 for (offset = element_size;
5286 offset < vector_size;
5287 offset += element_size;)
5289 Create: s' = extract_field <v_out2, offset>
5290 Create: s = op <s, s'> // For non SLP cases
5291 } */
5293 if (dump_enabled_p ())
5294 dump_printf_loc (MSG_NOTE, vect_location,
5295 "Reduce using scalar code.\n");
5297 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5298 int element_bitsize = tree_to_uhwi (bitsize);
5299 tree compute_type = TREE_TYPE (vectype);
5300 gimple_seq stmts = NULL;
5301 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5303 int bit_offset;
5304 if (gimple_code (new_phi) == GIMPLE_PHI)
5305 vec_temp = PHI_RESULT (new_phi);
5306 else
5307 vec_temp = gimple_assign_lhs (new_phi);
5308 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5309 vec_temp, bitsize, bitsize_zero_node);
5311 /* In SLP we don't need to apply reduction operation, so we just
5312 collect s' values in SCALAR_RESULTS. */
5313 if (slp_reduc)
5314 scalar_results.safe_push (new_temp);
5316 for (bit_offset = element_bitsize;
5317 bit_offset < vec_size_in_bits;
5318 bit_offset += element_bitsize)
5320 tree bitpos = bitsize_int (bit_offset);
5321 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5322 compute_type, vec_temp,
5323 bitsize, bitpos);
5324 if (slp_reduc)
5326 /* In SLP we don't need to apply reduction operation, so
5327 we just collect s' values in SCALAR_RESULTS. */
5328 new_temp = new_name;
5329 scalar_results.safe_push (new_name);
5331 else
5332 new_temp = gimple_build (&stmts, code, compute_type,
5333 new_name, new_temp);
5337 /* The only case where we need to reduce scalar results in SLP, is
5338 unrolling. If the size of SCALAR_RESULTS is greater than
5339 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5340 REDUC_GROUP_SIZE. */
5341 if (slp_reduc)
5343 tree res, first_res, new_res;
5345 /* Reduce multiple scalar results in case of SLP unrolling. */
5346 for (j = group_size; scalar_results.iterate (j, &res);
5347 j++)
5349 first_res = scalar_results[j % group_size];
5350 new_res = gimple_build (&stmts, code, compute_type,
5351 first_res, res);
5352 scalar_results[j % group_size] = new_res;
5354 for (k = 0; k < group_size; k++)
5355 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5356 scalar_results[k]);
5358 else
5360 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5361 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5362 scalar_results.safe_push (new_temp);
5365 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5368 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5369 && induc_val)
5371 /* Earlier we set the initial value to be a vector if induc_val
5372 values. Check the result and if it is induc_val then replace
5373 with the original initial value, unless induc_val is
5374 the same as initial_def already. */
5375 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5376 induc_val);
5378 tree tmp = make_ssa_name (new_scalar_dest);
5379 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5380 initial_def, new_temp);
5381 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5382 scalar_results[0] = tmp;
5386 /* 2.5 Adjust the final result by the initial value of the reduction
5387 variable. (When such adjustment is not needed, then
5388 'adjustment_def' is zero). For example, if code is PLUS we create:
5389 new_temp = loop_exit_def + adjustment_def */
5391 if (adjustment_def)
5393 gcc_assert (!slp_reduc);
5394 gimple_seq stmts = NULL;
5395 if (nested_in_vect_loop)
5397 new_phi = new_phis[0];
5398 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5399 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5400 new_temp = gimple_build (&stmts, code, vectype,
5401 PHI_RESULT (new_phi), adjustment_def);
5403 else
5405 new_temp = scalar_results[0];
5406 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5407 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5408 new_temp = gimple_build (&stmts, code, scalar_type,
5409 new_temp, adjustment_def);
5412 epilog_stmt = gimple_seq_last_stmt (stmts);
5413 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5414 if (nested_in_vect_loop)
5416 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5417 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5418 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5420 if (!double_reduc)
5421 scalar_results.quick_push (new_temp);
5422 else
5423 scalar_results[0] = new_temp;
5425 else
5426 scalar_results[0] = new_temp;
5428 new_phis[0] = epilog_stmt;
5431 if (double_reduc)
5432 loop = loop->inner;
5434 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5435 phis with new adjusted scalar results, i.e., replace use <s_out0>
5436 with use <s_out4>.
5438 Transform:
5439 loop_exit:
5440 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5441 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5442 v_out2 = reduce <v_out1>
5443 s_out3 = extract_field <v_out2, 0>
5444 s_out4 = adjust_result <s_out3>
5445 use <s_out0>
5446 use <s_out0>
5448 into:
5450 loop_exit:
5451 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5452 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5453 v_out2 = reduce <v_out1>
5454 s_out3 = extract_field <v_out2, 0>
5455 s_out4 = adjust_result <s_out3>
5456 use <s_out4>
5457 use <s_out4> */
5460 /* In SLP reduction chain we reduce vector results into one vector if
5461 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5462 LHS of the last stmt in the reduction chain, since we are looking for
5463 the loop exit phi node. */
5464 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5466 stmt_vec_info dest_stmt_info
5467 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5468 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5469 group_size = 1;
5472 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5473 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5474 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5475 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5476 correspond to the first vector stmt, etc.
5477 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5478 if (group_size > new_phis.length ())
5479 gcc_assert (!(group_size % new_phis.length ()));
5481 for (k = 0; k < group_size; k++)
5483 if (slp_reduc)
5485 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5487 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5488 /* SLP statements can't participate in patterns. */
5489 gcc_assert (!orig_stmt_info);
5490 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5493 if (nested_in_vect_loop)
5495 if (double_reduc)
5496 loop = outer_loop;
5497 else
5498 gcc_unreachable ();
5501 phis.create (3);
5502 /* Find the loop-closed-use at the loop exit of the original scalar
5503 result. (The reduction result is expected to have two immediate uses,
5504 one at the latch block, and one at the loop exit). For double
5505 reductions we are looking for exit phis of the outer loop. */
5506 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5508 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5510 if (!is_gimple_debug (USE_STMT (use_p)))
5511 phis.safe_push (USE_STMT (use_p));
5513 else
5515 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5517 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5519 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5521 if (!flow_bb_inside_loop_p (loop,
5522 gimple_bb (USE_STMT (phi_use_p)))
5523 && !is_gimple_debug (USE_STMT (phi_use_p)))
5524 phis.safe_push (USE_STMT (phi_use_p));
5530 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5532 /* Replace the uses: */
5533 orig_name = PHI_RESULT (exit_phi);
5534 scalar_result = scalar_results[k];
5535 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5537 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5538 SET_USE (use_p, scalar_result);
5539 update_stmt (use_stmt);
5543 phis.release ();
5547 /* Return a vector of type VECTYPE that is equal to the vector select
5548 operation "MASK ? VEC : IDENTITY". Insert the select statements
5549 before GSI. */
5551 static tree
5552 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5553 tree vec, tree identity)
5555 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5556 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5557 mask, vec, identity);
5558 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5559 return cond;
5562 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5563 order, starting with LHS. Insert the extraction statements before GSI and
5564 associate the new scalar SSA names with variable SCALAR_DEST.
5565 Return the SSA name for the result. */
5567 static tree
5568 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5569 tree_code code, tree lhs, tree vector_rhs)
5571 tree vectype = TREE_TYPE (vector_rhs);
5572 tree scalar_type = TREE_TYPE (vectype);
5573 tree bitsize = TYPE_SIZE (scalar_type);
5574 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5575 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5577 for (unsigned HOST_WIDE_INT bit_offset = 0;
5578 bit_offset < vec_size_in_bits;
5579 bit_offset += element_bitsize)
5581 tree bitpos = bitsize_int (bit_offset);
5582 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5583 bitsize, bitpos);
5585 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5586 rhs = make_ssa_name (scalar_dest, stmt);
5587 gimple_assign_set_lhs (stmt, rhs);
5588 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5590 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5591 tree new_name = make_ssa_name (scalar_dest, stmt);
5592 gimple_assign_set_lhs (stmt, new_name);
5593 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5594 lhs = new_name;
5596 return lhs;
5599 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5600 type of the vector input. */
5602 static internal_fn
5603 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5605 internal_fn mask_reduc_fn;
5607 switch (reduc_fn)
5609 case IFN_FOLD_LEFT_PLUS:
5610 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5611 break;
5613 default:
5614 return IFN_LAST;
5617 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5618 OPTIMIZE_FOR_SPEED))
5619 return mask_reduc_fn;
5620 return IFN_LAST;
5623 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5624 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5625 statement. CODE is the operation performed by STMT_INFO and OPS are
5626 its scalar operands. REDUC_INDEX is the index of the operand in
5627 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5628 implements in-order reduction, or IFN_LAST if we should open-code it.
5629 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5630 that should be used to control the operation in a fully-masked loop. */
5632 static bool
5633 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5634 gimple_stmt_iterator *gsi,
5635 stmt_vec_info *vec_stmt, slp_tree slp_node,
5636 gimple *reduc_def_stmt,
5637 tree_code code, internal_fn reduc_fn,
5638 tree ops[3], tree vectype_in,
5639 int reduc_index, vec_loop_masks *masks)
5641 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5642 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5643 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5644 stmt_vec_info new_stmt_info = NULL;
5645 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5647 int ncopies;
5648 if (slp_node)
5649 ncopies = 1;
5650 else
5651 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5653 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5654 gcc_assert (ncopies == 1);
5655 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5657 if (slp_node)
5658 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5659 TYPE_VECTOR_SUBPARTS (vectype_in)));
5661 tree op0 = ops[1 - reduc_index];
5663 int group_size = 1;
5664 stmt_vec_info scalar_dest_def_info;
5665 auto_vec<tree> vec_oprnds0;
5666 if (slp_node)
5668 auto_vec<vec<tree> > vec_defs (2);
5669 vect_get_slp_defs (slp_node, &vec_defs);
5670 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5671 vec_defs[0].release ();
5672 vec_defs[1].release ();
5673 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5674 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5676 else
5678 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5679 vec_oprnds0.create (1);
5680 vec_oprnds0.quick_push (loop_vec_def0);
5681 scalar_dest_def_info = stmt_info;
5684 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5685 tree scalar_type = TREE_TYPE (scalar_dest);
5686 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5688 int vec_num = vec_oprnds0.length ();
5689 gcc_assert (vec_num == 1 || slp_node);
5690 tree vec_elem_type = TREE_TYPE (vectype_out);
5691 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5693 tree vector_identity = NULL_TREE;
5694 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5695 vector_identity = build_zero_cst (vectype_out);
5697 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5698 int i;
5699 tree def0;
5700 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5702 gimple *new_stmt;
5703 tree mask = NULL_TREE;
5704 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5705 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5707 /* Handle MINUS by adding the negative. */
5708 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5710 tree negated = make_ssa_name (vectype_out);
5711 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5712 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5713 def0 = negated;
5716 if (mask && mask_reduc_fn == IFN_LAST)
5717 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5718 vector_identity);
5720 /* On the first iteration the input is simply the scalar phi
5721 result, and for subsequent iterations it is the output of
5722 the preceding operation. */
5723 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5725 if (mask && mask_reduc_fn != IFN_LAST)
5726 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5727 def0, mask);
5728 else
5729 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5730 def0);
5731 /* For chained SLP reductions the output of the previous reduction
5732 operation serves as the input of the next. For the final statement
5733 the output cannot be a temporary - we reuse the original
5734 scalar destination of the last statement. */
5735 if (i != vec_num - 1)
5737 gimple_set_lhs (new_stmt, scalar_dest_var);
5738 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5739 gimple_set_lhs (new_stmt, reduc_var);
5742 else
5744 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5745 reduc_var, def0);
5746 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5747 /* Remove the statement, so that we can use the same code paths
5748 as for statements that we've just created. */
5749 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5750 gsi_remove (&tmp_gsi, true);
5753 if (i == vec_num - 1)
5755 gimple_set_lhs (new_stmt, scalar_dest);
5756 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5757 new_stmt);
5759 else
5760 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5761 new_stmt, gsi);
5763 if (slp_node)
5764 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5767 if (!slp_node)
5768 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5770 return true;
5773 /* Function is_nonwrapping_integer_induction.
5775 Check if STMT_VINO (which is part of loop LOOP) both increments and
5776 does not cause overflow. */
5778 static bool
5779 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5781 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5782 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5783 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5784 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5785 widest_int ni, max_loop_value, lhs_max;
5786 wi::overflow_type overflow = wi::OVF_NONE;
5788 /* Make sure the loop is integer based. */
5789 if (TREE_CODE (base) != INTEGER_CST
5790 || TREE_CODE (step) != INTEGER_CST)
5791 return false;
5793 /* Check that the max size of the loop will not wrap. */
5795 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5796 return true;
5798 if (! max_stmt_executions (loop, &ni))
5799 return false;
5801 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5802 &overflow);
5803 if (overflow)
5804 return false;
5806 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5807 TYPE_SIGN (lhs_type), &overflow);
5808 if (overflow)
5809 return false;
5811 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5812 <= TYPE_PRECISION (lhs_type));
5815 /* Check if masking can be supported by inserting a conditional expression.
5816 CODE is the code for the operation. COND_FN is the conditional internal
5817 function, if it exists. VECTYPE_IN is the type of the vector input. */
5818 static bool
5819 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5820 tree vectype_in)
5822 if (cond_fn != IFN_LAST
5823 && direct_internal_fn_supported_p (cond_fn, vectype_in,
5824 OPTIMIZE_FOR_SPEED))
5825 return false;
5827 switch (code)
5829 case DOT_PROD_EXPR:
5830 case SAD_EXPR:
5831 return true;
5833 default:
5834 return false;
5838 /* Insert a conditional expression to enable masked vectorization. CODE is the
5839 code for the operation. VOP is the array of operands. MASK is the loop
5840 mask. GSI is a statement iterator used to place the new conditional
5841 expression. */
5842 static void
5843 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5844 gimple_stmt_iterator *gsi)
5846 switch (code)
5848 case DOT_PROD_EXPR:
5850 tree vectype = TREE_TYPE (vop[1]);
5851 tree zero = build_zero_cst (vectype);
5852 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5853 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5854 mask, vop[1], zero);
5855 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5856 vop[1] = masked_op1;
5857 break;
5860 case SAD_EXPR:
5862 tree vectype = TREE_TYPE (vop[1]);
5863 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5864 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5865 mask, vop[1], vop[0]);
5866 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5867 vop[1] = masked_op1;
5868 break;
5871 default:
5872 gcc_unreachable ();
5876 /* Function vectorizable_reduction.
5878 Check if STMT_INFO performs a reduction operation that can be vectorized.
5879 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5880 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5881 Return true if STMT_INFO is vectorizable in this way.
5883 This function also handles reduction idioms (patterns) that have been
5884 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5885 may be of this form:
5886 X = pattern_expr (arg0, arg1, ..., X)
5887 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5888 sequence that had been detected and replaced by the pattern-stmt
5889 (STMT_INFO).
5891 This function also handles reduction of condition expressions, for example:
5892 for (int i = 0; i < N; i++)
5893 if (a[i] < value)
5894 last = a[i];
5895 This is handled by vectorising the loop and creating an additional vector
5896 containing the loop indexes for which "a[i] < value" was true. In the
5897 function epilogue this is reduced to a single max value and then used to
5898 index into the vector of results.
5900 In some cases of reduction patterns, the type of the reduction variable X is
5901 different than the type of the other arguments of STMT_INFO.
5902 In such cases, the vectype that is used when transforming STMT_INFO into
5903 a vector stmt is different than the vectype that is used to determine the
5904 vectorization factor, because it consists of a different number of elements
5905 than the actual number of elements that are being operated upon in parallel.
5907 For example, consider an accumulation of shorts into an int accumulator.
5908 On some targets it's possible to vectorize this pattern operating on 8
5909 shorts at a time (hence, the vectype for purposes of determining the
5910 vectorization factor should be V8HI); on the other hand, the vectype that
5911 is used to create the vector form is actually V4SI (the type of the result).
5913 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5914 indicates what is the actual level of parallelism (V8HI in the example), so
5915 that the right vectorization factor would be derived. This vectype
5916 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5917 be used to create the vectorized stmt. The right vectype for the vectorized
5918 stmt is obtained from the type of the result X:
5919 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5921 This means that, contrary to "regular" reductions (or "regular" stmts in
5922 general), the following equation:
5923 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5924 does *NOT* necessarily hold for reduction patterns. */
5926 bool
5927 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5928 slp_instance slp_node_instance,
5929 stmt_vector_for_cost *cost_vec)
5931 tree scalar_dest;
5932 tree vectype_in = NULL_TREE;
5933 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5934 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5935 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5936 stmt_vec_info cond_stmt_vinfo = NULL;
5937 tree scalar_type;
5938 int i;
5939 int ncopies;
5940 bool single_defuse_cycle = false;
5941 bool nested_cycle = false;
5942 bool double_reduc = false;
5943 int vec_num;
5944 tree tem;
5945 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5946 tree cond_reduc_val = NULL_TREE;
5948 /* Make sure it was already recognized as a reduction computation. */
5949 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5950 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5951 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5952 return false;
5954 /* The stmt we store reduction analysis meta on. */
5955 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5956 reduc_info->is_reduc_info = true;
5958 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5960 if (is_a <gphi *> (stmt_info->stmt))
5961 /* Analysis for double-reduction is done on the outer
5962 loop PHI, nested cycles have no further restrictions. */
5963 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5964 else
5965 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5966 return true;
5969 stmt_vec_info orig_stmt_of_analysis = stmt_info;
5970 stmt_vec_info phi_info = stmt_info;
5971 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5972 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5974 if (!is_a <gphi *> (stmt_info->stmt))
5976 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5977 return true;
5979 if (slp_node)
5981 slp_node_instance->reduc_phis = slp_node;
5982 /* ??? We're leaving slp_node to point to the PHIs, we only
5983 need it to get at the number of vector stmts which wasn't
5984 yet initialized for the instance root. */
5986 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5987 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5988 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5990 use_operand_p use_p;
5991 gimple *use_stmt;
5992 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
5993 &use_p, &use_stmt);
5994 gcc_assert (res);
5995 phi_info = loop_vinfo->lookup_stmt (use_stmt);
5996 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6000 /* PHIs should not participate in patterns. */
6001 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6002 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6004 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6005 and compute the reduction chain length. */
6006 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6007 loop_latch_edge (loop));
6008 unsigned reduc_chain_length = 0;
6009 bool only_slp_reduc_chain = true;
6010 stmt_info = NULL;
6011 while (reduc_def != PHI_RESULT (reduc_def_phi))
6013 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6014 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6015 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6017 if (dump_enabled_p ())
6018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6019 "reduction chain broken by patterns.\n");
6020 return false;
6022 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6023 only_slp_reduc_chain = false;
6024 /* ??? For epilogue generation live members of the chain need
6025 to point back to the PHI via their original stmt for
6026 info_for_reduction to work. */
6027 if (STMT_VINFO_LIVE_P (vdef))
6028 STMT_VINFO_REDUC_DEF (def) = phi_info;
6029 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (vdef->stmt)))
6031 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (vdef->stmt)),
6032 TREE_TYPE (gimple_assign_rhs1 (vdef->stmt))))
6034 if (dump_enabled_p ())
6035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6036 "conversion in the reduction chain.\n");
6037 return false;
6040 else if (!stmt_info)
6041 /* First non-conversion stmt. */
6042 stmt_info = vdef;
6043 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6044 reduc_chain_length++;
6046 /* PHIs should not participate in patterns. */
6047 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6049 if (nested_in_vect_loop_p (loop, stmt_info))
6051 loop = loop->inner;
6052 nested_cycle = true;
6055 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6056 element. */
6057 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6059 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6060 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6062 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6063 gcc_assert (slp_node
6064 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6066 /* 1. Is vectorizable reduction? */
6067 /* Not supportable if the reduction variable is used in the loop, unless
6068 it's a reduction chain. */
6069 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6070 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6071 return false;
6073 /* Reductions that are not used even in an enclosing outer-loop,
6074 are expected to be "live" (used out of the loop). */
6075 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6076 && !STMT_VINFO_LIVE_P (stmt_info))
6077 return false;
6079 /* 2. Has this been recognized as a reduction pattern?
6081 Check if STMT represents a pattern that has been recognized
6082 in earlier analysis stages. For stmts that represent a pattern,
6083 the STMT_VINFO_RELATED_STMT field records the last stmt in
6084 the original sequence that constitutes the pattern. */
6086 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6087 if (orig_stmt_info)
6089 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6090 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6093 /* 3. Check the operands of the operation. The first operands are defined
6094 inside the loop body. The last operand is the reduction variable,
6095 which is defined by the loop-header-phi. */
6097 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6098 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6099 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6100 enum tree_code code = gimple_assign_rhs_code (stmt);
6101 bool lane_reduc_code_p
6102 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6103 int op_type = TREE_CODE_LENGTH (code);
6105 scalar_dest = gimple_assign_lhs (stmt);
6106 scalar_type = TREE_TYPE (scalar_dest);
6107 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6108 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6109 return false;
6111 /* Do not try to vectorize bit-precision reductions. */
6112 if (!type_has_mode_precision_p (scalar_type))
6113 return false;
6115 /* For lane-reducing ops we're reducing the number of reduction PHIs
6116 which means the only use of that may be in the lane-reducing operation. */
6117 if (lane_reduc_code_p
6118 && reduc_chain_length != 1
6119 && !only_slp_reduc_chain)
6121 if (dump_enabled_p ())
6122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6123 "lane-reducing reduction with extra stmts.\n");
6124 return false;
6127 /* All uses but the last are expected to be defined in the loop.
6128 The last use is the reduction variable. In case of nested cycle this
6129 assumption is not true: we use reduc_index to record the index of the
6130 reduction variable. */
6131 reduc_def = PHI_RESULT (reduc_def_phi);
6132 for (i = 0; i < op_type; i++)
6134 tree op = gimple_op (stmt, i + 1);
6135 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6136 if (i == 0 && code == COND_EXPR)
6137 continue;
6139 stmt_vec_info def_stmt_info;
6140 enum vect_def_type dt;
6141 if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6142 &def_stmt_info))
6144 if (dump_enabled_p ())
6145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6146 "use not simple.\n");
6147 return false;
6149 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6150 continue;
6152 /* There should be only one cycle def in the stmt, the one
6153 leading to reduc_def. */
6154 if (VECTORIZABLE_CYCLE_DEF (dt))
6155 return false;
6157 /* To properly compute ncopies we are interested in the widest
6158 non-reduction input type in case we're looking at a widening
6159 accumulation that we later handle in vect_transform_reduction. */
6160 if (lane_reduc_code_p
6161 && tem
6162 && (!vectype_in
6163 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6164 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6165 vectype_in = tem;
6167 if (code == COND_EXPR)
6169 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6170 if (dt == vect_constant_def)
6172 cond_reduc_dt = dt;
6173 cond_reduc_val = op;
6175 if (dt == vect_induction_def
6176 && def_stmt_info
6177 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6179 cond_reduc_dt = dt;
6180 cond_stmt_vinfo = def_stmt_info;
6184 if (!vectype_in)
6185 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6186 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6188 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6189 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6190 /* If we have a condition reduction, see if we can simplify it further. */
6191 if (v_reduc_type == COND_REDUCTION)
6193 if (slp_node)
6194 return false;
6196 /* When the condition uses the reduction value in the condition, fail. */
6197 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6199 if (dump_enabled_p ())
6200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6201 "condition depends on previous iteration\n");
6202 return false;
6205 if (reduc_chain_length == 1
6206 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6207 vectype_in, OPTIMIZE_FOR_SPEED))
6209 if (dump_enabled_p ())
6210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211 "optimizing condition reduction with"
6212 " FOLD_EXTRACT_LAST.\n");
6213 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6215 else if (cond_reduc_dt == vect_induction_def)
6217 tree base
6218 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6219 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6221 gcc_assert (TREE_CODE (base) == INTEGER_CST
6222 && TREE_CODE (step) == INTEGER_CST);
6223 cond_reduc_val = NULL_TREE;
6224 enum tree_code cond_reduc_op_code = ERROR_MARK;
6225 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6226 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6228 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6229 above base; punt if base is the minimum value of the type for
6230 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6231 else if (tree_int_cst_sgn (step) == -1)
6233 cond_reduc_op_code = MIN_EXPR;
6234 if (tree_int_cst_sgn (base) == -1)
6235 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6236 else if (tree_int_cst_lt (base,
6237 TYPE_MAX_VALUE (TREE_TYPE (base))))
6238 cond_reduc_val
6239 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6241 else
6243 cond_reduc_op_code = MAX_EXPR;
6244 if (tree_int_cst_sgn (base) == 1)
6245 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6246 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6247 base))
6248 cond_reduc_val
6249 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6251 if (cond_reduc_val)
6253 if (dump_enabled_p ())
6254 dump_printf_loc (MSG_NOTE, vect_location,
6255 "condition expression based on "
6256 "integer induction.\n");
6257 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6258 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6259 = cond_reduc_val;
6260 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6263 else if (cond_reduc_dt == vect_constant_def)
6265 enum vect_def_type cond_initial_dt;
6266 tree cond_initial_val
6267 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6269 gcc_assert (cond_reduc_val != NULL_TREE);
6270 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6271 if (cond_initial_dt == vect_constant_def
6272 && types_compatible_p (TREE_TYPE (cond_initial_val),
6273 TREE_TYPE (cond_reduc_val)))
6275 tree e = fold_binary (LE_EXPR, boolean_type_node,
6276 cond_initial_val, cond_reduc_val);
6277 if (e && (integer_onep (e) || integer_zerop (e)))
6279 if (dump_enabled_p ())
6280 dump_printf_loc (MSG_NOTE, vect_location,
6281 "condition expression based on "
6282 "compile time constant.\n");
6283 /* Record reduction code at analysis stage. */
6284 STMT_VINFO_REDUC_CODE (reduc_info)
6285 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6286 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6292 if (STMT_VINFO_LIVE_P (phi_info))
6293 return false;
6295 if (slp_node)
6296 ncopies = 1;
6297 else
6298 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6300 gcc_assert (ncopies >= 1);
6302 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6304 if (nested_cycle)
6306 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6307 == vect_double_reduction_def);
6308 double_reduc = true;
6311 /* 4.2. Check support for the epilog operation.
6313 If STMT represents a reduction pattern, then the type of the
6314 reduction variable may be different than the type of the rest
6315 of the arguments. For example, consider the case of accumulation
6316 of shorts into an int accumulator; The original code:
6317 S1: int_a = (int) short_a;
6318 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6320 was replaced with:
6321 STMT: int_acc = widen_sum <short_a, int_acc>
6323 This means that:
6324 1. The tree-code that is used to create the vector operation in the
6325 epilog code (that reduces the partial results) is not the
6326 tree-code of STMT, but is rather the tree-code of the original
6327 stmt from the pattern that STMT is replacing. I.e, in the example
6328 above we want to use 'widen_sum' in the loop, but 'plus' in the
6329 epilog.
6330 2. The type (mode) we use to check available target support
6331 for the vector operation to be created in the *epilog*, is
6332 determined by the type of the reduction variable (in the example
6333 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6334 However the type (mode) we use to check available target support
6335 for the vector operation to be created *inside the loop*, is
6336 determined by the type of the other arguments to STMT (in the
6337 example we'd check this: optab_handler (widen_sum_optab,
6338 vect_short_mode)).
6340 This is contrary to "regular" reductions, in which the types of all
6341 the arguments are the same as the type of the reduction variable.
6342 For "regular" reductions we can therefore use the same vector type
6343 (and also the same tree-code) when generating the epilog code and
6344 when generating the code inside the loop. */
6346 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6347 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6349 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6350 if (reduction_type == TREE_CODE_REDUCTION)
6352 /* Check whether it's ok to change the order of the computation.
6353 Generally, when vectorizing a reduction we change the order of the
6354 computation. This may change the behavior of the program in some
6355 cases, so we need to check that this is ok. One exception is when
6356 vectorizing an outer-loop: the inner-loop is executed sequentially,
6357 and therefore vectorizing reductions in the inner-loop during
6358 outer-loop vectorization is safe. */
6359 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6361 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6362 is not directy used in stmt. */
6363 if (!only_slp_reduc_chain
6364 && reduc_chain_length != 1)
6366 if (dump_enabled_p ())
6367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6368 "in-order reduction chain without SLP.\n");
6369 return false;
6371 STMT_VINFO_REDUC_TYPE (reduc_info)
6372 = reduction_type = FOLD_LEFT_REDUCTION;
6374 else if (!commutative_tree_code (orig_code)
6375 || !associative_tree_code (orig_code))
6377 if (dump_enabled_p ())
6378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6379 "reduction: not commutative/associative");
6380 return false;
6384 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6385 && ncopies > 1)
6387 if (dump_enabled_p ())
6388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6389 "multiple types in double reduction or condition "
6390 "reduction or fold-left reduction.\n");
6391 return false;
6394 internal_fn reduc_fn = IFN_LAST;
6395 if (reduction_type == TREE_CODE_REDUCTION
6396 || reduction_type == FOLD_LEFT_REDUCTION
6397 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6398 || reduction_type == CONST_COND_REDUCTION)
6400 if (reduction_type == FOLD_LEFT_REDUCTION
6401 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6402 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6404 if (reduc_fn != IFN_LAST
6405 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6406 OPTIMIZE_FOR_SPEED))
6408 if (dump_enabled_p ())
6409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6410 "reduc op not supported by target.\n");
6412 reduc_fn = IFN_LAST;
6415 else
6417 if (!nested_cycle || double_reduc)
6419 if (dump_enabled_p ())
6420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6421 "no reduc code for scalar code.\n");
6423 return false;
6427 else if (reduction_type == COND_REDUCTION)
6429 int scalar_precision
6430 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6431 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6432 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6433 nunits_out);
6435 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6436 OPTIMIZE_FOR_SPEED))
6437 reduc_fn = IFN_REDUC_MAX;
6439 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6441 if (reduction_type != EXTRACT_LAST_REDUCTION
6442 && (!nested_cycle || double_reduc)
6443 && reduc_fn == IFN_LAST
6444 && !nunits_out.is_constant ())
6446 if (dump_enabled_p ())
6447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448 "missing target support for reduction on"
6449 " variable-length vectors.\n");
6450 return false;
6453 /* For SLP reductions, see if there is a neutral value we can use. */
6454 tree neutral_op = NULL_TREE;
6455 if (slp_node)
6456 neutral_op = neutral_op_for_slp_reduction
6457 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6458 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6460 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6462 /* We can't support in-order reductions of code such as this:
6464 for (int i = 0; i < n1; ++i)
6465 for (int j = 0; j < n2; ++j)
6466 l += a[j];
6468 since GCC effectively transforms the loop when vectorizing:
6470 for (int i = 0; i < n1 / VF; ++i)
6471 for (int j = 0; j < n2; ++j)
6472 for (int k = 0; k < VF; ++k)
6473 l += a[j];
6475 which is a reassociation of the original operation. */
6476 if (dump_enabled_p ())
6477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6478 "in-order double reduction not supported.\n");
6480 return false;
6483 if (reduction_type == FOLD_LEFT_REDUCTION
6484 && slp_node
6485 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6487 /* We cannot use in-order reductions in this case because there is
6488 an implicit reassociation of the operations involved. */
6489 if (dump_enabled_p ())
6490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491 "in-order unchained SLP reductions not supported.\n");
6492 return false;
6495 /* For double reductions, and for SLP reductions with a neutral value,
6496 we construct a variable-length initial vector by loading a vector
6497 full of the neutral value and then shift-and-inserting the start
6498 values into the low-numbered elements. */
6499 if ((double_reduc || neutral_op)
6500 && !nunits_out.is_constant ()
6501 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6502 vectype_out, OPTIMIZE_FOR_SPEED))
6504 if (dump_enabled_p ())
6505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6506 "reduction on variable-length vectors requires"
6507 " target support for a vector-shift-and-insert"
6508 " operation.\n");
6509 return false;
6512 /* Check extra constraints for variable-length unchained SLP reductions. */
6513 if (STMT_SLP_TYPE (stmt_info)
6514 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6515 && !nunits_out.is_constant ())
6517 /* We checked above that we could build the initial vector when
6518 there's a neutral element value. Check here for the case in
6519 which each SLP statement has its own initial value and in which
6520 that value needs to be repeated for every instance of the
6521 statement within the initial vector. */
6522 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6523 if (!neutral_op
6524 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6525 TREE_TYPE (vectype_out)))
6527 if (dump_enabled_p ())
6528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6529 "unsupported form of SLP reduction for"
6530 " variable-length vectors: cannot build"
6531 " initial vector.\n");
6532 return false;
6534 /* The epilogue code relies on the number of elements being a multiple
6535 of the group size. The duplicate-and-interleave approach to setting
6536 up the the initial vector does too. */
6537 if (!multiple_p (nunits_out, group_size))
6539 if (dump_enabled_p ())
6540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6541 "unsupported form of SLP reduction for"
6542 " variable-length vectors: the vector size"
6543 " is not a multiple of the number of results.\n");
6544 return false;
6548 if (reduction_type == COND_REDUCTION)
6550 widest_int ni;
6552 if (! max_loop_iterations (loop, &ni))
6554 if (dump_enabled_p ())
6555 dump_printf_loc (MSG_NOTE, vect_location,
6556 "loop count not known, cannot create cond "
6557 "reduction.\n");
6558 return false;
6560 /* Convert backedges to iterations. */
6561 ni += 1;
6563 /* The additional index will be the same type as the condition. Check
6564 that the loop can fit into this less one (because we'll use up the
6565 zero slot for when there are no matches). */
6566 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6567 if (wi::geu_p (ni, wi::to_widest (max_index)))
6569 if (dump_enabled_p ())
6570 dump_printf_loc (MSG_NOTE, vect_location,
6571 "loop size is greater than data size.\n");
6572 return false;
6576 /* In case the vectorization factor (VF) is bigger than the number
6577 of elements that we can fit in a vectype (nunits), we have to generate
6578 more than one vector stmt - i.e - we need to "unroll" the
6579 vector stmt by a factor VF/nunits. For more details see documentation
6580 in vectorizable_operation. */
6582 /* If the reduction is used in an outer loop we need to generate
6583 VF intermediate results, like so (e.g. for ncopies=2):
6584 r0 = phi (init, r0)
6585 r1 = phi (init, r1)
6586 r0 = x0 + r0;
6587 r1 = x1 + r1;
6588 (i.e. we generate VF results in 2 registers).
6589 In this case we have a separate def-use cycle for each copy, and therefore
6590 for each copy we get the vector def for the reduction variable from the
6591 respective phi node created for this copy.
6593 Otherwise (the reduction is unused in the loop nest), we can combine
6594 together intermediate results, like so (e.g. for ncopies=2):
6595 r = phi (init, r)
6596 r = x0 + r;
6597 r = x1 + r;
6598 (i.e. we generate VF/2 results in a single register).
6599 In this case for each copy we get the vector def for the reduction variable
6600 from the vectorized reduction operation generated in the previous iteration.
6602 This only works when we see both the reduction PHI and its only consumer
6603 in vectorizable_reduction and there are no intermediate stmts
6604 participating. */
6605 if (ncopies > 1
6606 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6607 && reduc_chain_length == 1)
6608 single_defuse_cycle = true;
6610 if (single_defuse_cycle || lane_reduc_code_p)
6612 gcc_assert (code != COND_EXPR);
6614 /* 4. Supportable by target? */
6615 bool ok = true;
6617 /* 4.1. check support for the operation in the loop */
6618 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6619 if (!optab)
6621 if (dump_enabled_p ())
6622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623 "no optab.\n");
6624 ok = false;
6627 machine_mode vec_mode = TYPE_MODE (vectype_in);
6628 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6630 if (dump_enabled_p ())
6631 dump_printf (MSG_NOTE, "op not supported by target.\n");
6632 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6633 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6634 ok = false;
6635 else
6636 if (dump_enabled_p ())
6637 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6640 /* Worthwhile without SIMD support? */
6641 if (ok
6642 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6643 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6645 if (dump_enabled_p ())
6646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6647 "not worthwhile without SIMD support.\n");
6648 ok = false;
6651 /* lane-reducing operations have to go through vect_transform_reduction.
6652 For the other cases try without the single cycle optimization. */
6653 if (!ok)
6655 if (lane_reduc_code_p)
6656 return false;
6657 else
6658 single_defuse_cycle = false;
6661 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6663 /* If the reduction stmt is one of the patterns that have lane
6664 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6665 if ((ncopies > 1 && ! single_defuse_cycle)
6666 && lane_reduc_code_p)
6668 if (dump_enabled_p ())
6669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6670 "multi def-use cycle not possible for lane-reducing "
6671 "reduction operation\n");
6672 return false;
6675 if (slp_node)
6676 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6677 else
6678 vec_num = 1;
6680 vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6681 cost_vec);
6682 if (dump_enabled_p ()
6683 && reduction_type == FOLD_LEFT_REDUCTION)
6684 dump_printf_loc (MSG_NOTE, vect_location,
6685 "using an in-order (fold-left) reduction.\n");
6686 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6687 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6688 reductions go through their own vectorizable_* routines. */
6689 if (!single_defuse_cycle
6690 && code != DOT_PROD_EXPR
6691 && code != WIDEN_SUM_EXPR
6692 && code != SAD_EXPR
6693 && reduction_type != FOLD_LEFT_REDUCTION)
6695 stmt_vec_info tem
6696 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6697 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6699 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6700 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6702 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6703 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6705 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6707 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6708 internal_fn cond_fn = get_conditional_internal_fn (code);
6710 if (reduction_type != FOLD_LEFT_REDUCTION
6711 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6712 && (cond_fn == IFN_LAST
6713 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6714 OPTIMIZE_FOR_SPEED)))
6716 if (dump_enabled_p ())
6717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6718 "can't use a fully-masked loop because no"
6719 " conditional operation is available.\n");
6720 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6722 else if (reduction_type == FOLD_LEFT_REDUCTION
6723 && reduc_fn == IFN_LAST
6724 && !expand_vec_cond_expr_p (vectype_in,
6725 truth_type_for (vectype_in),
6726 SSA_NAME))
6728 if (dump_enabled_p ())
6729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6730 "can't use a fully-masked loop because no"
6731 " conditional operation is available.\n");
6732 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6734 else
6735 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6736 vectype_in, NULL);
6738 return true;
6741 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6742 value. */
6744 bool
6745 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6746 stmt_vec_info *vec_stmt, slp_tree slp_node)
6748 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6749 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6751 int i;
6752 int ncopies;
6753 int j;
6754 int vec_num;
6756 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6757 gcc_assert (reduc_info->is_reduc_info);
6759 if (nested_in_vect_loop_p (loop, stmt_info))
6761 loop = loop->inner;
6762 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6765 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6766 enum tree_code code = gimple_assign_rhs_code (stmt);
6767 int op_type = TREE_CODE_LENGTH (code);
6769 /* Flatten RHS. */
6770 tree ops[3];
6771 switch (get_gimple_rhs_class (code))
6773 case GIMPLE_TERNARY_RHS:
6774 ops[2] = gimple_assign_rhs3 (stmt);
6775 /* Fall thru. */
6776 case GIMPLE_BINARY_RHS:
6777 ops[0] = gimple_assign_rhs1 (stmt);
6778 ops[1] = gimple_assign_rhs2 (stmt);
6779 break;
6780 default:
6781 gcc_unreachable ();
6784 /* All uses but the last are expected to be defined in the loop.
6785 The last use is the reduction variable. In case of nested cycle this
6786 assumption is not true: we use reduc_index to record the index of the
6787 reduction variable. */
6788 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6789 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6790 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6791 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6793 if (slp_node)
6795 ncopies = 1;
6796 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6798 else
6800 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6801 vec_num = 1;
6804 internal_fn cond_fn = get_conditional_internal_fn (code);
6805 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6806 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6808 /* Transform. */
6809 stmt_vec_info new_stmt_info = NULL;
6810 stmt_vec_info prev_stmt_info;
6811 tree new_temp = NULL_TREE;
6812 auto_vec<tree> vec_oprnds0;
6813 auto_vec<tree> vec_oprnds1;
6814 auto_vec<tree> vec_oprnds2;
6815 tree def0;
6817 if (dump_enabled_p ())
6818 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6820 /* FORNOW: Multiple types are not supported for condition. */
6821 if (code == COND_EXPR)
6822 gcc_assert (ncopies == 1);
6824 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6826 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6827 if (reduction_type == FOLD_LEFT_REDUCTION)
6829 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6830 return vectorize_fold_left_reduction
6831 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6832 reduc_fn, ops, vectype_in, reduc_index, masks);
6835 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6836 gcc_assert (single_defuse_cycle
6837 || code == DOT_PROD_EXPR
6838 || code == WIDEN_SUM_EXPR
6839 || code == SAD_EXPR);
6841 /* Create the destination vector */
6842 tree scalar_dest = gimple_assign_lhs (stmt);
6843 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6845 prev_stmt_info = NULL;
6846 if (!slp_node)
6848 vec_oprnds0.create (1);
6849 vec_oprnds1.create (1);
6850 if (op_type == ternary_op)
6851 vec_oprnds2.create (1);
6854 for (j = 0; j < ncopies; j++)
6856 /* Handle uses. */
6857 if (j == 0)
6859 if (slp_node)
6861 /* Get vec defs for all the operands except the reduction index,
6862 ensuring the ordering of the ops in the vector is kept. */
6863 auto_vec<vec<tree>, 3> vec_defs;
6864 vect_get_slp_defs (slp_node, &vec_defs);
6865 vec_oprnds0.safe_splice (vec_defs[0]);
6866 vec_defs[0].release ();
6867 vec_oprnds1.safe_splice (vec_defs[1]);
6868 vec_defs[1].release ();
6869 if (op_type == ternary_op)
6871 vec_oprnds2.safe_splice (vec_defs[2]);
6872 vec_defs[2].release ();
6875 else
6877 vec_oprnds0.quick_push
6878 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6879 vec_oprnds1.quick_push
6880 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6881 if (op_type == ternary_op)
6882 vec_oprnds2.quick_push
6883 (vect_get_vec_def_for_operand (ops[2], stmt_info));
6886 else
6888 if (!slp_node)
6890 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6892 if (single_defuse_cycle && reduc_index == 0)
6893 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6894 else
6895 vec_oprnds0[0]
6896 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6897 vec_oprnds0[0]);
6898 if (single_defuse_cycle && reduc_index == 1)
6899 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6900 else
6901 vec_oprnds1[0]
6902 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6903 vec_oprnds1[0]);
6904 if (op_type == ternary_op)
6906 if (single_defuse_cycle && reduc_index == 2)
6907 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6908 else
6909 vec_oprnds2[0]
6910 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6911 vec_oprnds2[0]);
6916 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6918 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6919 if (masked_loop_p && !mask_by_cond_expr)
6921 /* Make sure that the reduction accumulator is vop[0]. */
6922 if (reduc_index == 1)
6924 gcc_assert (commutative_tree_code (code));
6925 std::swap (vop[0], vop[1]);
6927 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6928 vectype_in, i * ncopies + j);
6929 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6930 vop[0], vop[1],
6931 vop[0]);
6932 new_temp = make_ssa_name (vec_dest, call);
6933 gimple_call_set_lhs (call, new_temp);
6934 gimple_call_set_nothrow (call, true);
6935 new_stmt_info
6936 = vect_finish_stmt_generation (stmt_info, call, gsi);
6938 else
6940 if (op_type == ternary_op)
6941 vop[2] = vec_oprnds2[i];
6943 if (masked_loop_p && mask_by_cond_expr)
6945 tree mask = vect_get_loop_mask (gsi, masks,
6946 vec_num * ncopies,
6947 vectype_in, i * ncopies + j);
6948 build_vect_cond_expr (code, vop, mask, gsi);
6951 gassign *new_stmt = gimple_build_assign (vec_dest, code,
6952 vop[0], vop[1], vop[2]);
6953 new_temp = make_ssa_name (vec_dest, new_stmt);
6954 gimple_assign_set_lhs (new_stmt, new_temp);
6955 new_stmt_info
6956 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6959 if (slp_node)
6960 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6963 if (slp_node || single_defuse_cycle)
6964 continue;
6966 if (j == 0)
6967 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6968 else
6969 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6971 prev_stmt_info = new_stmt_info;
6974 if (single_defuse_cycle && !slp_node)
6975 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6977 return true;
6980 /* Transform phase of a cycle PHI. */
6982 bool
6983 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6984 slp_tree slp_node, slp_instance slp_node_instance)
6986 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6987 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6988 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6989 int i;
6990 int ncopies;
6991 stmt_vec_info prev_phi_info;
6992 int j;
6993 bool nested_cycle = false;
6994 int vec_num;
6996 if (nested_in_vect_loop_p (loop, stmt_info))
6998 loop = loop->inner;
6999 nested_cycle = true;
7002 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7003 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7004 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7005 gcc_assert (reduc_info->is_reduc_info);
7007 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7008 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7009 /* Leave the scalar phi in place. */
7010 return true;
7012 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7013 /* For a nested cycle we do not fill the above. */
7014 if (!vectype_in)
7015 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7016 gcc_assert (vectype_in);
7018 if (slp_node)
7020 /* The size vect_schedule_slp_instance computes is off for us. */
7021 vec_num = vect_get_num_vectors
7022 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7023 * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7024 ncopies = 1;
7026 else
7028 vec_num = 1;
7029 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7032 /* Check whether we should use a single PHI node and accumulate
7033 vectors to one before the backedge. */
7034 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7035 ncopies = 1;
7037 /* Create the destination vector */
7038 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7039 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7040 vectype_out);
7042 /* Get the loop-entry arguments. */
7043 tree vec_initial_def;
7044 auto_vec<tree> vec_initial_defs;
7045 if (slp_node)
7047 vec_initial_defs.reserve (vec_num);
7048 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7049 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7050 tree neutral_op
7051 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7052 STMT_VINFO_REDUC_CODE (reduc_info),
7053 first != NULL);
7054 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
7055 &vec_initial_defs, vec_num,
7056 first != NULL, neutral_op);
7058 else
7060 /* Get at the scalar def before the loop, that defines the initial
7061 value of the reduction variable. */
7062 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7063 loop_preheader_edge (loop));
7064 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7065 and we can't use zero for induc_val, use initial_def. Similarly
7066 for REDUC_MIN and initial_def larger than the base. */
7067 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7069 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7070 if (TREE_CODE (initial_def) == INTEGER_CST
7071 && !integer_zerop (induc_val)
7072 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7073 && tree_int_cst_lt (initial_def, induc_val))
7074 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7075 && tree_int_cst_lt (induc_val, initial_def))))
7077 induc_val = initial_def;
7078 /* Communicate we used the initial_def to epilouge
7079 generation. */
7080 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7082 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7084 else if (nested_cycle)
7086 /* Do not use an adjustment def as that case is not supported
7087 correctly if ncopies is not one. */
7088 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
7089 reduc_stmt_info);
7091 else
7093 tree adjustment_def = NULL_TREE;
7094 tree *adjustment_defp = &adjustment_def;
7095 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7096 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7097 adjustment_defp = NULL;
7098 vec_initial_def
7099 = get_initial_def_for_reduction (reduc_stmt_info, code,
7100 initial_def, adjustment_defp);
7101 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7103 vec_initial_defs.create (1);
7104 vec_initial_defs.quick_push (vec_initial_def);
7107 /* Generate the reduction PHIs upfront. */
7108 prev_phi_info = NULL;
7109 for (i = 0; i < vec_num; i++)
7111 tree vec_init_def = vec_initial_defs[i];
7112 for (j = 0; j < ncopies; j++)
7114 /* Create the reduction-phi that defines the reduction
7115 operand. */
7116 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7117 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7119 /* Set the loop-entry arg of the reduction-phi. */
7120 if (j != 0 && nested_cycle)
7121 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7122 vec_init_def);
7123 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7124 UNKNOWN_LOCATION);
7126 /* The loop-latch arg is set in epilogue processing. */
7128 if (slp_node)
7129 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7130 else
7132 if (j == 0)
7133 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7134 else
7135 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7136 prev_phi_info = new_phi_info;
7141 return true;
7144 /* Vectorizes LC PHIs. */
7146 bool
7147 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7148 slp_tree slp_node)
7150 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7151 if (!loop_vinfo
7152 || !is_a <gphi *> (stmt_info->stmt)
7153 || gimple_phi_num_args (stmt_info->stmt) != 1)
7154 return false;
7156 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7157 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7158 return false;
7160 if (!vec_stmt) /* transformation not required. */
7162 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7163 return true;
7166 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7167 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7168 basic_block bb = gimple_bb (stmt_info->stmt);
7169 edge e = single_pred_edge (bb);
7170 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7171 vec<tree> vec_oprnds = vNULL;
7172 vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7173 stmt_info, &vec_oprnds, NULL, slp_node);
7174 if (slp_node)
7176 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7177 gcc_assert (vec_oprnds.length () == vec_num);
7178 for (unsigned i = 0; i < vec_num; i++)
7180 /* Create the vectorized LC PHI node. */
7181 gphi *new_phi = create_phi_node (vec_dest, bb);
7182 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7183 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7184 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7187 else
7189 unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7190 stmt_vec_info prev_phi_info = NULL;
7191 for (unsigned i = 0; i < ncopies; i++)
7193 if (i != 0)
7194 vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7195 /* Create the vectorized LC PHI node. */
7196 gphi *new_phi = create_phi_node (vec_dest, bb);
7197 add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7198 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7199 if (i == 0)
7200 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7201 else
7202 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7203 prev_phi_info = new_phi_info;
7206 vec_oprnds.release ();
7208 return true;
7212 /* Function vect_min_worthwhile_factor.
7214 For a loop where we could vectorize the operation indicated by CODE,
7215 return the minimum vectorization factor that makes it worthwhile
7216 to use generic vectors. */
7217 static unsigned int
7218 vect_min_worthwhile_factor (enum tree_code code)
7220 switch (code)
7222 case PLUS_EXPR:
7223 case MINUS_EXPR:
7224 case NEGATE_EXPR:
7225 return 4;
7227 case BIT_AND_EXPR:
7228 case BIT_IOR_EXPR:
7229 case BIT_XOR_EXPR:
7230 case BIT_NOT_EXPR:
7231 return 2;
7233 default:
7234 return INT_MAX;
7238 /* Return true if VINFO indicates we are doing loop vectorization and if
7239 it is worth decomposing CODE operations into scalar operations for
7240 that loop's vectorization factor. */
7242 bool
7243 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7245 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7246 unsigned HOST_WIDE_INT value;
7247 return (loop_vinfo
7248 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7249 && value >= vect_min_worthwhile_factor (code));
7252 /* Function vectorizable_induction
7254 Check if STMT_INFO performs an induction computation that can be vectorized.
7255 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7256 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7257 Return true if STMT_INFO is vectorizable in this way. */
7259 bool
7260 vectorizable_induction (stmt_vec_info stmt_info,
7261 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7262 stmt_vec_info *vec_stmt, slp_tree slp_node,
7263 stmt_vector_for_cost *cost_vec)
7265 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7266 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7267 unsigned ncopies;
7268 bool nested_in_vect_loop = false;
7269 class loop *iv_loop;
7270 tree vec_def;
7271 edge pe = loop_preheader_edge (loop);
7272 basic_block new_bb;
7273 tree new_vec, vec_init, vec_step, t;
7274 tree new_name;
7275 gimple *new_stmt;
7276 gphi *induction_phi;
7277 tree induc_def, vec_dest;
7278 tree init_expr, step_expr;
7279 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7280 unsigned i;
7281 tree expr;
7282 gimple_seq stmts;
7283 imm_use_iterator imm_iter;
7284 use_operand_p use_p;
7285 gimple *exit_phi;
7286 edge latch_e;
7287 tree loop_arg;
7288 gimple_stmt_iterator si;
7290 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7291 if (!phi)
7292 return false;
7294 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7295 return false;
7297 /* Make sure it was recognized as induction computation. */
7298 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7299 return false;
7301 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7302 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7304 if (slp_node)
7305 ncopies = 1;
7306 else
7307 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7308 gcc_assert (ncopies >= 1);
7310 /* FORNOW. These restrictions should be relaxed. */
7311 if (nested_in_vect_loop_p (loop, stmt_info))
7313 imm_use_iterator imm_iter;
7314 use_operand_p use_p;
7315 gimple *exit_phi;
7316 edge latch_e;
7317 tree loop_arg;
7319 if (ncopies > 1)
7321 if (dump_enabled_p ())
7322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7323 "multiple types in nested loop.\n");
7324 return false;
7327 /* FORNOW: outer loop induction with SLP not supported. */
7328 if (STMT_SLP_TYPE (stmt_info))
7329 return false;
7331 exit_phi = NULL;
7332 latch_e = loop_latch_edge (loop->inner);
7333 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7336 gimple *use_stmt = USE_STMT (use_p);
7337 if (is_gimple_debug (use_stmt))
7338 continue;
7340 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7342 exit_phi = use_stmt;
7343 break;
7346 if (exit_phi)
7348 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7349 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7350 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7352 if (dump_enabled_p ())
7353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7354 "inner-loop induction only used outside "
7355 "of the outer vectorized loop.\n");
7356 return false;
7360 nested_in_vect_loop = true;
7361 iv_loop = loop->inner;
7363 else
7364 iv_loop = loop;
7365 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7367 if (slp_node && !nunits.is_constant ())
7369 /* The current SLP code creates the initial value element-by-element. */
7370 if (dump_enabled_p ())
7371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7372 "SLP induction not supported for variable-length"
7373 " vectors.\n");
7374 return false;
7377 if (!vec_stmt) /* transformation not required. */
7379 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7380 DUMP_VECT_SCOPE ("vectorizable_induction");
7381 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7382 return true;
7385 /* Transform. */
7387 /* Compute a vector variable, initialized with the first VF values of
7388 the induction variable. E.g., for an iv with IV_PHI='X' and
7389 evolution S, for a vector of 4 units, we want to compute:
7390 [X, X + S, X + 2*S, X + 3*S]. */
7392 if (dump_enabled_p ())
7393 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7395 latch_e = loop_latch_edge (iv_loop);
7396 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7398 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7399 gcc_assert (step_expr != NULL_TREE);
7400 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7402 pe = loop_preheader_edge (iv_loop);
7403 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7404 loop_preheader_edge (iv_loop));
7406 stmts = NULL;
7407 if (!nested_in_vect_loop)
7409 /* Convert the initial value to the IV update type. */
7410 tree new_type = TREE_TYPE (step_expr);
7411 init_expr = gimple_convert (&stmts, new_type, init_expr);
7413 /* If we are using the loop mask to "peel" for alignment then we need
7414 to adjust the start value here. */
7415 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7416 if (skip_niters != NULL_TREE)
7418 if (FLOAT_TYPE_P (vectype))
7419 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7420 skip_niters);
7421 else
7422 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7423 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7424 skip_niters, step_expr);
7425 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7426 init_expr, skip_step);
7430 if (stmts)
7432 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7433 gcc_assert (!new_bb);
7436 /* Find the first insertion point in the BB. */
7437 basic_block bb = gimple_bb (phi);
7438 si = gsi_after_labels (bb);
7440 /* For SLP induction we have to generate several IVs as for example
7441 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7442 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7443 [VF*S, VF*S, VF*S, VF*S] for all. */
7444 if (slp_node)
7446 /* Enforced above. */
7447 unsigned int const_nunits = nunits.to_constant ();
7449 /* Generate [VF*S, VF*S, ... ]. */
7450 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7452 expr = build_int_cst (integer_type_node, vf);
7453 expr = fold_convert (TREE_TYPE (step_expr), expr);
7455 else
7456 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7457 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7458 expr, step_expr);
7459 if (! CONSTANT_CLASS_P (new_name))
7460 new_name = vect_init_vector (stmt_info, new_name,
7461 TREE_TYPE (step_expr), NULL);
7462 new_vec = build_vector_from_val (step_vectype, new_name);
7463 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7465 /* Now generate the IVs. */
7466 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7467 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7468 unsigned elts = const_nunits * nvects;
7469 unsigned nivs = least_common_multiple (group_size,
7470 const_nunits) / const_nunits;
7471 gcc_assert (elts % group_size == 0);
7472 tree elt = init_expr;
7473 unsigned ivn;
7474 for (ivn = 0; ivn < nivs; ++ivn)
7476 tree_vector_builder elts (step_vectype, const_nunits, 1);
7477 stmts = NULL;
7478 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7480 if (ivn*const_nunits + eltn >= group_size
7481 && (ivn * const_nunits + eltn) % group_size == 0)
7482 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7483 elt, step_expr);
7484 elts.quick_push (elt);
7486 vec_init = gimple_build_vector (&stmts, &elts);
7487 vec_init = gimple_convert (&stmts, vectype, vec_init);
7488 if (stmts)
7490 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7491 gcc_assert (!new_bb);
7494 /* Create the induction-phi that defines the induction-operand. */
7495 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7496 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7497 stmt_vec_info induction_phi_info
7498 = loop_vinfo->add_stmt (induction_phi);
7499 induc_def = PHI_RESULT (induction_phi);
7501 /* Create the iv update inside the loop */
7502 gimple_seq stmts = NULL;
7503 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7504 vec_def = gimple_build (&stmts,
7505 PLUS_EXPR, step_vectype, vec_def, vec_step);
7506 vec_def = gimple_convert (&stmts, vectype, vec_def);
7507 loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7508 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7510 /* Set the arguments of the phi node: */
7511 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7512 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7513 UNKNOWN_LOCATION);
7515 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7518 /* Re-use IVs when we can. */
7519 if (ivn < nvects)
7521 unsigned vfp
7522 = least_common_multiple (group_size, const_nunits) / group_size;
7523 /* Generate [VF'*S, VF'*S, ... ]. */
7524 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7526 expr = build_int_cst (integer_type_node, vfp);
7527 expr = fold_convert (TREE_TYPE (step_expr), expr);
7529 else
7530 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7531 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7532 expr, step_expr);
7533 if (! CONSTANT_CLASS_P (new_name))
7534 new_name = vect_init_vector (stmt_info, new_name,
7535 TREE_TYPE (step_expr), NULL);
7536 new_vec = build_vector_from_val (step_vectype, new_name);
7537 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7538 for (; ivn < nvects; ++ivn)
7540 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7541 tree def;
7542 if (gimple_code (iv) == GIMPLE_PHI)
7543 def = gimple_phi_result (iv);
7544 else
7545 def = gimple_assign_lhs (iv);
7546 gimple_seq stmts = NULL;
7547 def = gimple_convert (&stmts, step_vectype, def);
7548 def = gimple_build (&stmts,
7549 PLUS_EXPR, step_vectype, def, vec_step);
7550 def = gimple_convert (&stmts, vectype, def);
7551 if (gimple_code (iv) == GIMPLE_PHI)
7552 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7553 else
7555 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7556 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7558 SLP_TREE_VEC_STMTS (slp_node).quick_push
7559 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7563 return true;
7566 /* Create the vector that holds the initial_value of the induction. */
7567 if (nested_in_vect_loop)
7569 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7570 been created during vectorization of previous stmts. We obtain it
7571 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7572 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7573 /* If the initial value is not of proper type, convert it. */
7574 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7576 new_stmt
7577 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7578 vect_simple_var,
7579 "vec_iv_"),
7580 VIEW_CONVERT_EXPR,
7581 build1 (VIEW_CONVERT_EXPR, vectype,
7582 vec_init));
7583 vec_init = gimple_assign_lhs (new_stmt);
7584 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7585 new_stmt);
7586 gcc_assert (!new_bb);
7587 loop_vinfo->add_stmt (new_stmt);
7590 else
7592 /* iv_loop is the loop to be vectorized. Create:
7593 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7594 stmts = NULL;
7595 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7597 unsigned HOST_WIDE_INT const_nunits;
7598 if (nunits.is_constant (&const_nunits))
7600 tree_vector_builder elts (step_vectype, const_nunits, 1);
7601 elts.quick_push (new_name);
7602 for (i = 1; i < const_nunits; i++)
7604 /* Create: new_name_i = new_name + step_expr */
7605 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7606 new_name, step_expr);
7607 elts.quick_push (new_name);
7609 /* Create a vector from [new_name_0, new_name_1, ...,
7610 new_name_nunits-1] */
7611 vec_init = gimple_build_vector (&stmts, &elts);
7613 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7614 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7615 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7616 new_name, step_expr);
7617 else
7619 /* Build:
7620 [base, base, base, ...]
7621 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7622 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7623 gcc_assert (flag_associative_math);
7624 tree index = build_index_vector (step_vectype, 0, 1);
7625 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7626 new_name);
7627 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7628 step_expr);
7629 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7630 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7631 vec_init, step_vec);
7632 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7633 vec_init, base_vec);
7635 vec_init = gimple_convert (&stmts, vectype, vec_init);
7637 if (stmts)
7639 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7640 gcc_assert (!new_bb);
7645 /* Create the vector that holds the step of the induction. */
7646 if (nested_in_vect_loop)
7647 /* iv_loop is nested in the loop to be vectorized. Generate:
7648 vec_step = [S, S, S, S] */
7649 new_name = step_expr;
7650 else
7652 /* iv_loop is the loop to be vectorized. Generate:
7653 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7654 gimple_seq seq = NULL;
7655 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7657 expr = build_int_cst (integer_type_node, vf);
7658 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7660 else
7661 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7662 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7663 expr, step_expr);
7664 if (seq)
7666 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7667 gcc_assert (!new_bb);
7671 t = unshare_expr (new_name);
7672 gcc_assert (CONSTANT_CLASS_P (new_name)
7673 || TREE_CODE (new_name) == SSA_NAME);
7674 new_vec = build_vector_from_val (step_vectype, t);
7675 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7678 /* Create the following def-use cycle:
7679 loop prolog:
7680 vec_init = ...
7681 vec_step = ...
7682 loop:
7683 vec_iv = PHI <vec_init, vec_loop>
7685 STMT
7687 vec_loop = vec_iv + vec_step; */
7689 /* Create the induction-phi that defines the induction-operand. */
7690 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7691 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7692 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7693 induc_def = PHI_RESULT (induction_phi);
7695 /* Create the iv update inside the loop */
7696 stmts = NULL;
7697 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7698 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7699 vec_def = gimple_convert (&stmts, vectype, vec_def);
7700 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7701 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7702 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7704 /* Set the arguments of the phi node: */
7705 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7706 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7707 UNKNOWN_LOCATION);
7709 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7711 /* In case that vectorization factor (VF) is bigger than the number
7712 of elements that we can fit in a vectype (nunits), we have to generate
7713 more than one vector stmt - i.e - we need to "unroll" the
7714 vector stmt by a factor VF/nunits. For more details see documentation
7715 in vectorizable_operation. */
7717 if (ncopies > 1)
7719 gimple_seq seq = NULL;
7720 stmt_vec_info prev_stmt_vinfo;
7721 /* FORNOW. This restriction should be relaxed. */
7722 gcc_assert (!nested_in_vect_loop);
7724 /* Create the vector that holds the step of the induction. */
7725 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7727 expr = build_int_cst (integer_type_node, nunits);
7728 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7730 else
7731 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7732 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7733 expr, step_expr);
7734 if (seq)
7736 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7737 gcc_assert (!new_bb);
7740 t = unshare_expr (new_name);
7741 gcc_assert (CONSTANT_CLASS_P (new_name)
7742 || TREE_CODE (new_name) == SSA_NAME);
7743 new_vec = build_vector_from_val (step_vectype, t);
7744 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7746 vec_def = induc_def;
7747 prev_stmt_vinfo = induction_phi_info;
7748 for (i = 1; i < ncopies; i++)
7750 /* vec_i = vec_prev + vec_step */
7751 gimple_seq stmts = NULL;
7752 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7753 vec_def = gimple_build (&stmts,
7754 PLUS_EXPR, step_vectype, vec_def, vec_step);
7755 vec_def = gimple_convert (&stmts, vectype, vec_def);
7757 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7758 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7759 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7760 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7761 prev_stmt_vinfo = new_stmt_info;
7765 if (nested_in_vect_loop)
7767 /* Find the loop-closed exit-phi of the induction, and record
7768 the final vector of induction results: */
7769 exit_phi = NULL;
7770 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7772 gimple *use_stmt = USE_STMT (use_p);
7773 if (is_gimple_debug (use_stmt))
7774 continue;
7776 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7778 exit_phi = use_stmt;
7779 break;
7782 if (exit_phi)
7784 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7785 /* FORNOW. Currently not supporting the case that an inner-loop induction
7786 is not used in the outer-loop (i.e. only outside the outer-loop). */
7787 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7788 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7790 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7791 if (dump_enabled_p ())
7792 dump_printf_loc (MSG_NOTE, vect_location,
7793 "vector of inductions after inner-loop:%G",
7794 new_stmt);
7799 if (dump_enabled_p ())
7800 dump_printf_loc (MSG_NOTE, vect_location,
7801 "transform induction: created def-use cycle: %G%G",
7802 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7804 return true;
7807 /* Function vectorizable_live_operation.
7809 STMT_INFO computes a value that is used outside the loop. Check if
7810 it can be supported. */
7812 bool
7813 vectorizable_live_operation (stmt_vec_info stmt_info,
7814 gimple_stmt_iterator *gsi,
7815 slp_tree slp_node, slp_instance slp_node_instance,
7816 int slp_index, bool vec_stmt_p,
7817 stmt_vector_for_cost *)
7819 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7820 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7821 imm_use_iterator imm_iter;
7822 tree lhs, lhs_type, bitsize, vec_bitsize;
7823 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7824 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7825 int ncopies;
7826 gimple *use_stmt;
7827 auto_vec<tree> vec_oprnds;
7828 int vec_entry = 0;
7829 poly_uint64 vec_index = 0;
7831 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7833 /* If a stmt of a reduction is live, vectorize it via
7834 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7835 validity so just trigger the transform here. */
7836 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7838 if (!vec_stmt_p)
7839 return true;
7840 if (slp_node)
7842 /* For reduction chains the meta-info is attached to
7843 the group leader. */
7844 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7845 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7846 /* For SLP reductions we vectorize the epilogue for
7847 all involved stmts together. */
7848 else if (slp_index != 0)
7849 return true;
7851 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7852 gcc_assert (reduc_info->is_reduc_info);
7853 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7854 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7855 return true;
7856 vect_create_epilog_for_reduction (stmt_info, slp_node,
7857 slp_node_instance);
7858 return true;
7861 /* FORNOW. CHECKME. */
7862 if (nested_in_vect_loop_p (loop, stmt_info))
7863 return false;
7865 /* If STMT is not relevant and it is a simple assignment and its inputs are
7866 invariant then it can remain in place, unvectorized. The original last
7867 scalar value that it computes will be used. */
7868 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7870 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7871 if (dump_enabled_p ())
7872 dump_printf_loc (MSG_NOTE, vect_location,
7873 "statement is simple and uses invariant. Leaving in "
7874 "place.\n");
7875 return true;
7878 if (slp_node)
7879 ncopies = 1;
7880 else
7881 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7883 if (slp_node)
7885 gcc_assert (slp_index >= 0);
7887 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7888 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7890 /* Get the last occurrence of the scalar index from the concatenation of
7891 all the slp vectors. Calculate which slp vector it is and the index
7892 within. */
7893 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7895 /* Calculate which vector contains the result, and which lane of
7896 that vector we need. */
7897 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7899 if (dump_enabled_p ())
7900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7901 "Cannot determine which vector holds the"
7902 " final result.\n");
7903 return false;
7907 if (!vec_stmt_p)
7909 /* No transformation required. */
7910 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7912 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7913 OPTIMIZE_FOR_SPEED))
7915 if (dump_enabled_p ())
7916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7917 "can't use a fully-masked loop because "
7918 "the target doesn't support extract last "
7919 "reduction.\n");
7920 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7922 else if (slp_node)
7924 if (dump_enabled_p ())
7925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7926 "can't use a fully-masked loop because an "
7927 "SLP statement is live after the loop.\n");
7928 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7930 else if (ncopies > 1)
7932 if (dump_enabled_p ())
7933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7934 "can't use a fully-masked loop because"
7935 " ncopies is greater than 1.\n");
7936 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7938 else
7940 gcc_assert (ncopies == 1 && !slp_node);
7941 vect_record_loop_mask (loop_vinfo,
7942 &LOOP_VINFO_MASKS (loop_vinfo),
7943 1, vectype, NULL);
7946 return true;
7949 /* Use the lhs of the original scalar statement. */
7950 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7952 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7953 : gimple_get_lhs (stmt);
7954 lhs_type = TREE_TYPE (lhs);
7956 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7957 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7958 : TYPE_SIZE (TREE_TYPE (vectype)));
7959 vec_bitsize = TYPE_SIZE (vectype);
7961 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7962 tree vec_lhs, bitstart;
7963 if (slp_node)
7965 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7967 /* Get the correct slp vectorized stmt. */
7968 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7969 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7970 vec_lhs = gimple_phi_result (phi);
7971 else
7972 vec_lhs = gimple_get_lhs (vec_stmt);
7974 /* Get entry to use. */
7975 bitstart = bitsize_int (vec_index);
7976 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7978 else
7980 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7981 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7982 gcc_checking_assert (ncopies == 1
7983 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7985 /* For multiple copies, get the last copy. */
7986 for (int i = 1; i < ncopies; ++i)
7987 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7989 /* Get the last lane in the vector. */
7990 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7993 gimple_seq stmts = NULL;
7994 tree new_tree;
7995 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7997 /* Emit:
7999 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8001 where VEC_LHS is the vectorized live-out result and MASK is
8002 the loop mask for the final iteration. */
8003 gcc_assert (ncopies == 1 && !slp_node);
8004 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8005 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8006 1, vectype, 0);
8007 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8008 scalar_type, mask, vec_lhs);
8010 /* Convert the extracted vector element to the required scalar type. */
8011 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8013 else
8015 tree bftype = TREE_TYPE (vectype);
8016 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8017 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8018 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8019 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8020 &stmts, true, NULL_TREE);
8023 if (stmts)
8024 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8026 /* Replace use of lhs with newly computed result. If the use stmt is a
8027 single arg PHI, just replace all uses of PHI result. It's necessary
8028 because lcssa PHI defining lhs may be before newly inserted stmt. */
8029 use_operand_p use_p;
8030 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8031 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8032 && !is_gimple_debug (use_stmt))
8034 if (gimple_code (use_stmt) == GIMPLE_PHI
8035 && gimple_phi_num_args (use_stmt) == 1)
8037 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8039 else
8041 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8042 SET_USE (use_p, new_tree);
8044 update_stmt (use_stmt);
8047 return true;
8050 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8052 static void
8053 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8055 ssa_op_iter op_iter;
8056 imm_use_iterator imm_iter;
8057 def_operand_p def_p;
8058 gimple *ustmt;
8060 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8062 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8064 basic_block bb;
8066 if (!is_gimple_debug (ustmt))
8067 continue;
8069 bb = gimple_bb (ustmt);
8071 if (!flow_bb_inside_loop_p (loop, bb))
8073 if (gimple_debug_bind_p (ustmt))
8075 if (dump_enabled_p ())
8076 dump_printf_loc (MSG_NOTE, vect_location,
8077 "killing debug use\n");
8079 gimple_debug_bind_reset_value (ustmt);
8080 update_stmt (ustmt);
8082 else
8083 gcc_unreachable ();
8089 /* Given loop represented by LOOP_VINFO, return true if computation of
8090 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8091 otherwise. */
8093 static bool
8094 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8096 /* Constant case. */
8097 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8099 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8100 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8102 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8103 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8104 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8105 return true;
8108 widest_int max;
8109 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8110 /* Check the upper bound of loop niters. */
8111 if (get_max_loop_iterations (loop, &max))
8113 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8114 signop sgn = TYPE_SIGN (type);
8115 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8116 if (max < type_max)
8117 return true;
8119 return false;
8122 /* Return a mask type with half the number of elements as OLD_TYPE,
8123 given that it should have mode NEW_MODE. */
8125 tree
8126 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8128 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8129 return build_truth_vector_type_for_mode (nunits, new_mode);
8132 /* Return a mask type with twice as many elements as OLD_TYPE,
8133 given that it should have mode NEW_MODE. */
8135 tree
8136 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8138 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8139 return build_truth_vector_type_for_mode (nunits, new_mode);
8142 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8143 contain a sequence of NVECTORS masks that each control a vector of type
8144 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8145 these vector masks with the vector version of SCALAR_MASK. */
8147 void
8148 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8149 unsigned int nvectors, tree vectype, tree scalar_mask)
8151 gcc_assert (nvectors != 0);
8152 if (masks->length () < nvectors)
8153 masks->safe_grow_cleared (nvectors);
8154 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8155 /* The number of scalars per iteration and the number of vectors are
8156 both compile-time constants. */
8157 unsigned int nscalars_per_iter
8158 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8159 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8161 if (scalar_mask)
8163 scalar_cond_masked_key cond (scalar_mask, nvectors);
8164 loop_vinfo->scalar_cond_masked_set.add (cond);
8167 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8169 rgm->max_nscalars_per_iter = nscalars_per_iter;
8170 rgm->mask_type = truth_type_for (vectype);
8174 /* Given a complete set of masks MASKS, extract mask number INDEX
8175 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8176 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8178 See the comment above vec_loop_masks for more details about the mask
8179 arrangement. */
8181 tree
8182 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8183 unsigned int nvectors, tree vectype, unsigned int index)
8185 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8186 tree mask_type = rgm->mask_type;
8188 /* Populate the rgroup's mask array, if this is the first time we've
8189 used it. */
8190 if (rgm->masks.is_empty ())
8192 rgm->masks.safe_grow_cleared (nvectors);
8193 for (unsigned int i = 0; i < nvectors; ++i)
8195 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8196 /* Provide a dummy definition until the real one is available. */
8197 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8198 rgm->masks[i] = mask;
8202 tree mask = rgm->masks[index];
8203 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8204 TYPE_VECTOR_SUBPARTS (vectype)))
8206 /* A loop mask for data type X can be reused for data type Y
8207 if X has N times more elements than Y and if Y's elements
8208 are N times bigger than X's. In this case each sequence
8209 of N elements in the loop mask will be all-zero or all-one.
8210 We can then view-convert the mask so that each sequence of
8211 N elements is replaced by a single element. */
8212 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8213 TYPE_VECTOR_SUBPARTS (vectype)));
8214 gimple_seq seq = NULL;
8215 mask_type = truth_type_for (vectype);
8216 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8217 if (seq)
8218 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8220 return mask;
8223 /* Scale profiling counters by estimation for LOOP which is vectorized
8224 by factor VF. */
8226 static void
8227 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8229 edge preheader = loop_preheader_edge (loop);
8230 /* Reduce loop iterations by the vectorization factor. */
8231 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8232 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8234 if (freq_h.nonzero_p ())
8236 profile_probability p;
8238 /* Avoid dropping loop body profile counter to 0 because of zero count
8239 in loop's preheader. */
8240 if (!(freq_e == profile_count::zero ()))
8241 freq_e = freq_e.force_nonzero ();
8242 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8243 scale_loop_frequencies (loop, p);
8246 edge exit_e = single_exit (loop);
8247 exit_e->probability = profile_probability::always ()
8248 .apply_scale (1, new_est_niter + 1);
8250 edge exit_l = single_pred_edge (loop->latch);
8251 profile_probability prob = exit_l->probability;
8252 exit_l->probability = exit_e->probability.invert ();
8253 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8254 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8257 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8258 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8259 stmt_vec_info. */
8261 static void
8262 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8263 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8265 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8266 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8268 if (dump_enabled_p ())
8269 dump_printf_loc (MSG_NOTE, vect_location,
8270 "------>vectorizing statement: %G", stmt_info->stmt);
8272 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8273 vect_loop_kill_debug_uses (loop, stmt_info);
8275 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8276 && !STMT_VINFO_LIVE_P (stmt_info))
8277 return;
8279 if (STMT_VINFO_VECTYPE (stmt_info))
8281 poly_uint64 nunits
8282 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8283 if (!STMT_SLP_TYPE (stmt_info)
8284 && maybe_ne (nunits, vf)
8285 && dump_enabled_p ())
8286 /* For SLP VF is set according to unrolling factor, and not
8287 to vector size, hence for SLP this print is not valid. */
8288 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8291 /* Pure SLP statements have already been vectorized. We still need
8292 to apply loop vectorization to hybrid SLP statements. */
8293 if (PURE_SLP_STMT (stmt_info))
8294 return;
8296 if (dump_enabled_p ())
8297 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8299 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8300 *seen_store = stmt_info;
8303 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8304 in the hash_map with its corresponding values. */
8306 static tree
8307 find_in_mapping (tree t, void *context)
8309 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8311 tree *value = mapping->get (t);
8312 return value ? *value : t;
8315 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8316 original loop that has now been vectorized.
8318 The inits of the data_references need to be advanced with the number of
8319 iterations of the main loop. This has been computed in vect_do_peeling and
8320 is stored in parameter ADVANCE. We first restore the data_references
8321 initial offset with the values recored in ORIG_DRS_INIT.
8323 Since the loop_vec_info of this EPILOGUE was constructed for the original
8324 loop, its stmt_vec_infos all point to the original statements. These need
8325 to be updated to point to their corresponding copies as well as the SSA_NAMES
8326 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8328 The data_reference's connections also need to be updated. Their
8329 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8330 stmt_vec_infos, their statements need to point to their corresponding copy,
8331 if they are gather loads or scatter stores then their reference needs to be
8332 updated to point to its corresponding copy and finally we set
8333 'base_misaligned' to false as we have already peeled for alignment in the
8334 prologue of the main loop. */
8336 static void
8337 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8339 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8340 auto_vec<gimple *> stmt_worklist;
8341 hash_map<tree,tree> mapping;
8342 gimple *orig_stmt, *new_stmt;
8343 gimple_stmt_iterator epilogue_gsi;
8344 gphi_iterator epilogue_phi_gsi;
8345 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8346 basic_block *epilogue_bbs = get_loop_body (epilogue);
8347 unsigned i;
8349 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8351 /* Advance data_reference's with the number of iterations of the previous
8352 loop and its prologue. */
8353 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8356 /* The EPILOGUE loop is a copy of the original loop so they share the same
8357 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8358 point to the copied statements. We also create a mapping of all LHS' in
8359 the original loop and all the LHS' in the EPILOGUE and create worklists to
8360 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8361 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8363 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8364 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8366 new_stmt = epilogue_phi_gsi.phi ();
8368 gcc_assert (gimple_uid (new_stmt) > 0);
8369 stmt_vinfo
8370 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8372 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8373 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8375 mapping.put (gimple_phi_result (orig_stmt),
8376 gimple_phi_result (new_stmt));
8377 /* PHI nodes can not have patterns or related statements. */
8378 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8379 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8382 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8383 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8385 new_stmt = gsi_stmt (epilogue_gsi);
8387 gcc_assert (gimple_uid (new_stmt) > 0);
8388 stmt_vinfo
8389 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8391 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8392 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8394 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8395 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8397 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8399 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8400 for (gimple_stmt_iterator gsi = gsi_start (seq);
8401 !gsi_end_p (gsi); gsi_next (&gsi))
8402 stmt_worklist.safe_push (gsi_stmt (gsi));
8405 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8406 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8408 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8409 stmt_worklist.safe_push (stmt);
8410 /* Set BB such that the assert in
8411 'get_initial_def_for_reduction' is able to determine that
8412 the BB of the related stmt is inside this loop. */
8413 gimple_set_bb (stmt,
8414 gimple_bb (new_stmt));
8415 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8416 gcc_assert (related_vinfo == NULL
8417 || related_vinfo == stmt_vinfo);
8422 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8423 using the original main loop and thus need to be updated to refer to the
8424 cloned variables used in the epilogue. */
8425 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8427 gimple *stmt = stmt_worklist[i];
8428 tree *new_op;
8430 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8432 tree op = gimple_op (stmt, j);
8433 if ((new_op = mapping.get(op)))
8434 gimple_set_op (stmt, j, *new_op);
8435 else
8437 /* PR92429: The last argument of simplify_replace_tree disables
8438 folding when replacing arguments. This is required as
8439 otherwise you might end up with different statements than the
8440 ones analyzed in vect_loop_analyze, leading to different
8441 vectorization. */
8442 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8443 &find_in_mapping, &mapping, false);
8444 gimple_set_op (stmt, j, op);
8449 struct data_reference *dr;
8450 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8451 FOR_EACH_VEC_ELT (datarefs, i, dr)
8453 orig_stmt = DR_STMT (dr);
8454 gcc_assert (gimple_uid (orig_stmt) > 0);
8455 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8456 /* Data references for gather loads and scatter stores do not use the
8457 updated offset we set using ADVANCE. Instead we have to make sure the
8458 reference in the data references point to the corresponding copy of
8459 the original in the epilogue. */
8460 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8461 == VMAT_GATHER_SCATTER)
8463 DR_REF (dr)
8464 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8465 &find_in_mapping, &mapping);
8466 DR_BASE_ADDRESS (dr)
8467 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8468 &find_in_mapping, &mapping);
8470 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8471 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8472 /* The vector size of the epilogue is smaller than that of the main loop
8473 so the alignment is either the same or lower. This means the dr will
8474 thus by definition be aligned. */
8475 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8478 epilogue_vinfo->shared->datarefs_copy.release ();
8479 epilogue_vinfo->shared->save_datarefs ();
8482 /* Function vect_transform_loop.
8484 The analysis phase has determined that the loop is vectorizable.
8485 Vectorize the loop - created vectorized stmts to replace the scalar
8486 stmts in the loop, and update the loop exit condition.
8487 Returns scalar epilogue loop if any. */
8489 class loop *
8490 vect_transform_loop (loop_vec_info loop_vinfo)
8492 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8493 class loop *epilogue = NULL;
8494 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8495 int nbbs = loop->num_nodes;
8496 int i;
8497 tree niters_vector = NULL_TREE;
8498 tree step_vector = NULL_TREE;
8499 tree niters_vector_mult_vf = NULL_TREE;
8500 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8501 unsigned int lowest_vf = constant_lower_bound (vf);
8502 gimple *stmt;
8503 bool check_profitability = false;
8504 unsigned int th;
8506 DUMP_VECT_SCOPE ("vec_transform_loop");
8508 loop_vinfo->shared->check_datarefs ();
8510 /* Use the more conservative vectorization threshold. If the number
8511 of iterations is constant assume the cost check has been performed
8512 by our caller. If the threshold makes all loops profitable that
8513 run at least the (estimated) vectorization factor number of times
8514 checking is pointless, too. */
8515 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8516 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8518 if (dump_enabled_p ())
8519 dump_printf_loc (MSG_NOTE, vect_location,
8520 "Profitability threshold is %d loop iterations.\n",
8521 th);
8522 check_profitability = true;
8525 /* Make sure there exists a single-predecessor exit bb. Do this before
8526 versioning. */
8527 edge e = single_exit (loop);
8528 if (! single_pred_p (e->dest))
8530 split_loop_exit_edge (e, true);
8531 if (dump_enabled_p ())
8532 dump_printf (MSG_NOTE, "split exit edge\n");
8535 /* Version the loop first, if required, so the profitability check
8536 comes first. */
8538 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8540 class loop *sloop
8541 = vect_loop_versioning (loop_vinfo);
8542 sloop->force_vectorize = false;
8543 check_profitability = false;
8546 /* Make sure there exists a single-predecessor exit bb also on the
8547 scalar loop copy. Do this after versioning but before peeling
8548 so CFG structure is fine for both scalar and if-converted loop
8549 to make slpeel_duplicate_current_defs_from_edges face matched
8550 loop closed PHI nodes on the exit. */
8551 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8553 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8554 if (! single_pred_p (e->dest))
8556 split_loop_exit_edge (e, true);
8557 if (dump_enabled_p ())
8558 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8562 tree niters = vect_build_loop_niters (loop_vinfo);
8563 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8564 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8565 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8566 tree advance;
8567 drs_init_vec orig_drs_init;
8569 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8570 &step_vector, &niters_vector_mult_vf, th,
8571 check_profitability, niters_no_overflow,
8572 &advance);
8574 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8575 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8576 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8577 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8579 if (niters_vector == NULL_TREE)
8581 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8582 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8583 && known_eq (lowest_vf, vf))
8585 niters_vector
8586 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8587 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8588 step_vector = build_one_cst (TREE_TYPE (niters));
8590 else
8591 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8592 &step_vector, niters_no_overflow);
8595 /* 1) Make sure the loop header has exactly two entries
8596 2) Make sure we have a preheader basic block. */
8598 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8600 split_edge (loop_preheader_edge (loop));
8602 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8603 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8604 /* This will deal with any possible peeling. */
8605 vect_prepare_for_masked_peels (loop_vinfo);
8607 /* Schedule the SLP instances first, then handle loop vectorization
8608 below. */
8609 if (!loop_vinfo->slp_instances.is_empty ())
8611 DUMP_VECT_SCOPE ("scheduling SLP instances");
8612 vect_schedule_slp (loop_vinfo);
8615 /* FORNOW: the vectorizer supports only loops which body consist
8616 of one basic block (header + empty latch). When the vectorizer will
8617 support more involved loop forms, the order by which the BBs are
8618 traversed need to be reconsidered. */
8620 for (i = 0; i < nbbs; i++)
8622 basic_block bb = bbs[i];
8623 stmt_vec_info stmt_info;
8625 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8626 gsi_next (&si))
8628 gphi *phi = si.phi ();
8629 if (dump_enabled_p ())
8630 dump_printf_loc (MSG_NOTE, vect_location,
8631 "------>vectorizing phi: %G", phi);
8632 stmt_info = loop_vinfo->lookup_stmt (phi);
8633 if (!stmt_info)
8634 continue;
8636 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8637 vect_loop_kill_debug_uses (loop, stmt_info);
8639 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8640 && !STMT_VINFO_LIVE_P (stmt_info))
8641 continue;
8643 if (STMT_VINFO_VECTYPE (stmt_info)
8644 && (maybe_ne
8645 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8646 && dump_enabled_p ())
8647 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8649 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8650 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8651 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8652 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8653 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8654 && ! PURE_SLP_STMT (stmt_info))
8656 if (dump_enabled_p ())
8657 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8658 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8662 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8663 !gsi_end_p (si);)
8665 stmt = gsi_stmt (si);
8666 /* During vectorization remove existing clobber stmts. */
8667 if (gimple_clobber_p (stmt))
8669 unlink_stmt_vdef (stmt);
8670 gsi_remove (&si, true);
8671 release_defs (stmt);
8673 else
8675 stmt_info = loop_vinfo->lookup_stmt (stmt);
8677 /* vector stmts created in the outer-loop during vectorization of
8678 stmts in an inner-loop may not have a stmt_info, and do not
8679 need to be vectorized. */
8680 stmt_vec_info seen_store = NULL;
8681 if (stmt_info)
8683 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8685 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8686 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8687 !gsi_end_p (subsi); gsi_next (&subsi))
8689 stmt_vec_info pat_stmt_info
8690 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8691 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8692 &si, &seen_store);
8694 stmt_vec_info pat_stmt_info
8695 = STMT_VINFO_RELATED_STMT (stmt_info);
8696 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8697 &seen_store);
8699 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8700 &seen_store);
8702 gsi_next (&si);
8703 if (seen_store)
8705 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8706 /* Interleaving. If IS_STORE is TRUE, the
8707 vectorization of the interleaving chain was
8708 completed - free all the stores in the chain. */
8709 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8710 else
8711 /* Free the attached stmt_vec_info and remove the stmt. */
8712 loop_vinfo->remove_stmt (stmt_info);
8717 /* Stub out scalar statements that must not survive vectorization.
8718 Doing this here helps with grouped statements, or statements that
8719 are involved in patterns. */
8720 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8721 !gsi_end_p (gsi); gsi_next (&gsi))
8723 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8724 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8726 tree lhs = gimple_get_lhs (call);
8727 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8729 tree zero = build_zero_cst (TREE_TYPE (lhs));
8730 gimple *new_stmt = gimple_build_assign (lhs, zero);
8731 gsi_replace (&gsi, new_stmt, true);
8735 } /* BBs in loop */
8737 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8738 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8739 if (integer_onep (step_vector))
8740 niters_no_overflow = true;
8741 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8742 niters_vector_mult_vf, !niters_no_overflow);
8744 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8745 scale_profile_for_vect_loop (loop, assumed_vf);
8747 /* True if the final iteration might not handle a full vector's
8748 worth of scalar iterations. */
8749 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8750 /* The minimum number of iterations performed by the epilogue. This
8751 is 1 when peeling for gaps because we always need a final scalar
8752 iteration. */
8753 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8754 /* +1 to convert latch counts to loop iteration counts,
8755 -min_epilogue_iters to remove iterations that cannot be performed
8756 by the vector code. */
8757 int bias_for_lowest = 1 - min_epilogue_iters;
8758 int bias_for_assumed = bias_for_lowest;
8759 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8760 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8762 /* When the amount of peeling is known at compile time, the first
8763 iteration will have exactly alignment_npeels active elements.
8764 In the worst case it will have at least one. */
8765 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8766 bias_for_lowest += lowest_vf - min_first_active;
8767 bias_for_assumed += assumed_vf - min_first_active;
8769 /* In these calculations the "- 1" converts loop iteration counts
8770 back to latch counts. */
8771 if (loop->any_upper_bound)
8772 loop->nb_iterations_upper_bound
8773 = (final_iter_may_be_partial
8774 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8775 lowest_vf) - 1
8776 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8777 lowest_vf) - 1);
8778 if (loop->any_likely_upper_bound)
8779 loop->nb_iterations_likely_upper_bound
8780 = (final_iter_may_be_partial
8781 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8782 + bias_for_lowest, lowest_vf) - 1
8783 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8784 + bias_for_lowest, lowest_vf) - 1);
8785 if (loop->any_estimate)
8786 loop->nb_iterations_estimate
8787 = (final_iter_may_be_partial
8788 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8789 assumed_vf) - 1
8790 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8791 assumed_vf) - 1);
8793 if (dump_enabled_p ())
8795 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8797 dump_printf_loc (MSG_NOTE, vect_location,
8798 "LOOP VECTORIZED\n");
8799 if (loop->inner)
8800 dump_printf_loc (MSG_NOTE, vect_location,
8801 "OUTER LOOP VECTORIZED\n");
8802 dump_printf (MSG_NOTE, "\n");
8804 else
8805 dump_printf_loc (MSG_NOTE, vect_location,
8806 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8807 GET_MODE_NAME (loop_vinfo->vector_mode));
8810 /* Loops vectorized with a variable factor won't benefit from
8811 unrolling/peeling. */
8812 if (!vf.is_constant ())
8814 loop->unroll = 1;
8815 if (dump_enabled_p ())
8816 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8817 " variable-length vectorization factor\n");
8819 /* Free SLP instances here because otherwise stmt reference counting
8820 won't work. */
8821 slp_instance instance;
8822 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8823 vect_free_slp_instance (instance, true);
8824 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8825 /* Clear-up safelen field since its value is invalid after vectorization
8826 since vectorized loop can have loop-carried dependencies. */
8827 loop->safelen = 0;
8829 if (epilogue)
8831 update_epilogue_loop_vinfo (epilogue, advance);
8833 epilogue->simduid = loop->simduid;
8834 epilogue->force_vectorize = loop->force_vectorize;
8835 epilogue->dont_vectorize = false;
8838 return epilogue;
8841 /* The code below is trying to perform simple optimization - revert
8842 if-conversion for masked stores, i.e. if the mask of a store is zero
8843 do not perform it and all stored value producers also if possible.
8844 For example,
8845 for (i=0; i<n; i++)
8846 if (c[i])
8848 p1[i] += 1;
8849 p2[i] = p3[i] +2;
8851 this transformation will produce the following semi-hammock:
8853 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8855 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8856 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8857 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8858 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8859 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8860 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8864 void
8865 optimize_mask_stores (class loop *loop)
8867 basic_block *bbs = get_loop_body (loop);
8868 unsigned nbbs = loop->num_nodes;
8869 unsigned i;
8870 basic_block bb;
8871 class loop *bb_loop;
8872 gimple_stmt_iterator gsi;
8873 gimple *stmt;
8874 auto_vec<gimple *> worklist;
8875 auto_purge_vect_location sentinel;
8877 vect_location = find_loop_location (loop);
8878 /* Pick up all masked stores in loop if any. */
8879 for (i = 0; i < nbbs; i++)
8881 bb = bbs[i];
8882 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8883 gsi_next (&gsi))
8885 stmt = gsi_stmt (gsi);
8886 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8887 worklist.safe_push (stmt);
8891 free (bbs);
8892 if (worklist.is_empty ())
8893 return;
8895 /* Loop has masked stores. */
8896 while (!worklist.is_empty ())
8898 gimple *last, *last_store;
8899 edge e, efalse;
8900 tree mask;
8901 basic_block store_bb, join_bb;
8902 gimple_stmt_iterator gsi_to;
8903 tree vdef, new_vdef;
8904 gphi *phi;
8905 tree vectype;
8906 tree zero;
8908 last = worklist.pop ();
8909 mask = gimple_call_arg (last, 2);
8910 bb = gimple_bb (last);
8911 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8912 the same loop as if_bb. It could be different to LOOP when two
8913 level loop-nest is vectorized and mask_store belongs to the inner
8914 one. */
8915 e = split_block (bb, last);
8916 bb_loop = bb->loop_father;
8917 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8918 join_bb = e->dest;
8919 store_bb = create_empty_bb (bb);
8920 add_bb_to_loop (store_bb, bb_loop);
8921 e->flags = EDGE_TRUE_VALUE;
8922 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8923 /* Put STORE_BB to likely part. */
8924 efalse->probability = profile_probability::unlikely ();
8925 store_bb->count = efalse->count ();
8926 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8927 if (dom_info_available_p (CDI_DOMINATORS))
8928 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8929 if (dump_enabled_p ())
8930 dump_printf_loc (MSG_NOTE, vect_location,
8931 "Create new block %d to sink mask stores.",
8932 store_bb->index);
8933 /* Create vector comparison with boolean result. */
8934 vectype = TREE_TYPE (mask);
8935 zero = build_zero_cst (vectype);
8936 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8937 gsi = gsi_last_bb (bb);
8938 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8939 /* Create new PHI node for vdef of the last masked store:
8940 .MEM_2 = VDEF <.MEM_1>
8941 will be converted to
8942 .MEM.3 = VDEF <.MEM_1>
8943 and new PHI node will be created in join bb
8944 .MEM_2 = PHI <.MEM_1, .MEM_3>
8946 vdef = gimple_vdef (last);
8947 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8948 gimple_set_vdef (last, new_vdef);
8949 phi = create_phi_node (vdef, join_bb);
8950 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8952 /* Put all masked stores with the same mask to STORE_BB if possible. */
8953 while (true)
8955 gimple_stmt_iterator gsi_from;
8956 gimple *stmt1 = NULL;
8958 /* Move masked store to STORE_BB. */
8959 last_store = last;
8960 gsi = gsi_for_stmt (last);
8961 gsi_from = gsi;
8962 /* Shift GSI to the previous stmt for further traversal. */
8963 gsi_prev (&gsi);
8964 gsi_to = gsi_start_bb (store_bb);
8965 gsi_move_before (&gsi_from, &gsi_to);
8966 /* Setup GSI_TO to the non-empty block start. */
8967 gsi_to = gsi_start_bb (store_bb);
8968 if (dump_enabled_p ())
8969 dump_printf_loc (MSG_NOTE, vect_location,
8970 "Move stmt to created bb\n%G", last);
8971 /* Move all stored value producers if possible. */
8972 while (!gsi_end_p (gsi))
8974 tree lhs;
8975 imm_use_iterator imm_iter;
8976 use_operand_p use_p;
8977 bool res;
8979 /* Skip debug statements. */
8980 if (is_gimple_debug (gsi_stmt (gsi)))
8982 gsi_prev (&gsi);
8983 continue;
8985 stmt1 = gsi_stmt (gsi);
8986 /* Do not consider statements writing to memory or having
8987 volatile operand. */
8988 if (gimple_vdef (stmt1)
8989 || gimple_has_volatile_ops (stmt1))
8990 break;
8991 gsi_from = gsi;
8992 gsi_prev (&gsi);
8993 lhs = gimple_get_lhs (stmt1);
8994 if (!lhs)
8995 break;
8997 /* LHS of vectorized stmt must be SSA_NAME. */
8998 if (TREE_CODE (lhs) != SSA_NAME)
8999 break;
9001 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9003 /* Remove dead scalar statement. */
9004 if (has_zero_uses (lhs))
9006 gsi_remove (&gsi_from, true);
9007 continue;
9011 /* Check that LHS does not have uses outside of STORE_BB. */
9012 res = true;
9013 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9015 gimple *use_stmt;
9016 use_stmt = USE_STMT (use_p);
9017 if (is_gimple_debug (use_stmt))
9018 continue;
9019 if (gimple_bb (use_stmt) != store_bb)
9021 res = false;
9022 break;
9025 if (!res)
9026 break;
9028 if (gimple_vuse (stmt1)
9029 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9030 break;
9032 /* Can move STMT1 to STORE_BB. */
9033 if (dump_enabled_p ())
9034 dump_printf_loc (MSG_NOTE, vect_location,
9035 "Move stmt to created bb\n%G", stmt1);
9036 gsi_move_before (&gsi_from, &gsi_to);
9037 /* Shift GSI_TO for further insertion. */
9038 gsi_prev (&gsi_to);
9040 /* Put other masked stores with the same mask to STORE_BB. */
9041 if (worklist.is_empty ()
9042 || gimple_call_arg (worklist.last (), 2) != mask
9043 || worklist.last () != stmt1)
9044 break;
9045 last = worklist.pop ();
9047 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9051 /* Decide whether it is possible to use a zero-based induction variable
9052 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9053 return the value that the induction variable must be able to hold
9054 in order to ensure that the loop ends with an all-false mask.
9055 Return -1 otherwise. */
9056 widest_int
9057 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9059 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9060 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9061 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9063 /* Calculate the value that the induction variable must be able
9064 to hit in order to ensure that we end the loop with an all-false mask.
9065 This involves adding the maximum number of inactive trailing scalar
9066 iterations. */
9067 widest_int iv_limit = -1;
9068 if (max_loop_iterations (loop, &iv_limit))
9070 if (niters_skip)
9072 /* Add the maximum number of skipped iterations to the
9073 maximum iteration count. */
9074 if (TREE_CODE (niters_skip) == INTEGER_CST)
9075 iv_limit += wi::to_widest (niters_skip);
9076 else
9077 iv_limit += max_vf - 1;
9079 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9080 /* Make a conservatively-correct assumption. */
9081 iv_limit += max_vf - 1;
9083 /* IV_LIMIT is the maximum number of latch iterations, which is also
9084 the maximum in-range IV value. Round this value down to the previous
9085 vector alignment boundary and then add an extra full iteration. */
9086 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9087 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9089 return iv_limit;