[Ada] Missing range check on assignment to bit-packed array
[official-gcc.git] / gcc / tree-vect-loop.c
blobb49ab152012a5c7fe9cc0564e58d296447f9ffb1
1 /* Loop Vectorization
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
185 if (stmt_vectype)
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
258 return opt_result::success ();
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i = 0; i < nbbs; i++)
304 basic_block bb = bbs[i];
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
315 gcc_assert (stmt_info);
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
347 vect_update_max_nunits (&vectorization_factor, vectype);
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
376 for (i = 0; i < mask_producers.length (); i++)
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
385 return opt_result::success ();
389 /* Function vect_is_simple_iv_evolution.
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
420 *init = init_expr;
421 *step = step_expr;
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
439 return true;
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
453 x_3 = ...;
456 outer2:
457 x_4 = PHI <x_3(inner)>;
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
476 /* Function vect_analyze_scalar_cycles_1.
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
534 worklist.safe_push (stmt_vinfo);
535 continue;
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
566 if (double_reduc)
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
576 else
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
587 else
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
612 /* Function vect_analyze_scalar_cycles.
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
621 Example1: reduction:
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
627 Example2: induction:
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
682 stmt_vec_info first;
683 unsigned i;
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
706 /* Function vect_get_loop_niters.
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
713 Return the loop exit condition. */
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
730 if (!exit)
731 return cond;
733 may_be_zero = NULL_TREE;
734 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
735 || chrec_contains_undetermined (niter_desc.niter))
736 return cond;
738 niter_assumptions = niter_desc.assumptions;
739 may_be_zero = niter_desc.may_be_zero;
740 niter = niter_desc.niter;
742 if (may_be_zero && integer_zerop (may_be_zero))
743 may_be_zero = NULL_TREE;
745 if (may_be_zero)
747 if (COMPARISON_CLASS_P (may_be_zero))
749 /* Try to combine may_be_zero with assumptions, this can simplify
750 computation of niter expression. */
751 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
752 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
753 niter_assumptions,
754 fold_build1 (TRUTH_NOT_EXPR,
755 boolean_type_node,
756 may_be_zero));
757 else
758 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
759 build_int_cst (TREE_TYPE (niter), 0),
760 rewrite_to_non_trapping_overflow (niter));
762 may_be_zero = NULL_TREE;
764 else if (integer_nonzerop (may_be_zero))
766 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
767 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
768 return cond;
770 else
771 return cond;
774 *assumptions = niter_assumptions;
775 *number_of_iterationsm1 = niter;
777 /* We want the number of loop header executions which is the number
778 of latch executions plus one.
779 ??? For UINT_MAX latch executions this number overflows to zero
780 for loops like do { n++; } while (n != 0); */
781 if (niter && !chrec_contains_undetermined (niter))
782 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
783 build_int_cst (TREE_TYPE (niter), 1));
784 *number_of_iterations = niter;
786 return cond;
789 /* Function bb_in_loop_p
791 Used as predicate for dfs order traversal of the loop bbs. */
793 static bool
794 bb_in_loop_p (const_basic_block bb, const void *data)
796 const struct loop *const loop = (const struct loop *)data;
797 if (flow_bb_inside_loop_p (loop, bb))
798 return true;
799 return false;
803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
804 stmt_vec_info structs for all the stmts in LOOP_IN. */
806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
807 : vec_info (vec_info::loop, init_cost (loop_in), shared),
808 loop (loop_in),
809 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
810 num_itersm1 (NULL_TREE),
811 num_iters (NULL_TREE),
812 num_iters_unchanged (NULL_TREE),
813 num_iters_assumptions (NULL_TREE),
814 th (0),
815 versioning_threshold (0),
816 vectorization_factor (0),
817 max_vectorization_factor (0),
818 mask_skip_niters (NULL_TREE),
819 mask_compare_type (NULL_TREE),
820 simd_if_cond (NULL_TREE),
821 unaligned_dr (NULL),
822 peeling_for_alignment (0),
823 ptr_mask (0),
824 ivexpr_map (NULL),
825 scan_map (NULL),
826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
829 can_fully_mask_p (true),
830 fully_masked_p (false),
831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop_scaling (profile_probability::uninitialized ()),
837 scalar_loop (NULL),
838 orig_loop_info (NULL)
840 /* CHECKME: We want to visit all BBs before their successors (except for
841 latch blocks, for which this assertion wouldn't hold). In the simple
842 case of the loop forms we allow, a dfs order of the BBs would the same
843 as reversed postorder traversal, so we are safe. */
845 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
846 bbs, loop->num_nodes, loop);
847 gcc_assert (nbbs == loop->num_nodes);
849 for (unsigned int i = 0; i < nbbs; i++)
851 basic_block bb = bbs[i];
852 gimple_stmt_iterator si;
854 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
856 gimple *phi = gsi_stmt (si);
857 gimple_set_uid (phi, 0);
858 add_stmt (phi);
861 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
863 gimple *stmt = gsi_stmt (si);
864 gimple_set_uid (stmt, 0);
865 add_stmt (stmt);
866 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
867 third argument is the #pragma omp simd if (x) condition, when 0,
868 loop shouldn't be vectorized, when non-zero constant, it should
869 be vectorized normally, otherwise versioned with vectorized loop
870 done if the condition is non-zero at runtime. */
871 if (loop_in->simduid
872 && is_gimple_call (stmt)
873 && gimple_call_internal_p (stmt)
874 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
875 && gimple_call_num_args (stmt) >= 3
876 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
877 && (loop_in->simduid
878 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
880 tree arg = gimple_call_arg (stmt, 2);
881 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
882 simd_if_cond = arg;
883 else
884 gcc_assert (integer_nonzerop (arg));
890 /* Free all levels of MASKS. */
892 void
893 release_vec_loop_masks (vec_loop_masks *masks)
895 rgroup_masks *rgm;
896 unsigned int i;
897 FOR_EACH_VEC_ELT (*masks, i, rgm)
898 rgm->masks.release ();
899 masks->release ();
902 /* Free all memory used by the _loop_vec_info, as well as all the
903 stmt_vec_info structs of all the stmts in the loop. */
905 _loop_vec_info::~_loop_vec_info ()
907 int nbbs;
908 gimple_stmt_iterator si;
909 int j;
911 nbbs = loop->num_nodes;
912 for (j = 0; j < nbbs; j++)
914 basic_block bb = bbs[j];
915 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
917 gimple *stmt = gsi_stmt (si);
919 /* We may have broken canonical form by moving a constant
920 into RHS1 of a commutative op. Fix such occurrences. */
921 if (operands_swapped && is_gimple_assign (stmt))
923 enum tree_code code = gimple_assign_rhs_code (stmt);
925 if ((code == PLUS_EXPR
926 || code == POINTER_PLUS_EXPR
927 || code == MULT_EXPR)
928 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
929 swap_ssa_operands (stmt,
930 gimple_assign_rhs1_ptr (stmt),
931 gimple_assign_rhs2_ptr (stmt));
932 else if (code == COND_EXPR
933 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
935 tree cond_expr = gimple_assign_rhs1 (stmt);
936 enum tree_code cond_code = TREE_CODE (cond_expr);
938 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
940 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
941 0));
942 cond_code = invert_tree_comparison (cond_code,
943 honor_nans);
944 if (cond_code != ERROR_MARK)
946 TREE_SET_CODE (cond_expr, cond_code);
947 swap_ssa_operands (stmt,
948 gimple_assign_rhs2_ptr (stmt),
949 gimple_assign_rhs3_ptr (stmt));
954 gsi_next (&si);
958 free (bbs);
960 release_vec_loop_masks (&masks);
961 delete ivexpr_map;
962 delete scan_map;
964 loop->aux = NULL;
967 /* Return an invariant or register for EXPR and emit necessary
968 computations in the LOOP_VINFO loop preheader. */
970 tree
971 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
973 if (is_gimple_reg (expr)
974 || is_gimple_min_invariant (expr))
975 return expr;
977 if (! loop_vinfo->ivexpr_map)
978 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
979 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
980 if (! cached)
982 gimple_seq stmts = NULL;
983 cached = force_gimple_operand (unshare_expr (expr),
984 &stmts, true, NULL_TREE);
985 if (stmts)
987 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
988 gsi_insert_seq_on_edge_immediate (e, stmts);
991 return cached;
994 /* Return true if we can use CMP_TYPE as the comparison type to produce
995 all masks required to mask LOOP_VINFO. */
997 static bool
998 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1000 rgroup_masks *rgm;
1001 unsigned int i;
1002 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1003 if (rgm->mask_type != NULL_TREE
1004 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1005 cmp_type, rgm->mask_type,
1006 OPTIMIZE_FOR_SPEED))
1007 return false;
1008 return true;
1011 /* Calculate the maximum number of scalars per iteration for every
1012 rgroup in LOOP_VINFO. */
1014 static unsigned int
1015 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1017 unsigned int res = 1;
1018 unsigned int i;
1019 rgroup_masks *rgm;
1020 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1021 res = MAX (res, rgm->max_nscalars_per_iter);
1022 return res;
1025 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1026 whether we can actually generate the masks required. Return true if so,
1027 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1029 static bool
1030 vect_verify_full_masking (loop_vec_info loop_vinfo)
1032 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1033 unsigned int min_ni_width;
1034 unsigned int max_nscalars_per_iter
1035 = vect_get_max_nscalars_per_iter (loop_vinfo);
1037 /* Use a normal loop if there are no statements that need masking.
1038 This only happens in rare degenerate cases: it means that the loop
1039 has no loads, no stores, and no live-out values. */
1040 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1041 return false;
1043 /* Get the maximum number of iterations that is representable
1044 in the counter type. */
1045 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1046 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1048 /* Get a more refined estimate for the number of iterations. */
1049 widest_int max_back_edges;
1050 if (max_loop_iterations (loop, &max_back_edges))
1051 max_ni = wi::smin (max_ni, max_back_edges + 1);
1053 /* Account for rgroup masks, in which each bit is replicated N times. */
1054 max_ni *= max_nscalars_per_iter;
1056 /* Work out how many bits we need to represent the limit. */
1057 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1059 /* Find a scalar mode for which WHILE_ULT is supported. */
1060 opt_scalar_int_mode cmp_mode_iter;
1061 tree cmp_type = NULL_TREE;
1062 tree iv_type = NULL_TREE;
1063 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1064 unsigned int iv_precision = UINT_MAX;
1066 if (iv_limit != -1)
1067 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1068 UNSIGNED);
1070 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1072 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1073 if (cmp_bits >= min_ni_width
1074 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1076 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1077 if (this_type
1078 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1080 /* Although we could stop as soon as we find a valid mode,
1081 there are at least two reasons why that's not always the
1082 best choice:
1084 - An IV that's Pmode or wider is more likely to be reusable
1085 in address calculations than an IV that's narrower than
1086 Pmode.
1088 - Doing the comparison in IV_PRECISION or wider allows
1089 a natural 0-based IV, whereas using a narrower comparison
1090 type requires mitigations against wrap-around.
1092 Conversely, if the IV limit is variable, doing the comparison
1093 in a wider type than the original type can introduce
1094 unnecessary extensions, so picking the widest valid mode
1095 is not always a good choice either.
1097 Here we prefer the first IV type that's Pmode or wider,
1098 and the first comparison type that's IV_PRECISION or wider.
1099 (The comparison type must be no wider than the IV type,
1100 to avoid extensions in the vector loop.)
1102 ??? We might want to try continuing beyond Pmode for ILP32
1103 targets if CMP_BITS < IV_PRECISION. */
1104 iv_type = this_type;
1105 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1106 cmp_type = this_type;
1107 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1108 break;
1113 if (!cmp_type)
1114 return false;
1116 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1117 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1118 return true;
1121 /* Calculate the cost of one scalar iteration of the loop. */
1122 static void
1123 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1127 int nbbs = loop->num_nodes, factor;
1128 int innerloop_iters, i;
1130 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1132 /* Gather costs for statements in the scalar loop. */
1134 /* FORNOW. */
1135 innerloop_iters = 1;
1136 if (loop->inner)
1137 innerloop_iters = 50; /* FIXME */
1139 for (i = 0; i < nbbs; i++)
1141 gimple_stmt_iterator si;
1142 basic_block bb = bbs[i];
1144 if (bb->loop_father == loop->inner)
1145 factor = innerloop_iters;
1146 else
1147 factor = 1;
1149 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1151 gimple *stmt = gsi_stmt (si);
1152 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1154 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1155 continue;
1157 /* Skip stmts that are not vectorized inside the loop. */
1158 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1159 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1160 && (!STMT_VINFO_LIVE_P (vstmt_info)
1161 || !VECTORIZABLE_CYCLE_DEF
1162 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1163 continue;
1165 vect_cost_for_stmt kind;
1166 if (STMT_VINFO_DATA_REF (stmt_info))
1168 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1169 kind = scalar_load;
1170 else
1171 kind = scalar_store;
1173 else
1174 kind = scalar_stmt;
1176 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1177 factor, kind, stmt_info, 0, vect_prologue);
1181 /* Now accumulate cost. */
1182 void *target_cost_data = init_cost (loop);
1183 stmt_info_for_cost *si;
1184 int j;
1185 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1186 j, si)
1187 (void) add_stmt_cost (target_cost_data, si->count,
1188 si->kind, si->stmt_info, si->misalign,
1189 vect_body);
1190 unsigned dummy, body_cost = 0;
1191 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1192 destroy_cost_data (target_cost_data);
1193 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1197 /* Function vect_analyze_loop_form_1.
1199 Verify that certain CFG restrictions hold, including:
1200 - the loop has a pre-header
1201 - the loop has a single entry and exit
1202 - the loop exit condition is simple enough
1203 - the number of iterations can be analyzed, i.e, a countable loop. The
1204 niter could be analyzed under some assumptions. */
1206 opt_result
1207 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1208 tree *assumptions, tree *number_of_iterationsm1,
1209 tree *number_of_iterations, gcond **inner_loop_cond)
1211 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1213 /* Different restrictions apply when we are considering an inner-most loop,
1214 vs. an outer (nested) loop.
1215 (FORNOW. May want to relax some of these restrictions in the future). */
1217 if (!loop->inner)
1219 /* Inner-most loop. We currently require that the number of BBs is
1220 exactly 2 (the header and latch). Vectorizable inner-most loops
1221 look like this:
1223 (pre-header)
1225 header <--------+
1226 | | |
1227 | +--> latch --+
1229 (exit-bb) */
1231 if (loop->num_nodes != 2)
1232 return opt_result::failure_at (vect_location,
1233 "not vectorized:"
1234 " control flow in loop.\n");
1236 if (empty_block_p (loop->header))
1237 return opt_result::failure_at (vect_location,
1238 "not vectorized: empty loop.\n");
1240 else
1242 struct loop *innerloop = loop->inner;
1243 edge entryedge;
1245 /* Nested loop. We currently require that the loop is doubly-nested,
1246 contains a single inner loop, and the number of BBs is exactly 5.
1247 Vectorizable outer-loops look like this:
1249 (pre-header)
1251 header <---+
1253 inner-loop |
1255 tail ------+
1257 (exit-bb)
1259 The inner-loop has the properties expected of inner-most loops
1260 as described above. */
1262 if ((loop->inner)->inner || (loop->inner)->next)
1263 return opt_result::failure_at (vect_location,
1264 "not vectorized:"
1265 " multiple nested loops.\n");
1267 if (loop->num_nodes != 5)
1268 return opt_result::failure_at (vect_location,
1269 "not vectorized:"
1270 " control flow in loop.\n");
1272 entryedge = loop_preheader_edge (innerloop);
1273 if (entryedge->src != loop->header
1274 || !single_exit (innerloop)
1275 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1276 return opt_result::failure_at (vect_location,
1277 "not vectorized:"
1278 " unsupported outerloop form.\n");
1280 /* Analyze the inner-loop. */
1281 tree inner_niterm1, inner_niter, inner_assumptions;
1282 opt_result res
1283 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1284 &inner_assumptions, &inner_niterm1,
1285 &inner_niter, NULL);
1286 if (!res)
1288 if (dump_enabled_p ())
1289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290 "not vectorized: Bad inner loop.\n");
1291 return res;
1294 /* Don't support analyzing niter under assumptions for inner
1295 loop. */
1296 if (!integer_onep (inner_assumptions))
1297 return opt_result::failure_at (vect_location,
1298 "not vectorized: Bad inner loop.\n");
1300 if (!expr_invariant_in_loop_p (loop, inner_niter))
1301 return opt_result::failure_at (vect_location,
1302 "not vectorized: inner-loop count not"
1303 " invariant.\n");
1305 if (dump_enabled_p ())
1306 dump_printf_loc (MSG_NOTE, vect_location,
1307 "Considering outer-loop vectorization.\n");
1310 if (!single_exit (loop))
1311 return opt_result::failure_at (vect_location,
1312 "not vectorized: multiple exits.\n");
1313 if (EDGE_COUNT (loop->header->preds) != 2)
1314 return opt_result::failure_at (vect_location,
1315 "not vectorized:"
1316 " too many incoming edges.\n");
1318 /* We assume that the loop exit condition is at the end of the loop. i.e,
1319 that the loop is represented as a do-while (with a proper if-guard
1320 before the loop if needed), where the loop header contains all the
1321 executable statements, and the latch is empty. */
1322 if (!empty_block_p (loop->latch)
1323 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1324 return opt_result::failure_at (vect_location,
1325 "not vectorized: latch block not empty.\n");
1327 /* Make sure the exit is not abnormal. */
1328 edge e = single_exit (loop);
1329 if (e->flags & EDGE_ABNORMAL)
1330 return opt_result::failure_at (vect_location,
1331 "not vectorized:"
1332 " abnormal loop exit edge.\n");
1334 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1335 number_of_iterationsm1);
1336 if (!*loop_cond)
1337 return opt_result::failure_at
1338 (vect_location,
1339 "not vectorized: complicated exit condition.\n");
1341 if (integer_zerop (*assumptions)
1342 || !*number_of_iterations
1343 || chrec_contains_undetermined (*number_of_iterations))
1344 return opt_result::failure_at
1345 (*loop_cond,
1346 "not vectorized: number of iterations cannot be computed.\n");
1348 if (integer_zerop (*number_of_iterations))
1349 return opt_result::failure_at
1350 (*loop_cond,
1351 "not vectorized: number of iterations = 0.\n");
1353 return opt_result::success ();
1356 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1358 opt_loop_vec_info
1359 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1361 tree assumptions, number_of_iterations, number_of_iterationsm1;
1362 gcond *loop_cond, *inner_loop_cond = NULL;
1364 opt_result res
1365 = vect_analyze_loop_form_1 (loop, &loop_cond,
1366 &assumptions, &number_of_iterationsm1,
1367 &number_of_iterations, &inner_loop_cond);
1368 if (!res)
1369 return opt_loop_vec_info::propagate_failure (res);
1371 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1372 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1373 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1374 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1375 if (!integer_onep (assumptions))
1377 /* We consider to vectorize this loop by versioning it under
1378 some assumptions. In order to do this, we need to clear
1379 existing information computed by scev and niter analyzer. */
1380 scev_reset_htab ();
1381 free_numbers_of_iterations_estimates (loop);
1382 /* Also set flag for this loop so that following scev and niter
1383 analysis are done under the assumptions. */
1384 loop_constraint_set (loop, LOOP_C_FINITE);
1385 /* Also record the assumptions for versioning. */
1386 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1389 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1391 if (dump_enabled_p ())
1393 dump_printf_loc (MSG_NOTE, vect_location,
1394 "Symbolic number of iterations is ");
1395 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1396 dump_printf (MSG_NOTE, "\n");
1400 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1401 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1402 if (inner_loop_cond)
1404 stmt_vec_info inner_loop_cond_info
1405 = loop_vinfo->lookup_stmt (inner_loop_cond);
1406 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1409 gcc_assert (!loop->aux);
1410 loop->aux = loop_vinfo;
1411 return opt_loop_vec_info::success (loop_vinfo);
1416 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1417 statements update the vectorization factor. */
1419 static void
1420 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1422 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1423 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1424 int nbbs = loop->num_nodes;
1425 poly_uint64 vectorization_factor;
1426 int i;
1428 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1430 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1431 gcc_assert (known_ne (vectorization_factor, 0U));
1433 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1434 vectorization factor of the loop is the unrolling factor required by
1435 the SLP instances. If that unrolling factor is 1, we say, that we
1436 perform pure SLP on loop - cross iteration parallelism is not
1437 exploited. */
1438 bool only_slp_in_loop = true;
1439 for (i = 0; i < nbbs; i++)
1441 basic_block bb = bbs[i];
1442 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1443 gsi_next (&si))
1445 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1446 stmt_info = vect_stmt_to_vectorize (stmt_info);
1447 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1448 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1449 && !PURE_SLP_STMT (stmt_info))
1450 /* STMT needs both SLP and loop-based vectorization. */
1451 only_slp_in_loop = false;
1455 if (only_slp_in_loop)
1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE, vect_location,
1459 "Loop contains only SLP stmts\n");
1460 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1462 else
1464 if (dump_enabled_p ())
1465 dump_printf_loc (MSG_NOTE, vect_location,
1466 "Loop contains SLP and non-SLP stmts\n");
1467 /* Both the vectorization factor and unroll factor have the form
1468 current_vector_size * X for some rational X, so they must have
1469 a common multiple. */
1470 vectorization_factor
1471 = force_common_multiple (vectorization_factor,
1472 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1475 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1476 if (dump_enabled_p ())
1478 dump_printf_loc (MSG_NOTE, vect_location,
1479 "Updating vectorization factor to ");
1480 dump_dec (MSG_NOTE, vectorization_factor);
1481 dump_printf (MSG_NOTE, ".\n");
1485 /* Return true if STMT_INFO describes a double reduction phi and if
1486 the other phi in the reduction is also relevant for vectorization.
1487 This rejects cases such as:
1489 outer1:
1490 x_1 = PHI <x_3(outer2), ...>;
1493 inner:
1494 x_2 = ...;
1497 outer2:
1498 x_3 = PHI <x_2(inner)>;
1500 if nothing in x_2 or elsewhere makes x_1 relevant. */
1502 static bool
1503 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1505 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1506 return false;
1508 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1511 /* Function vect_analyze_loop_operations.
1513 Scan the loop stmts and make sure they are all vectorizable. */
1515 static opt_result
1516 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1518 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1519 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1520 int nbbs = loop->num_nodes;
1521 int i;
1522 stmt_vec_info stmt_info;
1523 bool need_to_vectorize = false;
1524 bool ok;
1526 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1528 auto_vec<stmt_info_for_cost> cost_vec;
1530 for (i = 0; i < nbbs; i++)
1532 basic_block bb = bbs[i];
1534 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535 gsi_next (&si))
1537 gphi *phi = si.phi ();
1538 ok = true;
1540 stmt_info = loop_vinfo->lookup_stmt (phi);
1541 if (dump_enabled_p ())
1542 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1543 if (virtual_operand_p (gimple_phi_result (phi)))
1544 continue;
1546 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1547 (i.e., a phi in the tail of the outer-loop). */
1548 if (! is_loop_header_bb_p (bb))
1550 /* FORNOW: we currently don't support the case that these phis
1551 are not used in the outerloop (unless it is double reduction,
1552 i.e., this phi is vect_reduction_def), cause this case
1553 requires to actually do something here. */
1554 if (STMT_VINFO_LIVE_P (stmt_info)
1555 && !vect_active_double_reduction_p (stmt_info))
1556 return opt_result::failure_at (phi,
1557 "Unsupported loop-closed phi"
1558 " in outer-loop.\n");
1560 /* If PHI is used in the outer loop, we check that its operand
1561 is defined in the inner loop. */
1562 if (STMT_VINFO_RELEVANT_P (stmt_info))
1564 tree phi_op;
1566 if (gimple_phi_num_args (phi) != 1)
1567 return opt_result::failure_at (phi, "unsupported phi");
1569 phi_op = PHI_ARG_DEF (phi, 0);
1570 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1571 if (!op_def_info)
1572 return opt_result::failure_at (phi, "unsupported phi");
1574 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1575 && (STMT_VINFO_RELEVANT (op_def_info)
1576 != vect_used_in_outer_by_reduction))
1577 return opt_result::failure_at (phi, "unsupported phi");
1580 continue;
1583 gcc_assert (stmt_info);
1585 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1586 || STMT_VINFO_LIVE_P (stmt_info))
1587 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1588 /* A scalar-dependence cycle that we don't support. */
1589 return opt_result::failure_at (phi,
1590 "not vectorized:"
1591 " scalar dependence cycle.\n");
1593 if (STMT_VINFO_RELEVANT_P (stmt_info))
1595 need_to_vectorize = true;
1596 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1597 && ! PURE_SLP_STMT (stmt_info))
1598 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1599 &cost_vec);
1600 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1601 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1602 && ! PURE_SLP_STMT (stmt_info))
1603 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1604 &cost_vec);
1607 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1608 if (ok
1609 && STMT_VINFO_LIVE_P (stmt_info)
1610 && !PURE_SLP_STMT (stmt_info))
1611 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1612 &cost_vec);
1614 if (!ok)
1615 return opt_result::failure_at (phi,
1616 "not vectorized: relevant phi not "
1617 "supported: %G",
1618 static_cast <gimple *> (phi));
1621 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1622 gsi_next (&si))
1624 gimple *stmt = gsi_stmt (si);
1625 if (!gimple_clobber_p (stmt))
1627 opt_result res
1628 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1629 &need_to_vectorize,
1630 NULL, NULL, &cost_vec);
1631 if (!res)
1632 return res;
1635 } /* bbs */
1637 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1639 /* All operations in the loop are either irrelevant (deal with loop
1640 control, or dead), or only used outside the loop and can be moved
1641 out of the loop (e.g. invariants, inductions). The loop can be
1642 optimized away by scalar optimizations. We're better off not
1643 touching this loop. */
1644 if (!need_to_vectorize)
1646 if (dump_enabled_p ())
1647 dump_printf_loc (MSG_NOTE, vect_location,
1648 "All the computation can be taken out of the loop.\n");
1649 return opt_result::failure_at
1650 (vect_location,
1651 "not vectorized: redundant loop. no profit to vectorize.\n");
1654 return opt_result::success ();
1657 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1658 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1659 definitely no, or -1 if it's worth retrying. */
1661 static int
1662 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1664 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1665 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1667 /* Only fully-masked loops can have iteration counts less than the
1668 vectorization factor. */
1669 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1671 HOST_WIDE_INT max_niter;
1673 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1674 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1675 else
1676 max_niter = max_stmt_executions_int (loop);
1678 if (max_niter != -1
1679 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "not vectorized: iteration count smaller than "
1684 "vectorization factor.\n");
1685 return 0;
1689 int min_profitable_iters, min_profitable_estimate;
1690 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1691 &min_profitable_estimate);
1693 if (min_profitable_iters < 0)
1695 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697 "not vectorized: vectorization not profitable.\n");
1698 if (dump_enabled_p ())
1699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700 "not vectorized: vector version will never be "
1701 "profitable.\n");
1702 return -1;
1705 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1706 * assumed_vf);
1708 /* Use the cost model only if it is more conservative than user specified
1709 threshold. */
1710 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1711 min_profitable_iters);
1713 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1715 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1716 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1718 if (dump_enabled_p ())
1719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1720 "not vectorized: vectorization not profitable.\n");
1721 if (dump_enabled_p ())
1722 dump_printf_loc (MSG_NOTE, vect_location,
1723 "not vectorized: iteration count smaller than user "
1724 "specified loop bound parameter or minimum profitable "
1725 "iterations (whichever is more conservative).\n");
1726 return 0;
1729 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1730 if (estimated_niter == -1)
1731 estimated_niter = likely_max_stmt_executions_int (loop);
1732 if (estimated_niter != -1
1733 && ((unsigned HOST_WIDE_INT) estimated_niter
1734 < MAX (th, (unsigned) min_profitable_estimate)))
1736 if (dump_enabled_p ())
1737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1738 "not vectorized: estimated iteration count too "
1739 "small.\n");
1740 if (dump_enabled_p ())
1741 dump_printf_loc (MSG_NOTE, vect_location,
1742 "not vectorized: estimated iteration count smaller "
1743 "than specified loop bound parameter or minimum "
1744 "profitable iterations (whichever is more "
1745 "conservative).\n");
1746 return -1;
1749 return 1;
1752 static opt_result
1753 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1754 vec<data_reference_p> *datarefs,
1755 unsigned int *n_stmts)
1757 *n_stmts = 0;
1758 for (unsigned i = 0; i < loop->num_nodes; i++)
1759 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1760 !gsi_end_p (gsi); gsi_next (&gsi))
1762 gimple *stmt = gsi_stmt (gsi);
1763 if (is_gimple_debug (stmt))
1764 continue;
1765 ++(*n_stmts);
1766 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1767 if (!res)
1769 if (is_gimple_call (stmt) && loop->safelen)
1771 tree fndecl = gimple_call_fndecl (stmt), op;
1772 if (fndecl != NULL_TREE)
1774 cgraph_node *node = cgraph_node::get (fndecl);
1775 if (node != NULL && node->simd_clones != NULL)
1777 unsigned int j, n = gimple_call_num_args (stmt);
1778 for (j = 0; j < n; j++)
1780 op = gimple_call_arg (stmt, j);
1781 if (DECL_P (op)
1782 || (REFERENCE_CLASS_P (op)
1783 && get_base_address (op)))
1784 break;
1786 op = gimple_call_lhs (stmt);
1787 /* Ignore #pragma omp declare simd functions
1788 if they don't have data references in the
1789 call stmt itself. */
1790 if (j == n
1791 && !(op
1792 && (DECL_P (op)
1793 || (REFERENCE_CLASS_P (op)
1794 && get_base_address (op)))))
1795 continue;
1799 return res;
1801 /* If dependence analysis will give up due to the limit on the
1802 number of datarefs stop here and fail fatally. */
1803 if (datarefs->length ()
1804 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1805 return opt_result::failure_at (stmt, "exceeded param "
1806 "loop-max-datarefs-for-datadeps\n");
1808 return opt_result::success ();
1811 /* Look for SLP-only access groups and turn each individual access into its own
1812 group. */
1813 static void
1814 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1816 unsigned int i;
1817 struct data_reference *dr;
1819 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1821 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1822 FOR_EACH_VEC_ELT (datarefs, i, dr)
1824 gcc_assert (DR_REF (dr));
1825 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1827 /* Check if the load is a part of an interleaving chain. */
1828 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1830 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1831 unsigned int group_size = DR_GROUP_SIZE (first_element);
1833 /* Check if SLP-only groups. */
1834 if (!STMT_SLP_TYPE (stmt_info)
1835 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1837 /* Dissolve the group. */
1838 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1840 stmt_vec_info vinfo = first_element;
1841 while (vinfo)
1843 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1844 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1845 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1846 DR_GROUP_SIZE (vinfo) = 1;
1847 DR_GROUP_GAP (vinfo) = group_size - 1;
1848 vinfo = next;
1855 /* Function vect_analyze_loop_2.
1857 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1858 for it. The different analyses will record information in the
1859 loop_vec_info struct. */
1860 static opt_result
1861 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1863 opt_result ok = opt_result::success ();
1864 int res;
1865 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1866 poly_uint64 min_vf = 2;
1868 /* The first group of checks is independent of the vector size. */
1869 fatal = true;
1871 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1872 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1873 return opt_result::failure_at (vect_location,
1874 "not vectorized: simd if(0)\n");
1876 /* Find all data references in the loop (which correspond to vdefs/vuses)
1877 and analyze their evolution in the loop. */
1879 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1881 /* Gather the data references and count stmts in the loop. */
1882 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1884 opt_result res
1885 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1886 &LOOP_VINFO_DATAREFS (loop_vinfo),
1887 n_stmts);
1888 if (!res)
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 "not vectorized: loop contains function "
1893 "calls or data references that cannot "
1894 "be analyzed\n");
1895 return res;
1897 loop_vinfo->shared->save_datarefs ();
1899 else
1900 loop_vinfo->shared->check_datarefs ();
1902 /* Analyze the data references and also adjust the minimal
1903 vectorization factor according to the loads and stores. */
1905 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1906 if (!ok)
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 "bad data references.\n");
1911 return ok;
1914 /* Classify all cross-iteration scalar data-flow cycles.
1915 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1916 vect_analyze_scalar_cycles (loop_vinfo);
1918 vect_pattern_recog (loop_vinfo);
1920 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1922 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1923 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1925 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1926 if (!ok)
1928 if (dump_enabled_p ())
1929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930 "bad data access.\n");
1931 return ok;
1934 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1936 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1937 if (!ok)
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 "unexpected pattern.\n");
1942 return ok;
1945 /* While the rest of the analysis below depends on it in some way. */
1946 fatal = false;
1948 /* Analyze data dependences between the data-refs in the loop
1949 and adjust the maximum vectorization factor according to
1950 the dependences.
1951 FORNOW: fail at the first data dependence that we encounter. */
1953 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1954 if (!ok)
1956 if (dump_enabled_p ())
1957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958 "bad data dependence.\n");
1959 return ok;
1961 if (max_vf != MAX_VECTORIZATION_FACTOR
1962 && maybe_lt (max_vf, min_vf))
1963 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1964 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1966 ok = vect_determine_vectorization_factor (loop_vinfo);
1967 if (!ok)
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971 "can't determine vectorization factor.\n");
1972 return ok;
1974 if (max_vf != MAX_VECTORIZATION_FACTOR
1975 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1976 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1978 /* Compute the scalar iteration cost. */
1979 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1981 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1982 unsigned th;
1984 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1985 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1986 if (!ok)
1987 return ok;
1989 /* If there are any SLP instances mark them as pure_slp. */
1990 bool slp = vect_make_slp_decision (loop_vinfo);
1991 if (slp)
1993 /* Find stmts that need to be both vectorized and SLPed. */
1994 vect_detect_hybrid_slp (loop_vinfo);
1996 /* Update the vectorization factor based on the SLP decision. */
1997 vect_update_vf_for_slp (loop_vinfo);
2000 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2002 /* We don't expect to have to roll back to anything other than an empty
2003 set of rgroups. */
2004 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2006 /* This is the point where we can re-start analysis with SLP forced off. */
2007 start_over:
2009 /* Now the vectorization factor is final. */
2010 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2011 gcc_assert (known_ne (vectorization_factor, 0U));
2013 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE, vect_location,
2016 "vectorization_factor = ");
2017 dump_dec (MSG_NOTE, vectorization_factor);
2018 dump_printf (MSG_NOTE, ", niters = %wd\n",
2019 LOOP_VINFO_INT_NITERS (loop_vinfo));
2022 HOST_WIDE_INT max_niter
2023 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2025 /* Analyze the alignment of the data-refs in the loop.
2026 Fail if a data reference is found that cannot be vectorized. */
2028 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2029 if (!ok)
2031 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033 "bad data alignment.\n");
2034 return ok;
2037 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2038 It is important to call pruning after vect_analyze_data_ref_accesses,
2039 since we use grouping information gathered by interleaving analysis. */
2040 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2041 if (!ok)
2042 return ok;
2044 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2045 vectorization, since we do not want to add extra peeling or
2046 add versioning for alignment. */
2047 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2048 /* This pass will decide on using loop versioning and/or loop peeling in
2049 order to enhance the alignment of data references in the loop. */
2050 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2051 else
2052 ok = vect_verify_datarefs_alignment (loop_vinfo);
2053 if (!ok)
2054 return ok;
2056 if (slp)
2058 /* Analyze operations in the SLP instances. Note this may
2059 remove unsupported SLP instances which makes the above
2060 SLP kind detection invalid. */
2061 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2062 vect_slp_analyze_operations (loop_vinfo);
2063 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2065 ok = opt_result::failure_at (vect_location,
2066 "unsupported SLP instances\n");
2067 goto again;
2071 /* Dissolve SLP-only groups. */
2072 vect_dissolve_slp_only_groups (loop_vinfo);
2074 /* Scan all the remaining operations in the loop that are not subject
2075 to SLP and make sure they are vectorizable. */
2076 ok = vect_analyze_loop_operations (loop_vinfo);
2077 if (!ok)
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081 "bad operation or unsupported loop bound.\n");
2082 return ok;
2085 /* Decide whether to use a fully-masked loop for this vectorization
2086 factor. */
2087 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2088 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2089 && vect_verify_full_masking (loop_vinfo));
2090 if (dump_enabled_p ())
2092 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2093 dump_printf_loc (MSG_NOTE, vect_location,
2094 "using a fully-masked loop.\n");
2095 else
2096 dump_printf_loc (MSG_NOTE, vect_location,
2097 "not using a fully-masked loop.\n");
2100 /* If epilog loop is required because of data accesses with gaps,
2101 one additional iteration needs to be peeled. Check if there is
2102 enough iterations for vectorization. */
2103 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2104 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2105 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2107 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2108 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2110 if (known_lt (wi::to_widest (scalar_niters), vf))
2111 return opt_result::failure_at (vect_location,
2112 "loop has no enough iterations to"
2113 " support peeling for gaps.\n");
2116 /* Check the costings of the loop make vectorizing worthwhile. */
2117 res = vect_analyze_loop_costing (loop_vinfo);
2118 if (res < 0)
2120 ok = opt_result::failure_at (vect_location,
2121 "Loop costings may not be worthwhile.\n");
2122 goto again;
2124 if (!res)
2125 return opt_result::failure_at (vect_location,
2126 "Loop costings not worthwhile.\n");
2128 /* Decide whether we need to create an epilogue loop to handle
2129 remaining scalar iterations. */
2130 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2132 unsigned HOST_WIDE_INT const_vf;
2133 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2134 /* The main loop handles all iterations. */
2135 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2136 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2137 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2139 /* Work out the (constant) number of iterations that need to be
2140 peeled for reasons other than niters. */
2141 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2142 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2143 peel_niter += 1;
2144 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2145 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2146 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2148 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2149 /* ??? When peeling for gaps but not alignment, we could
2150 try to check whether the (variable) niters is known to be
2151 VF * N + 1. That's something of a niche case though. */
2152 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2153 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2154 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2155 < (unsigned) exact_log2 (const_vf))
2156 /* In case of versioning, check if the maximum number of
2157 iterations is greater than th. If they are identical,
2158 the epilogue is unnecessary. */
2159 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2160 || ((unsigned HOST_WIDE_INT) max_niter
2161 > (th / const_vf) * const_vf))))
2162 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2164 /* If an epilogue loop is required make sure we can create one. */
2165 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2166 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2170 if (!vect_can_advance_ivs_p (loop_vinfo)
2171 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2172 single_exit (LOOP_VINFO_LOOP
2173 (loop_vinfo))))
2175 ok = opt_result::failure_at (vect_location,
2176 "not vectorized: can't create required "
2177 "epilog loop\n");
2178 goto again;
2182 /* During peeling, we need to check if number of loop iterations is
2183 enough for both peeled prolog loop and vector loop. This check
2184 can be merged along with threshold check of loop versioning, so
2185 increase threshold for this case if necessary. */
2186 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2188 poly_uint64 niters_th = 0;
2190 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2192 /* Niters for peeled prolog loop. */
2193 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2195 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2196 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2197 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2199 else
2200 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2203 /* Niters for at least one iteration of vectorized loop. */
2204 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2205 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2206 /* One additional iteration because of peeling for gap. */
2207 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2208 niters_th += 1;
2209 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2212 gcc_assert (known_eq (vectorization_factor,
2213 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2215 /* Ok to vectorize! */
2216 return opt_result::success ();
2218 again:
2219 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2220 gcc_assert (!ok);
2222 /* Try again with SLP forced off but if we didn't do any SLP there is
2223 no point in re-trying. */
2224 if (!slp)
2225 return ok;
2227 /* If there are reduction chains re-trying will fail anyway. */
2228 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2229 return ok;
2231 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2232 via interleaving or lane instructions. */
2233 slp_instance instance;
2234 slp_tree node;
2235 unsigned i, j;
2236 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2238 stmt_vec_info vinfo;
2239 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2240 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2241 continue;
2242 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2243 unsigned int size = DR_GROUP_SIZE (vinfo);
2244 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2245 if (! vect_store_lanes_supported (vectype, size, false)
2246 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2247 && ! vect_grouped_store_supported (vectype, size))
2248 return opt_result::failure_at (vinfo->stmt,
2249 "unsupported grouped store\n");
2250 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2252 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2253 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2254 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2255 size = DR_GROUP_SIZE (vinfo);
2256 vectype = STMT_VINFO_VECTYPE (vinfo);
2257 if (! vect_load_lanes_supported (vectype, size, false)
2258 && ! vect_grouped_load_supported (vectype, single_element_p,
2259 size))
2260 return opt_result::failure_at (vinfo->stmt,
2261 "unsupported grouped load\n");
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "re-trying with SLP disabled\n");
2269 /* Roll back state appropriately. No SLP this time. */
2270 slp = false;
2271 /* Restore vectorization factor as it were without SLP. */
2272 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2273 /* Free the SLP instances. */
2274 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2275 vect_free_slp_instance (instance, false);
2276 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2277 /* Reset SLP type to loop_vect on all stmts. */
2278 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2280 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2281 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2282 !gsi_end_p (si); gsi_next (&si))
2284 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2285 STMT_SLP_TYPE (stmt_info) = loop_vect;
2287 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2288 !gsi_end_p (si); gsi_next (&si))
2290 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2291 STMT_SLP_TYPE (stmt_info) = loop_vect;
2292 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2294 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2295 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2296 STMT_SLP_TYPE (stmt_info) = loop_vect;
2297 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2298 !gsi_end_p (pi); gsi_next (&pi))
2299 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2300 = loop_vect;
2304 /* Free optimized alias test DDRS. */
2305 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2306 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2307 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2308 /* Reset target cost data. */
2309 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2310 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2311 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2312 /* Reset accumulated rgroup information. */
2313 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2314 /* Reset assorted flags. */
2315 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2316 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2317 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2318 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2319 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2321 goto start_over;
2324 /* Function vect_analyze_loop.
2326 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2327 for it. The different analyses will record information in the
2328 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2329 be vectorized. */
2330 opt_loop_vec_info
2331 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2332 vec_info_shared *shared)
2334 auto_vector_sizes vector_sizes;
2336 /* Autodetect first vector size we try. */
2337 current_vector_size = 0;
2338 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2339 loop->simdlen != 0);
2340 unsigned int next_size = 0;
2342 DUMP_VECT_SCOPE ("analyze_loop_nest");
2344 if (loop_outer (loop)
2345 && loop_vec_info_for_loop (loop_outer (loop))
2346 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2347 return opt_loop_vec_info::failure_at (vect_location,
2348 "outer-loop already vectorized.\n");
2350 if (!find_loop_nest (loop, &shared->loop_nest))
2351 return opt_loop_vec_info::failure_at
2352 (vect_location,
2353 "not vectorized: loop nest containing two or more consecutive inner"
2354 " loops cannot be vectorized\n");
2356 unsigned n_stmts = 0;
2357 poly_uint64 autodetected_vector_size = 0;
2358 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2359 poly_uint64 first_vector_size = 0;
2360 while (1)
2362 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2363 opt_loop_vec_info loop_vinfo
2364 = vect_analyze_loop_form (loop, shared);
2365 if (!loop_vinfo)
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 "bad loop form.\n");
2370 gcc_checking_assert (first_loop_vinfo == NULL);
2371 return loop_vinfo;
2374 bool fatal = false;
2376 if (orig_loop_vinfo)
2377 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2379 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2380 if (res)
2382 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2384 if (loop->simdlen
2385 && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2386 (unsigned HOST_WIDE_INT) loop->simdlen))
2388 if (first_loop_vinfo == NULL)
2390 first_loop_vinfo = loop_vinfo;
2391 first_vector_size = current_vector_size;
2392 loop->aux = NULL;
2394 else
2395 delete loop_vinfo;
2397 else
2399 delete first_loop_vinfo;
2400 return loop_vinfo;
2403 else
2404 delete loop_vinfo;
2406 if (next_size == 0)
2407 autodetected_vector_size = current_vector_size;
2409 if (next_size < vector_sizes.length ()
2410 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2411 next_size += 1;
2413 if (fatal)
2415 gcc_checking_assert (first_loop_vinfo == NULL);
2416 return opt_loop_vec_info::propagate_failure (res);
2419 if (next_size == vector_sizes.length ()
2420 || known_eq (current_vector_size, 0U))
2422 if (first_loop_vinfo)
2424 current_vector_size = first_vector_size;
2425 loop->aux = (loop_vec_info) first_loop_vinfo;
2426 if (dump_enabled_p ())
2428 dump_printf_loc (MSG_NOTE, vect_location,
2429 "***** Choosing vector size ");
2430 dump_dec (MSG_NOTE, current_vector_size);
2431 dump_printf (MSG_NOTE, "\n");
2433 return first_loop_vinfo;
2435 else
2436 return opt_loop_vec_info::propagate_failure (res);
2439 /* Try the next biggest vector size. */
2440 current_vector_size = vector_sizes[next_size++];
2441 if (dump_enabled_p ())
2443 dump_printf_loc (MSG_NOTE, vect_location,
2444 "***** Re-trying analysis with "
2445 "vector size ");
2446 dump_dec (MSG_NOTE, current_vector_size);
2447 dump_printf (MSG_NOTE, "\n");
2452 /* Return true if there is an in-order reduction function for CODE, storing
2453 it in *REDUC_FN if so. */
2455 static bool
2456 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2458 switch (code)
2460 case PLUS_EXPR:
2461 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2462 return true;
2464 default:
2465 return false;
2469 /* Function reduction_fn_for_scalar_code
2471 Input:
2472 CODE - tree_code of a reduction operations.
2474 Output:
2475 REDUC_FN - the corresponding internal function to be used to reduce the
2476 vector of partial results into a single scalar result, or IFN_LAST
2477 if the operation is a supported reduction operation, but does not have
2478 such an internal function.
2480 Return FALSE if CODE currently cannot be vectorized as reduction. */
2482 static bool
2483 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2485 switch (code)
2487 case MAX_EXPR:
2488 *reduc_fn = IFN_REDUC_MAX;
2489 return true;
2491 case MIN_EXPR:
2492 *reduc_fn = IFN_REDUC_MIN;
2493 return true;
2495 case PLUS_EXPR:
2496 *reduc_fn = IFN_REDUC_PLUS;
2497 return true;
2499 case BIT_AND_EXPR:
2500 *reduc_fn = IFN_REDUC_AND;
2501 return true;
2503 case BIT_IOR_EXPR:
2504 *reduc_fn = IFN_REDUC_IOR;
2505 return true;
2507 case BIT_XOR_EXPR:
2508 *reduc_fn = IFN_REDUC_XOR;
2509 return true;
2511 case MULT_EXPR:
2512 case MINUS_EXPR:
2513 *reduc_fn = IFN_LAST;
2514 return true;
2516 default:
2517 return false;
2521 /* If there is a neutral value X such that SLP reduction NODE would not
2522 be affected by the introduction of additional X elements, return that X,
2523 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2524 is true if the SLP statements perform a single reduction, false if each
2525 statement performs an independent reduction. */
2527 static tree
2528 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2529 bool reduc_chain)
2531 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2532 stmt_vec_info stmt_vinfo = stmts[0];
2533 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2534 tree scalar_type = TREE_TYPE (vector_type);
2535 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2536 gcc_assert (loop);
2538 switch (code)
2540 case WIDEN_SUM_EXPR:
2541 case DOT_PROD_EXPR:
2542 case SAD_EXPR:
2543 case PLUS_EXPR:
2544 case MINUS_EXPR:
2545 case BIT_IOR_EXPR:
2546 case BIT_XOR_EXPR:
2547 return build_zero_cst (scalar_type);
2549 case MULT_EXPR:
2550 return build_one_cst (scalar_type);
2552 case BIT_AND_EXPR:
2553 return build_all_ones_cst (scalar_type);
2555 case MAX_EXPR:
2556 case MIN_EXPR:
2557 /* For MIN/MAX the initial values are neutral. A reduction chain
2558 has only a single initial value, so that value is neutral for
2559 all statements. */
2560 if (reduc_chain)
2561 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2562 loop_preheader_edge (loop));
2563 return NULL_TREE;
2565 default:
2566 return NULL_TREE;
2570 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2571 STMT is printed with a message MSG. */
2573 static void
2574 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2576 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2579 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2580 operation. Return true if the results of DEF_STMT_INFO are something
2581 that can be accumulated by such a reduction. */
2583 static bool
2584 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2586 return (is_gimple_assign (def_stmt_info->stmt)
2587 || is_gimple_call (def_stmt_info->stmt)
2588 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2589 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2590 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2591 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2594 /* Detect SLP reduction of the form:
2596 #a1 = phi <a5, a0>
2597 a2 = operation (a1)
2598 a3 = operation (a2)
2599 a4 = operation (a3)
2600 a5 = operation (a4)
2602 #a = phi <a5>
2604 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2605 FIRST_STMT is the first reduction stmt in the chain
2606 (a2 = operation (a1)).
2608 Return TRUE if a reduction chain was detected. */
2610 static bool
2611 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2612 gimple *first_stmt)
2614 struct loop *loop = (gimple_bb (phi))->loop_father;
2615 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2616 enum tree_code code;
2617 gimple *loop_use_stmt = NULL;
2618 stmt_vec_info use_stmt_info;
2619 tree lhs;
2620 imm_use_iterator imm_iter;
2621 use_operand_p use_p;
2622 int nloop_uses, size = 0, n_out_of_loop_uses;
2623 bool found = false;
2625 if (loop != vect_loop)
2626 return false;
2628 auto_vec<stmt_vec_info, 8> reduc_chain;
2629 lhs = PHI_RESULT (phi);
2630 code = gimple_assign_rhs_code (first_stmt);
2631 while (1)
2633 nloop_uses = 0;
2634 n_out_of_loop_uses = 0;
2635 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2637 gimple *use_stmt = USE_STMT (use_p);
2638 if (is_gimple_debug (use_stmt))
2639 continue;
2641 /* Check if we got back to the reduction phi. */
2642 if (use_stmt == phi)
2644 loop_use_stmt = use_stmt;
2645 found = true;
2646 break;
2649 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2651 loop_use_stmt = use_stmt;
2652 nloop_uses++;
2654 else
2655 n_out_of_loop_uses++;
2657 /* There are can be either a single use in the loop or two uses in
2658 phi nodes. */
2659 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2660 return false;
2663 if (found)
2664 break;
2666 /* We reached a statement with no loop uses. */
2667 if (nloop_uses == 0)
2668 return false;
2670 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2671 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2672 return false;
2674 if (!is_gimple_assign (loop_use_stmt)
2675 || code != gimple_assign_rhs_code (loop_use_stmt)
2676 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2677 return false;
2679 /* Insert USE_STMT into reduction chain. */
2680 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2681 reduc_chain.safe_push (use_stmt_info);
2683 lhs = gimple_assign_lhs (loop_use_stmt);
2684 size++;
2687 if (!found || loop_use_stmt != phi || size < 2)
2688 return false;
2690 /* Swap the operands, if needed, to make the reduction operand be the second
2691 operand. */
2692 lhs = PHI_RESULT (phi);
2693 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2695 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2696 if (gimple_assign_rhs2 (next_stmt) == lhs)
2698 tree op = gimple_assign_rhs1 (next_stmt);
2699 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2701 /* Check that the other def is either defined in the loop
2702 ("vect_internal_def"), or it's an induction (defined by a
2703 loop-header phi-node). */
2704 if (def_stmt_info
2705 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2706 && vect_valid_reduction_input_p (def_stmt_info))
2708 lhs = gimple_assign_lhs (next_stmt);
2709 continue;
2712 return false;
2714 else
2716 tree op = gimple_assign_rhs2 (next_stmt);
2717 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2719 /* Check that the other def is either defined in the loop
2720 ("vect_internal_def"), or it's an induction (defined by a
2721 loop-header phi-node). */
2722 if (def_stmt_info
2723 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2724 && vect_valid_reduction_input_p (def_stmt_info))
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2728 next_stmt);
2730 swap_ssa_operands (next_stmt,
2731 gimple_assign_rhs1_ptr (next_stmt),
2732 gimple_assign_rhs2_ptr (next_stmt));
2733 update_stmt (next_stmt);
2735 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2736 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2738 else
2739 return false;
2742 lhs = gimple_assign_lhs (next_stmt);
2745 /* Build up the actual chain. */
2746 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2748 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2749 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2751 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2752 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2754 /* Save the chain for further analysis in SLP detection. */
2755 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2756 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2758 return true;
2761 /* Return true if we need an in-order reduction for operation CODE
2762 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2763 overflow must wrap. */
2765 static bool
2766 needs_fold_left_reduction_p (tree type, tree_code code,
2767 bool need_wrapping_integral_overflow)
2769 /* CHECKME: check for !flag_finite_math_only too? */
2770 if (SCALAR_FLOAT_TYPE_P (type))
2771 switch (code)
2773 case MIN_EXPR:
2774 case MAX_EXPR:
2775 return false;
2777 default:
2778 return !flag_associative_math;
2781 if (INTEGRAL_TYPE_P (type))
2783 if (!operation_no_trapping_overflow (type, code))
2784 return true;
2785 if (need_wrapping_integral_overflow
2786 && !TYPE_OVERFLOW_WRAPS (type)
2787 && operation_can_overflow (code))
2788 return true;
2789 return false;
2792 if (SAT_FIXED_POINT_TYPE_P (type))
2793 return true;
2795 return false;
2798 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2799 reduction operation CODE has a handled computation expression. */
2801 bool
2802 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2803 tree loop_arg, enum tree_code code)
2805 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2806 auto_bitmap visited;
2807 tree lookfor = PHI_RESULT (phi);
2808 ssa_op_iter curri;
2809 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2810 while (USE_FROM_PTR (curr) != loop_arg)
2811 curr = op_iter_next_use (&curri);
2812 curri.i = curri.numops;
2815 path.safe_push (std::make_pair (curri, curr));
2816 tree use = USE_FROM_PTR (curr);
2817 if (use == lookfor)
2818 break;
2819 gimple *def = SSA_NAME_DEF_STMT (use);
2820 if (gimple_nop_p (def)
2821 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2823 pop:
2826 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2827 curri = x.first;
2828 curr = x.second;
2830 curr = op_iter_next_use (&curri);
2831 /* Skip already visited or non-SSA operands (from iterating
2832 over PHI args). */
2833 while (curr != NULL_USE_OPERAND_P
2834 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2835 || ! bitmap_set_bit (visited,
2836 SSA_NAME_VERSION
2837 (USE_FROM_PTR (curr)))));
2839 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2840 if (curr == NULL_USE_OPERAND_P)
2841 break;
2843 else
2845 if (gimple_code (def) == GIMPLE_PHI)
2846 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2847 else
2848 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2849 while (curr != NULL_USE_OPERAND_P
2850 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2851 || ! bitmap_set_bit (visited,
2852 SSA_NAME_VERSION
2853 (USE_FROM_PTR (curr)))))
2854 curr = op_iter_next_use (&curri);
2855 if (curr == NULL_USE_OPERAND_P)
2856 goto pop;
2859 while (1);
2860 if (dump_file && (dump_flags & TDF_DETAILS))
2862 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2863 unsigned i;
2864 std::pair<ssa_op_iter, use_operand_p> *x;
2865 FOR_EACH_VEC_ELT (path, i, x)
2866 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2867 dump_printf (MSG_NOTE, "\n");
2870 /* Check whether the reduction path detected is valid. */
2871 bool fail = path.length () == 0;
2872 bool neg = false;
2873 for (unsigned i = 1; i < path.length (); ++i)
2875 gimple *use_stmt = USE_STMT (path[i].second);
2876 tree op = USE_FROM_PTR (path[i].second);
2877 if (! has_single_use (op)
2878 || ! is_gimple_assign (use_stmt))
2880 fail = true;
2881 break;
2883 if (gimple_assign_rhs_code (use_stmt) != code)
2885 if (code == PLUS_EXPR
2886 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2888 /* Track whether we negate the reduction value each iteration. */
2889 if (gimple_assign_rhs2 (use_stmt) == op)
2890 neg = ! neg;
2892 else
2894 fail = true;
2895 break;
2899 return ! fail && ! neg;
2903 /* Function vect_is_simple_reduction
2905 (1) Detect a cross-iteration def-use cycle that represents a simple
2906 reduction computation. We look for the following pattern:
2908 loop_header:
2909 a1 = phi < a0, a2 >
2910 a3 = ...
2911 a2 = operation (a3, a1)
2915 a3 = ...
2916 loop_header:
2917 a1 = phi < a0, a2 >
2918 a2 = operation (a3, a1)
2920 such that:
2921 1. operation is commutative and associative and it is safe to
2922 change the order of the computation
2923 2. no uses for a2 in the loop (a2 is used out of the loop)
2924 3. no uses of a1 in the loop besides the reduction operation
2925 4. no uses of a1 outside the loop.
2927 Conditions 1,4 are tested here.
2928 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2930 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2931 nested cycles.
2933 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2934 reductions:
2936 a1 = phi < a0, a2 >
2937 inner loop (def of a3)
2938 a2 = phi < a3 >
2940 (4) Detect condition expressions, ie:
2941 for (int i = 0; i < N; i++)
2942 if (a[i] < val)
2943 ret_val = a[i];
2947 static stmt_vec_info
2948 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2949 bool *double_reduc,
2950 bool need_wrapping_integral_overflow,
2951 enum vect_reduction_type *v_reduc_type)
2953 gphi *phi = as_a <gphi *> (phi_info->stmt);
2954 struct loop *loop = (gimple_bb (phi))->loop_father;
2955 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2956 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2957 gimple *phi_use_stmt = NULL;
2958 enum tree_code orig_code, code;
2959 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2960 tree type;
2961 tree name;
2962 imm_use_iterator imm_iter;
2963 use_operand_p use_p;
2964 bool phi_def;
2966 *double_reduc = false;
2967 *v_reduc_type = TREE_CODE_REDUCTION;
2969 tree phi_name = PHI_RESULT (phi);
2970 /* ??? If there are no uses of the PHI result the inner loop reduction
2971 won't be detected as possibly double-reduction by vectorizable_reduction
2972 because that tries to walk the PHI arg from the preheader edge which
2973 can be constant. See PR60382. */
2974 if (has_zero_uses (phi_name))
2975 return NULL;
2976 unsigned nphi_def_loop_uses = 0;
2977 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2979 gimple *use_stmt = USE_STMT (use_p);
2980 if (is_gimple_debug (use_stmt))
2981 continue;
2983 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2985 if (dump_enabled_p ())
2986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2987 "intermediate value used outside loop.\n");
2989 return NULL;
2992 nphi_def_loop_uses++;
2993 phi_use_stmt = use_stmt;
2996 edge latch_e = loop_latch_edge (loop);
2997 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2998 if (TREE_CODE (loop_arg) != SSA_NAME)
3000 if (dump_enabled_p ())
3001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002 "reduction: not ssa_name: %T\n", loop_arg);
3003 return NULL;
3006 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
3007 if (!def_stmt_info
3008 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3009 return NULL;
3011 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
3013 name = gimple_assign_lhs (def_stmt);
3014 phi_def = false;
3016 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3018 name = PHI_RESULT (def_stmt);
3019 phi_def = true;
3021 else
3023 if (dump_enabled_p ())
3024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3025 "reduction: unhandled reduction operation: %G",
3026 def_stmt_info->stmt);
3027 return NULL;
3030 unsigned nlatch_def_loop_uses = 0;
3031 auto_vec<gphi *, 3> lcphis;
3032 bool inner_loop_of_double_reduc = false;
3033 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3035 gimple *use_stmt = USE_STMT (use_p);
3036 if (is_gimple_debug (use_stmt))
3037 continue;
3038 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3039 nlatch_def_loop_uses++;
3040 else
3042 /* We can have more than one loop-closed PHI. */
3043 lcphis.safe_push (as_a <gphi *> (use_stmt));
3044 if (nested_in_vect_loop
3045 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3046 == vect_double_reduction_def))
3047 inner_loop_of_double_reduc = true;
3051 /* If this isn't a nested cycle or if the nested cycle reduction value
3052 is used ouside of the inner loop we cannot handle uses of the reduction
3053 value. */
3054 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
3055 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
3057 if (dump_enabled_p ())
3058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3059 "reduction used in loop.\n");
3060 return NULL;
3063 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3064 defined in the inner loop. */
3065 if (phi_def)
3067 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
3068 op1 = PHI_ARG_DEF (def_stmt, 0);
3070 if (gimple_phi_num_args (def_stmt) != 1
3071 || TREE_CODE (op1) != SSA_NAME)
3073 if (dump_enabled_p ())
3074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3075 "unsupported phi node definition.\n");
3077 return NULL;
3080 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3081 if (gimple_bb (def1)
3082 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3083 && loop->inner
3084 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3085 && is_gimple_assign (def1)
3086 && is_a <gphi *> (phi_use_stmt)
3087 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3089 if (dump_enabled_p ())
3090 report_vect_op (MSG_NOTE, def_stmt,
3091 "detected double reduction: ");
3093 *double_reduc = true;
3094 return def_stmt_info;
3097 return NULL;
3100 /* If we are vectorizing an inner reduction we are executing that
3101 in the original order only in case we are not dealing with a
3102 double reduction. */
3103 bool check_reduction = true;
3104 if (flow_loop_nested_p (vect_loop, loop))
3106 gphi *lcphi;
3107 unsigned i;
3108 check_reduction = false;
3109 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3110 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3112 gimple *use_stmt = USE_STMT (use_p);
3113 if (is_gimple_debug (use_stmt))
3114 continue;
3115 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3116 check_reduction = true;
3120 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3121 code = orig_code = gimple_assign_rhs_code (def_stmt);
3123 if (nested_in_vect_loop && !check_reduction)
3125 /* FIXME: Even for non-reductions code generation is funneled
3126 through vectorizable_reduction for the stmt defining the
3127 PHI latch value. So we have to artificially restrict ourselves
3128 for the supported operations. */
3129 switch (get_gimple_rhs_class (code))
3131 case GIMPLE_BINARY_RHS:
3132 case GIMPLE_TERNARY_RHS:
3133 break;
3134 default:
3135 /* Not supported by vectorizable_reduction. */
3136 if (dump_enabled_p ())
3137 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3138 "nested cycle: not handled operation: ");
3139 return NULL;
3141 if (dump_enabled_p ())
3142 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3143 return def_stmt_info;
3146 /* We can handle "res -= x[i]", which is non-associative by
3147 simply rewriting this into "res += -x[i]". Avoid changing
3148 gimple instruction for the first simple tests and only do this
3149 if we're allowed to change code at all. */
3150 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3151 code = PLUS_EXPR;
3153 if (code == COND_EXPR)
3155 if (! nested_in_vect_loop)
3156 *v_reduc_type = COND_REDUCTION;
3158 op3 = gimple_assign_rhs1 (def_stmt);
3159 if (COMPARISON_CLASS_P (op3))
3161 op4 = TREE_OPERAND (op3, 1);
3162 op3 = TREE_OPERAND (op3, 0);
3164 if (op3 == phi_name || op4 == phi_name)
3166 if (dump_enabled_p ())
3167 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3168 "reduction: condition depends on previous"
3169 " iteration: ");
3170 return NULL;
3173 op1 = gimple_assign_rhs2 (def_stmt);
3174 op2 = gimple_assign_rhs3 (def_stmt);
3176 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3178 if (dump_enabled_p ())
3179 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3180 "reduction: not commutative/associative: ");
3181 return NULL;
3183 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3185 op1 = gimple_assign_rhs1 (def_stmt);
3186 op2 = gimple_assign_rhs2 (def_stmt);
3188 else
3190 if (dump_enabled_p ())
3191 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3192 "reduction: not handled operation: ");
3193 return NULL;
3196 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3198 if (dump_enabled_p ())
3199 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3200 "reduction: both uses not ssa_names: ");
3202 return NULL;
3205 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3206 if ((TREE_CODE (op1) == SSA_NAME
3207 && !types_compatible_p (type,TREE_TYPE (op1)))
3208 || (TREE_CODE (op2) == SSA_NAME
3209 && !types_compatible_p (type, TREE_TYPE (op2)))
3210 || (op3 && TREE_CODE (op3) == SSA_NAME
3211 && !types_compatible_p (type, TREE_TYPE (op3)))
3212 || (op4 && TREE_CODE (op4) == SSA_NAME
3213 && !types_compatible_p (type, TREE_TYPE (op4))))
3215 if (dump_enabled_p ())
3217 dump_printf_loc (MSG_NOTE, vect_location,
3218 "reduction: multiple types: operation type: "
3219 "%T, operands types: %T,%T",
3220 type, TREE_TYPE (op1), TREE_TYPE (op2));
3221 if (op3)
3222 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3224 if (op4)
3225 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3226 dump_printf (MSG_NOTE, "\n");
3229 return NULL;
3232 /* Check whether it's ok to change the order of the computation.
3233 Generally, when vectorizing a reduction we change the order of the
3234 computation. This may change the behavior of the program in some
3235 cases, so we need to check that this is ok. One exception is when
3236 vectorizing an outer-loop: the inner-loop is executed sequentially,
3237 and therefore vectorizing reductions in the inner-loop during
3238 outer-loop vectorization is safe. */
3239 if (check_reduction
3240 && *v_reduc_type == TREE_CODE_REDUCTION
3241 && needs_fold_left_reduction_p (type, code,
3242 need_wrapping_integral_overflow))
3243 *v_reduc_type = FOLD_LEFT_REDUCTION;
3245 /* Reduction is safe. We're dealing with one of the following:
3246 1) integer arithmetic and no trapv
3247 2) floating point arithmetic, and special flags permit this optimization
3248 3) nested cycle (i.e., outer loop vectorization). */
3249 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3250 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3251 if (code != COND_EXPR && !def1_info && !def2_info)
3253 if (dump_enabled_p ())
3254 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3255 return NULL;
3258 /* Check that one def is the reduction def, defined by PHI,
3259 the other def is either defined in the loop ("vect_internal_def"),
3260 or it's an induction (defined by a loop-header phi-node). */
3262 if (def2_info
3263 && def2_info->stmt == phi
3264 && (code == COND_EXPR
3265 || !def1_info
3266 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3267 || vect_valid_reduction_input_p (def1_info)))
3269 if (dump_enabled_p ())
3270 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3271 return def_stmt_info;
3274 if (def1_info
3275 && def1_info->stmt == phi
3276 && (code == COND_EXPR
3277 || !def2_info
3278 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3279 || vect_valid_reduction_input_p (def2_info)))
3281 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3283 /* Check if we can swap operands (just for simplicity - so that
3284 the rest of the code can assume that the reduction variable
3285 is always the last (second) argument). */
3286 if (code == COND_EXPR)
3288 /* Swap cond_expr by inverting the condition. */
3289 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3290 enum tree_code invert_code = ERROR_MARK;
3291 enum tree_code cond_code = TREE_CODE (cond_expr);
3293 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3295 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3296 invert_code = invert_tree_comparison (cond_code, honor_nans);
3298 if (invert_code != ERROR_MARK)
3300 TREE_SET_CODE (cond_expr, invert_code);
3301 swap_ssa_operands (def_stmt,
3302 gimple_assign_rhs2_ptr (def_stmt),
3303 gimple_assign_rhs3_ptr (def_stmt));
3305 else
3307 if (dump_enabled_p ())
3308 report_vect_op (MSG_NOTE, def_stmt,
3309 "detected reduction: cannot swap operands "
3310 "for cond_expr");
3311 return NULL;
3314 else
3315 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3316 gimple_assign_rhs2_ptr (def_stmt));
3318 if (dump_enabled_p ())
3319 report_vect_op (MSG_NOTE, def_stmt,
3320 "detected reduction: need to swap operands: ");
3322 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3323 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3325 else
3327 if (dump_enabled_p ())
3328 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3331 return def_stmt_info;
3334 /* Try to find SLP reduction chain. */
3335 if (! nested_in_vect_loop
3336 && code != COND_EXPR
3337 && orig_code != MINUS_EXPR
3338 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3340 if (dump_enabled_p ())
3341 report_vect_op (MSG_NOTE, def_stmt,
3342 "reduction: detected reduction chain: ");
3344 return def_stmt_info;
3347 /* Look for the expression computing loop_arg from loop PHI result. */
3348 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3349 return def_stmt_info;
3351 if (dump_enabled_p ())
3353 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3354 "reduction: unknown pattern: ");
3357 return NULL;
3360 /* Wrapper around vect_is_simple_reduction, which will modify code
3361 in-place if it enables detection of more reductions. Arguments
3362 as there. */
3364 stmt_vec_info
3365 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3366 bool *double_reduc,
3367 bool need_wrapping_integral_overflow)
3369 enum vect_reduction_type v_reduc_type;
3370 stmt_vec_info def_info
3371 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3372 need_wrapping_integral_overflow,
3373 &v_reduc_type);
3374 if (def_info)
3376 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3377 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3378 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3379 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3381 return def_info;
3384 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3386 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3387 int *peel_iters_epilogue,
3388 stmt_vector_for_cost *scalar_cost_vec,
3389 stmt_vector_for_cost *prologue_cost_vec,
3390 stmt_vector_for_cost *epilogue_cost_vec)
3392 int retval = 0;
3393 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3395 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3397 *peel_iters_epilogue = assumed_vf / 2;
3398 if (dump_enabled_p ())
3399 dump_printf_loc (MSG_NOTE, vect_location,
3400 "cost model: epilogue peel iters set to vf/2 "
3401 "because loop iterations are unknown .\n");
3403 /* If peeled iterations are known but number of scalar loop
3404 iterations are unknown, count a taken branch per peeled loop. */
3405 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3406 NULL, 0, vect_prologue);
3407 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3408 NULL, 0, vect_epilogue);
3410 else
3412 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3413 peel_iters_prologue = niters < peel_iters_prologue ?
3414 niters : peel_iters_prologue;
3415 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3416 /* If we need to peel for gaps, but no peeling is required, we have to
3417 peel VF iterations. */
3418 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3419 *peel_iters_epilogue = assumed_vf;
3422 stmt_info_for_cost *si;
3423 int j;
3424 if (peel_iters_prologue)
3425 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3426 retval += record_stmt_cost (prologue_cost_vec,
3427 si->count * peel_iters_prologue,
3428 si->kind, si->stmt_info, si->misalign,
3429 vect_prologue);
3430 if (*peel_iters_epilogue)
3431 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3432 retval += record_stmt_cost (epilogue_cost_vec,
3433 si->count * *peel_iters_epilogue,
3434 si->kind, si->stmt_info, si->misalign,
3435 vect_epilogue);
3437 return retval;
3440 /* Function vect_estimate_min_profitable_iters
3442 Return the number of iterations required for the vector version of the
3443 loop to be profitable relative to the cost of the scalar version of the
3444 loop.
3446 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3447 of iterations for vectorization. -1 value means loop vectorization
3448 is not profitable. This returned value may be used for dynamic
3449 profitability check.
3451 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3452 for static check against estimated number of iterations. */
3454 static void
3455 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3456 int *ret_min_profitable_niters,
3457 int *ret_min_profitable_estimate)
3459 int min_profitable_iters;
3460 int min_profitable_estimate;
3461 int peel_iters_prologue;
3462 int peel_iters_epilogue;
3463 unsigned vec_inside_cost = 0;
3464 int vec_outside_cost = 0;
3465 unsigned vec_prologue_cost = 0;
3466 unsigned vec_epilogue_cost = 0;
3467 int scalar_single_iter_cost = 0;
3468 int scalar_outside_cost = 0;
3469 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3470 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3471 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3473 /* Cost model disabled. */
3474 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3476 if (dump_enabled_p ())
3477 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3478 *ret_min_profitable_niters = 0;
3479 *ret_min_profitable_estimate = 0;
3480 return;
3483 /* Requires loop versioning tests to handle misalignment. */
3484 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3486 /* FIXME: Make cost depend on complexity of individual check. */
3487 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3488 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3489 vect_prologue);
3490 if (dump_enabled_p ())
3491 dump_printf (MSG_NOTE,
3492 "cost model: Adding cost of checks for loop "
3493 "versioning to treat misalignment.\n");
3496 /* Requires loop versioning with alias checks. */
3497 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3499 /* FIXME: Make cost depend on complexity of individual check. */
3500 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3501 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3502 vect_prologue);
3503 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3504 if (len)
3505 /* Count LEN - 1 ANDs and LEN comparisons. */
3506 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3507 NULL, 0, vect_prologue);
3508 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3509 if (len)
3511 /* Count LEN - 1 ANDs and LEN comparisons. */
3512 unsigned int nstmts = len * 2 - 1;
3513 /* +1 for each bias that needs adding. */
3514 for (unsigned int i = 0; i < len; ++i)
3515 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3516 nstmts += 1;
3517 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3518 NULL, 0, vect_prologue);
3520 if (dump_enabled_p ())
3521 dump_printf (MSG_NOTE,
3522 "cost model: Adding cost of checks for loop "
3523 "versioning aliasing.\n");
3526 /* Requires loop versioning with niter checks. */
3527 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3529 /* FIXME: Make cost depend on complexity of individual check. */
3530 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3531 vect_prologue);
3532 if (dump_enabled_p ())
3533 dump_printf (MSG_NOTE,
3534 "cost model: Adding cost of checks for loop "
3535 "versioning niters.\n");
3538 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3539 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3540 vect_prologue);
3542 /* Count statements in scalar loop. Using this as scalar cost for a single
3543 iteration for now.
3545 TODO: Add outer loop support.
3547 TODO: Consider assigning different costs to different scalar
3548 statements. */
3550 scalar_single_iter_cost
3551 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3553 /* Add additional cost for the peeled instructions in prologue and epilogue
3554 loop. (For fully-masked loops there will be no peeling.)
3556 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3557 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3559 TODO: Build an expression that represents peel_iters for prologue and
3560 epilogue to be used in a run-time test. */
3562 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3564 peel_iters_prologue = 0;
3565 peel_iters_epilogue = 0;
3567 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3569 /* We need to peel exactly one iteration. */
3570 peel_iters_epilogue += 1;
3571 stmt_info_for_cost *si;
3572 int j;
3573 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3574 j, si)
3575 (void) add_stmt_cost (target_cost_data, si->count,
3576 si->kind, si->stmt_info, si->misalign,
3577 vect_epilogue);
3580 else if (npeel < 0)
3582 peel_iters_prologue = assumed_vf / 2;
3583 if (dump_enabled_p ())
3584 dump_printf (MSG_NOTE, "cost model: "
3585 "prologue peel iters set to vf/2.\n");
3587 /* If peeling for alignment is unknown, loop bound of main loop becomes
3588 unknown. */
3589 peel_iters_epilogue = assumed_vf / 2;
3590 if (dump_enabled_p ())
3591 dump_printf (MSG_NOTE, "cost model: "
3592 "epilogue peel iters set to vf/2 because "
3593 "peeling for alignment is unknown.\n");
3595 /* If peeled iterations are unknown, count a taken branch and a not taken
3596 branch per peeled loop. Even if scalar loop iterations are known,
3597 vector iterations are not known since peeled prologue iterations are
3598 not known. Hence guards remain the same. */
3599 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3600 NULL, 0, vect_prologue);
3601 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3602 NULL, 0, vect_prologue);
3603 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3604 NULL, 0, vect_epilogue);
3605 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3606 NULL, 0, vect_epilogue);
3607 stmt_info_for_cost *si;
3608 int j;
3609 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3611 (void) add_stmt_cost (target_cost_data,
3612 si->count * peel_iters_prologue,
3613 si->kind, si->stmt_info, si->misalign,
3614 vect_prologue);
3615 (void) add_stmt_cost (target_cost_data,
3616 si->count * peel_iters_epilogue,
3617 si->kind, si->stmt_info, si->misalign,
3618 vect_epilogue);
3621 else
3623 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3624 stmt_info_for_cost *si;
3625 int j;
3626 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3628 prologue_cost_vec.create (2);
3629 epilogue_cost_vec.create (2);
3630 peel_iters_prologue = npeel;
3632 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3633 &peel_iters_epilogue,
3634 &LOOP_VINFO_SCALAR_ITERATION_COST
3635 (loop_vinfo),
3636 &prologue_cost_vec,
3637 &epilogue_cost_vec);
3639 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3640 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3641 si->misalign, vect_prologue);
3643 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3644 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3645 si->misalign, vect_epilogue);
3647 prologue_cost_vec.release ();
3648 epilogue_cost_vec.release ();
3651 /* FORNOW: The scalar outside cost is incremented in one of the
3652 following ways:
3654 1. The vectorizer checks for alignment and aliasing and generates
3655 a condition that allows dynamic vectorization. A cost model
3656 check is ANDED with the versioning condition. Hence scalar code
3657 path now has the added cost of the versioning check.
3659 if (cost > th & versioning_check)
3660 jmp to vector code
3662 Hence run-time scalar is incremented by not-taken branch cost.
3664 2. The vectorizer then checks if a prologue is required. If the
3665 cost model check was not done before during versioning, it has to
3666 be done before the prologue check.
3668 if (cost <= th)
3669 prologue = scalar_iters
3670 if (prologue == 0)
3671 jmp to vector code
3672 else
3673 execute prologue
3674 if (prologue == num_iters)
3675 go to exit
3677 Hence the run-time scalar cost is incremented by a taken branch,
3678 plus a not-taken branch, plus a taken branch cost.
3680 3. The vectorizer then checks if an epilogue is required. If the
3681 cost model check was not done before during prologue check, it
3682 has to be done with the epilogue check.
3684 if (prologue == 0)
3685 jmp to vector code
3686 else
3687 execute prologue
3688 if (prologue == num_iters)
3689 go to exit
3690 vector code:
3691 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3692 jmp to epilogue
3694 Hence the run-time scalar cost should be incremented by 2 taken
3695 branches.
3697 TODO: The back end may reorder the BBS's differently and reverse
3698 conditions/branch directions. Change the estimates below to
3699 something more reasonable. */
3701 /* If the number of iterations is known and we do not do versioning, we can
3702 decide whether to vectorize at compile time. Hence the scalar version
3703 do not carry cost model guard costs. */
3704 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3705 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3707 /* Cost model check occurs at versioning. */
3708 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3709 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3710 else
3712 /* Cost model check occurs at prologue generation. */
3713 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3714 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3715 + vect_get_stmt_cost (cond_branch_not_taken);
3716 /* Cost model check occurs at epilogue generation. */
3717 else
3718 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3722 /* Complete the target-specific cost calculations. */
3723 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3724 &vec_inside_cost, &vec_epilogue_cost);
3726 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3728 if (dump_enabled_p ())
3730 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3731 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3732 vec_inside_cost);
3733 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3734 vec_prologue_cost);
3735 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3736 vec_epilogue_cost);
3737 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3738 scalar_single_iter_cost);
3739 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3740 scalar_outside_cost);
3741 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3742 vec_outside_cost);
3743 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3744 peel_iters_prologue);
3745 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3746 peel_iters_epilogue);
3749 /* Calculate number of iterations required to make the vector version
3750 profitable, relative to the loop bodies only. The following condition
3751 must hold true:
3752 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3753 where
3754 SIC = scalar iteration cost, VIC = vector iteration cost,
3755 VOC = vector outside cost, VF = vectorization factor,
3756 NPEEL = prologue iterations + epilogue iterations,
3757 SOC = scalar outside cost for run time cost model check. */
3759 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3760 - vec_inside_cost);
3761 if (saving_per_viter <= 0)
3763 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3764 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3765 "vectorization did not happen for a simd loop");
3767 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769 "cost model: the vector iteration cost = %d "
3770 "divided by the scalar iteration cost = %d "
3771 "is greater or equal to the vectorization factor = %d"
3772 ".\n",
3773 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3774 *ret_min_profitable_niters = -1;
3775 *ret_min_profitable_estimate = -1;
3776 return;
3779 /* ??? The "if" arm is written to handle all cases; see below for what
3780 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3781 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3783 /* Rewriting the condition above in terms of the number of
3784 vector iterations (vniters) rather than the number of
3785 scalar iterations (niters) gives:
3787 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3789 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3791 For integer N, X and Y when X > 0:
3793 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3794 int outside_overhead = (vec_outside_cost
3795 - scalar_single_iter_cost * peel_iters_prologue
3796 - scalar_single_iter_cost * peel_iters_epilogue
3797 - scalar_outside_cost);
3798 /* We're only interested in cases that require at least one
3799 vector iteration. */
3800 int min_vec_niters = 1;
3801 if (outside_overhead > 0)
3802 min_vec_niters = outside_overhead / saving_per_viter + 1;
3804 if (dump_enabled_p ())
3805 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3806 min_vec_niters);
3808 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3810 /* Now that we know the minimum number of vector iterations,
3811 find the minimum niters for which the scalar cost is larger:
3813 SIC * niters > VIC * vniters + VOC - SOC
3815 We know that the minimum niters is no more than
3816 vniters * VF + NPEEL, but it might be (and often is) less
3817 than that if a partial vector iteration is cheaper than the
3818 equivalent scalar code. */
3819 int threshold = (vec_inside_cost * min_vec_niters
3820 + vec_outside_cost
3821 - scalar_outside_cost);
3822 if (threshold <= 0)
3823 min_profitable_iters = 1;
3824 else
3825 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3827 else
3828 /* Convert the number of vector iterations into a number of
3829 scalar iterations. */
3830 min_profitable_iters = (min_vec_niters * assumed_vf
3831 + peel_iters_prologue
3832 + peel_iters_epilogue);
3834 else
3836 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3837 * assumed_vf
3838 - vec_inside_cost * peel_iters_prologue
3839 - vec_inside_cost * peel_iters_epilogue);
3840 if (min_profitable_iters <= 0)
3841 min_profitable_iters = 0;
3842 else
3844 min_profitable_iters /= saving_per_viter;
3846 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3847 <= (((int) vec_inside_cost * min_profitable_iters)
3848 + (((int) vec_outside_cost - scalar_outside_cost)
3849 * assumed_vf)))
3850 min_profitable_iters++;
3854 if (dump_enabled_p ())
3855 dump_printf (MSG_NOTE,
3856 " Calculated minimum iters for profitability: %d\n",
3857 min_profitable_iters);
3859 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3860 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3861 /* We want the vectorized loop to execute at least once. */
3862 min_profitable_iters = assumed_vf + peel_iters_prologue;
3864 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_NOTE, vect_location,
3866 " Runtime profitability threshold = %d\n",
3867 min_profitable_iters);
3869 *ret_min_profitable_niters = min_profitable_iters;
3871 /* Calculate number of iterations required to make the vector version
3872 profitable, relative to the loop bodies only.
3874 Non-vectorized variant is SIC * niters and it must win over vector
3875 variant on the expected loop trip count. The following condition must hold true:
3876 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3878 if (vec_outside_cost <= 0)
3879 min_profitable_estimate = 0;
3880 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3882 /* This is a repeat of the code above, but with + SOC rather
3883 than - SOC. */
3884 int outside_overhead = (vec_outside_cost
3885 - scalar_single_iter_cost * peel_iters_prologue
3886 - scalar_single_iter_cost * peel_iters_epilogue
3887 + scalar_outside_cost);
3888 int min_vec_niters = 1;
3889 if (outside_overhead > 0)
3890 min_vec_niters = outside_overhead / saving_per_viter + 1;
3892 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3894 int threshold = (vec_inside_cost * min_vec_niters
3895 + vec_outside_cost
3896 + scalar_outside_cost);
3897 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3899 else
3900 min_profitable_estimate = (min_vec_niters * assumed_vf
3901 + peel_iters_prologue
3902 + peel_iters_epilogue);
3904 else
3906 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3907 * assumed_vf
3908 - vec_inside_cost * peel_iters_prologue
3909 - vec_inside_cost * peel_iters_epilogue)
3910 / ((scalar_single_iter_cost * assumed_vf)
3911 - vec_inside_cost);
3913 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3914 if (dump_enabled_p ())
3915 dump_printf_loc (MSG_NOTE, vect_location,
3916 " Static estimate profitability threshold = %d\n",
3917 min_profitable_estimate);
3919 *ret_min_profitable_estimate = min_profitable_estimate;
3922 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3923 vector elements (not bits) for a vector with NELT elements. */
3924 static void
3925 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3926 vec_perm_builder *sel)
3928 /* The encoding is a single stepped pattern. Any wrap-around is handled
3929 by vec_perm_indices. */
3930 sel->new_vector (nelt, 1, 3);
3931 for (unsigned int i = 0; i < 3; i++)
3932 sel->quick_push (i + offset);
3935 /* Checks whether the target supports whole-vector shifts for vectors of mode
3936 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3937 it supports vec_perm_const with masks for all necessary shift amounts. */
3938 static bool
3939 have_whole_vector_shift (machine_mode mode)
3941 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3942 return true;
3944 /* Variable-length vectors should be handled via the optab. */
3945 unsigned int nelt;
3946 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3947 return false;
3949 vec_perm_builder sel;
3950 vec_perm_indices indices;
3951 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3953 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3954 indices.new_vector (sel, 2, nelt);
3955 if (!can_vec_perm_const_p (mode, indices, false))
3956 return false;
3958 return true;
3961 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3962 functions. Design better to avoid maintenance issues. */
3964 /* Function vect_model_reduction_cost.
3966 Models cost for a reduction operation, including the vector ops
3967 generated within the strip-mine loop, the initial definition before
3968 the loop, and the epilogue code that must be generated. */
3970 static void
3971 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3972 int ncopies, stmt_vector_for_cost *cost_vec)
3974 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3975 enum tree_code code;
3976 optab optab;
3977 tree vectype;
3978 machine_mode mode;
3979 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3980 struct loop *loop = NULL;
3982 if (loop_vinfo)
3983 loop = LOOP_VINFO_LOOP (loop_vinfo);
3985 /* Condition reductions generate two reductions in the loop. */
3986 vect_reduction_type reduction_type
3987 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3988 if (reduction_type == COND_REDUCTION)
3989 ncopies *= 2;
3991 vectype = STMT_VINFO_VECTYPE (stmt_info);
3992 mode = TYPE_MODE (vectype);
3993 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3995 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3997 if (reduction_type == EXTRACT_LAST_REDUCTION
3998 || reduction_type == FOLD_LEFT_REDUCTION)
4000 /* No extra instructions needed in the prologue. */
4001 prologue_cost = 0;
4003 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4004 /* Count one reduction-like operation per vector. */
4005 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4006 stmt_info, 0, vect_body);
4007 else
4009 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4010 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4011 inside_cost = record_stmt_cost (cost_vec, nelements,
4012 vec_to_scalar, stmt_info, 0,
4013 vect_body);
4014 inside_cost += record_stmt_cost (cost_vec, nelements,
4015 scalar_stmt, stmt_info, 0,
4016 vect_body);
4019 else
4021 /* Add in cost for initial definition.
4022 For cond reduction we have four vectors: initial index, step,
4023 initial result of the data reduction, initial value of the index
4024 reduction. */
4025 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4026 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4027 scalar_to_vec, stmt_info, 0,
4028 vect_prologue);
4030 /* Cost of reduction op inside loop. */
4031 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4032 stmt_info, 0, vect_body);
4035 /* Determine cost of epilogue code.
4037 We have a reduction operator that will reduce the vector in one statement.
4038 Also requires scalar extract. */
4040 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4042 if (reduc_fn != IFN_LAST)
4044 if (reduction_type == COND_REDUCTION)
4046 /* An EQ stmt and an COND_EXPR stmt. */
4047 epilogue_cost += record_stmt_cost (cost_vec, 2,
4048 vector_stmt, stmt_info, 0,
4049 vect_epilogue);
4050 /* Reduction of the max index and a reduction of the found
4051 values. */
4052 epilogue_cost += record_stmt_cost (cost_vec, 2,
4053 vec_to_scalar, stmt_info, 0,
4054 vect_epilogue);
4055 /* A broadcast of the max value. */
4056 epilogue_cost += record_stmt_cost (cost_vec, 1,
4057 scalar_to_vec, stmt_info, 0,
4058 vect_epilogue);
4060 else
4062 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4063 stmt_info, 0, vect_epilogue);
4064 epilogue_cost += record_stmt_cost (cost_vec, 1,
4065 vec_to_scalar, stmt_info, 0,
4066 vect_epilogue);
4069 else if (reduction_type == COND_REDUCTION)
4071 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4072 /* Extraction of scalar elements. */
4073 epilogue_cost += record_stmt_cost (cost_vec,
4074 2 * estimated_nunits,
4075 vec_to_scalar, stmt_info, 0,
4076 vect_epilogue);
4077 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4078 epilogue_cost += record_stmt_cost (cost_vec,
4079 2 * estimated_nunits - 3,
4080 scalar_stmt, stmt_info, 0,
4081 vect_epilogue);
4083 else if (reduction_type == EXTRACT_LAST_REDUCTION
4084 || reduction_type == FOLD_LEFT_REDUCTION)
4085 /* No extra instructions need in the epilogue. */
4087 else
4089 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4090 tree bitsize =
4091 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4092 int element_bitsize = tree_to_uhwi (bitsize);
4093 int nelements = vec_size_in_bits / element_bitsize;
4095 if (code == COND_EXPR)
4096 code = MAX_EXPR;
4098 optab = optab_for_tree_code (code, vectype, optab_default);
4100 /* We have a whole vector shift available. */
4101 if (optab != unknown_optab
4102 && VECTOR_MODE_P (mode)
4103 && optab_handler (optab, mode) != CODE_FOR_nothing
4104 && have_whole_vector_shift (mode))
4106 /* Final reduction via vector shifts and the reduction operator.
4107 Also requires scalar extract. */
4108 epilogue_cost += record_stmt_cost (cost_vec,
4109 exact_log2 (nelements) * 2,
4110 vector_stmt, stmt_info, 0,
4111 vect_epilogue);
4112 epilogue_cost += record_stmt_cost (cost_vec, 1,
4113 vec_to_scalar, stmt_info, 0,
4114 vect_epilogue);
4116 else
4117 /* Use extracts and reduction op for final reduction. For N
4118 elements, we have N extracts and N-1 reduction ops. */
4119 epilogue_cost += record_stmt_cost (cost_vec,
4120 nelements + nelements - 1,
4121 vector_stmt, stmt_info, 0,
4122 vect_epilogue);
4126 if (dump_enabled_p ())
4127 dump_printf (MSG_NOTE,
4128 "vect_model_reduction_cost: inside_cost = %d, "
4129 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4130 prologue_cost, epilogue_cost);
4134 /* Function vect_model_induction_cost.
4136 Models cost for induction operations. */
4138 static void
4139 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4140 stmt_vector_for_cost *cost_vec)
4142 unsigned inside_cost, prologue_cost;
4144 if (PURE_SLP_STMT (stmt_info))
4145 return;
4147 /* loop cost for vec_loop. */
4148 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4149 stmt_info, 0, vect_body);
4151 /* prologue cost for vec_init and vec_step. */
4152 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4153 stmt_info, 0, vect_prologue);
4155 if (dump_enabled_p ())
4156 dump_printf_loc (MSG_NOTE, vect_location,
4157 "vect_model_induction_cost: inside_cost = %d, "
4158 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4163 /* Function get_initial_def_for_reduction
4165 Input:
4166 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4167 INIT_VAL - the initial value of the reduction variable
4169 Output:
4170 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4171 of the reduction (used for adjusting the epilog - see below).
4172 Return a vector variable, initialized according to the operation that
4173 STMT_VINFO performs. This vector will be used as the initial value
4174 of the vector of partial results.
4176 Option1 (adjust in epilog): Initialize the vector as follows:
4177 add/bit or/xor: [0,0,...,0,0]
4178 mult/bit and: [1,1,...,1,1]
4179 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4180 and when necessary (e.g. add/mult case) let the caller know
4181 that it needs to adjust the result by init_val.
4183 Option2: Initialize the vector as follows:
4184 add/bit or/xor: [init_val,0,0,...,0]
4185 mult/bit and: [init_val,1,1,...,1]
4186 min/max/cond_expr: [init_val,init_val,...,init_val]
4187 and no adjustments are needed.
4189 For example, for the following code:
4191 s = init_val;
4192 for (i=0;i<n;i++)
4193 s = s + a[i];
4195 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4196 For a vector of 4 units, we want to return either [0,0,0,init_val],
4197 or [0,0,0,0] and let the caller know that it needs to adjust
4198 the result at the end by 'init_val'.
4200 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4201 initialization vector is simpler (same element in all entries), if
4202 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4204 A cost model should help decide between these two schemes. */
4206 tree
4207 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4208 tree *adjustment_def)
4210 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4211 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4212 tree scalar_type = TREE_TYPE (init_val);
4213 tree vectype = get_vectype_for_scalar_type (scalar_type);
4214 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4215 tree def_for_init;
4216 tree init_def;
4217 REAL_VALUE_TYPE real_init_val = dconst0;
4218 int int_init_val = 0;
4219 gimple_seq stmts = NULL;
4221 gcc_assert (vectype);
4223 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4224 || SCALAR_FLOAT_TYPE_P (scalar_type));
4226 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4227 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4229 vect_reduction_type reduction_type
4230 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4232 switch (code)
4234 case WIDEN_SUM_EXPR:
4235 case DOT_PROD_EXPR:
4236 case SAD_EXPR:
4237 case PLUS_EXPR:
4238 case MINUS_EXPR:
4239 case BIT_IOR_EXPR:
4240 case BIT_XOR_EXPR:
4241 case MULT_EXPR:
4242 case BIT_AND_EXPR:
4244 /* ADJUSTMENT_DEF is NULL when called from
4245 vect_create_epilog_for_reduction to vectorize double reduction. */
4246 if (adjustment_def)
4247 *adjustment_def = init_val;
4249 if (code == MULT_EXPR)
4251 real_init_val = dconst1;
4252 int_init_val = 1;
4255 if (code == BIT_AND_EXPR)
4256 int_init_val = -1;
4258 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4259 def_for_init = build_real (scalar_type, real_init_val);
4260 else
4261 def_for_init = build_int_cst (scalar_type, int_init_val);
4263 if (adjustment_def)
4264 /* Option1: the first element is '0' or '1' as well. */
4265 init_def = gimple_build_vector_from_val (&stmts, vectype,
4266 def_for_init);
4267 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4269 /* Option2 (variable length): the first element is INIT_VAL. */
4270 init_def = gimple_build_vector_from_val (&stmts, vectype,
4271 def_for_init);
4272 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4273 vectype, init_def, init_val);
4275 else
4277 /* Option2: the first element is INIT_VAL. */
4278 tree_vector_builder elts (vectype, 1, 2);
4279 elts.quick_push (init_val);
4280 elts.quick_push (def_for_init);
4281 init_def = gimple_build_vector (&stmts, &elts);
4284 break;
4286 case MIN_EXPR:
4287 case MAX_EXPR:
4288 case COND_EXPR:
4290 if (adjustment_def)
4292 *adjustment_def = NULL_TREE;
4293 if (reduction_type != COND_REDUCTION
4294 && reduction_type != EXTRACT_LAST_REDUCTION)
4296 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4297 break;
4300 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4301 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4303 break;
4305 default:
4306 gcc_unreachable ();
4309 if (stmts)
4310 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4311 return init_def;
4314 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4315 NUMBER_OF_VECTORS is the number of vector defs to create.
4316 If NEUTRAL_OP is nonnull, introducing extra elements of that
4317 value will not change the result. */
4319 static void
4320 get_initial_defs_for_reduction (slp_tree slp_node,
4321 vec<tree> *vec_oprnds,
4322 unsigned int number_of_vectors,
4323 bool reduc_chain, tree neutral_op)
4325 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4326 stmt_vec_info stmt_vinfo = stmts[0];
4327 unsigned HOST_WIDE_INT nunits;
4328 unsigned j, number_of_places_left_in_vector;
4329 tree vector_type;
4330 unsigned int group_size = stmts.length ();
4331 unsigned int i;
4332 struct loop *loop;
4334 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4336 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4338 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4339 gcc_assert (loop);
4340 edge pe = loop_preheader_edge (loop);
4342 gcc_assert (!reduc_chain || neutral_op);
4344 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4345 created vectors. It is greater than 1 if unrolling is performed.
4347 For example, we have two scalar operands, s1 and s2 (e.g., group of
4348 strided accesses of size two), while NUNITS is four (i.e., four scalars
4349 of this type can be packed in a vector). The output vector will contain
4350 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4351 will be 2).
4353 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4354 vectors containing the operands.
4356 For example, NUNITS is four as before, and the group size is 8
4357 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4358 {s5, s6, s7, s8}. */
4360 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4361 nunits = group_size;
4363 number_of_places_left_in_vector = nunits;
4364 bool constant_p = true;
4365 tree_vector_builder elts (vector_type, nunits, 1);
4366 elts.quick_grow (nunits);
4367 gimple_seq ctor_seq = NULL;
4368 for (j = 0; j < nunits * number_of_vectors; ++j)
4370 tree op;
4371 i = j % group_size;
4372 stmt_vinfo = stmts[i];
4374 /* Get the def before the loop. In reduction chain we have only
4375 one initial value. Else we have as many as PHIs in the group. */
4376 if (reduc_chain)
4377 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4378 else if (((vec_oprnds->length () + 1) * nunits
4379 - number_of_places_left_in_vector >= group_size)
4380 && neutral_op)
4381 op = neutral_op;
4382 else
4383 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4385 /* Create 'vect_ = {op0,op1,...,opn}'. */
4386 number_of_places_left_in_vector--;
4387 elts[nunits - number_of_places_left_in_vector - 1] = op;
4388 if (!CONSTANT_CLASS_P (op))
4389 constant_p = false;
4391 if (number_of_places_left_in_vector == 0)
4393 tree init;
4394 if (constant_p && !neutral_op
4395 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4396 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4397 /* Build the vector directly from ELTS. */
4398 init = gimple_build_vector (&ctor_seq, &elts);
4399 else if (neutral_op)
4401 /* Build a vector of the neutral value and shift the
4402 other elements into place. */
4403 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4404 neutral_op);
4405 int k = nunits;
4406 while (k > 0 && elts[k - 1] == neutral_op)
4407 k -= 1;
4408 while (k > 0)
4410 k -= 1;
4411 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4412 vector_type, init, elts[k]);
4415 else
4417 /* First time round, duplicate ELTS to fill the
4418 required number of vectors. */
4419 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4420 number_of_vectors, *vec_oprnds);
4421 break;
4423 vec_oprnds->quick_push (init);
4425 number_of_places_left_in_vector = nunits;
4426 elts.new_vector (vector_type, nunits, 1);
4427 elts.quick_grow (nunits);
4428 constant_p = true;
4431 if (ctor_seq != NULL)
4432 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4436 /* Function vect_create_epilog_for_reduction
4438 Create code at the loop-epilog to finalize the result of a reduction
4439 computation.
4441 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4442 reduction statements.
4443 STMT_INFO is the scalar reduction stmt that is being vectorized.
4444 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4445 number of elements that we can fit in a vectype (nunits). In this case
4446 we have to generate more than one vector stmt - i.e - we need to "unroll"
4447 the vector stmt by a factor VF/nunits. For more details see documentation
4448 in vectorizable_operation.
4449 REDUC_FN is the internal function for the epilog reduction.
4450 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4451 computation.
4452 REDUC_INDEX is the index of the operand in the right hand side of the
4453 statement that is defined by REDUCTION_PHI.
4454 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4455 SLP_NODE is an SLP node containing a group of reduction statements. The
4456 first one in this group is STMT_INFO.
4457 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4458 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4459 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4460 any value of the IV in the loop.
4461 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4462 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4463 null if this is not an SLP reduction
4465 This function:
4466 1. Creates the reduction def-use cycles: sets the arguments for
4467 REDUCTION_PHIS:
4468 The loop-entry argument is the vectorized initial-value of the reduction.
4469 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4470 sums.
4471 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4472 by calling the function specified by REDUC_FN if available, or by
4473 other means (whole-vector shifts or a scalar loop).
4474 The function also creates a new phi node at the loop exit to preserve
4475 loop-closed form, as illustrated below.
4477 The flow at the entry to this function:
4479 loop:
4480 vec_def = phi <null, null> # REDUCTION_PHI
4481 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4482 s_loop = scalar_stmt # (scalar) STMT_INFO
4483 loop_exit:
4484 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4485 use <s_out0>
4486 use <s_out0>
4488 The above is transformed by this function into:
4490 loop:
4491 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4492 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4493 s_loop = scalar_stmt # (scalar) STMT_INFO
4494 loop_exit:
4495 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4496 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4497 v_out2 = reduce <v_out1>
4498 s_out3 = extract_field <v_out2, 0>
4499 s_out4 = adjust_result <s_out3>
4500 use <s_out4>
4501 use <s_out4>
4504 static void
4505 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4506 stmt_vec_info stmt_info,
4507 gimple *reduc_def_stmt,
4508 int ncopies, internal_fn reduc_fn,
4509 vec<stmt_vec_info> reduction_phis,
4510 bool double_reduc,
4511 slp_tree slp_node,
4512 slp_instance slp_node_instance,
4513 tree induc_val, enum tree_code induc_code,
4514 tree neutral_op)
4516 stmt_vec_info prev_phi_info;
4517 tree vectype;
4518 machine_mode mode;
4519 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4520 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4521 basic_block exit_bb;
4522 tree scalar_dest;
4523 tree scalar_type;
4524 gimple *new_phi = NULL, *phi;
4525 stmt_vec_info phi_info;
4526 gimple_stmt_iterator exit_gsi;
4527 tree vec_dest;
4528 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4529 gimple *epilog_stmt = NULL;
4530 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4531 gimple *exit_phi;
4532 tree bitsize;
4533 tree adjustment_def = NULL;
4534 tree vec_initial_def = NULL;
4535 tree expr, def, initial_def = NULL;
4536 tree orig_name, scalar_result;
4537 imm_use_iterator imm_iter, phi_imm_iter;
4538 use_operand_p use_p, phi_use_p;
4539 gimple *use_stmt;
4540 stmt_vec_info reduction_phi_info = NULL;
4541 bool nested_in_vect_loop = false;
4542 auto_vec<gimple *> new_phis;
4543 auto_vec<stmt_vec_info> inner_phis;
4544 int j, i;
4545 auto_vec<tree> scalar_results;
4546 unsigned int group_size = 1, k, ratio;
4547 auto_vec<tree> vec_initial_defs;
4548 auto_vec<gimple *> phis;
4549 bool slp_reduc = false;
4550 bool direct_slp_reduc;
4551 tree new_phi_result;
4552 stmt_vec_info inner_phi = NULL;
4553 tree induction_index = NULL_TREE;
4555 if (slp_node)
4556 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4558 if (nested_in_vect_loop_p (loop, stmt_info))
4560 outer_loop = loop;
4561 loop = loop->inner;
4562 nested_in_vect_loop = true;
4563 gcc_assert (!slp_node);
4566 vectype = STMT_VINFO_VECTYPE (stmt_info);
4567 gcc_assert (vectype);
4568 mode = TYPE_MODE (vectype);
4570 /* 1. Create the reduction def-use cycle:
4571 Set the arguments of REDUCTION_PHIS, i.e., transform
4573 loop:
4574 vec_def = phi <null, null> # REDUCTION_PHI
4575 VECT_DEF = vector_stmt # vectorized form of STMT
4578 into:
4580 loop:
4581 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4582 VECT_DEF = vector_stmt # vectorized form of STMT
4585 (in case of SLP, do it for all the phis). */
4587 /* Get the loop-entry arguments. */
4588 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4589 if (slp_node)
4591 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4592 vec_initial_defs.reserve (vec_num);
4593 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4594 &vec_initial_defs, vec_num,
4595 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4596 neutral_op);
4598 else
4600 /* Get at the scalar def before the loop, that defines the initial value
4601 of the reduction variable. */
4602 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4603 loop_preheader_edge (loop));
4604 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4605 and we can't use zero for induc_val, use initial_def. Similarly
4606 for REDUC_MIN and initial_def larger than the base. */
4607 if (TREE_CODE (initial_def) == INTEGER_CST
4608 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4609 == INTEGER_INDUC_COND_REDUCTION)
4610 && !integer_zerop (induc_val)
4611 && ((induc_code == MAX_EXPR
4612 && tree_int_cst_lt (initial_def, induc_val))
4613 || (induc_code == MIN_EXPR
4614 && tree_int_cst_lt (induc_val, initial_def))))
4615 induc_val = initial_def;
4617 if (double_reduc)
4618 /* In case of double reduction we only create a vector variable
4619 to be put in the reduction phi node. The actual statement
4620 creation is done later in this function. */
4621 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4622 else if (nested_in_vect_loop)
4624 /* Do not use an adjustment def as that case is not supported
4625 correctly if ncopies is not one. */
4626 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4627 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4628 stmt_info);
4630 else
4631 vec_initial_def
4632 = get_initial_def_for_reduction (stmt_info, initial_def,
4633 &adjustment_def);
4634 vec_initial_defs.create (1);
4635 vec_initial_defs.quick_push (vec_initial_def);
4638 /* Set phi nodes arguments. */
4639 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4641 tree vec_init_def = vec_initial_defs[i];
4642 tree def = vect_defs[i];
4643 for (j = 0; j < ncopies; j++)
4645 if (j != 0)
4647 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4648 if (nested_in_vect_loop)
4649 vec_init_def
4650 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4653 /* Set the loop-entry arg of the reduction-phi. */
4655 gphi *phi = as_a <gphi *> (phi_info->stmt);
4656 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4657 == INTEGER_INDUC_COND_REDUCTION)
4659 /* Initialise the reduction phi to zero. This prevents initial
4660 values of non-zero interferring with the reduction op. */
4661 gcc_assert (ncopies == 1);
4662 gcc_assert (i == 0);
4664 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4665 tree induc_val_vec
4666 = build_vector_from_val (vec_init_def_type, induc_val);
4668 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4669 UNKNOWN_LOCATION);
4671 else
4672 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4673 UNKNOWN_LOCATION);
4675 /* Set the loop-latch arg for the reduction-phi. */
4676 if (j > 0)
4677 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4679 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4681 if (dump_enabled_p ())
4682 dump_printf_loc (MSG_NOTE, vect_location,
4683 "transform reduction: created def-use cycle: %G%G",
4684 phi, SSA_NAME_DEF_STMT (def));
4688 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4689 which is updated with the current index of the loop for every match of
4690 the original loop's cond_expr (VEC_STMT). This results in a vector
4691 containing the last time the condition passed for that vector lane.
4692 The first match will be a 1 to allow 0 to be used for non-matching
4693 indexes. If there are no matches at all then the vector will be all
4694 zeroes. */
4695 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4697 tree indx_before_incr, indx_after_incr;
4698 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4700 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4701 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4703 int scalar_precision
4704 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4705 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4706 tree cr_index_vector_type = build_vector_type
4707 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4709 /* First we create a simple vector induction variable which starts
4710 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4711 vector size (STEP). */
4713 /* Create a {1,2,3,...} vector. */
4714 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4716 /* Create a vector of the step value. */
4717 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4718 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4720 /* Create an induction variable. */
4721 gimple_stmt_iterator incr_gsi;
4722 bool insert_after;
4723 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4724 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4725 insert_after, &indx_before_incr, &indx_after_incr);
4727 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4728 filled with zeros (VEC_ZERO). */
4730 /* Create a vector of 0s. */
4731 tree zero = build_zero_cst (cr_index_scalar_type);
4732 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4734 /* Create a vector phi node. */
4735 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4736 new_phi = create_phi_node (new_phi_tree, loop->header);
4737 loop_vinfo->add_stmt (new_phi);
4738 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4739 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4741 /* Now take the condition from the loops original cond_expr
4742 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4743 every match uses values from the induction variable
4744 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4745 (NEW_PHI_TREE).
4746 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4747 the new cond_expr (INDEX_COND_EXPR). */
4749 /* Duplicate the condition from vec_stmt. */
4750 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4752 /* Create a conditional, where the condition is taken from vec_stmt
4753 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4754 else is the phi (NEW_PHI_TREE). */
4755 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4756 ccompare, indx_before_incr,
4757 new_phi_tree);
4758 induction_index = make_ssa_name (cr_index_vector_type);
4759 gimple *index_condition = gimple_build_assign (induction_index,
4760 index_cond_expr);
4761 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4762 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4763 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4765 /* Update the phi with the vec cond. */
4766 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4767 loop_latch_edge (loop), UNKNOWN_LOCATION);
4770 /* 2. Create epilog code.
4771 The reduction epilog code operates across the elements of the vector
4772 of partial results computed by the vectorized loop.
4773 The reduction epilog code consists of:
4775 step 1: compute the scalar result in a vector (v_out2)
4776 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4777 step 3: adjust the scalar result (s_out3) if needed.
4779 Step 1 can be accomplished using one the following three schemes:
4780 (scheme 1) using reduc_fn, if available.
4781 (scheme 2) using whole-vector shifts, if available.
4782 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4783 combined.
4785 The overall epilog code looks like this:
4787 s_out0 = phi <s_loop> # original EXIT_PHI
4788 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4789 v_out2 = reduce <v_out1> # step 1
4790 s_out3 = extract_field <v_out2, 0> # step 2
4791 s_out4 = adjust_result <s_out3> # step 3
4793 (step 3 is optional, and steps 1 and 2 may be combined).
4794 Lastly, the uses of s_out0 are replaced by s_out4. */
4797 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4798 v_out1 = phi <VECT_DEF>
4799 Store them in NEW_PHIS. */
4801 exit_bb = single_exit (loop)->dest;
4802 prev_phi_info = NULL;
4803 new_phis.create (vect_defs.length ());
4804 FOR_EACH_VEC_ELT (vect_defs, i, def)
4806 for (j = 0; j < ncopies; j++)
4808 tree new_def = copy_ssa_name (def);
4809 phi = create_phi_node (new_def, exit_bb);
4810 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4811 if (j == 0)
4812 new_phis.quick_push (phi);
4813 else
4815 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4816 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4819 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4820 prev_phi_info = phi_info;
4824 /* The epilogue is created for the outer-loop, i.e., for the loop being
4825 vectorized. Create exit phis for the outer loop. */
4826 if (double_reduc)
4828 loop = outer_loop;
4829 exit_bb = single_exit (loop)->dest;
4830 inner_phis.create (vect_defs.length ());
4831 FOR_EACH_VEC_ELT (new_phis, i, phi)
4833 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4834 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4835 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4836 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4837 PHI_RESULT (phi));
4838 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4839 inner_phis.quick_push (phi_info);
4840 new_phis[i] = outer_phi;
4841 while (STMT_VINFO_RELATED_STMT (phi_info))
4843 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4844 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4845 outer_phi = create_phi_node (new_result, exit_bb);
4846 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4847 PHI_RESULT (phi_info->stmt));
4848 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4849 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4850 prev_phi_info = outer_phi_info;
4855 exit_gsi = gsi_after_labels (exit_bb);
4857 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4858 (i.e. when reduc_fn is not available) and in the final adjustment
4859 code (if needed). Also get the original scalar reduction variable as
4860 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4861 represents a reduction pattern), the tree-code and scalar-def are
4862 taken from the original stmt that the pattern-stmt (STMT) replaces.
4863 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4864 are taken from STMT. */
4866 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4867 if (orig_stmt_info != stmt_info)
4869 /* Reduction pattern */
4870 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4871 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4874 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4875 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4876 partial results are added and not subtracted. */
4877 if (code == MINUS_EXPR)
4878 code = PLUS_EXPR;
4880 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4881 scalar_type = TREE_TYPE (scalar_dest);
4882 scalar_results.create (group_size);
4883 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4884 bitsize = TYPE_SIZE (scalar_type);
4886 /* In case this is a reduction in an inner-loop while vectorizing an outer
4887 loop - we don't need to extract a single scalar result at the end of the
4888 inner-loop (unless it is double reduction, i.e., the use of reduction is
4889 outside the outer-loop). The final vector of partial results will be used
4890 in the vectorized outer-loop, or reduced to a scalar result at the end of
4891 the outer-loop. */
4892 if (nested_in_vect_loop && !double_reduc)
4893 goto vect_finalize_reduction;
4895 /* SLP reduction without reduction chain, e.g.,
4896 # a1 = phi <a2, a0>
4897 # b1 = phi <b2, b0>
4898 a2 = operation (a1)
4899 b2 = operation (b1) */
4900 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4902 /* True if we should implement SLP_REDUC using native reduction operations
4903 instead of scalar operations. */
4904 direct_slp_reduc = (reduc_fn != IFN_LAST
4905 && slp_reduc
4906 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4908 /* In case of reduction chain, e.g.,
4909 # a1 = phi <a3, a0>
4910 a2 = operation (a1)
4911 a3 = operation (a2),
4913 we may end up with more than one vector result. Here we reduce them to
4914 one vector. */
4915 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4917 tree first_vect = PHI_RESULT (new_phis[0]);
4918 gassign *new_vec_stmt = NULL;
4919 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4920 for (k = 1; k < new_phis.length (); k++)
4922 gimple *next_phi = new_phis[k];
4923 tree second_vect = PHI_RESULT (next_phi);
4924 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4925 new_vec_stmt = gimple_build_assign (tem, code,
4926 first_vect, second_vect);
4927 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4928 first_vect = tem;
4931 new_phi_result = first_vect;
4932 if (new_vec_stmt)
4934 new_phis.truncate (0);
4935 new_phis.safe_push (new_vec_stmt);
4938 /* Likewise if we couldn't use a single defuse cycle. */
4939 else if (ncopies > 1)
4941 gcc_assert (new_phis.length () == 1);
4942 tree first_vect = PHI_RESULT (new_phis[0]);
4943 gassign *new_vec_stmt = NULL;
4944 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4945 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4946 for (int k = 1; k < ncopies; ++k)
4948 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4949 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4950 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4951 new_vec_stmt = gimple_build_assign (tem, code,
4952 first_vect, second_vect);
4953 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4954 first_vect = tem;
4956 new_phi_result = first_vect;
4957 new_phis.truncate (0);
4958 new_phis.safe_push (new_vec_stmt);
4960 else
4961 new_phi_result = PHI_RESULT (new_phis[0]);
4963 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4964 && reduc_fn != IFN_LAST)
4966 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4967 various data values where the condition matched and another vector
4968 (INDUCTION_INDEX) containing all the indexes of those matches. We
4969 need to extract the last matching index (which will be the index with
4970 highest value) and use this to index into the data vector.
4971 For the case where there were no matches, the data vector will contain
4972 all default values and the index vector will be all zeros. */
4974 /* Get various versions of the type of the vector of indexes. */
4975 tree index_vec_type = TREE_TYPE (induction_index);
4976 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4977 tree index_scalar_type = TREE_TYPE (index_vec_type);
4978 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4979 (index_vec_type);
4981 /* Get an unsigned integer version of the type of the data vector. */
4982 int scalar_precision
4983 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4984 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4985 tree vectype_unsigned = build_vector_type
4986 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4988 /* First we need to create a vector (ZERO_VEC) of zeros and another
4989 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4990 can create using a MAX reduction and then expanding.
4991 In the case where the loop never made any matches, the max index will
4992 be zero. */
4994 /* Vector of {0, 0, 0,...}. */
4995 tree zero_vec = make_ssa_name (vectype);
4996 tree zero_vec_rhs = build_zero_cst (vectype);
4997 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4998 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5000 /* Find maximum value from the vector of found indexes. */
5001 tree max_index = make_ssa_name (index_scalar_type);
5002 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5003 1, induction_index);
5004 gimple_call_set_lhs (max_index_stmt, max_index);
5005 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5007 /* Vector of {max_index, max_index, max_index,...}. */
5008 tree max_index_vec = make_ssa_name (index_vec_type);
5009 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5010 max_index);
5011 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5012 max_index_vec_rhs);
5013 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5015 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5016 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5017 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5018 otherwise. Only one value should match, resulting in a vector
5019 (VEC_COND) with one data value and the rest zeros.
5020 In the case where the loop never made any matches, every index will
5021 match, resulting in a vector with all data values (which will all be
5022 the default value). */
5024 /* Compare the max index vector to the vector of found indexes to find
5025 the position of the max value. */
5026 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5027 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5028 induction_index,
5029 max_index_vec);
5030 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5032 /* Use the compare to choose either values from the data vector or
5033 zero. */
5034 tree vec_cond = make_ssa_name (vectype);
5035 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5036 vec_compare, new_phi_result,
5037 zero_vec);
5038 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5040 /* Finally we need to extract the data value from the vector (VEC_COND)
5041 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5042 reduction, but because this doesn't exist, we can use a MAX reduction
5043 instead. The data value might be signed or a float so we need to cast
5044 it first.
5045 In the case where the loop never made any matches, the data values are
5046 all identical, and so will reduce down correctly. */
5048 /* Make the matched data values unsigned. */
5049 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5050 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5051 vec_cond);
5052 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5053 VIEW_CONVERT_EXPR,
5054 vec_cond_cast_rhs);
5055 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5057 /* Reduce down to a scalar value. */
5058 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5059 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5060 1, vec_cond_cast);
5061 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5062 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5064 /* Convert the reduced value back to the result type and set as the
5065 result. */
5066 gimple_seq stmts = NULL;
5067 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5068 data_reduc);
5069 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5070 scalar_results.safe_push (new_temp);
5072 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5073 && reduc_fn == IFN_LAST)
5075 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5076 idx = 0;
5077 idx_val = induction_index[0];
5078 val = data_reduc[0];
5079 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5080 if (induction_index[i] > idx_val)
5081 val = data_reduc[i], idx_val = induction_index[i];
5082 return val; */
5084 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5085 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5086 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5087 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5088 /* Enforced by vectorizable_reduction, which ensures we have target
5089 support before allowing a conditional reduction on variable-length
5090 vectors. */
5091 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5092 tree idx_val = NULL_TREE, val = NULL_TREE;
5093 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5095 tree old_idx_val = idx_val;
5096 tree old_val = val;
5097 idx_val = make_ssa_name (idx_eltype);
5098 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5099 build3 (BIT_FIELD_REF, idx_eltype,
5100 induction_index,
5101 bitsize_int (el_size),
5102 bitsize_int (off)));
5103 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5104 val = make_ssa_name (data_eltype);
5105 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5106 build3 (BIT_FIELD_REF,
5107 data_eltype,
5108 new_phi_result,
5109 bitsize_int (el_size),
5110 bitsize_int (off)));
5111 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5112 if (off != 0)
5114 tree new_idx_val = idx_val;
5115 if (off != v_size - el_size)
5117 new_idx_val = make_ssa_name (idx_eltype);
5118 epilog_stmt = gimple_build_assign (new_idx_val,
5119 MAX_EXPR, idx_val,
5120 old_idx_val);
5121 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5123 tree new_val = make_ssa_name (data_eltype);
5124 epilog_stmt = gimple_build_assign (new_val,
5125 COND_EXPR,
5126 build2 (GT_EXPR,
5127 boolean_type_node,
5128 idx_val,
5129 old_idx_val),
5130 val, old_val);
5131 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5132 idx_val = new_idx_val;
5133 val = new_val;
5136 /* Convert the reduced value back to the result type and set as the
5137 result. */
5138 gimple_seq stmts = NULL;
5139 val = gimple_convert (&stmts, scalar_type, val);
5140 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5141 scalar_results.safe_push (val);
5144 /* 2.3 Create the reduction code, using one of the three schemes described
5145 above. In SLP we simply need to extract all the elements from the
5146 vector (without reducing them), so we use scalar shifts. */
5147 else if (reduc_fn != IFN_LAST && !slp_reduc)
5149 tree tmp;
5150 tree vec_elem_type;
5152 /* Case 1: Create:
5153 v_out2 = reduc_expr <v_out1> */
5155 if (dump_enabled_p ())
5156 dump_printf_loc (MSG_NOTE, vect_location,
5157 "Reduce using direct vector reduction.\n");
5159 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5160 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5162 tree tmp_dest
5163 = vect_create_destination_var (scalar_dest, vec_elem_type);
5164 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5165 new_phi_result);
5166 gimple_set_lhs (epilog_stmt, tmp_dest);
5167 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5168 gimple_set_lhs (epilog_stmt, new_temp);
5169 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5171 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5172 new_temp);
5174 else
5176 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5177 new_phi_result);
5178 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5181 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5182 gimple_set_lhs (epilog_stmt, new_temp);
5183 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5185 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5186 == INTEGER_INDUC_COND_REDUCTION)
5187 && !operand_equal_p (initial_def, induc_val, 0))
5189 /* Earlier we set the initial value to be a vector if induc_val
5190 values. Check the result and if it is induc_val then replace
5191 with the original initial value, unless induc_val is
5192 the same as initial_def already. */
5193 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5194 induc_val);
5196 tmp = make_ssa_name (new_scalar_dest);
5197 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5198 initial_def, new_temp);
5199 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5200 new_temp = tmp;
5203 scalar_results.safe_push (new_temp);
5205 else if (direct_slp_reduc)
5207 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5208 with the elements for other SLP statements replaced with the
5209 neutral value. We can then do a normal reduction on each vector. */
5211 /* Enforced by vectorizable_reduction. */
5212 gcc_assert (new_phis.length () == 1);
5213 gcc_assert (pow2p_hwi (group_size));
5215 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5216 vec<stmt_vec_info> orig_phis
5217 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5218 gimple_seq seq = NULL;
5220 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5221 and the same element size as VECTYPE. */
5222 tree index = build_index_vector (vectype, 0, 1);
5223 tree index_type = TREE_TYPE (index);
5224 tree index_elt_type = TREE_TYPE (index_type);
5225 tree mask_type = build_same_sized_truth_vector_type (index_type);
5227 /* Create a vector that, for each element, identifies which of
5228 the REDUC_GROUP_SIZE results should use it. */
5229 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5230 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5231 build_vector_from_val (index_type, index_mask));
5233 /* Get a neutral vector value. This is simply a splat of the neutral
5234 scalar value if we have one, otherwise the initial scalar value
5235 is itself a neutral value. */
5236 tree vector_identity = NULL_TREE;
5237 if (neutral_op)
5238 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5239 neutral_op);
5240 for (unsigned int i = 0; i < group_size; ++i)
5242 /* If there's no univeral neutral value, we can use the
5243 initial scalar value from the original PHI. This is used
5244 for MIN and MAX reduction, for example. */
5245 if (!neutral_op)
5247 tree scalar_value
5248 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5249 loop_preheader_edge (loop));
5250 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5251 scalar_value);
5254 /* Calculate the equivalent of:
5256 sel[j] = (index[j] == i);
5258 which selects the elements of NEW_PHI_RESULT that should
5259 be included in the result. */
5260 tree compare_val = build_int_cst (index_elt_type, i);
5261 compare_val = build_vector_from_val (index_type, compare_val);
5262 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5263 index, compare_val);
5265 /* Calculate the equivalent of:
5267 vec = seq ? new_phi_result : vector_identity;
5269 VEC is now suitable for a full vector reduction. */
5270 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5271 sel, new_phi_result, vector_identity);
5273 /* Do the reduction and convert it to the appropriate type. */
5274 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5275 TREE_TYPE (vectype), vec);
5276 scalar = gimple_convert (&seq, scalar_type, scalar);
5277 scalar_results.safe_push (scalar);
5279 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5281 else
5283 bool reduce_with_shift;
5284 tree vec_temp;
5286 /* COND reductions all do the final reduction with MAX_EXPR
5287 or MIN_EXPR. */
5288 if (code == COND_EXPR)
5290 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5291 == INTEGER_INDUC_COND_REDUCTION)
5292 code = induc_code;
5293 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5294 == CONST_COND_REDUCTION)
5295 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5296 else
5297 code = MAX_EXPR;
5300 /* See if the target wants to do the final (shift) reduction
5301 in a vector mode of smaller size and first reduce upper/lower
5302 halves against each other. */
5303 enum machine_mode mode1 = mode;
5304 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5305 unsigned sz1 = sz;
5306 if (!slp_reduc
5307 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5308 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5310 tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5311 reduce_with_shift = have_whole_vector_shift (mode1);
5312 if (!VECTOR_MODE_P (mode1))
5313 reduce_with_shift = false;
5314 else
5316 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5317 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5318 reduce_with_shift = false;
5321 /* First reduce the vector to the desired vector size we should
5322 do shift reduction on by combining upper and lower halves. */
5323 new_temp = new_phi_result;
5324 while (sz > sz1)
5326 gcc_assert (!slp_reduc);
5327 sz /= 2;
5328 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5330 /* The target has to make sure we support lowpart/highpart
5331 extraction, either via direct vector extract or through
5332 an integer mode punning. */
5333 tree dst1, dst2;
5334 if (convert_optab_handler (vec_extract_optab,
5335 TYPE_MODE (TREE_TYPE (new_temp)),
5336 TYPE_MODE (vectype1))
5337 != CODE_FOR_nothing)
5339 /* Extract sub-vectors directly once vec_extract becomes
5340 a conversion optab. */
5341 dst1 = make_ssa_name (vectype1);
5342 epilog_stmt
5343 = gimple_build_assign (dst1, BIT_FIELD_REF,
5344 build3 (BIT_FIELD_REF, vectype1,
5345 new_temp, TYPE_SIZE (vectype1),
5346 bitsize_int (0)));
5347 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5348 dst2 = make_ssa_name (vectype1);
5349 epilog_stmt
5350 = gimple_build_assign (dst2, BIT_FIELD_REF,
5351 build3 (BIT_FIELD_REF, vectype1,
5352 new_temp, TYPE_SIZE (vectype1),
5353 bitsize_int (sz * BITS_PER_UNIT)));
5354 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356 else
5358 /* Extract via punning to appropriately sized integer mode
5359 vector. */
5360 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5362 tree etype = build_vector_type (eltype, 2);
5363 gcc_assert (convert_optab_handler (vec_extract_optab,
5364 TYPE_MODE (etype),
5365 TYPE_MODE (eltype))
5366 != CODE_FOR_nothing);
5367 tree tem = make_ssa_name (etype);
5368 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5369 build1 (VIEW_CONVERT_EXPR,
5370 etype, new_temp));
5371 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5372 new_temp = tem;
5373 tem = make_ssa_name (eltype);
5374 epilog_stmt
5375 = gimple_build_assign (tem, BIT_FIELD_REF,
5376 build3 (BIT_FIELD_REF, eltype,
5377 new_temp, TYPE_SIZE (eltype),
5378 bitsize_int (0)));
5379 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5380 dst1 = make_ssa_name (vectype1);
5381 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5382 build1 (VIEW_CONVERT_EXPR,
5383 vectype1, tem));
5384 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5385 tem = make_ssa_name (eltype);
5386 epilog_stmt
5387 = gimple_build_assign (tem, BIT_FIELD_REF,
5388 build3 (BIT_FIELD_REF, eltype,
5389 new_temp, TYPE_SIZE (eltype),
5390 bitsize_int (sz * BITS_PER_UNIT)));
5391 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5392 dst2 = make_ssa_name (vectype1);
5393 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5394 build1 (VIEW_CONVERT_EXPR,
5395 vectype1, tem));
5396 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5399 new_temp = make_ssa_name (vectype1);
5400 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5401 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5404 if (reduce_with_shift && !slp_reduc)
5406 int element_bitsize = tree_to_uhwi (bitsize);
5407 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5408 for variable-length vectors and also requires direct target support
5409 for loop reductions. */
5410 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5411 int nelements = vec_size_in_bits / element_bitsize;
5412 vec_perm_builder sel;
5413 vec_perm_indices indices;
5415 int elt_offset;
5417 tree zero_vec = build_zero_cst (vectype1);
5418 /* Case 2: Create:
5419 for (offset = nelements/2; offset >= 1; offset/=2)
5421 Create: va' = vec_shift <va, offset>
5422 Create: va = vop <va, va'>
5423 } */
5425 tree rhs;
5427 if (dump_enabled_p ())
5428 dump_printf_loc (MSG_NOTE, vect_location,
5429 "Reduce using vector shifts\n");
5431 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5432 for (elt_offset = nelements / 2;
5433 elt_offset >= 1;
5434 elt_offset /= 2)
5436 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5437 indices.new_vector (sel, 2, nelements);
5438 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5439 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5440 new_temp, zero_vec, mask);
5441 new_name = make_ssa_name (vec_dest, epilog_stmt);
5442 gimple_assign_set_lhs (epilog_stmt, new_name);
5443 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5445 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5446 new_temp);
5447 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5448 gimple_assign_set_lhs (epilog_stmt, new_temp);
5449 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5452 /* 2.4 Extract the final scalar result. Create:
5453 s_out3 = extract_field <v_out2, bitpos> */
5455 if (dump_enabled_p ())
5456 dump_printf_loc (MSG_NOTE, vect_location,
5457 "extract scalar result\n");
5459 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5460 bitsize, bitsize_zero_node);
5461 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5462 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5463 gimple_assign_set_lhs (epilog_stmt, new_temp);
5464 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5465 scalar_results.safe_push (new_temp);
5467 else
5469 /* Case 3: Create:
5470 s = extract_field <v_out2, 0>
5471 for (offset = element_size;
5472 offset < vector_size;
5473 offset += element_size;)
5475 Create: s' = extract_field <v_out2, offset>
5476 Create: s = op <s, s'> // For non SLP cases
5477 } */
5479 if (dump_enabled_p ())
5480 dump_printf_loc (MSG_NOTE, vect_location,
5481 "Reduce using scalar code.\n");
5483 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5484 int element_bitsize = tree_to_uhwi (bitsize);
5485 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5487 int bit_offset;
5488 if (gimple_code (new_phi) == GIMPLE_PHI)
5489 vec_temp = PHI_RESULT (new_phi);
5490 else
5491 vec_temp = gimple_assign_lhs (new_phi);
5492 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5493 bitsize_zero_node);
5494 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5495 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5496 gimple_assign_set_lhs (epilog_stmt, new_temp);
5497 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5499 /* In SLP we don't need to apply reduction operation, so we just
5500 collect s' values in SCALAR_RESULTS. */
5501 if (slp_reduc)
5502 scalar_results.safe_push (new_temp);
5504 for (bit_offset = element_bitsize;
5505 bit_offset < vec_size_in_bits;
5506 bit_offset += element_bitsize)
5508 tree bitpos = bitsize_int (bit_offset);
5509 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5510 bitsize, bitpos);
5512 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5513 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5514 gimple_assign_set_lhs (epilog_stmt, new_name);
5515 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5517 if (slp_reduc)
5519 /* In SLP we don't need to apply reduction operation, so
5520 we just collect s' values in SCALAR_RESULTS. */
5521 new_temp = new_name;
5522 scalar_results.safe_push (new_name);
5524 else
5526 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5527 new_name, new_temp);
5528 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5529 gimple_assign_set_lhs (epilog_stmt, new_temp);
5530 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5535 /* The only case where we need to reduce scalar results in SLP, is
5536 unrolling. If the size of SCALAR_RESULTS is greater than
5537 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5538 REDUC_GROUP_SIZE. */
5539 if (slp_reduc)
5541 tree res, first_res, new_res;
5542 gimple *new_stmt;
5544 /* Reduce multiple scalar results in case of SLP unrolling. */
5545 for (j = group_size; scalar_results.iterate (j, &res);
5546 j++)
5548 first_res = scalar_results[j % group_size];
5549 new_stmt = gimple_build_assign (new_scalar_dest, code,
5550 first_res, res);
5551 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5552 gimple_assign_set_lhs (new_stmt, new_res);
5553 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5554 scalar_results[j % group_size] = new_res;
5557 else
5558 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5559 scalar_results.safe_push (new_temp);
5562 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5563 == INTEGER_INDUC_COND_REDUCTION)
5564 && !operand_equal_p (initial_def, induc_val, 0))
5566 /* Earlier we set the initial value to be a vector if induc_val
5567 values. Check the result and if it is induc_val then replace
5568 with the original initial value, unless induc_val is
5569 the same as initial_def already. */
5570 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5571 induc_val);
5573 tree tmp = make_ssa_name (new_scalar_dest);
5574 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5575 initial_def, new_temp);
5576 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5577 scalar_results[0] = tmp;
5581 vect_finalize_reduction:
5583 if (double_reduc)
5584 loop = loop->inner;
5586 /* 2.5 Adjust the final result by the initial value of the reduction
5587 variable. (When such adjustment is not needed, then
5588 'adjustment_def' is zero). For example, if code is PLUS we create:
5589 new_temp = loop_exit_def + adjustment_def */
5591 if (adjustment_def)
5593 gcc_assert (!slp_reduc);
5594 if (nested_in_vect_loop)
5596 new_phi = new_phis[0];
5597 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5598 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5599 new_dest = vect_create_destination_var (scalar_dest, vectype);
5601 else
5603 new_temp = scalar_results[0];
5604 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5605 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5606 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5609 epilog_stmt = gimple_build_assign (new_dest, expr);
5610 new_temp = make_ssa_name (new_dest, epilog_stmt);
5611 gimple_assign_set_lhs (epilog_stmt, new_temp);
5612 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5613 if (nested_in_vect_loop)
5615 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5616 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5617 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5619 if (!double_reduc)
5620 scalar_results.quick_push (new_temp);
5621 else
5622 scalar_results[0] = new_temp;
5624 else
5625 scalar_results[0] = new_temp;
5627 new_phis[0] = epilog_stmt;
5630 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5631 phis with new adjusted scalar results, i.e., replace use <s_out0>
5632 with use <s_out4>.
5634 Transform:
5635 loop_exit:
5636 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5637 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5638 v_out2 = reduce <v_out1>
5639 s_out3 = extract_field <v_out2, 0>
5640 s_out4 = adjust_result <s_out3>
5641 use <s_out0>
5642 use <s_out0>
5644 into:
5646 loop_exit:
5647 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5648 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5649 v_out2 = reduce <v_out1>
5650 s_out3 = extract_field <v_out2, 0>
5651 s_out4 = adjust_result <s_out3>
5652 use <s_out4>
5653 use <s_out4> */
5656 /* In SLP reduction chain we reduce vector results into one vector if
5657 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5658 LHS of the last stmt in the reduction chain, since we are looking for
5659 the loop exit phi node. */
5660 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5662 stmt_vec_info dest_stmt_info
5663 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5664 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5665 group_size = 1;
5668 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5669 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5670 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5671 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5672 correspond to the first vector stmt, etc.
5673 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5674 if (group_size > new_phis.length ())
5676 ratio = group_size / new_phis.length ();
5677 gcc_assert (!(group_size % new_phis.length ()));
5679 else
5680 ratio = 1;
5682 stmt_vec_info epilog_stmt_info = NULL;
5683 for (k = 0; k < group_size; k++)
5685 if (k % ratio == 0)
5687 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5688 reduction_phi_info = reduction_phis[k / ratio];
5689 if (double_reduc)
5690 inner_phi = inner_phis[k / ratio];
5693 if (slp_reduc)
5695 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5697 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5698 /* SLP statements can't participate in patterns. */
5699 gcc_assert (!orig_stmt_info);
5700 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5703 phis.create (3);
5704 /* Find the loop-closed-use at the loop exit of the original scalar
5705 result. (The reduction result is expected to have two immediate uses -
5706 one at the latch block, and one at the loop exit). */
5707 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5708 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5709 && !is_gimple_debug (USE_STMT (use_p)))
5710 phis.safe_push (USE_STMT (use_p));
5712 /* While we expect to have found an exit_phi because of loop-closed-ssa
5713 form we can end up without one if the scalar cycle is dead. */
5715 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5717 if (outer_loop)
5719 stmt_vec_info exit_phi_vinfo
5720 = loop_vinfo->lookup_stmt (exit_phi);
5721 gphi *vect_phi;
5723 if (double_reduc)
5724 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5725 else
5726 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5727 if (!double_reduc
5728 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5729 != vect_double_reduction_def)
5730 continue;
5732 /* Handle double reduction:
5734 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5735 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5736 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5737 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5739 At that point the regular reduction (stmt2 and stmt3) is
5740 already vectorized, as well as the exit phi node, stmt4.
5741 Here we vectorize the phi node of double reduction, stmt1, and
5742 update all relevant statements. */
5744 /* Go through all the uses of s2 to find double reduction phi
5745 node, i.e., stmt1 above. */
5746 orig_name = PHI_RESULT (exit_phi);
5747 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5749 stmt_vec_info use_stmt_vinfo;
5750 tree vect_phi_init, preheader_arg, vect_phi_res;
5751 basic_block bb = gimple_bb (use_stmt);
5753 /* Check that USE_STMT is really double reduction phi
5754 node. */
5755 if (gimple_code (use_stmt) != GIMPLE_PHI
5756 || gimple_phi_num_args (use_stmt) != 2
5757 || bb->loop_father != outer_loop)
5758 continue;
5759 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5760 if (!use_stmt_vinfo
5761 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5762 != vect_double_reduction_def)
5763 continue;
5765 /* Create vector phi node for double reduction:
5766 vs1 = phi <vs0, vs2>
5767 vs1 was created previously in this function by a call to
5768 vect_get_vec_def_for_operand and is stored in
5769 vec_initial_def;
5770 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5771 vs0 is created here. */
5773 /* Create vector phi node. */
5774 vect_phi = create_phi_node (vec_initial_def, bb);
5775 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5777 /* Create vs0 - initial def of the double reduction phi. */
5778 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5779 loop_preheader_edge (outer_loop));
5780 vect_phi_init = get_initial_def_for_reduction
5781 (stmt_info, preheader_arg, NULL);
5783 /* Update phi node arguments with vs0 and vs2. */
5784 add_phi_arg (vect_phi, vect_phi_init,
5785 loop_preheader_edge (outer_loop),
5786 UNKNOWN_LOCATION);
5787 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5788 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5789 if (dump_enabled_p ())
5790 dump_printf_loc (MSG_NOTE, vect_location,
5791 "created double reduction phi node: %G",
5792 vect_phi);
5794 vect_phi_res = PHI_RESULT (vect_phi);
5796 /* Replace the use, i.e., set the correct vs1 in the regular
5797 reduction phi node. FORNOW, NCOPIES is always 1, so the
5798 loop is redundant. */
5799 stmt_vec_info use_info = reduction_phi_info;
5800 for (j = 0; j < ncopies; j++)
5802 edge pr_edge = loop_preheader_edge (loop);
5803 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5804 pr_edge->dest_idx, vect_phi_res);
5805 use_info = STMT_VINFO_RELATED_STMT (use_info);
5811 phis.release ();
5812 if (nested_in_vect_loop)
5814 if (double_reduc)
5815 loop = outer_loop;
5816 else
5817 continue;
5820 phis.create (3);
5821 /* Find the loop-closed-use at the loop exit of the original scalar
5822 result. (The reduction result is expected to have two immediate uses,
5823 one at the latch block, and one at the loop exit). For double
5824 reductions we are looking for exit phis of the outer loop. */
5825 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5827 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5829 if (!is_gimple_debug (USE_STMT (use_p)))
5830 phis.safe_push (USE_STMT (use_p));
5832 else
5834 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5836 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5838 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5840 if (!flow_bb_inside_loop_p (loop,
5841 gimple_bb (USE_STMT (phi_use_p)))
5842 && !is_gimple_debug (USE_STMT (phi_use_p)))
5843 phis.safe_push (USE_STMT (phi_use_p));
5849 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5851 /* Replace the uses: */
5852 orig_name = PHI_RESULT (exit_phi);
5853 scalar_result = scalar_results[k];
5854 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5855 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5856 SET_USE (use_p, scalar_result);
5859 phis.release ();
5863 /* Return a vector of type VECTYPE that is equal to the vector select
5864 operation "MASK ? VEC : IDENTITY". Insert the select statements
5865 before GSI. */
5867 static tree
5868 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5869 tree vec, tree identity)
5871 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5872 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5873 mask, vec, identity);
5874 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5875 return cond;
5878 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5879 order, starting with LHS. Insert the extraction statements before GSI and
5880 associate the new scalar SSA names with variable SCALAR_DEST.
5881 Return the SSA name for the result. */
5883 static tree
5884 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5885 tree_code code, tree lhs, tree vector_rhs)
5887 tree vectype = TREE_TYPE (vector_rhs);
5888 tree scalar_type = TREE_TYPE (vectype);
5889 tree bitsize = TYPE_SIZE (scalar_type);
5890 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5891 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5893 for (unsigned HOST_WIDE_INT bit_offset = 0;
5894 bit_offset < vec_size_in_bits;
5895 bit_offset += element_bitsize)
5897 tree bitpos = bitsize_int (bit_offset);
5898 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5899 bitsize, bitpos);
5901 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5902 rhs = make_ssa_name (scalar_dest, stmt);
5903 gimple_assign_set_lhs (stmt, rhs);
5904 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5906 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5907 tree new_name = make_ssa_name (scalar_dest, stmt);
5908 gimple_assign_set_lhs (stmt, new_name);
5909 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5910 lhs = new_name;
5912 return lhs;
5915 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5916 type of the vector input. */
5918 static internal_fn
5919 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5921 internal_fn mask_reduc_fn;
5923 switch (reduc_fn)
5925 case IFN_FOLD_LEFT_PLUS:
5926 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5927 break;
5929 default:
5930 return IFN_LAST;
5933 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5934 OPTIMIZE_FOR_SPEED))
5935 return mask_reduc_fn;
5936 return IFN_LAST;
5939 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5940 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5941 statement. CODE is the operation performed by STMT_INFO and OPS are
5942 its scalar operands. REDUC_INDEX is the index of the operand in
5943 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5944 implements in-order reduction, or IFN_LAST if we should open-code it.
5945 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5946 that should be used to control the operation in a fully-masked loop. */
5948 static bool
5949 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5950 gimple_stmt_iterator *gsi,
5951 stmt_vec_info *vec_stmt, slp_tree slp_node,
5952 gimple *reduc_def_stmt,
5953 tree_code code, internal_fn reduc_fn,
5954 tree ops[3], tree vectype_in,
5955 int reduc_index, vec_loop_masks *masks)
5957 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5958 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5959 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5960 stmt_vec_info new_stmt_info = NULL;
5961 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5963 int ncopies;
5964 if (slp_node)
5965 ncopies = 1;
5966 else
5967 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5969 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5970 gcc_assert (ncopies == 1);
5971 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5972 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5973 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5974 == FOLD_LEFT_REDUCTION);
5976 if (slp_node)
5977 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5978 TYPE_VECTOR_SUBPARTS (vectype_in)));
5980 tree op0 = ops[1 - reduc_index];
5982 int group_size = 1;
5983 stmt_vec_info scalar_dest_def_info;
5984 auto_vec<tree> vec_oprnds0;
5985 if (slp_node)
5987 auto_vec<vec<tree> > vec_defs (2);
5988 auto_vec<tree> sops(2);
5989 sops.quick_push (ops[0]);
5990 sops.quick_push (ops[1]);
5991 vect_get_slp_defs (sops, slp_node, &vec_defs);
5992 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5993 vec_defs[0].release ();
5994 vec_defs[1].release ();
5995 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5996 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5998 else
6000 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
6001 vec_oprnds0.create (1);
6002 vec_oprnds0.quick_push (loop_vec_def0);
6003 scalar_dest_def_info = stmt_info;
6006 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6007 tree scalar_type = TREE_TYPE (scalar_dest);
6008 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6010 int vec_num = vec_oprnds0.length ();
6011 gcc_assert (vec_num == 1 || slp_node);
6012 tree vec_elem_type = TREE_TYPE (vectype_out);
6013 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6015 tree vector_identity = NULL_TREE;
6016 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6017 vector_identity = build_zero_cst (vectype_out);
6019 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6020 int i;
6021 tree def0;
6022 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6024 gimple *new_stmt;
6025 tree mask = NULL_TREE;
6026 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6027 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6029 /* Handle MINUS by adding the negative. */
6030 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6032 tree negated = make_ssa_name (vectype_out);
6033 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6034 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6035 def0 = negated;
6038 if (mask && mask_reduc_fn == IFN_LAST)
6039 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6040 vector_identity);
6042 /* On the first iteration the input is simply the scalar phi
6043 result, and for subsequent iterations it is the output of
6044 the preceding operation. */
6045 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6047 if (mask && mask_reduc_fn != IFN_LAST)
6048 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6049 def0, mask);
6050 else
6051 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6052 def0);
6053 /* For chained SLP reductions the output of the previous reduction
6054 operation serves as the input of the next. For the final statement
6055 the output cannot be a temporary - we reuse the original
6056 scalar destination of the last statement. */
6057 if (i != vec_num - 1)
6059 gimple_set_lhs (new_stmt, scalar_dest_var);
6060 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6061 gimple_set_lhs (new_stmt, reduc_var);
6064 else
6066 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6067 reduc_var, def0);
6068 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6069 /* Remove the statement, so that we can use the same code paths
6070 as for statements that we've just created. */
6071 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6072 gsi_remove (&tmp_gsi, true);
6075 if (i == vec_num - 1)
6077 gimple_set_lhs (new_stmt, scalar_dest);
6078 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
6079 new_stmt);
6081 else
6082 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
6083 new_stmt, gsi);
6085 if (slp_node)
6086 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6089 if (!slp_node)
6090 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6092 return true;
6095 /* Function is_nonwrapping_integer_induction.
6097 Check if STMT_VINO (which is part of loop LOOP) both increments and
6098 does not cause overflow. */
6100 static bool
6101 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
6103 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6104 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6105 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6106 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6107 widest_int ni, max_loop_value, lhs_max;
6108 wi::overflow_type overflow = wi::OVF_NONE;
6110 /* Make sure the loop is integer based. */
6111 if (TREE_CODE (base) != INTEGER_CST
6112 || TREE_CODE (step) != INTEGER_CST)
6113 return false;
6115 /* Check that the max size of the loop will not wrap. */
6117 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6118 return true;
6120 if (! max_stmt_executions (loop, &ni))
6121 return false;
6123 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6124 &overflow);
6125 if (overflow)
6126 return false;
6128 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6129 TYPE_SIGN (lhs_type), &overflow);
6130 if (overflow)
6131 return false;
6133 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6134 <= TYPE_PRECISION (lhs_type));
6137 /* Check if masking can be supported by inserting a conditional expression.
6138 CODE is the code for the operation. COND_FN is the conditional internal
6139 function, if it exists. VECTYPE_IN is the type of the vector input. */
6140 static bool
6141 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6142 tree vectype_in)
6144 if (cond_fn != IFN_LAST
6145 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6146 OPTIMIZE_FOR_SPEED))
6147 return false;
6149 switch (code)
6151 case DOT_PROD_EXPR:
6152 case SAD_EXPR:
6153 return true;
6155 default:
6156 return false;
6160 /* Insert a conditional expression to enable masked vectorization. CODE is the
6161 code for the operation. VOP is the array of operands. MASK is the loop
6162 mask. GSI is a statement iterator used to place the new conditional
6163 expression. */
6164 static void
6165 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6166 gimple_stmt_iterator *gsi)
6168 switch (code)
6170 case DOT_PROD_EXPR:
6172 tree vectype = TREE_TYPE (vop[1]);
6173 tree zero = build_zero_cst (vectype);
6174 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6175 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6176 mask, vop[1], zero);
6177 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6178 vop[1] = masked_op1;
6179 break;
6182 case SAD_EXPR:
6184 tree vectype = TREE_TYPE (vop[1]);
6185 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6186 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6187 mask, vop[1], vop[0]);
6188 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6189 vop[1] = masked_op1;
6190 break;
6193 default:
6194 gcc_unreachable ();
6198 /* Function vectorizable_reduction.
6200 Check if STMT_INFO performs a reduction operation that can be vectorized.
6201 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6202 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6203 Return true if STMT_INFO is vectorizable in this way.
6205 This function also handles reduction idioms (patterns) that have been
6206 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6207 may be of this form:
6208 X = pattern_expr (arg0, arg1, ..., X)
6209 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6210 sequence that had been detected and replaced by the pattern-stmt
6211 (STMT_INFO).
6213 This function also handles reduction of condition expressions, for example:
6214 for (int i = 0; i < N; i++)
6215 if (a[i] < value)
6216 last = a[i];
6217 This is handled by vectorising the loop and creating an additional vector
6218 containing the loop indexes for which "a[i] < value" was true. In the
6219 function epilogue this is reduced to a single max value and then used to
6220 index into the vector of results.
6222 In some cases of reduction patterns, the type of the reduction variable X is
6223 different than the type of the other arguments of STMT_INFO.
6224 In such cases, the vectype that is used when transforming STMT_INFO into
6225 a vector stmt is different than the vectype that is used to determine the
6226 vectorization factor, because it consists of a different number of elements
6227 than the actual number of elements that are being operated upon in parallel.
6229 For example, consider an accumulation of shorts into an int accumulator.
6230 On some targets it's possible to vectorize this pattern operating on 8
6231 shorts at a time (hence, the vectype for purposes of determining the
6232 vectorization factor should be V8HI); on the other hand, the vectype that
6233 is used to create the vector form is actually V4SI (the type of the result).
6235 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6236 indicates what is the actual level of parallelism (V8HI in the example), so
6237 that the right vectorization factor would be derived. This vectype
6238 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6239 be used to create the vectorized stmt. The right vectype for the vectorized
6240 stmt is obtained from the type of the result X:
6241 get_vectype_for_scalar_type (TREE_TYPE (X))
6243 This means that, contrary to "regular" reductions (or "regular" stmts in
6244 general), the following equation:
6245 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6246 does *NOT* necessarily hold for reduction patterns. */
6248 bool
6249 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6250 stmt_vec_info *vec_stmt, slp_tree slp_node,
6251 slp_instance slp_node_instance,
6252 stmt_vector_for_cost *cost_vec)
6254 tree vec_dest;
6255 tree scalar_dest;
6256 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6257 tree vectype_in = NULL_TREE;
6258 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6259 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6260 enum tree_code code, orig_code;
6261 internal_fn reduc_fn;
6262 machine_mode vec_mode;
6263 int op_type;
6264 optab optab;
6265 tree new_temp = NULL_TREE;
6266 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6267 stmt_vec_info cond_stmt_vinfo = NULL;
6268 enum tree_code cond_reduc_op_code = ERROR_MARK;
6269 tree scalar_type;
6270 bool is_simple_use;
6271 int i;
6272 int ncopies;
6273 int epilog_copies;
6274 stmt_vec_info prev_stmt_info, prev_phi_info;
6275 bool single_defuse_cycle = false;
6276 stmt_vec_info new_stmt_info = NULL;
6277 int j;
6278 tree ops[3];
6279 enum vect_def_type dts[3];
6280 bool nested_cycle = false, found_nested_cycle_def = false;
6281 bool double_reduc = false;
6282 basic_block def_bb;
6283 struct loop * def_stmt_loop;
6284 tree def_arg;
6285 auto_vec<tree> vec_oprnds0;
6286 auto_vec<tree> vec_oprnds1;
6287 auto_vec<tree> vec_oprnds2;
6288 auto_vec<tree> vect_defs;
6289 auto_vec<stmt_vec_info> phis;
6290 int vec_num;
6291 tree def0, tem;
6292 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6293 tree cond_reduc_val = NULL_TREE;
6295 /* Make sure it was already recognized as a reduction computation. */
6296 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6297 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6298 return false;
6300 if (nested_in_vect_loop_p (loop, stmt_info))
6302 loop = loop->inner;
6303 nested_cycle = true;
6306 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6307 gcc_assert (slp_node
6308 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6310 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6312 tree phi_result = gimple_phi_result (phi);
6313 /* Analysis is fully done on the reduction stmt invocation. */
6314 if (! vec_stmt)
6316 if (slp_node)
6317 slp_node_instance->reduc_phis = slp_node;
6319 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6320 return true;
6323 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6324 /* Leave the scalar phi in place. Note that checking
6325 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6326 for reductions involving a single statement. */
6327 return true;
6329 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6330 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6332 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6333 == EXTRACT_LAST_REDUCTION)
6334 /* Leave the scalar phi in place. */
6335 return true;
6337 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6338 code = gimple_assign_rhs_code (reduc_stmt);
6339 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6341 tree op = gimple_op (reduc_stmt, k);
6342 if (op == phi_result)
6343 continue;
6344 if (k == 1 && code == COND_EXPR)
6345 continue;
6346 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6347 gcc_assert (is_simple_use);
6348 if (dt == vect_constant_def || dt == vect_external_def)
6349 continue;
6350 if (!vectype_in
6351 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6352 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6353 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6354 break;
6356 /* For a nested cycle we might end up with an operation like
6357 phi_result * phi_result. */
6358 if (!vectype_in)
6359 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6360 gcc_assert (vectype_in);
6362 if (slp_node)
6363 ncopies = 1;
6364 else
6365 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6367 stmt_vec_info use_stmt_info;
6368 if (ncopies > 1
6369 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6370 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6371 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6372 single_defuse_cycle = true;
6374 /* Create the destination vector */
6375 scalar_dest = gimple_assign_lhs (reduc_stmt);
6376 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6378 if (slp_node)
6379 /* The size vect_schedule_slp_instance computes is off for us. */
6380 vec_num = vect_get_num_vectors
6381 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6382 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6383 vectype_in);
6384 else
6385 vec_num = 1;
6387 /* Generate the reduction PHIs upfront. */
6388 prev_phi_info = NULL;
6389 for (j = 0; j < ncopies; j++)
6391 if (j == 0 || !single_defuse_cycle)
6393 for (i = 0; i < vec_num; i++)
6395 /* Create the reduction-phi that defines the reduction
6396 operand. */
6397 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6398 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6400 if (slp_node)
6401 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6402 else
6404 if (j == 0)
6405 STMT_VINFO_VEC_STMT (stmt_info)
6406 = *vec_stmt = new_phi_info;
6407 else
6408 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6409 prev_phi_info = new_phi_info;
6415 return true;
6418 /* 1. Is vectorizable reduction? */
6419 /* Not supportable if the reduction variable is used in the loop, unless
6420 it's a reduction chain. */
6421 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6422 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6423 return false;
6425 /* Reductions that are not used even in an enclosing outer-loop,
6426 are expected to be "live" (used out of the loop). */
6427 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6428 && !STMT_VINFO_LIVE_P (stmt_info))
6429 return false;
6431 /* 2. Has this been recognized as a reduction pattern?
6433 Check if STMT represents a pattern that has been recognized
6434 in earlier analysis stages. For stmts that represent a pattern,
6435 the STMT_VINFO_RELATED_STMT field records the last stmt in
6436 the original sequence that constitutes the pattern. */
6438 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6439 if (orig_stmt_info)
6441 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6442 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6445 /* 3. Check the operands of the operation. The first operands are defined
6446 inside the loop body. The last operand is the reduction variable,
6447 which is defined by the loop-header-phi. */
6449 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6451 /* Flatten RHS. */
6452 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6454 case GIMPLE_BINARY_RHS:
6455 code = gimple_assign_rhs_code (stmt);
6456 op_type = TREE_CODE_LENGTH (code);
6457 gcc_assert (op_type == binary_op);
6458 ops[0] = gimple_assign_rhs1 (stmt);
6459 ops[1] = gimple_assign_rhs2 (stmt);
6460 break;
6462 case GIMPLE_TERNARY_RHS:
6463 code = gimple_assign_rhs_code (stmt);
6464 op_type = TREE_CODE_LENGTH (code);
6465 gcc_assert (op_type == ternary_op);
6466 ops[0] = gimple_assign_rhs1 (stmt);
6467 ops[1] = gimple_assign_rhs2 (stmt);
6468 ops[2] = gimple_assign_rhs3 (stmt);
6469 break;
6471 case GIMPLE_UNARY_RHS:
6472 return false;
6474 default:
6475 gcc_unreachable ();
6478 if (code == COND_EXPR && slp_node)
6479 return false;
6481 scalar_dest = gimple_assign_lhs (stmt);
6482 scalar_type = TREE_TYPE (scalar_dest);
6483 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6484 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6485 return false;
6487 /* Do not try to vectorize bit-precision reductions. */
6488 if (!type_has_mode_precision_p (scalar_type))
6489 return false;
6491 /* All uses but the last are expected to be defined in the loop.
6492 The last use is the reduction variable. In case of nested cycle this
6493 assumption is not true: we use reduc_index to record the index of the
6494 reduction variable. */
6495 stmt_vec_info reduc_def_info;
6496 if (orig_stmt_info)
6497 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6498 else
6499 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6500 gcc_assert (reduc_def_info);
6501 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6502 tree reduc_def = PHI_RESULT (reduc_def_phi);
6503 int reduc_index = -1;
6504 for (i = 0; i < op_type; i++)
6506 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6507 if (i == 0 && code == COND_EXPR)
6508 continue;
6510 stmt_vec_info def_stmt_info;
6511 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6512 &def_stmt_info);
6513 dt = dts[i];
6514 gcc_assert (is_simple_use);
6515 if (dt == vect_reduction_def
6516 && ops[i] == reduc_def)
6518 reduc_index = i;
6519 continue;
6521 else if (tem)
6523 /* To properly compute ncopies we are interested in the widest
6524 input type in case we're looking at a widening accumulation. */
6525 if (!vectype_in
6526 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6527 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6528 vectype_in = tem;
6531 if (dt != vect_internal_def
6532 && dt != vect_external_def
6533 && dt != vect_constant_def
6534 && dt != vect_induction_def
6535 && !(dt == vect_nested_cycle && nested_cycle))
6536 return false;
6538 if (dt == vect_nested_cycle
6539 && ops[i] == reduc_def)
6541 found_nested_cycle_def = true;
6542 reduc_index = i;
6545 if (i == 1 && code == COND_EXPR)
6547 /* Record how value of COND_EXPR is defined. */
6548 if (dt == vect_constant_def)
6550 cond_reduc_dt = dt;
6551 cond_reduc_val = ops[i];
6553 if (dt == vect_induction_def
6554 && def_stmt_info
6555 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6557 cond_reduc_dt = dt;
6558 cond_stmt_vinfo = def_stmt_info;
6563 if (!vectype_in)
6564 vectype_in = vectype_out;
6566 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6567 directy used in stmt. */
6568 if (reduc_index == -1)
6570 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6572 if (dump_enabled_p ())
6573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6574 "in-order reduction chain without SLP.\n");
6575 return false;
6579 if (!(reduc_index == -1
6580 || dts[reduc_index] == vect_reduction_def
6581 || dts[reduc_index] == vect_nested_cycle
6582 || ((dts[reduc_index] == vect_internal_def
6583 || dts[reduc_index] == vect_external_def
6584 || dts[reduc_index] == vect_constant_def
6585 || dts[reduc_index] == vect_induction_def)
6586 && nested_cycle && found_nested_cycle_def)))
6588 /* For pattern recognized stmts, orig_stmt might be a reduction,
6589 but some helper statements for the pattern might not, or
6590 might be COND_EXPRs with reduction uses in the condition. */
6591 gcc_assert (orig_stmt_info);
6592 return false;
6595 /* PHIs should not participate in patterns. */
6596 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6597 enum vect_reduction_type v_reduc_type
6598 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6599 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6601 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6602 /* If we have a condition reduction, see if we can simplify it further. */
6603 if (v_reduc_type == COND_REDUCTION)
6605 /* TODO: We can't yet handle reduction chains, since we need to treat
6606 each COND_EXPR in the chain specially, not just the last one.
6607 E.g. for:
6609 x_1 = PHI <x_3, ...>
6610 x_2 = a_2 ? ... : x_1;
6611 x_3 = a_3 ? ... : x_2;
6613 we're interested in the last element in x_3 for which a_2 || a_3
6614 is true, whereas the current reduction chain handling would
6615 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6616 as a reduction operation. */
6617 if (reduc_index == -1)
6619 if (dump_enabled_p ())
6620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6621 "conditional reduction chains not supported\n");
6622 return false;
6625 /* vect_is_simple_reduction ensured that operand 2 is the
6626 loop-carried operand. */
6627 gcc_assert (reduc_index == 2);
6629 /* Loop peeling modifies initial value of reduction PHI, which
6630 makes the reduction stmt to be transformed different to the
6631 original stmt analyzed. We need to record reduction code for
6632 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6633 it can be used directly at transform stage. */
6634 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6635 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6637 /* Also set the reduction type to CONST_COND_REDUCTION. */
6638 gcc_assert (cond_reduc_dt == vect_constant_def);
6639 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6641 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6642 vectype_in, OPTIMIZE_FOR_SPEED))
6644 if (dump_enabled_p ())
6645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6646 "optimizing condition reduction with"
6647 " FOLD_EXTRACT_LAST.\n");
6648 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6650 else if (cond_reduc_dt == vect_induction_def)
6652 tree base
6653 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6654 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6656 gcc_assert (TREE_CODE (base) == INTEGER_CST
6657 && TREE_CODE (step) == INTEGER_CST);
6658 cond_reduc_val = NULL_TREE;
6659 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6660 above base; punt if base is the minimum value of the type for
6661 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6662 if (tree_int_cst_sgn (step) == -1)
6664 cond_reduc_op_code = MIN_EXPR;
6665 if (tree_int_cst_sgn (base) == -1)
6666 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6667 else if (tree_int_cst_lt (base,
6668 TYPE_MAX_VALUE (TREE_TYPE (base))))
6669 cond_reduc_val
6670 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6672 else
6674 cond_reduc_op_code = MAX_EXPR;
6675 if (tree_int_cst_sgn (base) == 1)
6676 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6677 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6678 base))
6679 cond_reduc_val
6680 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6682 if (cond_reduc_val)
6684 if (dump_enabled_p ())
6685 dump_printf_loc (MSG_NOTE, vect_location,
6686 "condition expression based on "
6687 "integer induction.\n");
6688 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6689 = INTEGER_INDUC_COND_REDUCTION;
6692 else if (cond_reduc_dt == vect_constant_def)
6694 enum vect_def_type cond_initial_dt;
6695 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6696 tree cond_initial_val
6697 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6699 gcc_assert (cond_reduc_val != NULL_TREE);
6700 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6701 if (cond_initial_dt == vect_constant_def
6702 && types_compatible_p (TREE_TYPE (cond_initial_val),
6703 TREE_TYPE (cond_reduc_val)))
6705 tree e = fold_binary (LE_EXPR, boolean_type_node,
6706 cond_initial_val, cond_reduc_val);
6707 if (e && (integer_onep (e) || integer_zerop (e)))
6709 if (dump_enabled_p ())
6710 dump_printf_loc (MSG_NOTE, vect_location,
6711 "condition expression based on "
6712 "compile time constant.\n");
6713 /* Record reduction code at analysis stage. */
6714 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6715 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6716 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6717 = CONST_COND_REDUCTION;
6723 if (orig_stmt_info)
6724 gcc_assert (tmp == orig_stmt_info
6725 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6726 else
6727 /* We changed STMT to be the first stmt in reduction chain, hence we
6728 check that in this case the first element in the chain is STMT. */
6729 gcc_assert (tmp == stmt_info
6730 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6732 if (STMT_VINFO_LIVE_P (reduc_def_info))
6733 return false;
6735 if (slp_node)
6736 ncopies = 1;
6737 else
6738 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6740 gcc_assert (ncopies >= 1);
6742 vec_mode = TYPE_MODE (vectype_in);
6743 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6745 if (nested_cycle)
6747 def_bb = gimple_bb (reduc_def_phi);
6748 def_stmt_loop = def_bb->loop_father;
6749 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6750 loop_preheader_edge (def_stmt_loop));
6751 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6752 if (def_arg_stmt_info
6753 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6754 == vect_double_reduction_def))
6755 double_reduc = true;
6758 vect_reduction_type reduction_type
6759 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6760 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6761 && ncopies > 1)
6763 if (dump_enabled_p ())
6764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765 "multiple types in double reduction or condition "
6766 "reduction.\n");
6767 return false;
6770 if (code == COND_EXPR)
6772 /* Only call during the analysis stage, otherwise we'll lose
6773 STMT_VINFO_TYPE. */
6774 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6775 true, NULL, cost_vec))
6777 if (dump_enabled_p ())
6778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779 "unsupported condition in reduction\n");
6780 return false;
6783 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6784 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6786 /* Only call during the analysis stage, otherwise we'll lose
6787 STMT_VINFO_TYPE. We only support this for nested cycles
6788 without double reductions at the moment. */
6789 if (!nested_cycle
6790 || double_reduc
6791 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6792 NULL, cost_vec)))
6794 if (dump_enabled_p ())
6795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6796 "unsupported shift or rotation in reduction\n");
6797 return false;
6800 else
6802 /* 4. Supportable by target? */
6804 /* 4.1. check support for the operation in the loop */
6805 optab = optab_for_tree_code (code, vectype_in, optab_default);
6806 if (!optab)
6808 if (dump_enabled_p ())
6809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810 "no optab.\n");
6812 return false;
6815 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6817 if (dump_enabled_p ())
6818 dump_printf (MSG_NOTE, "op not supported by target.\n");
6820 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6821 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6822 return false;
6824 if (dump_enabled_p ())
6825 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6828 /* Worthwhile without SIMD support? */
6829 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6830 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6832 if (dump_enabled_p ())
6833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6834 "not worthwhile without SIMD support.\n");
6836 return false;
6840 /* 4.2. Check support for the epilog operation.
6842 If STMT represents a reduction pattern, then the type of the
6843 reduction variable may be different than the type of the rest
6844 of the arguments. For example, consider the case of accumulation
6845 of shorts into an int accumulator; The original code:
6846 S1: int_a = (int) short_a;
6847 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6849 was replaced with:
6850 STMT: int_acc = widen_sum <short_a, int_acc>
6852 This means that:
6853 1. The tree-code that is used to create the vector operation in the
6854 epilog code (that reduces the partial results) is not the
6855 tree-code of STMT, but is rather the tree-code of the original
6856 stmt from the pattern that STMT is replacing. I.e, in the example
6857 above we want to use 'widen_sum' in the loop, but 'plus' in the
6858 epilog.
6859 2. The type (mode) we use to check available target support
6860 for the vector operation to be created in the *epilog*, is
6861 determined by the type of the reduction variable (in the example
6862 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6863 However the type (mode) we use to check available target support
6864 for the vector operation to be created *inside the loop*, is
6865 determined by the type of the other arguments to STMT (in the
6866 example we'd check this: optab_handler (widen_sum_optab,
6867 vect_short_mode)).
6869 This is contrary to "regular" reductions, in which the types of all
6870 the arguments are the same as the type of the reduction variable.
6871 For "regular" reductions we can therefore use the same vector type
6872 (and also the same tree-code) when generating the epilog code and
6873 when generating the code inside the loop. */
6875 if (orig_stmt_info
6876 && (reduction_type == TREE_CODE_REDUCTION
6877 || reduction_type == FOLD_LEFT_REDUCTION))
6879 /* This is a reduction pattern: get the vectype from the type of the
6880 reduction variable, and get the tree-code from orig_stmt. */
6881 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6882 gcc_assert (vectype_out);
6883 vec_mode = TYPE_MODE (vectype_out);
6885 else
6887 /* Regular reduction: use the same vectype and tree-code as used for
6888 the vector code inside the loop can be used for the epilog code. */
6889 orig_code = code;
6891 if (code == MINUS_EXPR)
6892 orig_code = PLUS_EXPR;
6894 /* For simple condition reductions, replace with the actual expression
6895 we want to base our reduction around. */
6896 if (reduction_type == CONST_COND_REDUCTION)
6898 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6899 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6901 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6902 orig_code = cond_reduc_op_code;
6905 reduc_fn = IFN_LAST;
6907 if (reduction_type == TREE_CODE_REDUCTION
6908 || reduction_type == FOLD_LEFT_REDUCTION
6909 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6910 || reduction_type == CONST_COND_REDUCTION)
6912 if (reduction_type == FOLD_LEFT_REDUCTION
6913 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6914 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6916 if (reduc_fn != IFN_LAST
6917 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6918 OPTIMIZE_FOR_SPEED))
6920 if (dump_enabled_p ())
6921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922 "reduc op not supported by target.\n");
6924 reduc_fn = IFN_LAST;
6927 else
6929 if (!nested_cycle || double_reduc)
6931 if (dump_enabled_p ())
6932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933 "no reduc code for scalar code.\n");
6935 return false;
6939 else if (reduction_type == COND_REDUCTION)
6941 int scalar_precision
6942 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6943 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6944 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6945 nunits_out);
6947 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6948 OPTIMIZE_FOR_SPEED))
6949 reduc_fn = IFN_REDUC_MAX;
6952 if (reduction_type != EXTRACT_LAST_REDUCTION
6953 && (!nested_cycle || double_reduc)
6954 && reduc_fn == IFN_LAST
6955 && !nunits_out.is_constant ())
6957 if (dump_enabled_p ())
6958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6959 "missing target support for reduction on"
6960 " variable-length vectors.\n");
6961 return false;
6964 /* For SLP reductions, see if there is a neutral value we can use. */
6965 tree neutral_op = NULL_TREE;
6966 if (slp_node)
6967 neutral_op = neutral_op_for_slp_reduction
6968 (slp_node_instance->reduc_phis, code,
6969 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6971 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6973 /* We can't support in-order reductions of code such as this:
6975 for (int i = 0; i < n1; ++i)
6976 for (int j = 0; j < n2; ++j)
6977 l += a[j];
6979 since GCC effectively transforms the loop when vectorizing:
6981 for (int i = 0; i < n1 / VF; ++i)
6982 for (int j = 0; j < n2; ++j)
6983 for (int k = 0; k < VF; ++k)
6984 l += a[j];
6986 which is a reassociation of the original operation. */
6987 if (dump_enabled_p ())
6988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6989 "in-order double reduction not supported.\n");
6991 return false;
6994 if (reduction_type == FOLD_LEFT_REDUCTION
6995 && slp_node
6996 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6998 /* We cannot use in-order reductions in this case because there is
6999 an implicit reassociation of the operations involved. */
7000 if (dump_enabled_p ())
7001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7002 "in-order unchained SLP reductions not supported.\n");
7003 return false;
7006 /* For double reductions, and for SLP reductions with a neutral value,
7007 we construct a variable-length initial vector by loading a vector
7008 full of the neutral value and then shift-and-inserting the start
7009 values into the low-numbered elements. */
7010 if ((double_reduc || neutral_op)
7011 && !nunits_out.is_constant ()
7012 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7013 vectype_out, OPTIMIZE_FOR_SPEED))
7015 if (dump_enabled_p ())
7016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7017 "reduction on variable-length vectors requires"
7018 " target support for a vector-shift-and-insert"
7019 " operation.\n");
7020 return false;
7023 /* Check extra constraints for variable-length unchained SLP reductions. */
7024 if (STMT_SLP_TYPE (stmt_info)
7025 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7026 && !nunits_out.is_constant ())
7028 /* We checked above that we could build the initial vector when
7029 there's a neutral element value. Check here for the case in
7030 which each SLP statement has its own initial value and in which
7031 that value needs to be repeated for every instance of the
7032 statement within the initial vector. */
7033 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7034 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7035 if (!neutral_op
7036 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7038 if (dump_enabled_p ())
7039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7040 "unsupported form of SLP reduction for"
7041 " variable-length vectors: cannot build"
7042 " initial vector.\n");
7043 return false;
7045 /* The epilogue code relies on the number of elements being a multiple
7046 of the group size. The duplicate-and-interleave approach to setting
7047 up the the initial vector does too. */
7048 if (!multiple_p (nunits_out, group_size))
7050 if (dump_enabled_p ())
7051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052 "unsupported form of SLP reduction for"
7053 " variable-length vectors: the vector size"
7054 " is not a multiple of the number of results.\n");
7055 return false;
7059 /* In case of widenning multiplication by a constant, we update the type
7060 of the constant to be the type of the other operand. We check that the
7061 constant fits the type in the pattern recognition pass. */
7062 if (code == DOT_PROD_EXPR
7063 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7065 if (TREE_CODE (ops[0]) == INTEGER_CST)
7066 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7067 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7068 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7069 else
7071 if (dump_enabled_p ())
7072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7073 "invalid types in dot-prod\n");
7075 return false;
7079 if (reduction_type == COND_REDUCTION)
7081 widest_int ni;
7083 if (! max_loop_iterations (loop, &ni))
7085 if (dump_enabled_p ())
7086 dump_printf_loc (MSG_NOTE, vect_location,
7087 "loop count not known, cannot create cond "
7088 "reduction.\n");
7089 return false;
7091 /* Convert backedges to iterations. */
7092 ni += 1;
7094 /* The additional index will be the same type as the condition. Check
7095 that the loop can fit into this less one (because we'll use up the
7096 zero slot for when there are no matches). */
7097 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7098 if (wi::geu_p (ni, wi::to_widest (max_index)))
7100 if (dump_enabled_p ())
7101 dump_printf_loc (MSG_NOTE, vect_location,
7102 "loop size is greater than data size.\n");
7103 return false;
7107 /* In case the vectorization factor (VF) is bigger than the number
7108 of elements that we can fit in a vectype (nunits), we have to generate
7109 more than one vector stmt - i.e - we need to "unroll" the
7110 vector stmt by a factor VF/nunits. For more details see documentation
7111 in vectorizable_operation. */
7113 /* If the reduction is used in an outer loop we need to generate
7114 VF intermediate results, like so (e.g. for ncopies=2):
7115 r0 = phi (init, r0)
7116 r1 = phi (init, r1)
7117 r0 = x0 + r0;
7118 r1 = x1 + r1;
7119 (i.e. we generate VF results in 2 registers).
7120 In this case we have a separate def-use cycle for each copy, and therefore
7121 for each copy we get the vector def for the reduction variable from the
7122 respective phi node created for this copy.
7124 Otherwise (the reduction is unused in the loop nest), we can combine
7125 together intermediate results, like so (e.g. for ncopies=2):
7126 r = phi (init, r)
7127 r = x0 + r;
7128 r = x1 + r;
7129 (i.e. we generate VF/2 results in a single register).
7130 In this case for each copy we get the vector def for the reduction variable
7131 from the vectorized reduction operation generated in the previous iteration.
7133 This only works when we see both the reduction PHI and its only consumer
7134 in vectorizable_reduction and there are no intermediate stmts
7135 participating. */
7136 stmt_vec_info use_stmt_info;
7137 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
7138 if (ncopies > 1
7139 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7140 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
7141 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
7143 single_defuse_cycle = true;
7144 epilog_copies = 1;
7146 else
7147 epilog_copies = ncopies;
7149 /* If the reduction stmt is one of the patterns that have lane
7150 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7151 if ((ncopies > 1
7152 && ! single_defuse_cycle)
7153 && (code == DOT_PROD_EXPR
7154 || code == WIDEN_SUM_EXPR
7155 || code == SAD_EXPR))
7157 if (dump_enabled_p ())
7158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7159 "multi def-use cycle not possible for lane-reducing "
7160 "reduction operation\n");
7161 return false;
7164 if (slp_node)
7165 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7166 else
7167 vec_num = 1;
7169 internal_fn cond_fn = get_conditional_internal_fn (code);
7170 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7171 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7173 if (!vec_stmt) /* transformation not required. */
7175 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7176 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7178 if (reduction_type != FOLD_LEFT_REDUCTION
7179 && !mask_by_cond_expr
7180 && (cond_fn == IFN_LAST
7181 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7182 OPTIMIZE_FOR_SPEED)))
7184 if (dump_enabled_p ())
7185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7186 "can't use a fully-masked loop because no"
7187 " conditional operation is available.\n");
7188 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7190 else if (reduc_index == -1)
7192 if (dump_enabled_p ())
7193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7194 "can't use a fully-masked loop for chained"
7195 " reductions.\n");
7196 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7198 else
7199 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7200 vectype_in);
7202 if (dump_enabled_p ()
7203 && reduction_type == FOLD_LEFT_REDUCTION)
7204 dump_printf_loc (MSG_NOTE, vect_location,
7205 "using an in-order (fold-left) reduction.\n");
7206 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7207 return true;
7210 /* Transform. */
7212 if (dump_enabled_p ())
7213 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7215 /* FORNOW: Multiple types are not supported for condition. */
7216 if (code == COND_EXPR)
7217 gcc_assert (ncopies == 1);
7219 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7221 if (reduction_type == FOLD_LEFT_REDUCTION)
7222 return vectorize_fold_left_reduction
7223 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7224 reduc_fn, ops, vectype_in, reduc_index, masks);
7226 if (reduction_type == EXTRACT_LAST_REDUCTION)
7228 gcc_assert (!slp_node);
7229 return vectorizable_condition (stmt_info, gsi, vec_stmt,
7230 true, NULL, NULL);
7233 /* Create the destination vector */
7234 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7236 prev_stmt_info = NULL;
7237 prev_phi_info = NULL;
7238 if (!slp_node)
7240 vec_oprnds0.create (1);
7241 vec_oprnds1.create (1);
7242 if (op_type == ternary_op)
7243 vec_oprnds2.create (1);
7246 phis.create (vec_num);
7247 vect_defs.create (vec_num);
7248 if (!slp_node)
7249 vect_defs.quick_push (NULL_TREE);
7251 if (slp_node)
7252 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7253 else
7254 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7256 for (j = 0; j < ncopies; j++)
7258 if (code == COND_EXPR)
7260 gcc_assert (!slp_node);
7261 vectorizable_condition (stmt_info, gsi, vec_stmt,
7262 true, NULL, NULL);
7263 break;
7265 if (code == LSHIFT_EXPR
7266 || code == RSHIFT_EXPR)
7268 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7269 break;
7272 /* Handle uses. */
7273 if (j == 0)
7275 if (slp_node)
7277 /* Get vec defs for all the operands except the reduction index,
7278 ensuring the ordering of the ops in the vector is kept. */
7279 auto_vec<tree, 3> slp_ops;
7280 auto_vec<vec<tree>, 3> vec_defs;
7282 slp_ops.quick_push (ops[0]);
7283 slp_ops.quick_push (ops[1]);
7284 if (op_type == ternary_op)
7285 slp_ops.quick_push (ops[2]);
7287 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7289 vec_oprnds0.safe_splice (vec_defs[0]);
7290 vec_defs[0].release ();
7291 vec_oprnds1.safe_splice (vec_defs[1]);
7292 vec_defs[1].release ();
7293 if (op_type == ternary_op)
7295 vec_oprnds2.safe_splice (vec_defs[2]);
7296 vec_defs[2].release ();
7299 else
7301 vec_oprnds0.quick_push
7302 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7303 vec_oprnds1.quick_push
7304 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7305 if (op_type == ternary_op)
7306 vec_oprnds2.quick_push
7307 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7310 else
7312 if (!slp_node)
7314 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7316 if (single_defuse_cycle && reduc_index == 0)
7317 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7318 else
7319 vec_oprnds0[0]
7320 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7321 vec_oprnds0[0]);
7322 if (single_defuse_cycle && reduc_index == 1)
7323 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7324 else
7325 vec_oprnds1[0]
7326 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7327 vec_oprnds1[0]);
7328 if (op_type == ternary_op)
7330 if (single_defuse_cycle && reduc_index == 2)
7331 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7332 else
7333 vec_oprnds2[0]
7334 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7335 vec_oprnds2[0]);
7340 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7342 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7343 if (masked_loop_p && !mask_by_cond_expr)
7345 /* Make sure that the reduction accumulator is vop[0]. */
7346 if (reduc_index == 1)
7348 gcc_assert (commutative_tree_code (code));
7349 std::swap (vop[0], vop[1]);
7351 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7352 vectype_in, i * ncopies + j);
7353 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7354 vop[0], vop[1],
7355 vop[0]);
7356 new_temp = make_ssa_name (vec_dest, call);
7357 gimple_call_set_lhs (call, new_temp);
7358 gimple_call_set_nothrow (call, true);
7359 new_stmt_info
7360 = vect_finish_stmt_generation (stmt_info, call, gsi);
7362 else
7364 if (op_type == ternary_op)
7365 vop[2] = vec_oprnds2[i];
7367 if (masked_loop_p && mask_by_cond_expr)
7369 tree mask = vect_get_loop_mask (gsi, masks,
7370 vec_num * ncopies,
7371 vectype_in, i * ncopies + j);
7372 build_vect_cond_expr (code, vop, mask, gsi);
7375 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7376 vop[0], vop[1], vop[2]);
7377 new_temp = make_ssa_name (vec_dest, new_stmt);
7378 gimple_assign_set_lhs (new_stmt, new_temp);
7379 new_stmt_info
7380 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7383 if (slp_node)
7385 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7386 vect_defs.quick_push (new_temp);
7388 else
7389 vect_defs[0] = new_temp;
7392 if (slp_node)
7393 continue;
7395 if (j == 0)
7396 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7397 else
7398 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7400 prev_stmt_info = new_stmt_info;
7403 /* Finalize the reduction-phi (set its arguments) and create the
7404 epilog reduction code. */
7405 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7406 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7408 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7409 epilog_copies, reduc_fn, phis,
7410 double_reduc, slp_node, slp_node_instance,
7411 cond_reduc_val, cond_reduc_op_code,
7412 neutral_op);
7414 return true;
7417 /* Function vect_min_worthwhile_factor.
7419 For a loop where we could vectorize the operation indicated by CODE,
7420 return the minimum vectorization factor that makes it worthwhile
7421 to use generic vectors. */
7422 static unsigned int
7423 vect_min_worthwhile_factor (enum tree_code code)
7425 switch (code)
7427 case PLUS_EXPR:
7428 case MINUS_EXPR:
7429 case NEGATE_EXPR:
7430 return 4;
7432 case BIT_AND_EXPR:
7433 case BIT_IOR_EXPR:
7434 case BIT_XOR_EXPR:
7435 case BIT_NOT_EXPR:
7436 return 2;
7438 default:
7439 return INT_MAX;
7443 /* Return true if VINFO indicates we are doing loop vectorization and if
7444 it is worth decomposing CODE operations into scalar operations for
7445 that loop's vectorization factor. */
7447 bool
7448 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7450 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7451 unsigned HOST_WIDE_INT value;
7452 return (loop_vinfo
7453 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7454 && value >= vect_min_worthwhile_factor (code));
7457 /* Function vectorizable_induction
7459 Check if STMT_INFO performs an induction computation that can be vectorized.
7460 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7461 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7462 Return true if STMT_INFO is vectorizable in this way. */
7464 bool
7465 vectorizable_induction (stmt_vec_info stmt_info,
7466 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7467 stmt_vec_info *vec_stmt, slp_tree slp_node,
7468 stmt_vector_for_cost *cost_vec)
7470 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7471 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7472 unsigned ncopies;
7473 bool nested_in_vect_loop = false;
7474 struct loop *iv_loop;
7475 tree vec_def;
7476 edge pe = loop_preheader_edge (loop);
7477 basic_block new_bb;
7478 tree new_vec, vec_init, vec_step, t;
7479 tree new_name;
7480 gimple *new_stmt;
7481 gphi *induction_phi;
7482 tree induc_def, vec_dest;
7483 tree init_expr, step_expr;
7484 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7485 unsigned i;
7486 tree expr;
7487 gimple_seq stmts;
7488 imm_use_iterator imm_iter;
7489 use_operand_p use_p;
7490 gimple *exit_phi;
7491 edge latch_e;
7492 tree loop_arg;
7493 gimple_stmt_iterator si;
7495 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7496 if (!phi)
7497 return false;
7499 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7500 return false;
7502 /* Make sure it was recognized as induction computation. */
7503 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7504 return false;
7506 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7507 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7509 if (slp_node)
7510 ncopies = 1;
7511 else
7512 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7513 gcc_assert (ncopies >= 1);
7515 /* FORNOW. These restrictions should be relaxed. */
7516 if (nested_in_vect_loop_p (loop, stmt_info))
7518 imm_use_iterator imm_iter;
7519 use_operand_p use_p;
7520 gimple *exit_phi;
7521 edge latch_e;
7522 tree loop_arg;
7524 if (ncopies > 1)
7526 if (dump_enabled_p ())
7527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7528 "multiple types in nested loop.\n");
7529 return false;
7532 /* FORNOW: outer loop induction with SLP not supported. */
7533 if (STMT_SLP_TYPE (stmt_info))
7534 return false;
7536 exit_phi = NULL;
7537 latch_e = loop_latch_edge (loop->inner);
7538 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7539 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7541 gimple *use_stmt = USE_STMT (use_p);
7542 if (is_gimple_debug (use_stmt))
7543 continue;
7545 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7547 exit_phi = use_stmt;
7548 break;
7551 if (exit_phi)
7553 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7554 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7555 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7557 if (dump_enabled_p ())
7558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7559 "inner-loop induction only used outside "
7560 "of the outer vectorized loop.\n");
7561 return false;
7565 nested_in_vect_loop = true;
7566 iv_loop = loop->inner;
7568 else
7569 iv_loop = loop;
7570 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7572 if (slp_node && !nunits.is_constant ())
7574 /* The current SLP code creates the initial value element-by-element. */
7575 if (dump_enabled_p ())
7576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7577 "SLP induction not supported for variable-length"
7578 " vectors.\n");
7579 return false;
7582 if (!vec_stmt) /* transformation not required. */
7584 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7585 DUMP_VECT_SCOPE ("vectorizable_induction");
7586 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7587 return true;
7590 /* Transform. */
7592 /* Compute a vector variable, initialized with the first VF values of
7593 the induction variable. E.g., for an iv with IV_PHI='X' and
7594 evolution S, for a vector of 4 units, we want to compute:
7595 [X, X + S, X + 2*S, X + 3*S]. */
7597 if (dump_enabled_p ())
7598 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7600 latch_e = loop_latch_edge (iv_loop);
7601 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7603 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7604 gcc_assert (step_expr != NULL_TREE);
7606 pe = loop_preheader_edge (iv_loop);
7607 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7608 loop_preheader_edge (iv_loop));
7610 stmts = NULL;
7611 if (!nested_in_vect_loop)
7613 /* Convert the initial value to the desired type. */
7614 tree new_type = TREE_TYPE (vectype);
7615 init_expr = gimple_convert (&stmts, new_type, init_expr);
7617 /* If we are using the loop mask to "peel" for alignment then we need
7618 to adjust the start value here. */
7619 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7620 if (skip_niters != NULL_TREE)
7622 if (FLOAT_TYPE_P (vectype))
7623 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7624 skip_niters);
7625 else
7626 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7627 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7628 skip_niters, step_expr);
7629 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7630 init_expr, skip_step);
7634 /* Convert the step to the desired type. */
7635 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7637 if (stmts)
7639 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7640 gcc_assert (!new_bb);
7643 /* Find the first insertion point in the BB. */
7644 basic_block bb = gimple_bb (phi);
7645 si = gsi_after_labels (bb);
7647 /* For SLP induction we have to generate several IVs as for example
7648 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7649 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7650 [VF*S, VF*S, VF*S, VF*S] for all. */
7651 if (slp_node)
7653 /* Enforced above. */
7654 unsigned int const_nunits = nunits.to_constant ();
7656 /* Generate [VF*S, VF*S, ... ]. */
7657 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7659 expr = build_int_cst (integer_type_node, vf);
7660 expr = fold_convert (TREE_TYPE (step_expr), expr);
7662 else
7663 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7664 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7665 expr, step_expr);
7666 if (! CONSTANT_CLASS_P (new_name))
7667 new_name = vect_init_vector (stmt_info, new_name,
7668 TREE_TYPE (step_expr), NULL);
7669 new_vec = build_vector_from_val (vectype, new_name);
7670 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7672 /* Now generate the IVs. */
7673 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7674 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7675 unsigned elts = const_nunits * nvects;
7676 unsigned nivs = least_common_multiple (group_size,
7677 const_nunits) / const_nunits;
7678 gcc_assert (elts % group_size == 0);
7679 tree elt = init_expr;
7680 unsigned ivn;
7681 for (ivn = 0; ivn < nivs; ++ivn)
7683 tree_vector_builder elts (vectype, const_nunits, 1);
7684 stmts = NULL;
7685 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7687 if (ivn*const_nunits + eltn >= group_size
7688 && (ivn * const_nunits + eltn) % group_size == 0)
7689 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7690 elt, step_expr);
7691 elts.quick_push (elt);
7693 vec_init = gimple_build_vector (&stmts, &elts);
7694 if (stmts)
7696 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7697 gcc_assert (!new_bb);
7700 /* Create the induction-phi that defines the induction-operand. */
7701 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7702 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7703 stmt_vec_info induction_phi_info
7704 = loop_vinfo->add_stmt (induction_phi);
7705 induc_def = PHI_RESULT (induction_phi);
7707 /* Create the iv update inside the loop */
7708 vec_def = make_ssa_name (vec_dest);
7709 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7710 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7711 loop_vinfo->add_stmt (new_stmt);
7713 /* Set the arguments of the phi node: */
7714 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7715 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7716 UNKNOWN_LOCATION);
7718 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7721 /* Re-use IVs when we can. */
7722 if (ivn < nvects)
7724 unsigned vfp
7725 = least_common_multiple (group_size, const_nunits) / group_size;
7726 /* Generate [VF'*S, VF'*S, ... ]. */
7727 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7729 expr = build_int_cst (integer_type_node, vfp);
7730 expr = fold_convert (TREE_TYPE (step_expr), expr);
7732 else
7733 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7734 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7735 expr, step_expr);
7736 if (! CONSTANT_CLASS_P (new_name))
7737 new_name = vect_init_vector (stmt_info, new_name,
7738 TREE_TYPE (step_expr), NULL);
7739 new_vec = build_vector_from_val (vectype, new_name);
7740 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7741 for (; ivn < nvects; ++ivn)
7743 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7744 tree def;
7745 if (gimple_code (iv) == GIMPLE_PHI)
7746 def = gimple_phi_result (iv);
7747 else
7748 def = gimple_assign_lhs (iv);
7749 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7750 PLUS_EXPR,
7751 def, vec_step);
7752 if (gimple_code (iv) == GIMPLE_PHI)
7753 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7754 else
7756 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7757 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7759 SLP_TREE_VEC_STMTS (slp_node).quick_push
7760 (loop_vinfo->add_stmt (new_stmt));
7764 return true;
7767 /* Create the vector that holds the initial_value of the induction. */
7768 if (nested_in_vect_loop)
7770 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7771 been created during vectorization of previous stmts. We obtain it
7772 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7773 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7774 /* If the initial value is not of proper type, convert it. */
7775 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7777 new_stmt
7778 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7779 vect_simple_var,
7780 "vec_iv_"),
7781 VIEW_CONVERT_EXPR,
7782 build1 (VIEW_CONVERT_EXPR, vectype,
7783 vec_init));
7784 vec_init = gimple_assign_lhs (new_stmt);
7785 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7786 new_stmt);
7787 gcc_assert (!new_bb);
7788 loop_vinfo->add_stmt (new_stmt);
7791 else
7793 /* iv_loop is the loop to be vectorized. Create:
7794 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7795 stmts = NULL;
7796 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7798 unsigned HOST_WIDE_INT const_nunits;
7799 if (nunits.is_constant (&const_nunits))
7801 tree_vector_builder elts (vectype, const_nunits, 1);
7802 elts.quick_push (new_name);
7803 for (i = 1; i < const_nunits; i++)
7805 /* Create: new_name_i = new_name + step_expr */
7806 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7807 new_name, step_expr);
7808 elts.quick_push (new_name);
7810 /* Create a vector from [new_name_0, new_name_1, ...,
7811 new_name_nunits-1] */
7812 vec_init = gimple_build_vector (&stmts, &elts);
7814 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7815 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7816 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7817 new_name, step_expr);
7818 else
7820 /* Build:
7821 [base, base, base, ...]
7822 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7823 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7824 gcc_assert (flag_associative_math);
7825 tree index = build_index_vector (vectype, 0, 1);
7826 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7827 new_name);
7828 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7829 step_expr);
7830 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7831 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7832 vec_init, step_vec);
7833 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7834 vec_init, base_vec);
7837 if (stmts)
7839 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7840 gcc_assert (!new_bb);
7845 /* Create the vector that holds the step of the induction. */
7846 if (nested_in_vect_loop)
7847 /* iv_loop is nested in the loop to be vectorized. Generate:
7848 vec_step = [S, S, S, S] */
7849 new_name = step_expr;
7850 else
7852 /* iv_loop is the loop to be vectorized. Generate:
7853 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7854 gimple_seq seq = NULL;
7855 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7857 expr = build_int_cst (integer_type_node, vf);
7858 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7860 else
7861 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7862 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7863 expr, step_expr);
7864 if (seq)
7866 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7867 gcc_assert (!new_bb);
7871 t = unshare_expr (new_name);
7872 gcc_assert (CONSTANT_CLASS_P (new_name)
7873 || TREE_CODE (new_name) == SSA_NAME);
7874 new_vec = build_vector_from_val (vectype, t);
7875 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7878 /* Create the following def-use cycle:
7879 loop prolog:
7880 vec_init = ...
7881 vec_step = ...
7882 loop:
7883 vec_iv = PHI <vec_init, vec_loop>
7885 STMT
7887 vec_loop = vec_iv + vec_step; */
7889 /* Create the induction-phi that defines the induction-operand. */
7890 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7891 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7892 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7893 induc_def = PHI_RESULT (induction_phi);
7895 /* Create the iv update inside the loop */
7896 vec_def = make_ssa_name (vec_dest);
7897 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7898 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7899 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7901 /* Set the arguments of the phi node: */
7902 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7903 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7904 UNKNOWN_LOCATION);
7906 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7908 /* In case that vectorization factor (VF) is bigger than the number
7909 of elements that we can fit in a vectype (nunits), we have to generate
7910 more than one vector stmt - i.e - we need to "unroll" the
7911 vector stmt by a factor VF/nunits. For more details see documentation
7912 in vectorizable_operation. */
7914 if (ncopies > 1)
7916 gimple_seq seq = NULL;
7917 stmt_vec_info prev_stmt_vinfo;
7918 /* FORNOW. This restriction should be relaxed. */
7919 gcc_assert (!nested_in_vect_loop);
7921 /* Create the vector that holds the step of the induction. */
7922 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7924 expr = build_int_cst (integer_type_node, nunits);
7925 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7927 else
7928 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7929 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7930 expr, step_expr);
7931 if (seq)
7933 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7934 gcc_assert (!new_bb);
7937 t = unshare_expr (new_name);
7938 gcc_assert (CONSTANT_CLASS_P (new_name)
7939 || TREE_CODE (new_name) == SSA_NAME);
7940 new_vec = build_vector_from_val (vectype, t);
7941 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7943 vec_def = induc_def;
7944 prev_stmt_vinfo = induction_phi_info;
7945 for (i = 1; i < ncopies; i++)
7947 /* vec_i = vec_prev + vec_step */
7948 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7949 vec_def, vec_step);
7950 vec_def = make_ssa_name (vec_dest, new_stmt);
7951 gimple_assign_set_lhs (new_stmt, vec_def);
7953 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7954 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7955 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7956 prev_stmt_vinfo = new_stmt_info;
7960 if (nested_in_vect_loop)
7962 /* Find the loop-closed exit-phi of the induction, and record
7963 the final vector of induction results: */
7964 exit_phi = NULL;
7965 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7967 gimple *use_stmt = USE_STMT (use_p);
7968 if (is_gimple_debug (use_stmt))
7969 continue;
7971 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7973 exit_phi = use_stmt;
7974 break;
7977 if (exit_phi)
7979 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7980 /* FORNOW. Currently not supporting the case that an inner-loop induction
7981 is not used in the outer-loop (i.e. only outside the outer-loop). */
7982 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7983 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7985 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7986 if (dump_enabled_p ())
7987 dump_printf_loc (MSG_NOTE, vect_location,
7988 "vector of inductions after inner-loop:%G",
7989 new_stmt);
7994 if (dump_enabled_p ())
7995 dump_printf_loc (MSG_NOTE, vect_location,
7996 "transform induction: created def-use cycle: %G%G",
7997 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7999 return true;
8002 /* Function vectorizable_live_operation.
8004 STMT_INFO computes a value that is used outside the loop. Check if
8005 it can be supported. */
8007 bool
8008 vectorizable_live_operation (stmt_vec_info stmt_info,
8009 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8010 slp_tree slp_node, int slp_index,
8011 stmt_vec_info *vec_stmt,
8012 stmt_vector_for_cost *)
8014 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8015 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8016 imm_use_iterator imm_iter;
8017 tree lhs, lhs_type, bitsize, vec_bitsize;
8018 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8019 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8020 int ncopies;
8021 gimple *use_stmt;
8022 auto_vec<tree> vec_oprnds;
8023 int vec_entry = 0;
8024 poly_uint64 vec_index = 0;
8026 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8028 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8029 return false;
8031 /* FORNOW. CHECKME. */
8032 if (nested_in_vect_loop_p (loop, stmt_info))
8033 return false;
8035 /* If STMT is not relevant and it is a simple assignment and its inputs are
8036 invariant then it can remain in place, unvectorized. The original last
8037 scalar value that it computes will be used. */
8038 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8040 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8041 if (dump_enabled_p ())
8042 dump_printf_loc (MSG_NOTE, vect_location,
8043 "statement is simple and uses invariant. Leaving in "
8044 "place.\n");
8045 return true;
8048 if (slp_node)
8049 ncopies = 1;
8050 else
8051 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8053 if (slp_node)
8055 gcc_assert (slp_index >= 0);
8057 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8058 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8060 /* Get the last occurrence of the scalar index from the concatenation of
8061 all the slp vectors. Calculate which slp vector it is and the index
8062 within. */
8063 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8065 /* Calculate which vector contains the result, and which lane of
8066 that vector we need. */
8067 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8069 if (dump_enabled_p ())
8070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8071 "Cannot determine which vector holds the"
8072 " final result.\n");
8073 return false;
8077 if (!vec_stmt)
8079 /* No transformation required. */
8080 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8082 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8083 OPTIMIZE_FOR_SPEED))
8085 if (dump_enabled_p ())
8086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8087 "can't use a fully-masked loop because "
8088 "the target doesn't support extract last "
8089 "reduction.\n");
8090 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8092 else if (slp_node)
8094 if (dump_enabled_p ())
8095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8096 "can't use a fully-masked loop because an "
8097 "SLP statement is live after the loop.\n");
8098 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8100 else if (ncopies > 1)
8102 if (dump_enabled_p ())
8103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8104 "can't use a fully-masked loop because"
8105 " ncopies is greater than 1.\n");
8106 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8108 else
8110 gcc_assert (ncopies == 1 && !slp_node);
8111 vect_record_loop_mask (loop_vinfo,
8112 &LOOP_VINFO_MASKS (loop_vinfo),
8113 1, vectype);
8116 return true;
8119 /* Use the lhs of the original scalar statement. */
8120 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8122 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8123 : gimple_get_lhs (stmt);
8124 lhs_type = TREE_TYPE (lhs);
8126 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8127 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8128 : TYPE_SIZE (TREE_TYPE (vectype)));
8129 vec_bitsize = TYPE_SIZE (vectype);
8131 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8132 tree vec_lhs, bitstart;
8133 if (slp_node)
8135 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8137 /* Get the correct slp vectorized stmt. */
8138 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8139 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8140 vec_lhs = gimple_phi_result (phi);
8141 else
8142 vec_lhs = gimple_get_lhs (vec_stmt);
8144 /* Get entry to use. */
8145 bitstart = bitsize_int (vec_index);
8146 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8148 else
8150 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8151 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8152 gcc_checking_assert (ncopies == 1
8153 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8155 /* For multiple copies, get the last copy. */
8156 for (int i = 1; i < ncopies; ++i)
8157 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8159 /* Get the last lane in the vector. */
8160 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8163 gimple_seq stmts = NULL;
8164 tree new_tree;
8165 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8167 /* Emit:
8169 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8171 where VEC_LHS is the vectorized live-out result and MASK is
8172 the loop mask for the final iteration. */
8173 gcc_assert (ncopies == 1 && !slp_node);
8174 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8175 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8176 1, vectype, 0);
8177 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8178 scalar_type, mask, vec_lhs);
8180 /* Convert the extracted vector element to the required scalar type. */
8181 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8183 else
8185 tree bftype = TREE_TYPE (vectype);
8186 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8187 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8188 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8189 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8190 &stmts, true, NULL_TREE);
8193 if (stmts)
8194 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8196 /* Replace use of lhs with newly computed result. If the use stmt is a
8197 single arg PHI, just replace all uses of PHI result. It's necessary
8198 because lcssa PHI defining lhs may be before newly inserted stmt. */
8199 use_operand_p use_p;
8200 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8201 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8202 && !is_gimple_debug (use_stmt))
8204 if (gimple_code (use_stmt) == GIMPLE_PHI
8205 && gimple_phi_num_args (use_stmt) == 1)
8207 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8209 else
8211 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8212 SET_USE (use_p, new_tree);
8214 update_stmt (use_stmt);
8217 return true;
8220 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8222 static void
8223 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8225 ssa_op_iter op_iter;
8226 imm_use_iterator imm_iter;
8227 def_operand_p def_p;
8228 gimple *ustmt;
8230 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8232 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8234 basic_block bb;
8236 if (!is_gimple_debug (ustmt))
8237 continue;
8239 bb = gimple_bb (ustmt);
8241 if (!flow_bb_inside_loop_p (loop, bb))
8243 if (gimple_debug_bind_p (ustmt))
8245 if (dump_enabled_p ())
8246 dump_printf_loc (MSG_NOTE, vect_location,
8247 "killing debug use\n");
8249 gimple_debug_bind_reset_value (ustmt);
8250 update_stmt (ustmt);
8252 else
8253 gcc_unreachable ();
8259 /* Given loop represented by LOOP_VINFO, return true if computation of
8260 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8261 otherwise. */
8263 static bool
8264 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8266 /* Constant case. */
8267 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8269 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8270 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8272 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8273 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8274 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8275 return true;
8278 widest_int max;
8279 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8280 /* Check the upper bound of loop niters. */
8281 if (get_max_loop_iterations (loop, &max))
8283 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8284 signop sgn = TYPE_SIGN (type);
8285 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8286 if (max < type_max)
8287 return true;
8289 return false;
8292 /* Return a mask type with half the number of elements as TYPE. */
8294 tree
8295 vect_halve_mask_nunits (tree type)
8297 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8298 return build_truth_vector_type (nunits, current_vector_size);
8301 /* Return a mask type with twice as many elements as TYPE. */
8303 tree
8304 vect_double_mask_nunits (tree type)
8306 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8307 return build_truth_vector_type (nunits, current_vector_size);
8310 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8311 contain a sequence of NVECTORS masks that each control a vector of type
8312 VECTYPE. */
8314 void
8315 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8316 unsigned int nvectors, tree vectype)
8318 gcc_assert (nvectors != 0);
8319 if (masks->length () < nvectors)
8320 masks->safe_grow_cleared (nvectors);
8321 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8322 /* The number of scalars per iteration and the number of vectors are
8323 both compile-time constants. */
8324 unsigned int nscalars_per_iter
8325 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8326 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8327 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8329 rgm->max_nscalars_per_iter = nscalars_per_iter;
8330 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8334 /* Given a complete set of masks MASKS, extract mask number INDEX
8335 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8336 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8338 See the comment above vec_loop_masks for more details about the mask
8339 arrangement. */
8341 tree
8342 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8343 unsigned int nvectors, tree vectype, unsigned int index)
8345 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8346 tree mask_type = rgm->mask_type;
8348 /* Populate the rgroup's mask array, if this is the first time we've
8349 used it. */
8350 if (rgm->masks.is_empty ())
8352 rgm->masks.safe_grow_cleared (nvectors);
8353 for (unsigned int i = 0; i < nvectors; ++i)
8355 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8356 /* Provide a dummy definition until the real one is available. */
8357 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8358 rgm->masks[i] = mask;
8362 tree mask = rgm->masks[index];
8363 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8364 TYPE_VECTOR_SUBPARTS (vectype)))
8366 /* A loop mask for data type X can be reused for data type Y
8367 if X has N times more elements than Y and if Y's elements
8368 are N times bigger than X's. In this case each sequence
8369 of N elements in the loop mask will be all-zero or all-one.
8370 We can then view-convert the mask so that each sequence of
8371 N elements is replaced by a single element. */
8372 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8373 TYPE_VECTOR_SUBPARTS (vectype)));
8374 gimple_seq seq = NULL;
8375 mask_type = build_same_sized_truth_vector_type (vectype);
8376 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8377 if (seq)
8378 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8380 return mask;
8383 /* Scale profiling counters by estimation for LOOP which is vectorized
8384 by factor VF. */
8386 static void
8387 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8389 edge preheader = loop_preheader_edge (loop);
8390 /* Reduce loop iterations by the vectorization factor. */
8391 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8392 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8394 if (freq_h.nonzero_p ())
8396 profile_probability p;
8398 /* Avoid dropping loop body profile counter to 0 because of zero count
8399 in loop's preheader. */
8400 if (!(freq_e == profile_count::zero ()))
8401 freq_e = freq_e.force_nonzero ();
8402 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8403 scale_loop_frequencies (loop, p);
8406 edge exit_e = single_exit (loop);
8407 exit_e->probability = profile_probability::always ()
8408 .apply_scale (1, new_est_niter + 1);
8410 edge exit_l = single_pred_edge (loop->latch);
8411 profile_probability prob = exit_l->probability;
8412 exit_l->probability = exit_e->probability.invert ();
8413 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8414 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8417 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8418 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8419 stmt_vec_info. */
8421 static void
8422 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8423 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8425 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8426 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8428 if (dump_enabled_p ())
8429 dump_printf_loc (MSG_NOTE, vect_location,
8430 "------>vectorizing statement: %G", stmt_info->stmt);
8432 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8433 vect_loop_kill_debug_uses (loop, stmt_info);
8435 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8436 && !STMT_VINFO_LIVE_P (stmt_info))
8437 return;
8439 if (STMT_VINFO_VECTYPE (stmt_info))
8441 poly_uint64 nunits
8442 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8443 if (!STMT_SLP_TYPE (stmt_info)
8444 && maybe_ne (nunits, vf)
8445 && dump_enabled_p ())
8446 /* For SLP VF is set according to unrolling factor, and not
8447 to vector size, hence for SLP this print is not valid. */
8448 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8451 /* Pure SLP statements have already been vectorized. We still need
8452 to apply loop vectorization to hybrid SLP statements. */
8453 if (PURE_SLP_STMT (stmt_info))
8454 return;
8456 if (dump_enabled_p ())
8457 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8459 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8460 *seen_store = stmt_info;
8463 /* Function vect_transform_loop.
8465 The analysis phase has determined that the loop is vectorizable.
8466 Vectorize the loop - created vectorized stmts to replace the scalar
8467 stmts in the loop, and update the loop exit condition.
8468 Returns scalar epilogue loop if any. */
8470 struct loop *
8471 vect_transform_loop (loop_vec_info loop_vinfo)
8473 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8474 struct loop *epilogue = NULL;
8475 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8476 int nbbs = loop->num_nodes;
8477 int i;
8478 tree niters_vector = NULL_TREE;
8479 tree step_vector = NULL_TREE;
8480 tree niters_vector_mult_vf = NULL_TREE;
8481 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8482 unsigned int lowest_vf = constant_lower_bound (vf);
8483 gimple *stmt;
8484 bool check_profitability = false;
8485 unsigned int th;
8487 DUMP_VECT_SCOPE ("vec_transform_loop");
8489 loop_vinfo->shared->check_datarefs ();
8491 /* Use the more conservative vectorization threshold. If the number
8492 of iterations is constant assume the cost check has been performed
8493 by our caller. If the threshold makes all loops profitable that
8494 run at least the (estimated) vectorization factor number of times
8495 checking is pointless, too. */
8496 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8497 if (th >= vect_vf_for_cost (loop_vinfo)
8498 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8500 if (dump_enabled_p ())
8501 dump_printf_loc (MSG_NOTE, vect_location,
8502 "Profitability threshold is %d loop iterations.\n",
8503 th);
8504 check_profitability = true;
8507 /* Make sure there exists a single-predecessor exit bb. Do this before
8508 versioning. */
8509 edge e = single_exit (loop);
8510 if (! single_pred_p (e->dest))
8512 split_loop_exit_edge (e, true);
8513 if (dump_enabled_p ())
8514 dump_printf (MSG_NOTE, "split exit edge\n");
8517 /* Version the loop first, if required, so the profitability check
8518 comes first. */
8520 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8522 poly_uint64 versioning_threshold
8523 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8524 if (check_profitability
8525 && ordered_p (poly_uint64 (th), versioning_threshold))
8527 versioning_threshold = ordered_max (poly_uint64 (th),
8528 versioning_threshold);
8529 check_profitability = false;
8531 struct loop *sloop
8532 = vect_loop_versioning (loop_vinfo, th, check_profitability,
8533 versioning_threshold);
8534 sloop->force_vectorize = false;
8535 check_profitability = false;
8538 /* Make sure there exists a single-predecessor exit bb also on the
8539 scalar loop copy. Do this after versioning but before peeling
8540 so CFG structure is fine for both scalar and if-converted loop
8541 to make slpeel_duplicate_current_defs_from_edges face matched
8542 loop closed PHI nodes on the exit. */
8543 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8545 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8546 if (! single_pred_p (e->dest))
8548 split_loop_exit_edge (e, true);
8549 if (dump_enabled_p ())
8550 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8554 tree niters = vect_build_loop_niters (loop_vinfo);
8555 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8556 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8557 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8558 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8559 &step_vector, &niters_vector_mult_vf, th,
8560 check_profitability, niters_no_overflow);
8561 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8562 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8563 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8564 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8566 if (niters_vector == NULL_TREE)
8568 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8569 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8570 && known_eq (lowest_vf, vf))
8572 niters_vector
8573 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8574 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8575 step_vector = build_one_cst (TREE_TYPE (niters));
8577 else
8578 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8579 &step_vector, niters_no_overflow);
8582 /* 1) Make sure the loop header has exactly two entries
8583 2) Make sure we have a preheader basic block. */
8585 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8587 split_edge (loop_preheader_edge (loop));
8589 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8590 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8591 /* This will deal with any possible peeling. */
8592 vect_prepare_for_masked_peels (loop_vinfo);
8594 /* Schedule the SLP instances first, then handle loop vectorization
8595 below. */
8596 if (!loop_vinfo->slp_instances.is_empty ())
8598 DUMP_VECT_SCOPE ("scheduling SLP instances");
8599 vect_schedule_slp (loop_vinfo);
8602 /* FORNOW: the vectorizer supports only loops which body consist
8603 of one basic block (header + empty latch). When the vectorizer will
8604 support more involved loop forms, the order by which the BBs are
8605 traversed need to be reconsidered. */
8607 for (i = 0; i < nbbs; i++)
8609 basic_block bb = bbs[i];
8610 stmt_vec_info stmt_info;
8612 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8613 gsi_next (&si))
8615 gphi *phi = si.phi ();
8616 if (dump_enabled_p ())
8617 dump_printf_loc (MSG_NOTE, vect_location,
8618 "------>vectorizing phi: %G", phi);
8619 stmt_info = loop_vinfo->lookup_stmt (phi);
8620 if (!stmt_info)
8621 continue;
8623 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8624 vect_loop_kill_debug_uses (loop, stmt_info);
8626 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8627 && !STMT_VINFO_LIVE_P (stmt_info))
8628 continue;
8630 if (STMT_VINFO_VECTYPE (stmt_info)
8631 && (maybe_ne
8632 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8633 && dump_enabled_p ())
8634 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8636 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8637 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8638 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8639 && ! PURE_SLP_STMT (stmt_info))
8641 if (dump_enabled_p ())
8642 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8643 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8647 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8648 !gsi_end_p (si);)
8650 stmt = gsi_stmt (si);
8651 /* During vectorization remove existing clobber stmts. */
8652 if (gimple_clobber_p (stmt))
8654 unlink_stmt_vdef (stmt);
8655 gsi_remove (&si, true);
8656 release_defs (stmt);
8658 else
8660 stmt_info = loop_vinfo->lookup_stmt (stmt);
8662 /* vector stmts created in the outer-loop during vectorization of
8663 stmts in an inner-loop may not have a stmt_info, and do not
8664 need to be vectorized. */
8665 stmt_vec_info seen_store = NULL;
8666 if (stmt_info)
8668 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8670 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8671 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8672 !gsi_end_p (subsi); gsi_next (&subsi))
8674 stmt_vec_info pat_stmt_info
8675 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8676 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8677 &si, &seen_store);
8679 stmt_vec_info pat_stmt_info
8680 = STMT_VINFO_RELATED_STMT (stmt_info);
8681 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8682 &seen_store);
8684 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8685 &seen_store);
8687 gsi_next (&si);
8688 if (seen_store)
8690 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8691 /* Interleaving. If IS_STORE is TRUE, the
8692 vectorization of the interleaving chain was
8693 completed - free all the stores in the chain. */
8694 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8695 else
8696 /* Free the attached stmt_vec_info and remove the stmt. */
8697 loop_vinfo->remove_stmt (stmt_info);
8702 /* Stub out scalar statements that must not survive vectorization.
8703 Doing this here helps with grouped statements, or statements that
8704 are involved in patterns. */
8705 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8706 !gsi_end_p (gsi); gsi_next (&gsi))
8708 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8709 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8711 tree lhs = gimple_get_lhs (call);
8712 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8714 tree zero = build_zero_cst (TREE_TYPE (lhs));
8715 gimple *new_stmt = gimple_build_assign (lhs, zero);
8716 gsi_replace (&gsi, new_stmt, true);
8720 } /* BBs in loop */
8722 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8723 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8724 if (integer_onep (step_vector))
8725 niters_no_overflow = true;
8726 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8727 niters_vector_mult_vf, !niters_no_overflow);
8729 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8730 scale_profile_for_vect_loop (loop, assumed_vf);
8732 /* True if the final iteration might not handle a full vector's
8733 worth of scalar iterations. */
8734 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8735 /* The minimum number of iterations performed by the epilogue. This
8736 is 1 when peeling for gaps because we always need a final scalar
8737 iteration. */
8738 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8739 /* +1 to convert latch counts to loop iteration counts,
8740 -min_epilogue_iters to remove iterations that cannot be performed
8741 by the vector code. */
8742 int bias_for_lowest = 1 - min_epilogue_iters;
8743 int bias_for_assumed = bias_for_lowest;
8744 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8745 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8747 /* When the amount of peeling is known at compile time, the first
8748 iteration will have exactly alignment_npeels active elements.
8749 In the worst case it will have at least one. */
8750 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8751 bias_for_lowest += lowest_vf - min_first_active;
8752 bias_for_assumed += assumed_vf - min_first_active;
8754 /* In these calculations the "- 1" converts loop iteration counts
8755 back to latch counts. */
8756 if (loop->any_upper_bound)
8757 loop->nb_iterations_upper_bound
8758 = (final_iter_may_be_partial
8759 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8760 lowest_vf) - 1
8761 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8762 lowest_vf) - 1);
8763 if (loop->any_likely_upper_bound)
8764 loop->nb_iterations_likely_upper_bound
8765 = (final_iter_may_be_partial
8766 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8767 + bias_for_lowest, lowest_vf) - 1
8768 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8769 + bias_for_lowest, lowest_vf) - 1);
8770 if (loop->any_estimate)
8771 loop->nb_iterations_estimate
8772 = (final_iter_may_be_partial
8773 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8774 assumed_vf) - 1
8775 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8776 assumed_vf) - 1);
8778 if (dump_enabled_p ())
8780 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8782 dump_printf_loc (MSG_NOTE, vect_location,
8783 "LOOP VECTORIZED\n");
8784 if (loop->inner)
8785 dump_printf_loc (MSG_NOTE, vect_location,
8786 "OUTER LOOP VECTORIZED\n");
8787 dump_printf (MSG_NOTE, "\n");
8789 else
8791 dump_printf_loc (MSG_NOTE, vect_location,
8792 "LOOP EPILOGUE VECTORIZED (VS=");
8793 dump_dec (MSG_NOTE, current_vector_size);
8794 dump_printf (MSG_NOTE, ")\n");
8798 /* Loops vectorized with a variable factor won't benefit from
8799 unrolling/peeling. */
8800 if (!vf.is_constant ())
8802 loop->unroll = 1;
8803 if (dump_enabled_p ())
8804 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8805 " variable-length vectorization factor\n");
8807 /* Free SLP instances here because otherwise stmt reference counting
8808 won't work. */
8809 slp_instance instance;
8810 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8811 vect_free_slp_instance (instance, true);
8812 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8813 /* Clear-up safelen field since its value is invalid after vectorization
8814 since vectorized loop can have loop-carried dependencies. */
8815 loop->safelen = 0;
8817 /* Don't vectorize epilogue for epilogue. */
8818 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8819 epilogue = NULL;
8821 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8822 epilogue = NULL;
8824 if (epilogue)
8826 auto_vector_sizes vector_sizes;
8827 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8828 unsigned int next_size = 0;
8830 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8831 on niters already ajusted for the iterations of the prologue. */
8832 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8833 && known_eq (vf, lowest_vf))
8835 unsigned HOST_WIDE_INT eiters
8836 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8837 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8838 eiters
8839 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8840 epilogue->nb_iterations_upper_bound = eiters - 1;
8841 epilogue->any_upper_bound = true;
8843 unsigned int ratio;
8844 while (next_size < vector_sizes.length ()
8845 && !(constant_multiple_p (current_vector_size,
8846 vector_sizes[next_size], &ratio)
8847 && eiters >= lowest_vf / ratio))
8848 next_size += 1;
8850 else
8851 while (next_size < vector_sizes.length ()
8852 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8853 next_size += 1;
8855 if (next_size == vector_sizes.length ())
8856 epilogue = NULL;
8859 if (epilogue)
8861 epilogue->force_vectorize = loop->force_vectorize;
8862 epilogue->safelen = loop->safelen;
8863 epilogue->dont_vectorize = false;
8865 /* We may need to if-convert epilogue to vectorize it. */
8866 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8867 tree_if_conversion (epilogue);
8870 return epilogue;
8873 /* The code below is trying to perform simple optimization - revert
8874 if-conversion for masked stores, i.e. if the mask of a store is zero
8875 do not perform it and all stored value producers also if possible.
8876 For example,
8877 for (i=0; i<n; i++)
8878 if (c[i])
8880 p1[i] += 1;
8881 p2[i] = p3[i] +2;
8883 this transformation will produce the following semi-hammock:
8885 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8887 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8888 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8889 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8890 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8891 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8892 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8896 void
8897 optimize_mask_stores (struct loop *loop)
8899 basic_block *bbs = get_loop_body (loop);
8900 unsigned nbbs = loop->num_nodes;
8901 unsigned i;
8902 basic_block bb;
8903 struct loop *bb_loop;
8904 gimple_stmt_iterator gsi;
8905 gimple *stmt;
8906 auto_vec<gimple *> worklist;
8907 auto_purge_vect_location sentinel;
8909 vect_location = find_loop_location (loop);
8910 /* Pick up all masked stores in loop if any. */
8911 for (i = 0; i < nbbs; i++)
8913 bb = bbs[i];
8914 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8915 gsi_next (&gsi))
8917 stmt = gsi_stmt (gsi);
8918 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8919 worklist.safe_push (stmt);
8923 free (bbs);
8924 if (worklist.is_empty ())
8925 return;
8927 /* Loop has masked stores. */
8928 while (!worklist.is_empty ())
8930 gimple *last, *last_store;
8931 edge e, efalse;
8932 tree mask;
8933 basic_block store_bb, join_bb;
8934 gimple_stmt_iterator gsi_to;
8935 tree vdef, new_vdef;
8936 gphi *phi;
8937 tree vectype;
8938 tree zero;
8940 last = worklist.pop ();
8941 mask = gimple_call_arg (last, 2);
8942 bb = gimple_bb (last);
8943 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8944 the same loop as if_bb. It could be different to LOOP when two
8945 level loop-nest is vectorized and mask_store belongs to the inner
8946 one. */
8947 e = split_block (bb, last);
8948 bb_loop = bb->loop_father;
8949 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8950 join_bb = e->dest;
8951 store_bb = create_empty_bb (bb);
8952 add_bb_to_loop (store_bb, bb_loop);
8953 e->flags = EDGE_TRUE_VALUE;
8954 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8955 /* Put STORE_BB to likely part. */
8956 efalse->probability = profile_probability::unlikely ();
8957 store_bb->count = efalse->count ();
8958 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8959 if (dom_info_available_p (CDI_DOMINATORS))
8960 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8961 if (dump_enabled_p ())
8962 dump_printf_loc (MSG_NOTE, vect_location,
8963 "Create new block %d to sink mask stores.",
8964 store_bb->index);
8965 /* Create vector comparison with boolean result. */
8966 vectype = TREE_TYPE (mask);
8967 zero = build_zero_cst (vectype);
8968 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8969 gsi = gsi_last_bb (bb);
8970 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8971 /* Create new PHI node for vdef of the last masked store:
8972 .MEM_2 = VDEF <.MEM_1>
8973 will be converted to
8974 .MEM.3 = VDEF <.MEM_1>
8975 and new PHI node will be created in join bb
8976 .MEM_2 = PHI <.MEM_1, .MEM_3>
8978 vdef = gimple_vdef (last);
8979 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8980 gimple_set_vdef (last, new_vdef);
8981 phi = create_phi_node (vdef, join_bb);
8982 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8984 /* Put all masked stores with the same mask to STORE_BB if possible. */
8985 while (true)
8987 gimple_stmt_iterator gsi_from;
8988 gimple *stmt1 = NULL;
8990 /* Move masked store to STORE_BB. */
8991 last_store = last;
8992 gsi = gsi_for_stmt (last);
8993 gsi_from = gsi;
8994 /* Shift GSI to the previous stmt for further traversal. */
8995 gsi_prev (&gsi);
8996 gsi_to = gsi_start_bb (store_bb);
8997 gsi_move_before (&gsi_from, &gsi_to);
8998 /* Setup GSI_TO to the non-empty block start. */
8999 gsi_to = gsi_start_bb (store_bb);
9000 if (dump_enabled_p ())
9001 dump_printf_loc (MSG_NOTE, vect_location,
9002 "Move stmt to created bb\n%G", last);
9003 /* Move all stored value producers if possible. */
9004 while (!gsi_end_p (gsi))
9006 tree lhs;
9007 imm_use_iterator imm_iter;
9008 use_operand_p use_p;
9009 bool res;
9011 /* Skip debug statements. */
9012 if (is_gimple_debug (gsi_stmt (gsi)))
9014 gsi_prev (&gsi);
9015 continue;
9017 stmt1 = gsi_stmt (gsi);
9018 /* Do not consider statements writing to memory or having
9019 volatile operand. */
9020 if (gimple_vdef (stmt1)
9021 || gimple_has_volatile_ops (stmt1))
9022 break;
9023 gsi_from = gsi;
9024 gsi_prev (&gsi);
9025 lhs = gimple_get_lhs (stmt1);
9026 if (!lhs)
9027 break;
9029 /* LHS of vectorized stmt must be SSA_NAME. */
9030 if (TREE_CODE (lhs) != SSA_NAME)
9031 break;
9033 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9035 /* Remove dead scalar statement. */
9036 if (has_zero_uses (lhs))
9038 gsi_remove (&gsi_from, true);
9039 continue;
9043 /* Check that LHS does not have uses outside of STORE_BB. */
9044 res = true;
9045 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9047 gimple *use_stmt;
9048 use_stmt = USE_STMT (use_p);
9049 if (is_gimple_debug (use_stmt))
9050 continue;
9051 if (gimple_bb (use_stmt) != store_bb)
9053 res = false;
9054 break;
9057 if (!res)
9058 break;
9060 if (gimple_vuse (stmt1)
9061 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9062 break;
9064 /* Can move STMT1 to STORE_BB. */
9065 if (dump_enabled_p ())
9066 dump_printf_loc (MSG_NOTE, vect_location,
9067 "Move stmt to created bb\n%G", stmt1);
9068 gsi_move_before (&gsi_from, &gsi_to);
9069 /* Shift GSI_TO for further insertion. */
9070 gsi_prev (&gsi_to);
9072 /* Put other masked stores with the same mask to STORE_BB. */
9073 if (worklist.is_empty ()
9074 || gimple_call_arg (worklist.last (), 2) != mask
9075 || worklist.last () != stmt1)
9076 break;
9077 last = worklist.pop ();
9079 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9083 /* Decide whether it is possible to use a zero-based induction variable
9084 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9085 return the value that the induction variable must be able to hold
9086 in order to ensure that the loop ends with an all-false mask.
9087 Return -1 otherwise. */
9088 widest_int
9089 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9091 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9092 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9093 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9095 /* Calculate the value that the induction variable must be able
9096 to hit in order to ensure that we end the loop with an all-false mask.
9097 This involves adding the maximum number of inactive trailing scalar
9098 iterations. */
9099 widest_int iv_limit = -1;
9100 if (max_loop_iterations (loop, &iv_limit))
9102 if (niters_skip)
9104 /* Add the maximum number of skipped iterations to the
9105 maximum iteration count. */
9106 if (TREE_CODE (niters_skip) == INTEGER_CST)
9107 iv_limit += wi::to_widest (niters_skip);
9108 else
9109 iv_limit += max_vf - 1;
9111 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9112 /* Make a conservatively-correct assumption. */
9113 iv_limit += max_vf - 1;
9115 /* IV_LIMIT is the maximum number of latch iterations, which is also
9116 the maximum in-range IV value. Round this value down to the previous
9117 vector alignment boundary and then add an extra full iteration. */
9118 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9119 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9121 return iv_limit;