[26/46] Make more use of dyn_cast in tree-vect*
[official-gcc.git] / gcc / tree-vect-loop.c
blobcaa1c6b872539d24c0a682f0ed62b39390447a87
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
184 if (stmt_vectype)
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return true;
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
216 vec_info *vinfo = stmt_info->vinfo;
217 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
220 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
222 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
223 return false;
225 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
226 && STMT_VINFO_RELATED_STMT (stmt_info))
228 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231 /* If a pattern statement has def stmts, analyze them too. */
232 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
233 !gsi_end_p (si); gsi_next (&si))
235 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: ");
240 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
241 def_stmt_info->stmt, 0);
243 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers))
245 return false;
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: ");
252 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
254 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
255 return false;
258 return true;
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
286 static bool
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i = 0; i < nbbs; i++)
304 basic_block bb = bbs[i];
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
314 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
317 gcc_assert (stmt_info);
319 if (STMT_VINFO_RELEVANT_P (stmt_info)
320 || STMT_VINFO_LIVE_P (stmt_info))
322 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
323 scalar_type = TREE_TYPE (PHI_RESULT (phi));
325 if (dump_enabled_p ())
327 dump_printf_loc (MSG_NOTE, vect_location,
328 "get vectype for scalar type: ");
329 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
330 dump_printf (MSG_NOTE, "\n");
333 vectype = get_vectype_for_scalar_type (scalar_type);
334 if (!vectype)
336 if (dump_enabled_p ())
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "not vectorized: unsupported "
340 "data-type ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
342 scalar_type);
343 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
345 return false;
347 STMT_VINFO_VECTYPE (stmt_info) = vectype;
349 if (dump_enabled_p ())
351 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
353 dump_printf (MSG_NOTE, "\n");
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
359 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
360 dump_printf (MSG_NOTE, "\n");
363 vect_update_max_nunits (&vectorization_factor, vectype);
367 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
368 gsi_next (&si))
370 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
371 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
372 &mask_producers))
373 return false;
377 /* TODO: Analyze cost. Decide if worth while to vectorize. */
378 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
381 dump_dec (MSG_NOTE, vectorization_factor);
382 dump_printf (MSG_NOTE, "\n");
385 if (known_le (vectorization_factor, 1U))
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "not vectorized: unsupported data-type\n");
390 return false;
392 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
394 for (i = 0; i < mask_producers.length (); i++)
396 stmt_info = mask_producers[i];
397 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
398 if (!mask_type)
399 return false;
400 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
403 return true;
407 /* Function vect_is_simple_iv_evolution.
409 FORNOW: A simple evolution of an induction variables in the loop is
410 considered a polynomial evolution. */
412 static bool
413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
414 tree * step)
416 tree init_expr;
417 tree step_expr;
418 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
419 basic_block bb;
421 /* When there is no evolution in this loop, the evolution function
422 is not "simple". */
423 if (evolution_part == NULL_TREE)
424 return false;
426 /* When the evolution is a polynomial of degree >= 2
427 the evolution function is not "simple". */
428 if (tree_is_chrec (evolution_part))
429 return false;
431 step_expr = evolution_part;
432 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
434 if (dump_enabled_p ())
436 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
437 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
438 dump_printf (MSG_NOTE, ", init: ");
439 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
440 dump_printf (MSG_NOTE, "\n");
443 *init = init_expr;
444 *step = step_expr;
446 if (TREE_CODE (step_expr) != INTEGER_CST
447 && (TREE_CODE (step_expr) != SSA_NAME
448 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
449 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
450 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
451 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
452 || !flag_associative_math)))
453 && (TREE_CODE (step_expr) != REAL_CST
454 || !flag_associative_math))
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
458 "step unknown.\n");
459 return false;
462 return true;
465 /* Function vect_analyze_scalar_cycles_1.
467 Examine the cross iteration def-use cycles of scalar variables
468 in LOOP. LOOP_VINFO represents the loop that is now being
469 considered for vectorization (can be LOOP, or an outer-loop
470 enclosing LOOP). */
472 static void
473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
475 basic_block bb = loop->header;
476 tree init, step;
477 auto_vec<gimple *, 64> worklist;
478 gphi_iterator gsi;
479 bool double_reduc;
481 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
483 /* First - identify all inductions. Reduction detection assumes that all the
484 inductions have been identified, therefore, this order must not be
485 changed. */
486 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
488 gphi *phi = gsi.phi ();
489 tree access_fn = NULL;
490 tree def = PHI_RESULT (phi);
491 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
493 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
496 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
499 /* Skip virtual phi's. The data dependences that are associated with
500 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
501 if (virtual_operand_p (def))
502 continue;
504 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
506 /* Analyze the evolution function. */
507 access_fn = analyze_scalar_evolution (loop, def);
508 if (access_fn)
510 STRIP_NOPS (access_fn);
511 if (dump_enabled_p ())
513 dump_printf_loc (MSG_NOTE, vect_location,
514 "Access function of PHI: ");
515 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
516 dump_printf (MSG_NOTE, "\n");
518 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
519 = initial_condition_in_loop_num (access_fn, loop->num);
520 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
521 = evolution_part_in_loop_num (access_fn, loop->num);
524 if (!access_fn
525 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
526 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
527 && TREE_CODE (step) != INTEGER_CST))
529 worklist.safe_push (phi);
530 continue;
533 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
534 != NULL_TREE);
535 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
539 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
543 /* Second - identify all reductions and nested cycles. */
544 while (worklist.length () > 0)
546 gimple *phi = worklist.pop ();
547 tree def = PHI_RESULT (phi);
548 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
550 if (dump_enabled_p ())
552 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
553 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
556 gcc_assert (!virtual_operand_p (def)
557 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
559 stmt_vec_info reduc_stmt_info
560 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
561 &double_reduc, false);
562 if (reduc_stmt_info)
564 if (double_reduc)
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "Detected double reduction.\n");
570 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
571 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
572 = vect_double_reduction_def;
574 else
576 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "Detected vectorizable nested cycle.\n");
582 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
583 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
585 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_NOTE, vect_location,
589 "Detected reduction.\n");
591 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
592 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
593 /* Store the reduction cycles for possible vectorization in
594 loop-aware SLP if it was not detected as reduction
595 chain. */
596 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
597 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
598 (reduc_stmt_info);
602 else
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Unknown def-use cycle pattern.\n");
610 /* Function vect_analyze_scalar_cycles.
612 Examine the cross iteration def-use cycles of scalar variables, by
613 analyzing the loop-header PHIs of scalar variables. Classify each
614 cycle as one of the following: invariant, induction, reduction, unknown.
615 We do that for the loop represented by LOOP_VINFO, and also to its
616 inner-loop, if exists.
617 Examples for scalar cycles:
619 Example1: reduction:
621 loop1:
622 for (i=0; i<N; i++)
623 sum += a[i];
625 Example2: induction:
627 loop2:
628 for (i=0; i<N; i++)
629 a[i] = i; */
631 static void
632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
634 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
636 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
638 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
639 Reductions in such inner-loop therefore have different properties than
640 the reductions in the nest that gets vectorized:
641 1. When vectorized, they are executed in the same order as in the original
642 scalar loop, so we can't change the order of computation when
643 vectorizing them.
644 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
645 current checks are too strict. */
647 if (loop->inner)
648 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 /* Transfer group and reduction information from STMT to its pattern stmt. */
653 static void
654 vect_fixup_reduc_chain (gimple *stmt)
656 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
657 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
658 stmt_vec_info stmtp;
659 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
660 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
661 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
665 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
666 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
667 if (stmt_info)
668 REDUC_GROUP_NEXT_ELEMENT (stmtp)
669 = STMT_VINFO_RELATED_STMT (stmt_info);
671 while (stmt_info);
672 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 /* Fixup scalar cycles that now have their stmts detected as patterns. */
677 static void
678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
680 stmt_vec_info first;
681 unsigned i;
683 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
684 if (STMT_VINFO_IN_PATTERN_P (first))
686 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
687 while (next)
689 if (! STMT_VINFO_IN_PATTERN_P (next))
690 break;
691 next = REDUC_GROUP_NEXT_ELEMENT (next);
693 /* If not all stmt in the chain are patterns try to handle
694 the chain without patterns. */
695 if (! next)
697 vect_fixup_reduc_chain (first);
698 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
699 = STMT_VINFO_RELATED_STMT (first);
704 /* Function vect_get_loop_niters.
706 Determine how many iterations the loop is executed and place it
707 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
708 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
709 niter information holds in ASSUMPTIONS.
711 Return the loop exit condition. */
714 static gcond *
715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
716 tree *number_of_iterations, tree *number_of_iterationsm1)
718 edge exit = single_exit (loop);
719 struct tree_niter_desc niter_desc;
720 tree niter_assumptions, niter, may_be_zero;
721 gcond *cond = get_loop_exit_condition (loop);
723 *assumptions = boolean_true_node;
724 *number_of_iterationsm1 = chrec_dont_know;
725 *number_of_iterations = chrec_dont_know;
726 DUMP_VECT_SCOPE ("get_loop_niters");
728 if (!exit)
729 return cond;
731 niter = chrec_dont_know;
732 may_be_zero = NULL_TREE;
733 niter_assumptions = boolean_true_node;
734 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
735 || chrec_contains_undetermined (niter_desc.niter))
736 return cond;
738 niter_assumptions = niter_desc.assumptions;
739 may_be_zero = niter_desc.may_be_zero;
740 niter = niter_desc.niter;
742 if (may_be_zero && integer_zerop (may_be_zero))
743 may_be_zero = NULL_TREE;
745 if (may_be_zero)
747 if (COMPARISON_CLASS_P (may_be_zero))
749 /* Try to combine may_be_zero with assumptions, this can simplify
750 computation of niter expression. */
751 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
752 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
753 niter_assumptions,
754 fold_build1 (TRUTH_NOT_EXPR,
755 boolean_type_node,
756 may_be_zero));
757 else
758 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
759 build_int_cst (TREE_TYPE (niter), 0),
760 rewrite_to_non_trapping_overflow (niter));
762 may_be_zero = NULL_TREE;
764 else if (integer_nonzerop (may_be_zero))
766 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
767 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
768 return cond;
770 else
771 return cond;
774 *assumptions = niter_assumptions;
775 *number_of_iterationsm1 = niter;
777 /* We want the number of loop header executions which is the number
778 of latch executions plus one.
779 ??? For UINT_MAX latch executions this number overflows to zero
780 for loops like do { n++; } while (n != 0); */
781 if (niter && !chrec_contains_undetermined (niter))
782 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
783 build_int_cst (TREE_TYPE (niter), 1));
784 *number_of_iterations = niter;
786 return cond;
789 /* Function bb_in_loop_p
791 Used as predicate for dfs order traversal of the loop bbs. */
793 static bool
794 bb_in_loop_p (const_basic_block bb, const void *data)
796 const struct loop *const loop = (const struct loop *)data;
797 if (flow_bb_inside_loop_p (loop, bb))
798 return true;
799 return false;
803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
804 stmt_vec_info structs for all the stmts in LOOP_IN. */
806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
807 : vec_info (vec_info::loop, init_cost (loop_in), shared),
808 loop (loop_in),
809 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
810 num_itersm1 (NULL_TREE),
811 num_iters (NULL_TREE),
812 num_iters_unchanged (NULL_TREE),
813 num_iters_assumptions (NULL_TREE),
814 th (0),
815 versioning_threshold (0),
816 vectorization_factor (0),
817 max_vectorization_factor (0),
818 mask_skip_niters (NULL_TREE),
819 mask_compare_type (NULL_TREE),
820 unaligned_dr (NULL),
821 peeling_for_alignment (0),
822 ptr_mask (0),
823 ivexpr_map (NULL),
824 slp_unrolling_factor (1),
825 single_scalar_iteration_cost (0),
826 vectorizable (false),
827 can_fully_mask_p (true),
828 fully_masked_p (false),
829 peeling_for_gaps (false),
830 peeling_for_niter (false),
831 operands_swapped (false),
832 no_data_dependencies (false),
833 has_mask_store (false),
834 scalar_loop (NULL),
835 orig_loop_info (NULL)
837 /* Create/Update stmt_info for all stmts in the loop. */
838 basic_block *body = get_loop_body (loop);
839 for (unsigned int i = 0; i < loop->num_nodes; i++)
841 basic_block bb = body[i];
842 gimple_stmt_iterator si;
844 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
846 gimple *phi = gsi_stmt (si);
847 gimple_set_uid (phi, 0);
848 add_stmt (phi);
851 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
853 gimple *stmt = gsi_stmt (si);
854 gimple_set_uid (stmt, 0);
855 add_stmt (stmt);
858 free (body);
860 /* CHECKME: We want to visit all BBs before their successors (except for
861 latch blocks, for which this assertion wouldn't hold). In the simple
862 case of the loop forms we allow, a dfs order of the BBs would the same
863 as reversed postorder traversal, so we are safe. */
865 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 bbs, loop->num_nodes, loop);
867 gcc_assert (nbbs == loop->num_nodes);
870 /* Free all levels of MASKS. */
872 void
873 release_vec_loop_masks (vec_loop_masks *masks)
875 rgroup_masks *rgm;
876 unsigned int i;
877 FOR_EACH_VEC_ELT (*masks, i, rgm)
878 rgm->masks.release ();
879 masks->release ();
882 /* Free all memory used by the _loop_vec_info, as well as all the
883 stmt_vec_info structs of all the stmts in the loop. */
885 _loop_vec_info::~_loop_vec_info ()
887 int nbbs;
888 gimple_stmt_iterator si;
889 int j;
891 /* ??? We're releasing loop_vinfos en-block. */
892 set_stmt_vec_info_vec (&stmt_vec_infos);
893 nbbs = loop->num_nodes;
894 for (j = 0; j < nbbs; j++)
896 basic_block bb = bbs[j];
897 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
898 free_stmt_vec_info (gsi_stmt (si));
900 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
902 gimple *stmt = gsi_stmt (si);
904 /* We may have broken canonical form by moving a constant
905 into RHS1 of a commutative op. Fix such occurrences. */
906 if (operands_swapped && is_gimple_assign (stmt))
908 enum tree_code code = gimple_assign_rhs_code (stmt);
910 if ((code == PLUS_EXPR
911 || code == POINTER_PLUS_EXPR
912 || code == MULT_EXPR)
913 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
914 swap_ssa_operands (stmt,
915 gimple_assign_rhs1_ptr (stmt),
916 gimple_assign_rhs2_ptr (stmt));
917 else if (code == COND_EXPR
918 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
920 tree cond_expr = gimple_assign_rhs1 (stmt);
921 enum tree_code cond_code = TREE_CODE (cond_expr);
923 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
925 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
926 0));
927 cond_code = invert_tree_comparison (cond_code,
928 honor_nans);
929 if (cond_code != ERROR_MARK)
931 TREE_SET_CODE (cond_expr, cond_code);
932 swap_ssa_operands (stmt,
933 gimple_assign_rhs2_ptr (stmt),
934 gimple_assign_rhs3_ptr (stmt));
940 /* Free stmt_vec_info. */
941 free_stmt_vec_info (stmt);
942 gsi_next (&si);
946 free (bbs);
948 release_vec_loop_masks (&masks);
949 delete ivexpr_map;
951 loop->aux = NULL;
954 /* Return an invariant or register for EXPR and emit necessary
955 computations in the LOOP_VINFO loop preheader. */
957 tree
958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
960 if (is_gimple_reg (expr)
961 || is_gimple_min_invariant (expr))
962 return expr;
964 if (! loop_vinfo->ivexpr_map)
965 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
966 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
967 if (! cached)
969 gimple_seq stmts = NULL;
970 cached = force_gimple_operand (unshare_expr (expr),
971 &stmts, true, NULL_TREE);
972 if (stmts)
974 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
975 gsi_insert_seq_on_edge_immediate (e, stmts);
978 return cached;
981 /* Return true if we can use CMP_TYPE as the comparison type to produce
982 all masks required to mask LOOP_VINFO. */
984 static bool
985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
987 rgroup_masks *rgm;
988 unsigned int i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
990 if (rgm->mask_type != NULL_TREE
991 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
992 cmp_type, rgm->mask_type,
993 OPTIMIZE_FOR_SPEED))
994 return false;
995 return true;
998 /* Calculate the maximum number of scalars per iteration for every
999 rgroup in LOOP_VINFO. */
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1004 unsigned int res = 1;
1005 unsigned int i;
1006 rgroup_masks *rgm;
1007 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008 res = MAX (res, rgm->max_nscalars_per_iter);
1009 return res;
1012 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1013 whether we can actually generate the masks required. Return true if so,
1014 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1019 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020 unsigned int min_ni_width;
1022 /* Use a normal loop if there are no statements that need masking.
1023 This only happens in rare degenerate cases: it means that the loop
1024 has no loads, no stores, and no live-out values. */
1025 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026 return false;
1028 /* Get the maximum number of iterations that is representable
1029 in the counter type. */
1030 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1033 /* Get a more refined estimate for the number of iterations. */
1034 widest_int max_back_edges;
1035 if (max_loop_iterations (loop, &max_back_edges))
1036 max_ni = wi::smin (max_ni, max_back_edges + 1);
1038 /* Account for rgroup masks, in which each bit is replicated N times. */
1039 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1041 /* Work out how many bits we need to represent the limit. */
1042 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1044 /* Find a scalar mode for which WHILE_ULT is supported. */
1045 opt_scalar_int_mode cmp_mode_iter;
1046 tree cmp_type = NULL_TREE;
1047 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1049 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050 if (cmp_bits >= min_ni_width
1051 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1053 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054 if (this_type
1055 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1057 /* Although we could stop as soon as we find a valid mode,
1058 it's often better to continue until we hit Pmode, since the
1059 operands to the WHILE are more likely to be reusable in
1060 address calculations. */
1061 cmp_type = this_type;
1062 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063 break;
1068 if (!cmp_type)
1069 return false;
1071 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072 return true;
1075 /* Calculate the cost of one scalar iteration of the loop. */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1079 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081 int nbbs = loop->num_nodes, factor;
1082 int innerloop_iters, i;
1084 /* Gather costs for statements in the scalar loop. */
1086 /* FORNOW. */
1087 innerloop_iters = 1;
1088 if (loop->inner)
1089 innerloop_iters = 50; /* FIXME */
1091 for (i = 0; i < nbbs; i++)
1093 gimple_stmt_iterator si;
1094 basic_block bb = bbs[i];
1096 if (bb->loop_father == loop->inner)
1097 factor = innerloop_iters;
1098 else
1099 factor = 1;
1101 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1103 gimple *stmt = gsi_stmt (si);
1104 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1106 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107 continue;
1109 /* Skip stmts that are not vectorized inside the loop. */
1110 if (stmt_info
1111 && !STMT_VINFO_RELEVANT_P (stmt_info)
1112 && (!STMT_VINFO_LIVE_P (stmt_info)
1113 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115 continue;
1117 vect_cost_for_stmt kind;
1118 if (STMT_VINFO_DATA_REF (stmt_info))
1120 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121 kind = scalar_load;
1122 else
1123 kind = scalar_store;
1125 else
1126 kind = scalar_stmt;
1128 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129 factor, kind, stmt_info, 0, vect_prologue);
1133 /* Now accumulate cost. */
1134 void *target_cost_data = init_cost (loop);
1135 stmt_info_for_cost *si;
1136 int j;
1137 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138 j, si)
1139 (void) add_stmt_cost (target_cost_data, si->count,
1140 si->kind, si->stmt_info, si->misalign,
1141 vect_body);
1142 unsigned dummy, body_cost = 0;
1143 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1144 destroy_cost_data (target_cost_data);
1145 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1149 /* Function vect_analyze_loop_form_1.
1151 Verify that certain CFG restrictions hold, including:
1152 - the loop has a pre-header
1153 - the loop has a single entry and exit
1154 - the loop exit condition is simple enough
1155 - the number of iterations can be analyzed, i.e, a countable loop. The
1156 niter could be analyzed under some assumptions. */
1158 bool
1159 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1160 tree *assumptions, tree *number_of_iterationsm1,
1161 tree *number_of_iterations, gcond **inner_loop_cond)
1163 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1165 /* Different restrictions apply when we are considering an inner-most loop,
1166 vs. an outer (nested) loop.
1167 (FORNOW. May want to relax some of these restrictions in the future). */
1169 if (!loop->inner)
1171 /* Inner-most loop. We currently require that the number of BBs is
1172 exactly 2 (the header and latch). Vectorizable inner-most loops
1173 look like this:
1175 (pre-header)
1177 header <--------+
1178 | | |
1179 | +--> latch --+
1181 (exit-bb) */
1183 if (loop->num_nodes != 2)
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "not vectorized: control flow in loop.\n");
1188 return false;
1191 if (empty_block_p (loop->header))
1193 if (dump_enabled_p ())
1194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1195 "not vectorized: empty loop.\n");
1196 return false;
1199 else
1201 struct loop *innerloop = loop->inner;
1202 edge entryedge;
1204 /* Nested loop. We currently require that the loop is doubly-nested,
1205 contains a single inner loop, and the number of BBs is exactly 5.
1206 Vectorizable outer-loops look like this:
1208 (pre-header)
1210 header <---+
1212 inner-loop |
1214 tail ------+
1216 (exit-bb)
1218 The inner-loop has the properties expected of inner-most loops
1219 as described above. */
1221 if ((loop->inner)->inner || (loop->inner)->next)
1223 if (dump_enabled_p ())
1224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1225 "not vectorized: multiple nested loops.\n");
1226 return false;
1229 if (loop->num_nodes != 5)
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "not vectorized: control flow in loop.\n");
1234 return false;
1237 entryedge = loop_preheader_edge (innerloop);
1238 if (entryedge->src != loop->header
1239 || !single_exit (innerloop)
1240 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244 "not vectorized: unsupported outerloop form.\n");
1245 return false;
1248 /* Analyze the inner-loop. */
1249 tree inner_niterm1, inner_niter, inner_assumptions;
1250 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1251 &inner_assumptions, &inner_niterm1,
1252 &inner_niter, NULL)
1253 /* Don't support analyzing niter under assumptions for inner
1254 loop. */
1255 || !integer_onep (inner_assumptions))
1257 if (dump_enabled_p ())
1258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1259 "not vectorized: Bad inner loop.\n");
1260 return false;
1263 if (!expr_invariant_in_loop_p (loop, inner_niter))
1265 if (dump_enabled_p ())
1266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267 "not vectorized: inner-loop count not"
1268 " invariant.\n");
1269 return false;
1272 if (dump_enabled_p ())
1273 dump_printf_loc (MSG_NOTE, vect_location,
1274 "Considering outer-loop vectorization.\n");
1277 if (!single_exit (loop)
1278 || EDGE_COUNT (loop->header->preds) != 2)
1280 if (dump_enabled_p ())
1282 if (!single_exit (loop))
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284 "not vectorized: multiple exits.\n");
1285 else if (EDGE_COUNT (loop->header->preds) != 2)
1286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1287 "not vectorized: too many incoming edges.\n");
1289 return false;
1292 /* We assume that the loop exit condition is at the end of the loop. i.e,
1293 that the loop is represented as a do-while (with a proper if-guard
1294 before the loop if needed), where the loop header contains all the
1295 executable statements, and the latch is empty. */
1296 if (!empty_block_p (loop->latch)
1297 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301 "not vectorized: latch block not empty.\n");
1302 return false;
1305 /* Make sure the exit is not abnormal. */
1306 edge e = single_exit (loop);
1307 if (e->flags & EDGE_ABNORMAL)
1309 if (dump_enabled_p ())
1310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311 "not vectorized: abnormal loop exit edge.\n");
1312 return false;
1315 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1316 number_of_iterationsm1);
1317 if (!*loop_cond)
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321 "not vectorized: complicated exit condition.\n");
1322 return false;
1325 if (integer_zerop (*assumptions)
1326 || !*number_of_iterations
1327 || chrec_contains_undetermined (*number_of_iterations))
1329 if (dump_enabled_p ())
1330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331 "not vectorized: number of iterations cannot be "
1332 "computed.\n");
1333 return false;
1336 if (integer_zerop (*number_of_iterations))
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340 "not vectorized: number of iterations = 0.\n");
1341 return false;
1344 return true;
1347 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1349 loop_vec_info
1350 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1352 tree assumptions, number_of_iterations, number_of_iterationsm1;
1353 gcond *loop_cond, *inner_loop_cond = NULL;
1355 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1356 &assumptions, &number_of_iterationsm1,
1357 &number_of_iterations, &inner_loop_cond))
1358 return NULL;
1360 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1361 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1362 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1363 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1364 if (!integer_onep (assumptions))
1366 /* We consider to vectorize this loop by versioning it under
1367 some assumptions. In order to do this, we need to clear
1368 existing information computed by scev and niter analyzer. */
1369 scev_reset_htab ();
1370 free_numbers_of_iterations_estimates (loop);
1371 /* Also set flag for this loop so that following scev and niter
1372 analysis are done under the assumptions. */
1373 loop_constraint_set (loop, LOOP_C_FINITE);
1374 /* Also record the assumptions for versioning. */
1375 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1378 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1380 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location,
1383 "Symbolic number of iterations is ");
1384 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1385 dump_printf (MSG_NOTE, "\n");
1389 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1390 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1391 if (inner_loop_cond)
1393 stmt_vec_info inner_loop_cond_info
1394 = loop_vinfo->lookup_stmt (inner_loop_cond);
1395 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1398 gcc_assert (!loop->aux);
1399 loop->aux = loop_vinfo;
1400 return loop_vinfo;
1405 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1406 statements update the vectorization factor. */
1408 static void
1409 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1411 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1412 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1413 int nbbs = loop->num_nodes;
1414 poly_uint64 vectorization_factor;
1415 int i;
1417 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1419 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420 gcc_assert (known_ne (vectorization_factor, 0U));
1422 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1423 vectorization factor of the loop is the unrolling factor required by
1424 the SLP instances. If that unrolling factor is 1, we say, that we
1425 perform pure SLP on loop - cross iteration parallelism is not
1426 exploited. */
1427 bool only_slp_in_loop = true;
1428 for (i = 0; i < nbbs; i++)
1430 basic_block bb = bbs[i];
1431 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1432 gsi_next (&si))
1434 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1435 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1436 && STMT_VINFO_RELATED_STMT (stmt_info))
1437 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1438 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1439 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1440 && !PURE_SLP_STMT (stmt_info))
1441 /* STMT needs both SLP and loop-based vectorization. */
1442 only_slp_in_loop = false;
1446 if (only_slp_in_loop)
1448 dump_printf_loc (MSG_NOTE, vect_location,
1449 "Loop contains only SLP stmts\n");
1450 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1452 else
1454 dump_printf_loc (MSG_NOTE, vect_location,
1455 "Loop contains SLP and non-SLP stmts\n");
1456 /* Both the vectorization factor and unroll factor have the form
1457 current_vector_size * X for some rational X, so they must have
1458 a common multiple. */
1459 vectorization_factor
1460 = force_common_multiple (vectorization_factor,
1461 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1464 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1465 if (dump_enabled_p ())
1467 dump_printf_loc (MSG_NOTE, vect_location,
1468 "Updating vectorization factor to ");
1469 dump_dec (MSG_NOTE, vectorization_factor);
1470 dump_printf (MSG_NOTE, ".\n");
1474 /* Return true if STMT_INFO describes a double reduction phi and if
1475 the other phi in the reduction is also relevant for vectorization.
1476 This rejects cases such as:
1478 outer1:
1479 x_1 = PHI <x_3(outer2), ...>;
1482 inner:
1483 x_2 = ...;
1486 outer2:
1487 x_3 = PHI <x_2(inner)>;
1489 if nothing in x_2 or elsewhere makes x_1 relevant. */
1491 static bool
1492 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1494 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1495 return false;
1497 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1500 /* Function vect_analyze_loop_operations.
1502 Scan the loop stmts and make sure they are all vectorizable. */
1504 static bool
1505 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1507 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1508 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1509 int nbbs = loop->num_nodes;
1510 int i;
1511 stmt_vec_info stmt_info;
1512 bool need_to_vectorize = false;
1513 bool ok;
1515 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1517 stmt_vector_for_cost cost_vec;
1518 cost_vec.create (2);
1520 for (i = 0; i < nbbs; i++)
1522 basic_block bb = bbs[i];
1524 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1525 gsi_next (&si))
1527 gphi *phi = si.phi ();
1528 ok = true;
1530 stmt_info = loop_vinfo->lookup_stmt (phi);
1531 if (dump_enabled_p ())
1533 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1534 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1536 if (virtual_operand_p (gimple_phi_result (phi)))
1537 continue;
1539 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1540 (i.e., a phi in the tail of the outer-loop). */
1541 if (! is_loop_header_bb_p (bb))
1543 /* FORNOW: we currently don't support the case that these phis
1544 are not used in the outerloop (unless it is double reduction,
1545 i.e., this phi is vect_reduction_def), cause this case
1546 requires to actually do something here. */
1547 if (STMT_VINFO_LIVE_P (stmt_info)
1548 && !vect_active_double_reduction_p (stmt_info))
1550 if (dump_enabled_p ())
1551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1552 "Unsupported loop-closed phi in "
1553 "outer-loop.\n");
1554 return false;
1557 /* If PHI is used in the outer loop, we check that its operand
1558 is defined in the inner loop. */
1559 if (STMT_VINFO_RELEVANT_P (stmt_info))
1561 tree phi_op;
1563 if (gimple_phi_num_args (phi) != 1)
1564 return false;
1566 phi_op = PHI_ARG_DEF (phi, 0);
1567 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1568 if (!op_def_info)
1569 return false;
1571 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1572 && (STMT_VINFO_RELEVANT (op_def_info)
1573 != vect_used_in_outer_by_reduction))
1574 return false;
1577 continue;
1580 gcc_assert (stmt_info);
1582 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1583 || STMT_VINFO_LIVE_P (stmt_info))
1584 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1586 /* A scalar-dependence cycle that we don't support. */
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589 "not vectorized: scalar dependence cycle.\n");
1590 return false;
1593 if (STMT_VINFO_RELEVANT_P (stmt_info))
1595 need_to_vectorize = true;
1596 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1597 && ! PURE_SLP_STMT (stmt_info))
1598 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1599 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1600 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1601 && ! PURE_SLP_STMT (stmt_info))
1602 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1603 &cost_vec);
1606 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1607 if (ok
1608 && STMT_VINFO_LIVE_P (stmt_info)
1609 && !PURE_SLP_STMT (stmt_info))
1610 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1611 &cost_vec);
1613 if (!ok)
1615 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1618 "not vectorized: relevant phi not "
1619 "supported: ");
1620 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1622 return false;
1626 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1627 gsi_next (&si))
1629 gimple *stmt = gsi_stmt (si);
1630 if (!gimple_clobber_p (stmt)
1631 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1632 &cost_vec))
1633 return false;
1635 } /* bbs */
1637 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1638 cost_vec.release ();
1640 /* All operations in the loop are either irrelevant (deal with loop
1641 control, or dead), or only used outside the loop and can be moved
1642 out of the loop (e.g. invariants, inductions). The loop can be
1643 optimized away by scalar optimizations. We're better off not
1644 touching this loop. */
1645 if (!need_to_vectorize)
1647 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_NOTE, vect_location,
1649 "All the computation can be taken out of the loop.\n");
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 "not vectorized: redundant loop. no profit to "
1653 "vectorize.\n");
1654 return false;
1657 return true;
1660 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1661 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1662 definitely no, or -1 if it's worth retrying. */
1664 static int
1665 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1667 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1668 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1670 /* Only fully-masked loops can have iteration counts less than the
1671 vectorization factor. */
1672 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1674 HOST_WIDE_INT max_niter;
1676 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1677 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1678 else
1679 max_niter = max_stmt_executions_int (loop);
1681 if (max_niter != -1
1682 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686 "not vectorized: iteration count smaller than "
1687 "vectorization factor.\n");
1688 return 0;
1692 int min_profitable_iters, min_profitable_estimate;
1693 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1694 &min_profitable_estimate);
1696 if (min_profitable_iters < 0)
1698 if (dump_enabled_p ())
1699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700 "not vectorized: vectorization not profitable.\n");
1701 if (dump_enabled_p ())
1702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1703 "not vectorized: vector version will never be "
1704 "profitable.\n");
1705 return -1;
1708 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1709 * assumed_vf);
1711 /* Use the cost model only if it is more conservative than user specified
1712 threshold. */
1713 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1714 min_profitable_iters);
1716 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1718 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1719 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1721 if (dump_enabled_p ())
1722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1723 "not vectorized: vectorization not profitable.\n");
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "not vectorized: iteration count smaller than user "
1727 "specified loop bound parameter or minimum profitable "
1728 "iterations (whichever is more conservative).\n");
1729 return 0;
1732 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1733 if (estimated_niter == -1)
1734 estimated_niter = likely_max_stmt_executions_int (loop);
1735 if (estimated_niter != -1
1736 && ((unsigned HOST_WIDE_INT) estimated_niter
1737 < MAX (th, (unsigned) min_profitable_estimate)))
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741 "not vectorized: estimated iteration count too "
1742 "small.\n");
1743 if (dump_enabled_p ())
1744 dump_printf_loc (MSG_NOTE, vect_location,
1745 "not vectorized: estimated iteration count smaller "
1746 "than specified loop bound parameter or minimum "
1747 "profitable iterations (whichever is more "
1748 "conservative).\n");
1749 return -1;
1752 return 1;
1755 static bool
1756 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1757 vec<data_reference_p> *datarefs,
1758 unsigned int *n_stmts)
1760 *n_stmts = 0;
1761 for (unsigned i = 0; i < loop->num_nodes; i++)
1762 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1763 !gsi_end_p (gsi); gsi_next (&gsi))
1765 gimple *stmt = gsi_stmt (gsi);
1766 if (is_gimple_debug (stmt))
1767 continue;
1768 ++(*n_stmts);
1769 if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1771 if (is_gimple_call (stmt) && loop->safelen)
1773 tree fndecl = gimple_call_fndecl (stmt), op;
1774 if (fndecl != NULL_TREE)
1776 cgraph_node *node = cgraph_node::get (fndecl);
1777 if (node != NULL && node->simd_clones != NULL)
1779 unsigned int j, n = gimple_call_num_args (stmt);
1780 for (j = 0; j < n; j++)
1782 op = gimple_call_arg (stmt, j);
1783 if (DECL_P (op)
1784 || (REFERENCE_CLASS_P (op)
1785 && get_base_address (op)))
1786 break;
1788 op = gimple_call_lhs (stmt);
1789 /* Ignore #pragma omp declare simd functions
1790 if they don't have data references in the
1791 call stmt itself. */
1792 if (j == n
1793 && !(op
1794 && (DECL_P (op)
1795 || (REFERENCE_CLASS_P (op)
1796 && get_base_address (op)))))
1797 continue;
1801 return false;
1803 /* If dependence analysis will give up due to the limit on the
1804 number of datarefs stop here and fail fatally. */
1805 if (datarefs->length ()
1806 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1807 return false;
1809 return true;
1812 /* Function vect_analyze_loop_2.
1814 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1815 for it. The different analyses will record information in the
1816 loop_vec_info struct. */
1817 static bool
1818 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1820 bool ok;
1821 int res;
1822 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1823 poly_uint64 min_vf = 2;
1825 /* The first group of checks is independent of the vector size. */
1826 fatal = true;
1828 /* Find all data references in the loop (which correspond to vdefs/vuses)
1829 and analyze their evolution in the loop. */
1831 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1833 /* Gather the data references and count stmts in the loop. */
1834 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1836 if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1837 &LOOP_VINFO_DATAREFS (loop_vinfo),
1838 n_stmts))
1840 if (dump_enabled_p ())
1841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1842 "not vectorized: loop contains function "
1843 "calls or data references that cannot "
1844 "be analyzed\n");
1845 return false;
1847 loop_vinfo->shared->save_datarefs ();
1849 else
1850 loop_vinfo->shared->check_datarefs ();
1852 /* Analyze the data references and also adjust the minimal
1853 vectorization factor according to the loads and stores. */
1855 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1856 if (!ok)
1858 if (dump_enabled_p ())
1859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860 "bad data references.\n");
1861 return false;
1864 /* Classify all cross-iteration scalar data-flow cycles.
1865 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1866 vect_analyze_scalar_cycles (loop_vinfo);
1868 vect_pattern_recog (loop_vinfo);
1870 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1872 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1873 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1875 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1876 if (!ok)
1878 if (dump_enabled_p ())
1879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1880 "bad data access.\n");
1881 return false;
1884 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1886 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1887 if (!ok)
1889 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891 "unexpected pattern.\n");
1892 return false;
1895 /* While the rest of the analysis below depends on it in some way. */
1896 fatal = false;
1898 /* Analyze data dependences between the data-refs in the loop
1899 and adjust the maximum vectorization factor according to
1900 the dependences.
1901 FORNOW: fail at the first data dependence that we encounter. */
1903 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1904 if (!ok
1905 || (max_vf != MAX_VECTORIZATION_FACTOR
1906 && maybe_lt (max_vf, min_vf)))
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 "bad data dependence.\n");
1911 return false;
1913 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1915 ok = vect_determine_vectorization_factor (loop_vinfo);
1916 if (!ok)
1918 if (dump_enabled_p ())
1919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1920 "can't determine vectorization factor.\n");
1921 return false;
1923 if (max_vf != MAX_VECTORIZATION_FACTOR
1924 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1926 if (dump_enabled_p ())
1927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928 "bad data dependence.\n");
1929 return false;
1932 /* Compute the scalar iteration cost. */
1933 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1935 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1936 unsigned th;
1938 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1939 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1940 if (!ok)
1941 return false;
1943 /* If there are any SLP instances mark them as pure_slp. */
1944 bool slp = vect_make_slp_decision (loop_vinfo);
1945 if (slp)
1947 /* Find stmts that need to be both vectorized and SLPed. */
1948 vect_detect_hybrid_slp (loop_vinfo);
1950 /* Update the vectorization factor based on the SLP decision. */
1951 vect_update_vf_for_slp (loop_vinfo);
1954 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1956 /* We don't expect to have to roll back to anything other than an empty
1957 set of rgroups. */
1958 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1960 /* This is the point where we can re-start analysis with SLP forced off. */
1961 start_over:
1963 /* Now the vectorization factor is final. */
1964 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1965 gcc_assert (known_ne (vectorization_factor, 0U));
1967 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1969 dump_printf_loc (MSG_NOTE, vect_location,
1970 "vectorization_factor = ");
1971 dump_dec (MSG_NOTE, vectorization_factor);
1972 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1973 LOOP_VINFO_INT_NITERS (loop_vinfo));
1976 HOST_WIDE_INT max_niter
1977 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1979 /* Analyze the alignment of the data-refs in the loop.
1980 Fail if a data reference is found that cannot be vectorized. */
1982 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1983 if (!ok)
1985 if (dump_enabled_p ())
1986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1987 "bad data alignment.\n");
1988 return false;
1991 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1992 It is important to call pruning after vect_analyze_data_ref_accesses,
1993 since we use grouping information gathered by interleaving analysis. */
1994 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1995 if (!ok)
1996 return false;
1998 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
1999 vectorization. */
2000 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2002 /* This pass will decide on using loop versioning and/or loop peeling in
2003 order to enhance the alignment of data references in the loop. */
2004 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2005 if (!ok)
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 "bad data alignment.\n");
2010 return false;
2014 if (slp)
2016 /* Analyze operations in the SLP instances. Note this may
2017 remove unsupported SLP instances which makes the above
2018 SLP kind detection invalid. */
2019 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2020 vect_slp_analyze_operations (loop_vinfo);
2021 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2022 goto again;
2025 /* Scan all the remaining operations in the loop that are not subject
2026 to SLP and make sure they are vectorizable. */
2027 ok = vect_analyze_loop_operations (loop_vinfo);
2028 if (!ok)
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032 "bad operation or unsupported loop bound.\n");
2033 return false;
2036 /* Decide whether to use a fully-masked loop for this vectorization
2037 factor. */
2038 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2039 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2040 && vect_verify_full_masking (loop_vinfo));
2041 if (dump_enabled_p ())
2043 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2044 dump_printf_loc (MSG_NOTE, vect_location,
2045 "using a fully-masked loop.\n");
2046 else
2047 dump_printf_loc (MSG_NOTE, vect_location,
2048 "not using a fully-masked loop.\n");
2051 /* If epilog loop is required because of data accesses with gaps,
2052 one additional iteration needs to be peeled. Check if there is
2053 enough iterations for vectorization. */
2054 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2055 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2056 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2058 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2061 if (known_lt (wi::to_widest (scalar_niters), vf))
2063 if (dump_enabled_p ())
2064 dump_printf_loc (MSG_NOTE, vect_location,
2065 "loop has no enough iterations to support"
2066 " peeling for gaps.\n");
2067 return false;
2071 /* Check the costings of the loop make vectorizing worthwhile. */
2072 res = vect_analyze_loop_costing (loop_vinfo);
2073 if (res < 0)
2074 goto again;
2075 if (!res)
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079 "Loop costings not worthwhile.\n");
2080 return false;
2083 /* Decide whether we need to create an epilogue loop to handle
2084 remaining scalar iterations. */
2085 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2087 unsigned HOST_WIDE_INT const_vf;
2088 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2089 /* The main loop handles all iterations. */
2090 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2091 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2092 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2094 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2095 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2096 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2097 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2099 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2100 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2101 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2102 < (unsigned) exact_log2 (const_vf))
2103 /* In case of versioning, check if the maximum number of
2104 iterations is greater than th. If they are identical,
2105 the epilogue is unnecessary. */
2106 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2107 || ((unsigned HOST_WIDE_INT) max_niter
2108 > (th / const_vf) * const_vf))))
2109 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2111 /* If an epilogue loop is required make sure we can create one. */
2112 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2113 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2115 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2117 if (!vect_can_advance_ivs_p (loop_vinfo)
2118 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2119 single_exit (LOOP_VINFO_LOOP
2120 (loop_vinfo))))
2122 if (dump_enabled_p ())
2123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2124 "not vectorized: can't create required "
2125 "epilog loop\n");
2126 goto again;
2130 /* During peeling, we need to check if number of loop iterations is
2131 enough for both peeled prolog loop and vector loop. This check
2132 can be merged along with threshold check of loop versioning, so
2133 increase threshold for this case if necessary. */
2134 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2136 poly_uint64 niters_th = 0;
2138 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2140 /* Niters for peeled prolog loop. */
2141 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2143 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2144 tree vectype = STMT_VINFO_VECTYPE (vect_dr_stmt (dr));
2145 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2147 else
2148 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2151 /* Niters for at least one iteration of vectorized loop. */
2152 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2153 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2154 /* One additional iteration because of peeling for gap. */
2155 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2156 niters_th += 1;
2157 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2160 gcc_assert (known_eq (vectorization_factor,
2161 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2163 /* Ok to vectorize! */
2164 return true;
2166 again:
2167 /* Try again with SLP forced off but if we didn't do any SLP there is
2168 no point in re-trying. */
2169 if (!slp)
2170 return false;
2172 /* If there are reduction chains re-trying will fail anyway. */
2173 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2174 return false;
2176 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2177 via interleaving or lane instructions. */
2178 slp_instance instance;
2179 slp_tree node;
2180 unsigned i, j;
2181 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2183 stmt_vec_info vinfo;
2184 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2185 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2186 continue;
2187 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2188 unsigned int size = DR_GROUP_SIZE (vinfo);
2189 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2190 if (! vect_store_lanes_supported (vectype, size, false)
2191 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2192 && ! vect_grouped_store_supported (vectype, size))
2193 return false;
2194 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2196 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2197 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2198 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2199 size = DR_GROUP_SIZE (vinfo);
2200 vectype = STMT_VINFO_VECTYPE (vinfo);
2201 if (! vect_load_lanes_supported (vectype, size, false)
2202 && ! vect_grouped_load_supported (vectype, single_element_p,
2203 size))
2204 return false;
2208 if (dump_enabled_p ())
2209 dump_printf_loc (MSG_NOTE, vect_location,
2210 "re-trying with SLP disabled\n");
2212 /* Roll back state appropriately. No SLP this time. */
2213 slp = false;
2214 /* Restore vectorization factor as it were without SLP. */
2215 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2216 /* Free the SLP instances. */
2217 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2218 vect_free_slp_instance (instance, false);
2219 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2220 /* Reset SLP type to loop_vect on all stmts. */
2221 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2223 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2224 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2225 !gsi_end_p (si); gsi_next (&si))
2227 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2228 STMT_SLP_TYPE (stmt_info) = loop_vect;
2230 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2231 !gsi_end_p (si); gsi_next (&si))
2233 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2234 STMT_SLP_TYPE (stmt_info) = loop_vect;
2235 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2237 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2238 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2239 STMT_SLP_TYPE (stmt_info) = loop_vect;
2240 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2241 !gsi_end_p (pi); gsi_next (&pi))
2242 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2243 = loop_vect;
2247 /* Free optimized alias test DDRS. */
2248 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2249 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2250 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2251 /* Reset target cost data. */
2252 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2253 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2254 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2255 /* Reset accumulated rgroup information. */
2256 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2257 /* Reset assorted flags. */
2258 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2259 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2260 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2261 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2262 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2264 goto start_over;
2267 /* Function vect_analyze_loop.
2269 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2270 for it. The different analyses will record information in the
2271 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2272 be vectorized. */
2273 loop_vec_info
2274 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2275 vec_info_shared *shared)
2277 loop_vec_info loop_vinfo;
2278 auto_vector_sizes vector_sizes;
2280 /* Autodetect first vector size we try. */
2281 current_vector_size = 0;
2282 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2283 unsigned int next_size = 0;
2285 DUMP_VECT_SCOPE ("analyze_loop_nest");
2287 if (loop_outer (loop)
2288 && loop_vec_info_for_loop (loop_outer (loop))
2289 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_NOTE, vect_location,
2293 "outer-loop already vectorized.\n");
2294 return NULL;
2297 if (!find_loop_nest (loop, &shared->loop_nest))
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "not vectorized: loop nest containing two "
2302 "or more consecutive inner loops cannot be "
2303 "vectorized\n");
2304 return NULL;
2307 unsigned n_stmts = 0;
2308 poly_uint64 autodetected_vector_size = 0;
2309 while (1)
2311 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2312 loop_vinfo = vect_analyze_loop_form (loop, shared);
2313 if (!loop_vinfo)
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317 "bad loop form.\n");
2318 return NULL;
2321 bool fatal = false;
2323 if (orig_loop_vinfo)
2324 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2326 if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2328 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2330 return loop_vinfo;
2333 delete loop_vinfo;
2335 if (next_size == 0)
2336 autodetected_vector_size = current_vector_size;
2338 if (next_size < vector_sizes.length ()
2339 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2340 next_size += 1;
2342 if (fatal
2343 || next_size == vector_sizes.length ()
2344 || known_eq (current_vector_size, 0U))
2345 return NULL;
2347 /* Try the next biggest vector size. */
2348 current_vector_size = vector_sizes[next_size++];
2349 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_NOTE, vect_location,
2352 "***** Re-trying analysis with "
2353 "vector size ");
2354 dump_dec (MSG_NOTE, current_vector_size);
2355 dump_printf (MSG_NOTE, "\n");
2360 /* Return true if there is an in-order reduction function for CODE, storing
2361 it in *REDUC_FN if so. */
2363 static bool
2364 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2366 switch (code)
2368 case PLUS_EXPR:
2369 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2370 return true;
2372 default:
2373 return false;
2377 /* Function reduction_fn_for_scalar_code
2379 Input:
2380 CODE - tree_code of a reduction operations.
2382 Output:
2383 REDUC_FN - the corresponding internal function to be used to reduce the
2384 vector of partial results into a single scalar result, or IFN_LAST
2385 if the operation is a supported reduction operation, but does not have
2386 such an internal function.
2388 Return FALSE if CODE currently cannot be vectorized as reduction. */
2390 static bool
2391 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2393 switch (code)
2395 case MAX_EXPR:
2396 *reduc_fn = IFN_REDUC_MAX;
2397 return true;
2399 case MIN_EXPR:
2400 *reduc_fn = IFN_REDUC_MIN;
2401 return true;
2403 case PLUS_EXPR:
2404 *reduc_fn = IFN_REDUC_PLUS;
2405 return true;
2407 case BIT_AND_EXPR:
2408 *reduc_fn = IFN_REDUC_AND;
2409 return true;
2411 case BIT_IOR_EXPR:
2412 *reduc_fn = IFN_REDUC_IOR;
2413 return true;
2415 case BIT_XOR_EXPR:
2416 *reduc_fn = IFN_REDUC_XOR;
2417 return true;
2419 case MULT_EXPR:
2420 case MINUS_EXPR:
2421 *reduc_fn = IFN_LAST;
2422 return true;
2424 default:
2425 return false;
2429 /* If there is a neutral value X such that SLP reduction NODE would not
2430 be affected by the introduction of additional X elements, return that X,
2431 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2432 is true if the SLP statements perform a single reduction, false if each
2433 statement performs an independent reduction. */
2435 static tree
2436 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2437 bool reduc_chain)
2439 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2440 stmt_vec_info stmt_vinfo = stmts[0];
2441 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2442 tree scalar_type = TREE_TYPE (vector_type);
2443 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2444 gcc_assert (loop);
2446 switch (code)
2448 case WIDEN_SUM_EXPR:
2449 case DOT_PROD_EXPR:
2450 case SAD_EXPR:
2451 case PLUS_EXPR:
2452 case MINUS_EXPR:
2453 case BIT_IOR_EXPR:
2454 case BIT_XOR_EXPR:
2455 return build_zero_cst (scalar_type);
2457 case MULT_EXPR:
2458 return build_one_cst (scalar_type);
2460 case BIT_AND_EXPR:
2461 return build_all_ones_cst (scalar_type);
2463 case MAX_EXPR:
2464 case MIN_EXPR:
2465 /* For MIN/MAX the initial values are neutral. A reduction chain
2466 has only a single initial value, so that value is neutral for
2467 all statements. */
2468 if (reduc_chain)
2469 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2470 loop_preheader_edge (loop));
2471 return NULL_TREE;
2473 default:
2474 return NULL_TREE;
2478 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2479 STMT is printed with a message MSG. */
2481 static void
2482 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2484 dump_printf_loc (msg_type, vect_location, "%s", msg);
2485 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2488 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2489 operation. Return true if the results of DEF_STMT_INFO are something
2490 that can be accumulated by such a reduction. */
2492 static bool
2493 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2495 return (is_gimple_assign (def_stmt_info->stmt)
2496 || is_gimple_call (def_stmt_info->stmt)
2497 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2498 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2499 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2500 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2503 /* Detect SLP reduction of the form:
2505 #a1 = phi <a5, a0>
2506 a2 = operation (a1)
2507 a3 = operation (a2)
2508 a4 = operation (a3)
2509 a5 = operation (a4)
2511 #a = phi <a5>
2513 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2514 FIRST_STMT is the first reduction stmt in the chain
2515 (a2 = operation (a1)).
2517 Return TRUE if a reduction chain was detected. */
2519 static bool
2520 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2521 gimple *first_stmt)
2523 struct loop *loop = (gimple_bb (phi))->loop_father;
2524 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2525 enum tree_code code;
2526 gimple *loop_use_stmt = NULL;
2527 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2528 tree lhs;
2529 imm_use_iterator imm_iter;
2530 use_operand_p use_p;
2531 int nloop_uses, size = 0, n_out_of_loop_uses;
2532 bool found = false;
2534 if (loop != vect_loop)
2535 return false;
2537 lhs = PHI_RESULT (phi);
2538 code = gimple_assign_rhs_code (first_stmt);
2539 while (1)
2541 nloop_uses = 0;
2542 n_out_of_loop_uses = 0;
2543 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2545 gimple *use_stmt = USE_STMT (use_p);
2546 if (is_gimple_debug (use_stmt))
2547 continue;
2549 /* Check if we got back to the reduction phi. */
2550 if (use_stmt == phi)
2552 loop_use_stmt = use_stmt;
2553 found = true;
2554 break;
2557 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2559 loop_use_stmt = use_stmt;
2560 nloop_uses++;
2562 else
2563 n_out_of_loop_uses++;
2565 /* There are can be either a single use in the loop or two uses in
2566 phi nodes. */
2567 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2568 return false;
2571 if (found)
2572 break;
2574 /* We reached a statement with no loop uses. */
2575 if (nloop_uses == 0)
2576 return false;
2578 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2579 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2580 return false;
2582 if (!is_gimple_assign (loop_use_stmt)
2583 || code != gimple_assign_rhs_code (loop_use_stmt)
2584 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2585 return false;
2587 /* Insert USE_STMT into reduction chain. */
2588 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2589 if (current_stmt_info)
2591 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2592 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2593 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2595 else
2596 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2598 lhs = gimple_assign_lhs (loop_use_stmt);
2599 current_stmt_info = use_stmt_info;
2600 size++;
2603 if (!found || loop_use_stmt != phi || size < 2)
2604 return false;
2606 /* Swap the operands, if needed, to make the reduction operand be the second
2607 operand. */
2608 lhs = PHI_RESULT (phi);
2609 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2610 while (next_stmt_info)
2612 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2613 if (gimple_assign_rhs2 (next_stmt) == lhs)
2615 tree op = gimple_assign_rhs1 (next_stmt);
2616 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2618 /* Check that the other def is either defined in the loop
2619 ("vect_internal_def"), or it's an induction (defined by a
2620 loop-header phi-node). */
2621 if (def_stmt_info
2622 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2623 && vect_valid_reduction_input_p (def_stmt_info))
2625 lhs = gimple_assign_lhs (next_stmt);
2626 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2627 continue;
2630 return false;
2632 else
2634 tree op = gimple_assign_rhs2 (next_stmt);
2635 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2637 /* Check that the other def is either defined in the loop
2638 ("vect_internal_def"), or it's an induction (defined by a
2639 loop-header phi-node). */
2640 if (def_stmt_info
2641 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2642 && vect_valid_reduction_input_p (def_stmt_info))
2644 if (dump_enabled_p ())
2646 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2647 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2650 swap_ssa_operands (next_stmt,
2651 gimple_assign_rhs1_ptr (next_stmt),
2652 gimple_assign_rhs2_ptr (next_stmt));
2653 update_stmt (next_stmt);
2655 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2656 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2658 else
2659 return false;
2662 lhs = gimple_assign_lhs (next_stmt);
2663 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2666 /* Save the chain for further analysis in SLP detection. */
2667 stmt_vec_info first_stmt_info
2668 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2669 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2670 REDUC_GROUP_SIZE (first_stmt_info) = size;
2672 return true;
2675 /* Return true if we need an in-order reduction for operation CODE
2676 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2677 overflow must wrap. */
2679 static bool
2680 needs_fold_left_reduction_p (tree type, tree_code code,
2681 bool need_wrapping_integral_overflow)
2683 /* CHECKME: check for !flag_finite_math_only too? */
2684 if (SCALAR_FLOAT_TYPE_P (type))
2685 switch (code)
2687 case MIN_EXPR:
2688 case MAX_EXPR:
2689 return false;
2691 default:
2692 return !flag_associative_math;
2695 if (INTEGRAL_TYPE_P (type))
2697 if (!operation_no_trapping_overflow (type, code))
2698 return true;
2699 if (need_wrapping_integral_overflow
2700 && !TYPE_OVERFLOW_WRAPS (type)
2701 && operation_can_overflow (code))
2702 return true;
2703 return false;
2706 if (SAT_FIXED_POINT_TYPE_P (type))
2707 return true;
2709 return false;
2712 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2713 reduction operation CODE has a handled computation expression. */
2715 bool
2716 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2717 tree loop_arg, enum tree_code code)
2719 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2720 auto_bitmap visited;
2721 tree lookfor = PHI_RESULT (phi);
2722 ssa_op_iter curri;
2723 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2724 while (USE_FROM_PTR (curr) != loop_arg)
2725 curr = op_iter_next_use (&curri);
2726 curri.i = curri.numops;
2729 path.safe_push (std::make_pair (curri, curr));
2730 tree use = USE_FROM_PTR (curr);
2731 if (use == lookfor)
2732 break;
2733 gimple *def = SSA_NAME_DEF_STMT (use);
2734 if (gimple_nop_p (def)
2735 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2737 pop:
2740 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2741 curri = x.first;
2742 curr = x.second;
2744 curr = op_iter_next_use (&curri);
2745 /* Skip already visited or non-SSA operands (from iterating
2746 over PHI args). */
2747 while (curr != NULL_USE_OPERAND_P
2748 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2749 || ! bitmap_set_bit (visited,
2750 SSA_NAME_VERSION
2751 (USE_FROM_PTR (curr)))));
2753 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2754 if (curr == NULL_USE_OPERAND_P)
2755 break;
2757 else
2759 if (gimple_code (def) == GIMPLE_PHI)
2760 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2761 else
2762 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2763 while (curr != NULL_USE_OPERAND_P
2764 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2765 || ! bitmap_set_bit (visited,
2766 SSA_NAME_VERSION
2767 (USE_FROM_PTR (curr)))))
2768 curr = op_iter_next_use (&curri);
2769 if (curr == NULL_USE_OPERAND_P)
2770 goto pop;
2773 while (1);
2774 if (dump_file && (dump_flags & TDF_DETAILS))
2776 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2777 unsigned i;
2778 std::pair<ssa_op_iter, use_operand_p> *x;
2779 FOR_EACH_VEC_ELT (path, i, x)
2781 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2782 dump_printf (MSG_NOTE, " ");
2784 dump_printf (MSG_NOTE, "\n");
2787 /* Check whether the reduction path detected is valid. */
2788 bool fail = path.length () == 0;
2789 bool neg = false;
2790 for (unsigned i = 1; i < path.length (); ++i)
2792 gimple *use_stmt = USE_STMT (path[i].second);
2793 tree op = USE_FROM_PTR (path[i].second);
2794 if (! has_single_use (op)
2795 || ! is_gimple_assign (use_stmt))
2797 fail = true;
2798 break;
2800 if (gimple_assign_rhs_code (use_stmt) != code)
2802 if (code == PLUS_EXPR
2803 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2805 /* Track whether we negate the reduction value each iteration. */
2806 if (gimple_assign_rhs2 (use_stmt) == op)
2807 neg = ! neg;
2809 else
2811 fail = true;
2812 break;
2816 return ! fail && ! neg;
2820 /* Function vect_is_simple_reduction
2822 (1) Detect a cross-iteration def-use cycle that represents a simple
2823 reduction computation. We look for the following pattern:
2825 loop_header:
2826 a1 = phi < a0, a2 >
2827 a3 = ...
2828 a2 = operation (a3, a1)
2832 a3 = ...
2833 loop_header:
2834 a1 = phi < a0, a2 >
2835 a2 = operation (a3, a1)
2837 such that:
2838 1. operation is commutative and associative and it is safe to
2839 change the order of the computation
2840 2. no uses for a2 in the loop (a2 is used out of the loop)
2841 3. no uses of a1 in the loop besides the reduction operation
2842 4. no uses of a1 outside the loop.
2844 Conditions 1,4 are tested here.
2845 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2847 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2848 nested cycles.
2850 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2851 reductions:
2853 a1 = phi < a0, a2 >
2854 inner loop (def of a3)
2855 a2 = phi < a3 >
2857 (4) Detect condition expressions, ie:
2858 for (int i = 0; i < N; i++)
2859 if (a[i] < val)
2860 ret_val = a[i];
2864 static stmt_vec_info
2865 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2866 bool *double_reduc,
2867 bool need_wrapping_integral_overflow,
2868 enum vect_reduction_type *v_reduc_type)
2870 gphi *phi = as_a <gphi *> (phi_info->stmt);
2871 struct loop *loop = (gimple_bb (phi))->loop_father;
2872 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2873 gimple *phi_use_stmt = NULL;
2874 enum tree_code orig_code, code;
2875 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2876 tree type;
2877 int nloop_uses;
2878 tree name;
2879 imm_use_iterator imm_iter;
2880 use_operand_p use_p;
2881 bool phi_def;
2883 *double_reduc = false;
2884 *v_reduc_type = TREE_CODE_REDUCTION;
2886 tree phi_name = PHI_RESULT (phi);
2887 /* ??? If there are no uses of the PHI result the inner loop reduction
2888 won't be detected as possibly double-reduction by vectorizable_reduction
2889 because that tries to walk the PHI arg from the preheader edge which
2890 can be constant. See PR60382. */
2891 if (has_zero_uses (phi_name))
2892 return NULL;
2893 nloop_uses = 0;
2894 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2896 gimple *use_stmt = USE_STMT (use_p);
2897 if (is_gimple_debug (use_stmt))
2898 continue;
2900 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2902 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904 "intermediate value used outside loop.\n");
2906 return NULL;
2909 nloop_uses++;
2910 if (nloop_uses > 1)
2912 if (dump_enabled_p ())
2913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914 "reduction value used in loop.\n");
2915 return NULL;
2918 phi_use_stmt = use_stmt;
2921 edge latch_e = loop_latch_edge (loop);
2922 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2923 if (TREE_CODE (loop_arg) != SSA_NAME)
2925 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2928 "reduction: not ssa_name: ");
2929 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2930 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2932 return NULL;
2935 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2936 if (!def_stmt_info)
2937 return NULL;
2939 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2941 name = gimple_assign_lhs (def_stmt);
2942 phi_def = false;
2944 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2946 name = PHI_RESULT (def_stmt);
2947 phi_def = true;
2949 else
2951 if (dump_enabled_p ())
2953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2954 "reduction: unhandled reduction operation: ");
2955 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2956 def_stmt_info->stmt, 0);
2958 return NULL;
2961 nloop_uses = 0;
2962 auto_vec<gphi *, 3> lcphis;
2963 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2965 gimple *use_stmt = USE_STMT (use_p);
2966 if (is_gimple_debug (use_stmt))
2967 continue;
2968 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2969 nloop_uses++;
2970 else
2971 /* We can have more than one loop-closed PHI. */
2972 lcphis.safe_push (as_a <gphi *> (use_stmt));
2973 if (nloop_uses > 1)
2975 if (dump_enabled_p ())
2976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2977 "reduction used in loop.\n");
2978 return NULL;
2982 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2983 defined in the inner loop. */
2984 if (phi_def)
2986 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2987 op1 = PHI_ARG_DEF (def_stmt, 0);
2989 if (gimple_phi_num_args (def_stmt) != 1
2990 || TREE_CODE (op1) != SSA_NAME)
2992 if (dump_enabled_p ())
2993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2994 "unsupported phi node definition.\n");
2996 return NULL;
2999 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3000 if (gimple_bb (def1)
3001 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3002 && loop->inner
3003 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3004 && is_gimple_assign (def1)
3005 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3007 if (dump_enabled_p ())
3008 report_vect_op (MSG_NOTE, def_stmt,
3009 "detected double reduction: ");
3011 *double_reduc = true;
3012 return def_stmt_info;
3015 return NULL;
3018 /* If we are vectorizing an inner reduction we are executing that
3019 in the original order only in case we are not dealing with a
3020 double reduction. */
3021 bool check_reduction = true;
3022 if (flow_loop_nested_p (vect_loop, loop))
3024 gphi *lcphi;
3025 unsigned i;
3026 check_reduction = false;
3027 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3028 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3030 gimple *use_stmt = USE_STMT (use_p);
3031 if (is_gimple_debug (use_stmt))
3032 continue;
3033 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3034 check_reduction = true;
3038 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3039 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3040 code = orig_code = gimple_assign_rhs_code (def_stmt);
3042 /* We can handle "res -= x[i]", which is non-associative by
3043 simply rewriting this into "res += -x[i]". Avoid changing
3044 gimple instruction for the first simple tests and only do this
3045 if we're allowed to change code at all. */
3046 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3047 code = PLUS_EXPR;
3049 if (code == COND_EXPR)
3051 if (! nested_in_vect_loop)
3052 *v_reduc_type = COND_REDUCTION;
3054 op3 = gimple_assign_rhs1 (def_stmt);
3055 if (COMPARISON_CLASS_P (op3))
3057 op4 = TREE_OPERAND (op3, 1);
3058 op3 = TREE_OPERAND (op3, 0);
3060 if (op3 == phi_name || op4 == phi_name)
3062 if (dump_enabled_p ())
3063 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3064 "reduction: condition depends on previous"
3065 " iteration: ");
3066 return NULL;
3069 op1 = gimple_assign_rhs2 (def_stmt);
3070 op2 = gimple_assign_rhs3 (def_stmt);
3072 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3074 if (dump_enabled_p ())
3075 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3076 "reduction: not commutative/associative: ");
3077 return NULL;
3079 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3081 op1 = gimple_assign_rhs1 (def_stmt);
3082 op2 = gimple_assign_rhs2 (def_stmt);
3084 else
3086 if (dump_enabled_p ())
3087 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3088 "reduction: not handled operation: ");
3089 return NULL;
3092 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3094 if (dump_enabled_p ())
3095 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3096 "reduction: both uses not ssa_names: ");
3098 return NULL;
3101 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3102 if ((TREE_CODE (op1) == SSA_NAME
3103 && !types_compatible_p (type,TREE_TYPE (op1)))
3104 || (TREE_CODE (op2) == SSA_NAME
3105 && !types_compatible_p (type, TREE_TYPE (op2)))
3106 || (op3 && TREE_CODE (op3) == SSA_NAME
3107 && !types_compatible_p (type, TREE_TYPE (op3)))
3108 || (op4 && TREE_CODE (op4) == SSA_NAME
3109 && !types_compatible_p (type, TREE_TYPE (op4))))
3111 if (dump_enabled_p ())
3113 dump_printf_loc (MSG_NOTE, vect_location,
3114 "reduction: multiple types: operation type: ");
3115 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3116 dump_printf (MSG_NOTE, ", operands types: ");
3117 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3118 TREE_TYPE (op1));
3119 dump_printf (MSG_NOTE, ",");
3120 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3121 TREE_TYPE (op2));
3122 if (op3)
3124 dump_printf (MSG_NOTE, ",");
3125 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126 TREE_TYPE (op3));
3129 if (op4)
3131 dump_printf (MSG_NOTE, ",");
3132 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3133 TREE_TYPE (op4));
3135 dump_printf (MSG_NOTE, "\n");
3138 return NULL;
3141 /* Check whether it's ok to change the order of the computation.
3142 Generally, when vectorizing a reduction we change the order of the
3143 computation. This may change the behavior of the program in some
3144 cases, so we need to check that this is ok. One exception is when
3145 vectorizing an outer-loop: the inner-loop is executed sequentially,
3146 and therefore vectorizing reductions in the inner-loop during
3147 outer-loop vectorization is safe. */
3148 if (check_reduction
3149 && *v_reduc_type == TREE_CODE_REDUCTION
3150 && needs_fold_left_reduction_p (type, code,
3151 need_wrapping_integral_overflow))
3152 *v_reduc_type = FOLD_LEFT_REDUCTION;
3154 /* Reduction is safe. We're dealing with one of the following:
3155 1) integer arithmetic and no trapv
3156 2) floating point arithmetic, and special flags permit this optimization
3157 3) nested cycle (i.e., outer loop vectorization). */
3158 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3159 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3160 if (code != COND_EXPR && !def1_info && !def2_info)
3162 if (dump_enabled_p ())
3163 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3164 return NULL;
3167 /* Check that one def is the reduction def, defined by PHI,
3168 the other def is either defined in the loop ("vect_internal_def"),
3169 or it's an induction (defined by a loop-header phi-node). */
3171 if (def2_info
3172 && def2_info->stmt == phi
3173 && (code == COND_EXPR
3174 || !def1_info
3175 || vect_valid_reduction_input_p (def1_info)))
3177 if (dump_enabled_p ())
3178 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3179 return def_stmt_info;
3182 if (def1_info
3183 && def1_info->stmt == phi
3184 && (code == COND_EXPR
3185 || !def2_info
3186 || vect_valid_reduction_input_p (def2_info)))
3188 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3190 /* Check if we can swap operands (just for simplicity - so that
3191 the rest of the code can assume that the reduction variable
3192 is always the last (second) argument). */
3193 if (code == COND_EXPR)
3195 /* Swap cond_expr by inverting the condition. */
3196 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3197 enum tree_code invert_code = ERROR_MARK;
3198 enum tree_code cond_code = TREE_CODE (cond_expr);
3200 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3202 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3203 invert_code = invert_tree_comparison (cond_code, honor_nans);
3205 if (invert_code != ERROR_MARK)
3207 TREE_SET_CODE (cond_expr, invert_code);
3208 swap_ssa_operands (def_stmt,
3209 gimple_assign_rhs2_ptr (def_stmt),
3210 gimple_assign_rhs3_ptr (def_stmt));
3212 else
3214 if (dump_enabled_p ())
3215 report_vect_op (MSG_NOTE, def_stmt,
3216 "detected reduction: cannot swap operands "
3217 "for cond_expr");
3218 return NULL;
3221 else
3222 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3223 gimple_assign_rhs2_ptr (def_stmt));
3225 if (dump_enabled_p ())
3226 report_vect_op (MSG_NOTE, def_stmt,
3227 "detected reduction: need to swap operands: ");
3229 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3230 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3232 else
3234 if (dump_enabled_p ())
3235 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3238 return def_stmt_info;
3241 /* Try to find SLP reduction chain. */
3242 if (! nested_in_vect_loop
3243 && code != COND_EXPR
3244 && orig_code != MINUS_EXPR
3245 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3247 if (dump_enabled_p ())
3248 report_vect_op (MSG_NOTE, def_stmt,
3249 "reduction: detected reduction chain: ");
3251 return def_stmt_info;
3254 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3255 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3256 while (first)
3258 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3259 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3260 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3261 first = next;
3264 /* Look for the expression computing loop_arg from loop PHI result. */
3265 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3266 return def_stmt_info;
3268 if (dump_enabled_p ())
3270 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3271 "reduction: unknown pattern: ");
3274 return NULL;
3277 /* Wrapper around vect_is_simple_reduction, which will modify code
3278 in-place if it enables detection of more reductions. Arguments
3279 as there. */
3281 stmt_vec_info
3282 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3283 bool *double_reduc,
3284 bool need_wrapping_integral_overflow)
3286 enum vect_reduction_type v_reduc_type;
3287 stmt_vec_info def_info
3288 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3289 need_wrapping_integral_overflow,
3290 &v_reduc_type);
3291 if (def_info)
3293 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3294 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3295 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3296 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3298 return def_info;
3301 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3303 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3304 int *peel_iters_epilogue,
3305 stmt_vector_for_cost *scalar_cost_vec,
3306 stmt_vector_for_cost *prologue_cost_vec,
3307 stmt_vector_for_cost *epilogue_cost_vec)
3309 int retval = 0;
3310 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3312 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3314 *peel_iters_epilogue = assumed_vf / 2;
3315 if (dump_enabled_p ())
3316 dump_printf_loc (MSG_NOTE, vect_location,
3317 "cost model: epilogue peel iters set to vf/2 "
3318 "because loop iterations are unknown .\n");
3320 /* If peeled iterations are known but number of scalar loop
3321 iterations are unknown, count a taken branch per peeled loop. */
3322 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3323 NULL, 0, vect_prologue);
3324 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3325 NULL, 0, vect_epilogue);
3327 else
3329 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3330 peel_iters_prologue = niters < peel_iters_prologue ?
3331 niters : peel_iters_prologue;
3332 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3333 /* If we need to peel for gaps, but no peeling is required, we have to
3334 peel VF iterations. */
3335 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3336 *peel_iters_epilogue = assumed_vf;
3339 stmt_info_for_cost *si;
3340 int j;
3341 if (peel_iters_prologue)
3342 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3343 retval += record_stmt_cost (prologue_cost_vec,
3344 si->count * peel_iters_prologue,
3345 si->kind, si->stmt_info, si->misalign,
3346 vect_prologue);
3347 if (*peel_iters_epilogue)
3348 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3349 retval += record_stmt_cost (epilogue_cost_vec,
3350 si->count * *peel_iters_epilogue,
3351 si->kind, si->stmt_info, si->misalign,
3352 vect_epilogue);
3354 return retval;
3357 /* Function vect_estimate_min_profitable_iters
3359 Return the number of iterations required for the vector version of the
3360 loop to be profitable relative to the cost of the scalar version of the
3361 loop.
3363 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3364 of iterations for vectorization. -1 value means loop vectorization
3365 is not profitable. This returned value may be used for dynamic
3366 profitability check.
3368 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3369 for static check against estimated number of iterations. */
3371 static void
3372 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3373 int *ret_min_profitable_niters,
3374 int *ret_min_profitable_estimate)
3376 int min_profitable_iters;
3377 int min_profitable_estimate;
3378 int peel_iters_prologue;
3379 int peel_iters_epilogue;
3380 unsigned vec_inside_cost = 0;
3381 int vec_outside_cost = 0;
3382 unsigned vec_prologue_cost = 0;
3383 unsigned vec_epilogue_cost = 0;
3384 int scalar_single_iter_cost = 0;
3385 int scalar_outside_cost = 0;
3386 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3387 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3388 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3390 /* Cost model disabled. */
3391 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3393 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3394 *ret_min_profitable_niters = 0;
3395 *ret_min_profitable_estimate = 0;
3396 return;
3399 /* Requires loop versioning tests to handle misalignment. */
3400 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3402 /* FIXME: Make cost depend on complexity of individual check. */
3403 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3404 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3405 vect_prologue);
3406 dump_printf (MSG_NOTE,
3407 "cost model: Adding cost of checks for loop "
3408 "versioning to treat misalignment.\n");
3411 /* Requires loop versioning with alias checks. */
3412 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3414 /* FIXME: Make cost depend on complexity of individual check. */
3415 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3416 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3417 vect_prologue);
3418 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3419 if (len)
3420 /* Count LEN - 1 ANDs and LEN comparisons. */
3421 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3422 NULL, 0, vect_prologue);
3423 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3424 if (len)
3426 /* Count LEN - 1 ANDs and LEN comparisons. */
3427 unsigned int nstmts = len * 2 - 1;
3428 /* +1 for each bias that needs adding. */
3429 for (unsigned int i = 0; i < len; ++i)
3430 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3431 nstmts += 1;
3432 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3433 NULL, 0, vect_prologue);
3435 dump_printf (MSG_NOTE,
3436 "cost model: Adding cost of checks for loop "
3437 "versioning aliasing.\n");
3440 /* Requires loop versioning with niter checks. */
3441 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3443 /* FIXME: Make cost depend on complexity of individual check. */
3444 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3445 vect_prologue);
3446 dump_printf (MSG_NOTE,
3447 "cost model: Adding cost of checks for loop "
3448 "versioning niters.\n");
3451 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3452 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3453 vect_prologue);
3455 /* Count statements in scalar loop. Using this as scalar cost for a single
3456 iteration for now.
3458 TODO: Add outer loop support.
3460 TODO: Consider assigning different costs to different scalar
3461 statements. */
3463 scalar_single_iter_cost
3464 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3466 /* Add additional cost for the peeled instructions in prologue and epilogue
3467 loop. (For fully-masked loops there will be no peeling.)
3469 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3470 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3472 TODO: Build an expression that represents peel_iters for prologue and
3473 epilogue to be used in a run-time test. */
3475 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3477 peel_iters_prologue = 0;
3478 peel_iters_epilogue = 0;
3480 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3482 /* We need to peel exactly one iteration. */
3483 peel_iters_epilogue += 1;
3484 stmt_info_for_cost *si;
3485 int j;
3486 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3487 j, si)
3488 (void) add_stmt_cost (target_cost_data, si->count,
3489 si->kind, si->stmt_info, si->misalign,
3490 vect_epilogue);
3493 else if (npeel < 0)
3495 peel_iters_prologue = assumed_vf / 2;
3496 dump_printf (MSG_NOTE, "cost model: "
3497 "prologue peel iters set to vf/2.\n");
3499 /* If peeling for alignment is unknown, loop bound of main loop becomes
3500 unknown. */
3501 peel_iters_epilogue = assumed_vf / 2;
3502 dump_printf (MSG_NOTE, "cost model: "
3503 "epilogue peel iters set to vf/2 because "
3504 "peeling for alignment is unknown.\n");
3506 /* If peeled iterations are unknown, count a taken branch and a not taken
3507 branch per peeled loop. Even if scalar loop iterations are known,
3508 vector iterations are not known since peeled prologue iterations are
3509 not known. Hence guards remain the same. */
3510 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3511 NULL, 0, vect_prologue);
3512 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3513 NULL, 0, vect_prologue);
3514 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3515 NULL, 0, vect_epilogue);
3516 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3517 NULL, 0, vect_epilogue);
3518 stmt_info_for_cost *si;
3519 int j;
3520 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3522 (void) add_stmt_cost (target_cost_data,
3523 si->count * peel_iters_prologue,
3524 si->kind, si->stmt_info, si->misalign,
3525 vect_prologue);
3526 (void) add_stmt_cost (target_cost_data,
3527 si->count * peel_iters_epilogue,
3528 si->kind, si->stmt_info, si->misalign,
3529 vect_epilogue);
3532 else
3534 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3535 stmt_info_for_cost *si;
3536 int j;
3537 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3539 prologue_cost_vec.create (2);
3540 epilogue_cost_vec.create (2);
3541 peel_iters_prologue = npeel;
3543 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3544 &peel_iters_epilogue,
3545 &LOOP_VINFO_SCALAR_ITERATION_COST
3546 (loop_vinfo),
3547 &prologue_cost_vec,
3548 &epilogue_cost_vec);
3550 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3551 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3552 si->misalign, vect_prologue);
3554 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3555 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3556 si->misalign, vect_epilogue);
3558 prologue_cost_vec.release ();
3559 epilogue_cost_vec.release ();
3562 /* FORNOW: The scalar outside cost is incremented in one of the
3563 following ways:
3565 1. The vectorizer checks for alignment and aliasing and generates
3566 a condition that allows dynamic vectorization. A cost model
3567 check is ANDED with the versioning condition. Hence scalar code
3568 path now has the added cost of the versioning check.
3570 if (cost > th & versioning_check)
3571 jmp to vector code
3573 Hence run-time scalar is incremented by not-taken branch cost.
3575 2. The vectorizer then checks if a prologue is required. If the
3576 cost model check was not done before during versioning, it has to
3577 be done before the prologue check.
3579 if (cost <= th)
3580 prologue = scalar_iters
3581 if (prologue == 0)
3582 jmp to vector code
3583 else
3584 execute prologue
3585 if (prologue == num_iters)
3586 go to exit
3588 Hence the run-time scalar cost is incremented by a taken branch,
3589 plus a not-taken branch, plus a taken branch cost.
3591 3. The vectorizer then checks if an epilogue is required. If the
3592 cost model check was not done before during prologue check, it
3593 has to be done with the epilogue check.
3595 if (prologue == 0)
3596 jmp to vector code
3597 else
3598 execute prologue
3599 if (prologue == num_iters)
3600 go to exit
3601 vector code:
3602 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3603 jmp to epilogue
3605 Hence the run-time scalar cost should be incremented by 2 taken
3606 branches.
3608 TODO: The back end may reorder the BBS's differently and reverse
3609 conditions/branch directions. Change the estimates below to
3610 something more reasonable. */
3612 /* If the number of iterations is known and we do not do versioning, we can
3613 decide whether to vectorize at compile time. Hence the scalar version
3614 do not carry cost model guard costs. */
3615 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3616 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3618 /* Cost model check occurs at versioning. */
3619 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3620 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3621 else
3623 /* Cost model check occurs at prologue generation. */
3624 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3625 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3626 + vect_get_stmt_cost (cond_branch_not_taken);
3627 /* Cost model check occurs at epilogue generation. */
3628 else
3629 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3633 /* Complete the target-specific cost calculations. */
3634 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3635 &vec_inside_cost, &vec_epilogue_cost);
3637 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3639 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3642 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3643 vec_inside_cost);
3644 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3645 vec_prologue_cost);
3646 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3647 vec_epilogue_cost);
3648 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3649 scalar_single_iter_cost);
3650 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3651 scalar_outside_cost);
3652 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3653 vec_outside_cost);
3654 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3655 peel_iters_prologue);
3656 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3657 peel_iters_epilogue);
3660 /* Calculate number of iterations required to make the vector version
3661 profitable, relative to the loop bodies only. The following condition
3662 must hold true:
3663 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3664 where
3665 SIC = scalar iteration cost, VIC = vector iteration cost,
3666 VOC = vector outside cost, VF = vectorization factor,
3667 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3668 SOC = scalar outside cost for run time cost model check. */
3670 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3672 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3673 * assumed_vf
3674 - vec_inside_cost * peel_iters_prologue
3675 - vec_inside_cost * peel_iters_epilogue);
3676 if (min_profitable_iters <= 0)
3677 min_profitable_iters = 0;
3678 else
3680 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3681 - vec_inside_cost);
3683 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3684 <= (((int) vec_inside_cost * min_profitable_iters)
3685 + (((int) vec_outside_cost - scalar_outside_cost)
3686 * assumed_vf)))
3687 min_profitable_iters++;
3690 /* vector version will never be profitable. */
3691 else
3693 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3694 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3695 "vectorization did not happen for a simd loop");
3697 if (dump_enabled_p ())
3698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3699 "cost model: the vector iteration cost = %d "
3700 "divided by the scalar iteration cost = %d "
3701 "is greater or equal to the vectorization factor = %d"
3702 ".\n",
3703 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3704 *ret_min_profitable_niters = -1;
3705 *ret_min_profitable_estimate = -1;
3706 return;
3709 dump_printf (MSG_NOTE,
3710 " Calculated minimum iters for profitability: %d\n",
3711 min_profitable_iters);
3713 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3714 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3715 /* We want the vectorized loop to execute at least once. */
3716 min_profitable_iters = assumed_vf + peel_iters_prologue;
3718 if (dump_enabled_p ())
3719 dump_printf_loc (MSG_NOTE, vect_location,
3720 " Runtime profitability threshold = %d\n",
3721 min_profitable_iters);
3723 *ret_min_profitable_niters = min_profitable_iters;
3725 /* Calculate number of iterations required to make the vector version
3726 profitable, relative to the loop bodies only.
3728 Non-vectorized variant is SIC * niters and it must win over vector
3729 variant on the expected loop trip count. The following condition must hold true:
3730 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3732 if (vec_outside_cost <= 0)
3733 min_profitable_estimate = 0;
3734 else
3736 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3737 * assumed_vf
3738 - vec_inside_cost * peel_iters_prologue
3739 - vec_inside_cost * peel_iters_epilogue)
3740 / ((scalar_single_iter_cost * assumed_vf)
3741 - vec_inside_cost);
3743 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3744 if (dump_enabled_p ())
3745 dump_printf_loc (MSG_NOTE, vect_location,
3746 " Static estimate profitability threshold = %d\n",
3747 min_profitable_estimate);
3749 *ret_min_profitable_estimate = min_profitable_estimate;
3752 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3753 vector elements (not bits) for a vector with NELT elements. */
3754 static void
3755 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3756 vec_perm_builder *sel)
3758 /* The encoding is a single stepped pattern. Any wrap-around is handled
3759 by vec_perm_indices. */
3760 sel->new_vector (nelt, 1, 3);
3761 for (unsigned int i = 0; i < 3; i++)
3762 sel->quick_push (i + offset);
3765 /* Checks whether the target supports whole-vector shifts for vectors of mode
3766 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3767 it supports vec_perm_const with masks for all necessary shift amounts. */
3768 static bool
3769 have_whole_vector_shift (machine_mode mode)
3771 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3772 return true;
3774 /* Variable-length vectors should be handled via the optab. */
3775 unsigned int nelt;
3776 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3777 return false;
3779 vec_perm_builder sel;
3780 vec_perm_indices indices;
3781 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3783 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3784 indices.new_vector (sel, 2, nelt);
3785 if (!can_vec_perm_const_p (mode, indices, false))
3786 return false;
3788 return true;
3791 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3792 functions. Design better to avoid maintenance issues. */
3794 /* Function vect_model_reduction_cost.
3796 Models cost for a reduction operation, including the vector ops
3797 generated within the strip-mine loop, the initial definition before
3798 the loop, and the epilogue code that must be generated. */
3800 static void
3801 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3802 int ncopies, stmt_vector_for_cost *cost_vec)
3804 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3805 enum tree_code code;
3806 optab optab;
3807 tree vectype;
3808 machine_mode mode;
3809 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3810 struct loop *loop = NULL;
3812 if (loop_vinfo)
3813 loop = LOOP_VINFO_LOOP (loop_vinfo);
3815 /* Condition reductions generate two reductions in the loop. */
3816 vect_reduction_type reduction_type
3817 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3818 if (reduction_type == COND_REDUCTION)
3819 ncopies *= 2;
3821 vectype = STMT_VINFO_VECTYPE (stmt_info);
3822 mode = TYPE_MODE (vectype);
3823 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3825 if (!orig_stmt_info)
3826 orig_stmt_info = stmt_info;
3828 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3830 if (reduction_type == EXTRACT_LAST_REDUCTION
3831 || reduction_type == FOLD_LEFT_REDUCTION)
3833 /* No extra instructions needed in the prologue. */
3834 prologue_cost = 0;
3836 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3837 /* Count one reduction-like operation per vector. */
3838 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3839 stmt_info, 0, vect_body);
3840 else
3842 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3843 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3844 inside_cost = record_stmt_cost (cost_vec, nelements,
3845 vec_to_scalar, stmt_info, 0,
3846 vect_body);
3847 inside_cost += record_stmt_cost (cost_vec, nelements,
3848 scalar_stmt, stmt_info, 0,
3849 vect_body);
3852 else
3854 /* Add in cost for initial definition.
3855 For cond reduction we have four vectors: initial index, step,
3856 initial result of the data reduction, initial value of the index
3857 reduction. */
3858 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3859 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3860 scalar_to_vec, stmt_info, 0,
3861 vect_prologue);
3863 /* Cost of reduction op inside loop. */
3864 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3865 stmt_info, 0, vect_body);
3868 /* Determine cost of epilogue code.
3870 We have a reduction operator that will reduce the vector in one statement.
3871 Also requires scalar extract. */
3873 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3875 if (reduc_fn != IFN_LAST)
3877 if (reduction_type == COND_REDUCTION)
3879 /* An EQ stmt and an COND_EXPR stmt. */
3880 epilogue_cost += record_stmt_cost (cost_vec, 2,
3881 vector_stmt, stmt_info, 0,
3882 vect_epilogue);
3883 /* Reduction of the max index and a reduction of the found
3884 values. */
3885 epilogue_cost += record_stmt_cost (cost_vec, 2,
3886 vec_to_scalar, stmt_info, 0,
3887 vect_epilogue);
3888 /* A broadcast of the max value. */
3889 epilogue_cost += record_stmt_cost (cost_vec, 1,
3890 scalar_to_vec, stmt_info, 0,
3891 vect_epilogue);
3893 else
3895 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3896 stmt_info, 0, vect_epilogue);
3897 epilogue_cost += record_stmt_cost (cost_vec, 1,
3898 vec_to_scalar, stmt_info, 0,
3899 vect_epilogue);
3902 else if (reduction_type == COND_REDUCTION)
3904 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3905 /* Extraction of scalar elements. */
3906 epilogue_cost += record_stmt_cost (cost_vec,
3907 2 * estimated_nunits,
3908 vec_to_scalar, stmt_info, 0,
3909 vect_epilogue);
3910 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3911 epilogue_cost += record_stmt_cost (cost_vec,
3912 2 * estimated_nunits - 3,
3913 scalar_stmt, stmt_info, 0,
3914 vect_epilogue);
3916 else if (reduction_type == EXTRACT_LAST_REDUCTION
3917 || reduction_type == FOLD_LEFT_REDUCTION)
3918 /* No extra instructions need in the epilogue. */
3920 else
3922 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3923 tree bitsize =
3924 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3925 int element_bitsize = tree_to_uhwi (bitsize);
3926 int nelements = vec_size_in_bits / element_bitsize;
3928 if (code == COND_EXPR)
3929 code = MAX_EXPR;
3931 optab = optab_for_tree_code (code, vectype, optab_default);
3933 /* We have a whole vector shift available. */
3934 if (optab != unknown_optab
3935 && VECTOR_MODE_P (mode)
3936 && optab_handler (optab, mode) != CODE_FOR_nothing
3937 && have_whole_vector_shift (mode))
3939 /* Final reduction via vector shifts and the reduction operator.
3940 Also requires scalar extract. */
3941 epilogue_cost += record_stmt_cost (cost_vec,
3942 exact_log2 (nelements) * 2,
3943 vector_stmt, stmt_info, 0,
3944 vect_epilogue);
3945 epilogue_cost += record_stmt_cost (cost_vec, 1,
3946 vec_to_scalar, stmt_info, 0,
3947 vect_epilogue);
3949 else
3950 /* Use extracts and reduction op for final reduction. For N
3951 elements, we have N extracts and N-1 reduction ops. */
3952 epilogue_cost += record_stmt_cost (cost_vec,
3953 nelements + nelements - 1,
3954 vector_stmt, stmt_info, 0,
3955 vect_epilogue);
3959 if (dump_enabled_p ())
3960 dump_printf (MSG_NOTE,
3961 "vect_model_reduction_cost: inside_cost = %d, "
3962 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3963 prologue_cost, epilogue_cost);
3967 /* Function vect_model_induction_cost.
3969 Models cost for induction operations. */
3971 static void
3972 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3973 stmt_vector_for_cost *cost_vec)
3975 unsigned inside_cost, prologue_cost;
3977 if (PURE_SLP_STMT (stmt_info))
3978 return;
3980 /* loop cost for vec_loop. */
3981 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3982 stmt_info, 0, vect_body);
3984 /* prologue cost for vec_init and vec_step. */
3985 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3986 stmt_info, 0, vect_prologue);
3988 if (dump_enabled_p ())
3989 dump_printf_loc (MSG_NOTE, vect_location,
3990 "vect_model_induction_cost: inside_cost = %d, "
3991 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3996 /* Function get_initial_def_for_reduction
3998 Input:
3999 STMT - a stmt that performs a reduction operation in the loop.
4000 INIT_VAL - the initial value of the reduction variable
4002 Output:
4003 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4004 of the reduction (used for adjusting the epilog - see below).
4005 Return a vector variable, initialized according to the operation that STMT
4006 performs. This vector will be used as the initial value of the
4007 vector of partial results.
4009 Option1 (adjust in epilog): Initialize the vector as follows:
4010 add/bit or/xor: [0,0,...,0,0]
4011 mult/bit and: [1,1,...,1,1]
4012 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4013 and when necessary (e.g. add/mult case) let the caller know
4014 that it needs to adjust the result by init_val.
4016 Option2: Initialize the vector as follows:
4017 add/bit or/xor: [init_val,0,0,...,0]
4018 mult/bit and: [init_val,1,1,...,1]
4019 min/max/cond_expr: [init_val,init_val,...,init_val]
4020 and no adjustments are needed.
4022 For example, for the following code:
4024 s = init_val;
4025 for (i=0;i<n;i++)
4026 s = s + a[i];
4028 STMT is 's = s + a[i]', and the reduction variable is 's'.
4029 For a vector of 4 units, we want to return either [0,0,0,init_val],
4030 or [0,0,0,0] and let the caller know that it needs to adjust
4031 the result at the end by 'init_val'.
4033 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4034 initialization vector is simpler (same element in all entries), if
4035 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4037 A cost model should help decide between these two schemes. */
4039 tree
4040 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4041 tree *adjustment_def)
4043 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4044 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4045 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4046 tree scalar_type = TREE_TYPE (init_val);
4047 tree vectype = get_vectype_for_scalar_type (scalar_type);
4048 enum tree_code code = gimple_assign_rhs_code (stmt);
4049 tree def_for_init;
4050 tree init_def;
4051 REAL_VALUE_TYPE real_init_val = dconst0;
4052 int int_init_val = 0;
4053 gimple_seq stmts = NULL;
4055 gcc_assert (vectype);
4057 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4058 || SCALAR_FLOAT_TYPE_P (scalar_type));
4060 gcc_assert (nested_in_vect_loop_p (loop, stmt)
4061 || loop == (gimple_bb (stmt))->loop_father);
4063 vect_reduction_type reduction_type
4064 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4066 switch (code)
4068 case WIDEN_SUM_EXPR:
4069 case DOT_PROD_EXPR:
4070 case SAD_EXPR:
4071 case PLUS_EXPR:
4072 case MINUS_EXPR:
4073 case BIT_IOR_EXPR:
4074 case BIT_XOR_EXPR:
4075 case MULT_EXPR:
4076 case BIT_AND_EXPR:
4078 /* ADJUSTMENT_DEF is NULL when called from
4079 vect_create_epilog_for_reduction to vectorize double reduction. */
4080 if (adjustment_def)
4081 *adjustment_def = init_val;
4083 if (code == MULT_EXPR)
4085 real_init_val = dconst1;
4086 int_init_val = 1;
4089 if (code == BIT_AND_EXPR)
4090 int_init_val = -1;
4092 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4093 def_for_init = build_real (scalar_type, real_init_val);
4094 else
4095 def_for_init = build_int_cst (scalar_type, int_init_val);
4097 if (adjustment_def)
4098 /* Option1: the first element is '0' or '1' as well. */
4099 init_def = gimple_build_vector_from_val (&stmts, vectype,
4100 def_for_init);
4101 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4103 /* Option2 (variable length): the first element is INIT_VAL. */
4104 init_def = gimple_build_vector_from_val (&stmts, vectype,
4105 def_for_init);
4106 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4107 vectype, init_def, init_val);
4109 else
4111 /* Option2: the first element is INIT_VAL. */
4112 tree_vector_builder elts (vectype, 1, 2);
4113 elts.quick_push (init_val);
4114 elts.quick_push (def_for_init);
4115 init_def = gimple_build_vector (&stmts, &elts);
4118 break;
4120 case MIN_EXPR:
4121 case MAX_EXPR:
4122 case COND_EXPR:
4124 if (adjustment_def)
4126 *adjustment_def = NULL_TREE;
4127 if (reduction_type != COND_REDUCTION
4128 && reduction_type != EXTRACT_LAST_REDUCTION)
4130 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4131 break;
4134 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4135 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4137 break;
4139 default:
4140 gcc_unreachable ();
4143 if (stmts)
4144 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4145 return init_def;
4148 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4149 NUMBER_OF_VECTORS is the number of vector defs to create.
4150 If NEUTRAL_OP is nonnull, introducing extra elements of that
4151 value will not change the result. */
4153 static void
4154 get_initial_defs_for_reduction (slp_tree slp_node,
4155 vec<tree> *vec_oprnds,
4156 unsigned int number_of_vectors,
4157 bool reduc_chain, tree neutral_op)
4159 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4160 stmt_vec_info stmt_vinfo = stmts[0];
4161 unsigned HOST_WIDE_INT nunits;
4162 unsigned j, number_of_places_left_in_vector;
4163 tree vector_type;
4164 tree vop;
4165 int group_size = stmts.length ();
4166 unsigned int vec_num, i;
4167 unsigned number_of_copies = 1;
4168 vec<tree> voprnds;
4169 voprnds.create (number_of_vectors);
4170 struct loop *loop;
4171 auto_vec<tree, 16> permute_results;
4173 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4175 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4177 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4178 gcc_assert (loop);
4179 edge pe = loop_preheader_edge (loop);
4181 gcc_assert (!reduc_chain || neutral_op);
4183 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4184 created vectors. It is greater than 1 if unrolling is performed.
4186 For example, we have two scalar operands, s1 and s2 (e.g., group of
4187 strided accesses of size two), while NUNITS is four (i.e., four scalars
4188 of this type can be packed in a vector). The output vector will contain
4189 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4190 will be 2).
4192 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4193 vectors containing the operands.
4195 For example, NUNITS is four as before, and the group size is 8
4196 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4197 {s5, s6, s7, s8}. */
4199 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4200 nunits = group_size;
4202 number_of_copies = nunits * number_of_vectors / group_size;
4204 number_of_places_left_in_vector = nunits;
4205 bool constant_p = true;
4206 tree_vector_builder elts (vector_type, nunits, 1);
4207 elts.quick_grow (nunits);
4208 for (j = 0; j < number_of_copies; j++)
4210 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4212 tree op;
4213 /* Get the def before the loop. In reduction chain we have only
4214 one initial value. */
4215 if ((j != (number_of_copies - 1)
4216 || (reduc_chain && i != 0))
4217 && neutral_op)
4218 op = neutral_op;
4219 else
4220 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4222 /* Create 'vect_ = {op0,op1,...,opn}'. */
4223 number_of_places_left_in_vector--;
4224 elts[number_of_places_left_in_vector] = op;
4225 if (!CONSTANT_CLASS_P (op))
4226 constant_p = false;
4228 if (number_of_places_left_in_vector == 0)
4230 gimple_seq ctor_seq = NULL;
4231 tree init;
4232 if (constant_p && !neutral_op
4233 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4234 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4235 /* Build the vector directly from ELTS. */
4236 init = gimple_build_vector (&ctor_seq, &elts);
4237 else if (neutral_op)
4239 /* Build a vector of the neutral value and shift the
4240 other elements into place. */
4241 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4242 neutral_op);
4243 int k = nunits;
4244 while (k > 0 && elts[k - 1] == neutral_op)
4245 k -= 1;
4246 while (k > 0)
4248 k -= 1;
4249 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4250 vector_type, init, elts[k]);
4253 else
4255 /* First time round, duplicate ELTS to fill the
4256 required number of vectors, then cherry pick the
4257 appropriate result for each iteration. */
4258 if (vec_oprnds->is_empty ())
4259 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4260 number_of_vectors,
4261 permute_results);
4262 init = permute_results[number_of_vectors - j - 1];
4264 if (ctor_seq != NULL)
4265 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4266 voprnds.quick_push (init);
4268 number_of_places_left_in_vector = nunits;
4269 elts.new_vector (vector_type, nunits, 1);
4270 elts.quick_grow (nunits);
4271 constant_p = true;
4276 /* Since the vectors are created in the reverse order, we should invert
4277 them. */
4278 vec_num = voprnds.length ();
4279 for (j = vec_num; j != 0; j--)
4281 vop = voprnds[j - 1];
4282 vec_oprnds->quick_push (vop);
4285 voprnds.release ();
4287 /* In case that VF is greater than the unrolling factor needed for the SLP
4288 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4289 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4290 to replicate the vectors. */
4291 tree neutral_vec = NULL;
4292 while (number_of_vectors > vec_oprnds->length ())
4294 if (neutral_op)
4296 if (!neutral_vec)
4298 gimple_seq ctor_seq = NULL;
4299 neutral_vec = gimple_build_vector_from_val
4300 (&ctor_seq, vector_type, neutral_op);
4301 if (ctor_seq != NULL)
4302 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4304 vec_oprnds->quick_push (neutral_vec);
4306 else
4308 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4309 vec_oprnds->quick_push (vop);
4315 /* Function vect_create_epilog_for_reduction
4317 Create code at the loop-epilog to finalize the result of a reduction
4318 computation.
4320 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4321 reduction statements.
4322 STMT is the scalar reduction stmt that is being vectorized.
4323 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4324 number of elements that we can fit in a vectype (nunits). In this case
4325 we have to generate more than one vector stmt - i.e - we need to "unroll"
4326 the vector stmt by a factor VF/nunits. For more details see documentation
4327 in vectorizable_operation.
4328 REDUC_FN is the internal function for the epilog reduction.
4329 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4330 computation.
4331 REDUC_INDEX is the index of the operand in the right hand side of the
4332 statement that is defined by REDUCTION_PHI.
4333 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4334 SLP_NODE is an SLP node containing a group of reduction statements. The
4335 first one in this group is STMT.
4336 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4337 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4338 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4339 any value of the IV in the loop.
4340 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4341 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4342 null if this is not an SLP reduction
4344 This function:
4345 1. Creates the reduction def-use cycles: sets the arguments for
4346 REDUCTION_PHIS:
4347 The loop-entry argument is the vectorized initial-value of the reduction.
4348 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4349 sums.
4350 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4351 by calling the function specified by REDUC_FN if available, or by
4352 other means (whole-vector shifts or a scalar loop).
4353 The function also creates a new phi node at the loop exit to preserve
4354 loop-closed form, as illustrated below.
4356 The flow at the entry to this function:
4358 loop:
4359 vec_def = phi <null, null> # REDUCTION_PHI
4360 VECT_DEF = vector_stmt # vectorized form of STMT
4361 s_loop = scalar_stmt # (scalar) STMT
4362 loop_exit:
4363 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4364 use <s_out0>
4365 use <s_out0>
4367 The above is transformed by this function into:
4369 loop:
4370 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4371 VECT_DEF = vector_stmt # vectorized form of STMT
4372 s_loop = scalar_stmt # (scalar) STMT
4373 loop_exit:
4374 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4375 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4376 v_out2 = reduce <v_out1>
4377 s_out3 = extract_field <v_out2, 0>
4378 s_out4 = adjust_result <s_out3>
4379 use <s_out4>
4380 use <s_out4>
4383 static void
4384 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4385 gimple *reduc_def_stmt,
4386 int ncopies, internal_fn reduc_fn,
4387 vec<stmt_vec_info> reduction_phis,
4388 bool double_reduc,
4389 slp_tree slp_node,
4390 slp_instance slp_node_instance,
4391 tree induc_val, enum tree_code induc_code,
4392 tree neutral_op)
4394 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4395 stmt_vec_info prev_phi_info;
4396 tree vectype;
4397 machine_mode mode;
4398 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4399 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4400 basic_block exit_bb;
4401 tree scalar_dest;
4402 tree scalar_type;
4403 gimple *new_phi = NULL, *phi;
4404 stmt_vec_info phi_info;
4405 gimple_stmt_iterator exit_gsi;
4406 tree vec_dest;
4407 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4408 gimple *epilog_stmt = NULL;
4409 enum tree_code code = gimple_assign_rhs_code (stmt);
4410 gimple *exit_phi;
4411 tree bitsize;
4412 tree adjustment_def = NULL;
4413 tree vec_initial_def = NULL;
4414 tree expr, def, initial_def = NULL;
4415 tree orig_name, scalar_result;
4416 imm_use_iterator imm_iter, phi_imm_iter;
4417 use_operand_p use_p, phi_use_p;
4418 gimple *use_stmt;
4419 stmt_vec_info reduction_phi_info = NULL;
4420 bool nested_in_vect_loop = false;
4421 auto_vec<gimple *> new_phis;
4422 auto_vec<stmt_vec_info> inner_phis;
4423 enum vect_def_type dt = vect_unknown_def_type;
4424 int j, i;
4425 auto_vec<tree> scalar_results;
4426 unsigned int group_size = 1, k, ratio;
4427 auto_vec<tree> vec_initial_defs;
4428 auto_vec<gimple *> phis;
4429 bool slp_reduc = false;
4430 bool direct_slp_reduc;
4431 tree new_phi_result;
4432 stmt_vec_info inner_phi = NULL;
4433 tree induction_index = NULL_TREE;
4435 if (slp_node)
4436 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4438 if (nested_in_vect_loop_p (loop, stmt))
4440 outer_loop = loop;
4441 loop = loop->inner;
4442 nested_in_vect_loop = true;
4443 gcc_assert (!slp_node);
4446 vectype = STMT_VINFO_VECTYPE (stmt_info);
4447 gcc_assert (vectype);
4448 mode = TYPE_MODE (vectype);
4450 /* 1. Create the reduction def-use cycle:
4451 Set the arguments of REDUCTION_PHIS, i.e., transform
4453 loop:
4454 vec_def = phi <null, null> # REDUCTION_PHI
4455 VECT_DEF = vector_stmt # vectorized form of STMT
4458 into:
4460 loop:
4461 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4462 VECT_DEF = vector_stmt # vectorized form of STMT
4465 (in case of SLP, do it for all the phis). */
4467 /* Get the loop-entry arguments. */
4468 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4469 if (slp_node)
4471 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4472 vec_initial_defs.reserve (vec_num);
4473 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4474 &vec_initial_defs, vec_num,
4475 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4476 neutral_op);
4478 else
4480 /* Get at the scalar def before the loop, that defines the initial value
4481 of the reduction variable. */
4482 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4483 loop_preheader_edge (loop));
4484 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4485 and we can't use zero for induc_val, use initial_def. Similarly
4486 for REDUC_MIN and initial_def larger than the base. */
4487 if (TREE_CODE (initial_def) == INTEGER_CST
4488 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4489 == INTEGER_INDUC_COND_REDUCTION)
4490 && !integer_zerop (induc_val)
4491 && ((induc_code == MAX_EXPR
4492 && tree_int_cst_lt (initial_def, induc_val))
4493 || (induc_code == MIN_EXPR
4494 && tree_int_cst_lt (induc_val, initial_def))))
4495 induc_val = initial_def;
4497 if (double_reduc)
4498 /* In case of double reduction we only create a vector variable
4499 to be put in the reduction phi node. The actual statement
4500 creation is done later in this function. */
4501 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4502 else if (nested_in_vect_loop)
4504 /* Do not use an adjustment def as that case is not supported
4505 correctly if ncopies is not one. */
4506 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4507 vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4509 else
4510 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4511 &adjustment_def);
4512 vec_initial_defs.create (1);
4513 vec_initial_defs.quick_push (vec_initial_def);
4516 /* Set phi nodes arguments. */
4517 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4519 tree vec_init_def = vec_initial_defs[i];
4520 tree def = vect_defs[i];
4521 for (j = 0; j < ncopies; j++)
4523 if (j != 0)
4525 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4526 if (nested_in_vect_loop)
4527 vec_init_def
4528 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4529 vec_init_def);
4532 /* Set the loop-entry arg of the reduction-phi. */
4534 gphi *phi = as_a <gphi *> (phi_info->stmt);
4535 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4536 == INTEGER_INDUC_COND_REDUCTION)
4538 /* Initialise the reduction phi to zero. This prevents initial
4539 values of non-zero interferring with the reduction op. */
4540 gcc_assert (ncopies == 1);
4541 gcc_assert (i == 0);
4543 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4544 tree induc_val_vec
4545 = build_vector_from_val (vec_init_def_type, induc_val);
4547 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4548 UNKNOWN_LOCATION);
4550 else
4551 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4552 UNKNOWN_LOCATION);
4554 /* Set the loop-latch arg for the reduction-phi. */
4555 if (j > 0)
4556 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4558 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4560 if (dump_enabled_p ())
4562 dump_printf_loc (MSG_NOTE, vect_location,
4563 "transform reduction: created def-use cycle: ");
4564 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4565 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4570 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4571 which is updated with the current index of the loop for every match of
4572 the original loop's cond_expr (VEC_STMT). This results in a vector
4573 containing the last time the condition passed for that vector lane.
4574 The first match will be a 1 to allow 0 to be used for non-matching
4575 indexes. If there are no matches at all then the vector will be all
4576 zeroes. */
4577 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4579 tree indx_before_incr, indx_after_incr;
4580 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4582 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4583 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4585 int scalar_precision
4586 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4587 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4588 tree cr_index_vector_type = build_vector_type
4589 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4591 /* First we create a simple vector induction variable which starts
4592 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4593 vector size (STEP). */
4595 /* Create a {1,2,3,...} vector. */
4596 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4598 /* Create a vector of the step value. */
4599 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4600 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4602 /* Create an induction variable. */
4603 gimple_stmt_iterator incr_gsi;
4604 bool insert_after;
4605 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4606 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4607 insert_after, &indx_before_incr, &indx_after_incr);
4609 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4610 filled with zeros (VEC_ZERO). */
4612 /* Create a vector of 0s. */
4613 tree zero = build_zero_cst (cr_index_scalar_type);
4614 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4616 /* Create a vector phi node. */
4617 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4618 new_phi = create_phi_node (new_phi_tree, loop->header);
4619 loop_vinfo->add_stmt (new_phi);
4620 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4621 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4623 /* Now take the condition from the loops original cond_expr
4624 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4625 every match uses values from the induction variable
4626 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4627 (NEW_PHI_TREE).
4628 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4629 the new cond_expr (INDEX_COND_EXPR). */
4631 /* Duplicate the condition from vec_stmt. */
4632 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4634 /* Create a conditional, where the condition is taken from vec_stmt
4635 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4636 else is the phi (NEW_PHI_TREE). */
4637 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4638 ccompare, indx_before_incr,
4639 new_phi_tree);
4640 induction_index = make_ssa_name (cr_index_vector_type);
4641 gimple *index_condition = gimple_build_assign (induction_index,
4642 index_cond_expr);
4643 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4644 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4645 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4647 /* Update the phi with the vec cond. */
4648 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4649 loop_latch_edge (loop), UNKNOWN_LOCATION);
4652 /* 2. Create epilog code.
4653 The reduction epilog code operates across the elements of the vector
4654 of partial results computed by the vectorized loop.
4655 The reduction epilog code consists of:
4657 step 1: compute the scalar result in a vector (v_out2)
4658 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4659 step 3: adjust the scalar result (s_out3) if needed.
4661 Step 1 can be accomplished using one the following three schemes:
4662 (scheme 1) using reduc_fn, if available.
4663 (scheme 2) using whole-vector shifts, if available.
4664 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4665 combined.
4667 The overall epilog code looks like this:
4669 s_out0 = phi <s_loop> # original EXIT_PHI
4670 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4671 v_out2 = reduce <v_out1> # step 1
4672 s_out3 = extract_field <v_out2, 0> # step 2
4673 s_out4 = adjust_result <s_out3> # step 3
4675 (step 3 is optional, and steps 1 and 2 may be combined).
4676 Lastly, the uses of s_out0 are replaced by s_out4. */
4679 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4680 v_out1 = phi <VECT_DEF>
4681 Store them in NEW_PHIS. */
4683 exit_bb = single_exit (loop)->dest;
4684 prev_phi_info = NULL;
4685 new_phis.create (vect_defs.length ());
4686 FOR_EACH_VEC_ELT (vect_defs, i, def)
4688 for (j = 0; j < ncopies; j++)
4690 tree new_def = copy_ssa_name (def);
4691 phi = create_phi_node (new_def, exit_bb);
4692 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4693 if (j == 0)
4694 new_phis.quick_push (phi);
4695 else
4697 def = vect_get_vec_def_for_stmt_copy (dt, def);
4698 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4701 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4702 prev_phi_info = phi_info;
4706 /* The epilogue is created for the outer-loop, i.e., for the loop being
4707 vectorized. Create exit phis for the outer loop. */
4708 if (double_reduc)
4710 loop = outer_loop;
4711 exit_bb = single_exit (loop)->dest;
4712 inner_phis.create (vect_defs.length ());
4713 FOR_EACH_VEC_ELT (new_phis, i, phi)
4715 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4716 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4717 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4718 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4719 PHI_RESULT (phi));
4720 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4721 inner_phis.quick_push (phi_info);
4722 new_phis[i] = outer_phi;
4723 while (STMT_VINFO_RELATED_STMT (phi_info))
4725 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4726 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4727 outer_phi = create_phi_node (new_result, exit_bb);
4728 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4729 PHI_RESULT (phi_info->stmt));
4730 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4731 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4732 prev_phi_info = outer_phi_info;
4737 exit_gsi = gsi_after_labels (exit_bb);
4739 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4740 (i.e. when reduc_fn is not available) and in the final adjustment
4741 code (if needed). Also get the original scalar reduction variable as
4742 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4743 represents a reduction pattern), the tree-code and scalar-def are
4744 taken from the original stmt that the pattern-stmt (STMT) replaces.
4745 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4746 are taken from STMT. */
4748 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4749 if (!orig_stmt_info)
4751 /* Regular reduction */
4752 orig_stmt_info = stmt_info;
4754 else
4756 /* Reduction pattern */
4757 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4758 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4761 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4762 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4763 partial results are added and not subtracted. */
4764 if (code == MINUS_EXPR)
4765 code = PLUS_EXPR;
4767 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4768 scalar_type = TREE_TYPE (scalar_dest);
4769 scalar_results.create (group_size);
4770 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4771 bitsize = TYPE_SIZE (scalar_type);
4773 /* In case this is a reduction in an inner-loop while vectorizing an outer
4774 loop - we don't need to extract a single scalar result at the end of the
4775 inner-loop (unless it is double reduction, i.e., the use of reduction is
4776 outside the outer-loop). The final vector of partial results will be used
4777 in the vectorized outer-loop, or reduced to a scalar result at the end of
4778 the outer-loop. */
4779 if (nested_in_vect_loop && !double_reduc)
4780 goto vect_finalize_reduction;
4782 /* SLP reduction without reduction chain, e.g.,
4783 # a1 = phi <a2, a0>
4784 # b1 = phi <b2, b0>
4785 a2 = operation (a1)
4786 b2 = operation (b1) */
4787 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4789 /* True if we should implement SLP_REDUC using native reduction operations
4790 instead of scalar operations. */
4791 direct_slp_reduc = (reduc_fn != IFN_LAST
4792 && slp_reduc
4793 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4795 /* In case of reduction chain, e.g.,
4796 # a1 = phi <a3, a0>
4797 a2 = operation (a1)
4798 a3 = operation (a2),
4800 we may end up with more than one vector result. Here we reduce them to
4801 one vector. */
4802 if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4804 tree first_vect = PHI_RESULT (new_phis[0]);
4805 gassign *new_vec_stmt = NULL;
4806 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4807 for (k = 1; k < new_phis.length (); k++)
4809 gimple *next_phi = new_phis[k];
4810 tree second_vect = PHI_RESULT (next_phi);
4811 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4812 new_vec_stmt = gimple_build_assign (tem, code,
4813 first_vect, second_vect);
4814 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4815 first_vect = tem;
4818 new_phi_result = first_vect;
4819 if (new_vec_stmt)
4821 new_phis.truncate (0);
4822 new_phis.safe_push (new_vec_stmt);
4825 /* Likewise if we couldn't use a single defuse cycle. */
4826 else if (ncopies > 1)
4828 gcc_assert (new_phis.length () == 1);
4829 tree first_vect = PHI_RESULT (new_phis[0]);
4830 gassign *new_vec_stmt = NULL;
4831 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4832 gimple *next_phi = new_phis[0];
4833 for (int k = 1; k < ncopies; ++k)
4835 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4836 tree second_vect = PHI_RESULT (next_phi);
4837 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4838 new_vec_stmt = gimple_build_assign (tem, code,
4839 first_vect, second_vect);
4840 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4841 first_vect = tem;
4843 new_phi_result = first_vect;
4844 new_phis.truncate (0);
4845 new_phis.safe_push (new_vec_stmt);
4847 else
4848 new_phi_result = PHI_RESULT (new_phis[0]);
4850 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4851 && reduc_fn != IFN_LAST)
4853 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4854 various data values where the condition matched and another vector
4855 (INDUCTION_INDEX) containing all the indexes of those matches. We
4856 need to extract the last matching index (which will be the index with
4857 highest value) and use this to index into the data vector.
4858 For the case where there were no matches, the data vector will contain
4859 all default values and the index vector will be all zeros. */
4861 /* Get various versions of the type of the vector of indexes. */
4862 tree index_vec_type = TREE_TYPE (induction_index);
4863 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4864 tree index_scalar_type = TREE_TYPE (index_vec_type);
4865 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4866 (index_vec_type);
4868 /* Get an unsigned integer version of the type of the data vector. */
4869 int scalar_precision
4870 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4871 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4872 tree vectype_unsigned = build_vector_type
4873 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4875 /* First we need to create a vector (ZERO_VEC) of zeros and another
4876 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4877 can create using a MAX reduction and then expanding.
4878 In the case where the loop never made any matches, the max index will
4879 be zero. */
4881 /* Vector of {0, 0, 0,...}. */
4882 tree zero_vec = make_ssa_name (vectype);
4883 tree zero_vec_rhs = build_zero_cst (vectype);
4884 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4885 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4887 /* Find maximum value from the vector of found indexes. */
4888 tree max_index = make_ssa_name (index_scalar_type);
4889 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4890 1, induction_index);
4891 gimple_call_set_lhs (max_index_stmt, max_index);
4892 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4894 /* Vector of {max_index, max_index, max_index,...}. */
4895 tree max_index_vec = make_ssa_name (index_vec_type);
4896 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4897 max_index);
4898 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4899 max_index_vec_rhs);
4900 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4902 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4903 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4904 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4905 otherwise. Only one value should match, resulting in a vector
4906 (VEC_COND) with one data value and the rest zeros.
4907 In the case where the loop never made any matches, every index will
4908 match, resulting in a vector with all data values (which will all be
4909 the default value). */
4911 /* Compare the max index vector to the vector of found indexes to find
4912 the position of the max value. */
4913 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4914 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4915 induction_index,
4916 max_index_vec);
4917 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4919 /* Use the compare to choose either values from the data vector or
4920 zero. */
4921 tree vec_cond = make_ssa_name (vectype);
4922 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4923 vec_compare, new_phi_result,
4924 zero_vec);
4925 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4927 /* Finally we need to extract the data value from the vector (VEC_COND)
4928 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4929 reduction, but because this doesn't exist, we can use a MAX reduction
4930 instead. The data value might be signed or a float so we need to cast
4931 it first.
4932 In the case where the loop never made any matches, the data values are
4933 all identical, and so will reduce down correctly. */
4935 /* Make the matched data values unsigned. */
4936 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4937 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4938 vec_cond);
4939 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4940 VIEW_CONVERT_EXPR,
4941 vec_cond_cast_rhs);
4942 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4944 /* Reduce down to a scalar value. */
4945 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4946 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4947 1, vec_cond_cast);
4948 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4949 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4951 /* Convert the reduced value back to the result type and set as the
4952 result. */
4953 gimple_seq stmts = NULL;
4954 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4955 data_reduc);
4956 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4957 scalar_results.safe_push (new_temp);
4959 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4960 && reduc_fn == IFN_LAST)
4962 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4963 idx = 0;
4964 idx_val = induction_index[0];
4965 val = data_reduc[0];
4966 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4967 if (induction_index[i] > idx_val)
4968 val = data_reduc[i], idx_val = induction_index[i];
4969 return val; */
4971 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4972 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4973 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4974 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4975 /* Enforced by vectorizable_reduction, which ensures we have target
4976 support before allowing a conditional reduction on variable-length
4977 vectors. */
4978 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4979 tree idx_val = NULL_TREE, val = NULL_TREE;
4980 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4982 tree old_idx_val = idx_val;
4983 tree old_val = val;
4984 idx_val = make_ssa_name (idx_eltype);
4985 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4986 build3 (BIT_FIELD_REF, idx_eltype,
4987 induction_index,
4988 bitsize_int (el_size),
4989 bitsize_int (off)));
4990 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4991 val = make_ssa_name (data_eltype);
4992 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4993 build3 (BIT_FIELD_REF,
4994 data_eltype,
4995 new_phi_result,
4996 bitsize_int (el_size),
4997 bitsize_int (off)));
4998 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4999 if (off != 0)
5001 tree new_idx_val = idx_val;
5002 tree new_val = val;
5003 if (off != v_size - el_size)
5005 new_idx_val = make_ssa_name (idx_eltype);
5006 epilog_stmt = gimple_build_assign (new_idx_val,
5007 MAX_EXPR, idx_val,
5008 old_idx_val);
5009 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5011 new_val = make_ssa_name (data_eltype);
5012 epilog_stmt = gimple_build_assign (new_val,
5013 COND_EXPR,
5014 build2 (GT_EXPR,
5015 boolean_type_node,
5016 idx_val,
5017 old_idx_val),
5018 val, old_val);
5019 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5020 idx_val = new_idx_val;
5021 val = new_val;
5024 /* Convert the reduced value back to the result type and set as the
5025 result. */
5026 gimple_seq stmts = NULL;
5027 val = gimple_convert (&stmts, scalar_type, val);
5028 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5029 scalar_results.safe_push (val);
5032 /* 2.3 Create the reduction code, using one of the three schemes described
5033 above. In SLP we simply need to extract all the elements from the
5034 vector (without reducing them), so we use scalar shifts. */
5035 else if (reduc_fn != IFN_LAST && !slp_reduc)
5037 tree tmp;
5038 tree vec_elem_type;
5040 /* Case 1: Create:
5041 v_out2 = reduc_expr <v_out1> */
5043 if (dump_enabled_p ())
5044 dump_printf_loc (MSG_NOTE, vect_location,
5045 "Reduce using direct vector reduction.\n");
5047 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5048 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5050 tree tmp_dest
5051 = vect_create_destination_var (scalar_dest, vec_elem_type);
5052 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5053 new_phi_result);
5054 gimple_set_lhs (epilog_stmt, tmp_dest);
5055 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5056 gimple_set_lhs (epilog_stmt, new_temp);
5057 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5059 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5060 new_temp);
5062 else
5064 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5065 new_phi_result);
5066 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5069 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5070 gimple_set_lhs (epilog_stmt, new_temp);
5071 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5073 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5074 == INTEGER_INDUC_COND_REDUCTION)
5075 && !operand_equal_p (initial_def, induc_val, 0))
5077 /* Earlier we set the initial value to be a vector if induc_val
5078 values. Check the result and if it is induc_val then replace
5079 with the original initial value, unless induc_val is
5080 the same as initial_def already. */
5081 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5082 induc_val);
5084 tmp = make_ssa_name (new_scalar_dest);
5085 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5086 initial_def, new_temp);
5087 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5088 new_temp = tmp;
5091 scalar_results.safe_push (new_temp);
5093 else if (direct_slp_reduc)
5095 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5096 with the elements for other SLP statements replaced with the
5097 neutral value. We can then do a normal reduction on each vector. */
5099 /* Enforced by vectorizable_reduction. */
5100 gcc_assert (new_phis.length () == 1);
5101 gcc_assert (pow2p_hwi (group_size));
5103 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5104 vec<stmt_vec_info> orig_phis
5105 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5106 gimple_seq seq = NULL;
5108 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5109 and the same element size as VECTYPE. */
5110 tree index = build_index_vector (vectype, 0, 1);
5111 tree index_type = TREE_TYPE (index);
5112 tree index_elt_type = TREE_TYPE (index_type);
5113 tree mask_type = build_same_sized_truth_vector_type (index_type);
5115 /* Create a vector that, for each element, identifies which of
5116 the REDUC_GROUP_SIZE results should use it. */
5117 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5118 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5119 build_vector_from_val (index_type, index_mask));
5121 /* Get a neutral vector value. This is simply a splat of the neutral
5122 scalar value if we have one, otherwise the initial scalar value
5123 is itself a neutral value. */
5124 tree vector_identity = NULL_TREE;
5125 if (neutral_op)
5126 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5127 neutral_op);
5128 for (unsigned int i = 0; i < group_size; ++i)
5130 /* If there's no univeral neutral value, we can use the
5131 initial scalar value from the original PHI. This is used
5132 for MIN and MAX reduction, for example. */
5133 if (!neutral_op)
5135 tree scalar_value
5136 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5137 loop_preheader_edge (loop));
5138 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5139 scalar_value);
5142 /* Calculate the equivalent of:
5144 sel[j] = (index[j] == i);
5146 which selects the elements of NEW_PHI_RESULT that should
5147 be included in the result. */
5148 tree compare_val = build_int_cst (index_elt_type, i);
5149 compare_val = build_vector_from_val (index_type, compare_val);
5150 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5151 index, compare_val);
5153 /* Calculate the equivalent of:
5155 vec = seq ? new_phi_result : vector_identity;
5157 VEC is now suitable for a full vector reduction. */
5158 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5159 sel, new_phi_result, vector_identity);
5161 /* Do the reduction and convert it to the appropriate type. */
5162 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5163 TREE_TYPE (vectype), vec);
5164 scalar = gimple_convert (&seq, scalar_type, scalar);
5165 scalar_results.safe_push (scalar);
5167 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5169 else
5171 bool reduce_with_shift;
5172 tree vec_temp;
5174 /* COND reductions all do the final reduction with MAX_EXPR
5175 or MIN_EXPR. */
5176 if (code == COND_EXPR)
5178 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5179 == INTEGER_INDUC_COND_REDUCTION)
5180 code = induc_code;
5181 else
5182 code = MAX_EXPR;
5185 /* See if the target wants to do the final (shift) reduction
5186 in a vector mode of smaller size and first reduce upper/lower
5187 halves against each other. */
5188 enum machine_mode mode1 = mode;
5189 tree vectype1 = vectype;
5190 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5191 unsigned sz1 = sz;
5192 if (!slp_reduc
5193 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5194 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5196 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5197 reduce_with_shift = have_whole_vector_shift (mode1);
5198 if (!VECTOR_MODE_P (mode1))
5199 reduce_with_shift = false;
5200 else
5202 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5203 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5204 reduce_with_shift = false;
5207 /* First reduce the vector to the desired vector size we should
5208 do shift reduction on by combining upper and lower halves. */
5209 new_temp = new_phi_result;
5210 while (sz > sz1)
5212 gcc_assert (!slp_reduc);
5213 sz /= 2;
5214 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5216 /* The target has to make sure we support lowpart/highpart
5217 extraction, either via direct vector extract or through
5218 an integer mode punning. */
5219 tree dst1, dst2;
5220 if (convert_optab_handler (vec_extract_optab,
5221 TYPE_MODE (TREE_TYPE (new_temp)),
5222 TYPE_MODE (vectype1))
5223 != CODE_FOR_nothing)
5225 /* Extract sub-vectors directly once vec_extract becomes
5226 a conversion optab. */
5227 dst1 = make_ssa_name (vectype1);
5228 epilog_stmt
5229 = gimple_build_assign (dst1, BIT_FIELD_REF,
5230 build3 (BIT_FIELD_REF, vectype1,
5231 new_temp, TYPE_SIZE (vectype1),
5232 bitsize_int (0)));
5233 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234 dst2 = make_ssa_name (vectype1);
5235 epilog_stmt
5236 = gimple_build_assign (dst2, BIT_FIELD_REF,
5237 build3 (BIT_FIELD_REF, vectype1,
5238 new_temp, TYPE_SIZE (vectype1),
5239 bitsize_int (sz * BITS_PER_UNIT)));
5240 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5242 else
5244 /* Extract via punning to appropriately sized integer mode
5245 vector. */
5246 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5248 tree etype = build_vector_type (eltype, 2);
5249 gcc_assert (convert_optab_handler (vec_extract_optab,
5250 TYPE_MODE (etype),
5251 TYPE_MODE (eltype))
5252 != CODE_FOR_nothing);
5253 tree tem = make_ssa_name (etype);
5254 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5255 build1 (VIEW_CONVERT_EXPR,
5256 etype, new_temp));
5257 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5258 new_temp = tem;
5259 tem = make_ssa_name (eltype);
5260 epilog_stmt
5261 = gimple_build_assign (tem, BIT_FIELD_REF,
5262 build3 (BIT_FIELD_REF, eltype,
5263 new_temp, TYPE_SIZE (eltype),
5264 bitsize_int (0)));
5265 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5266 dst1 = make_ssa_name (vectype1);
5267 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5268 build1 (VIEW_CONVERT_EXPR,
5269 vectype1, tem));
5270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271 tem = make_ssa_name (eltype);
5272 epilog_stmt
5273 = gimple_build_assign (tem, BIT_FIELD_REF,
5274 build3 (BIT_FIELD_REF, eltype,
5275 new_temp, TYPE_SIZE (eltype),
5276 bitsize_int (sz * BITS_PER_UNIT)));
5277 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278 dst2 = make_ssa_name (vectype1);
5279 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5280 build1 (VIEW_CONVERT_EXPR,
5281 vectype1, tem));
5282 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5285 new_temp = make_ssa_name (vectype1);
5286 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5287 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5290 if (reduce_with_shift && !slp_reduc)
5292 int element_bitsize = tree_to_uhwi (bitsize);
5293 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5294 for variable-length vectors and also requires direct target support
5295 for loop reductions. */
5296 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5297 int nelements = vec_size_in_bits / element_bitsize;
5298 vec_perm_builder sel;
5299 vec_perm_indices indices;
5301 int elt_offset;
5303 tree zero_vec = build_zero_cst (vectype1);
5304 /* Case 2: Create:
5305 for (offset = nelements/2; offset >= 1; offset/=2)
5307 Create: va' = vec_shift <va, offset>
5308 Create: va = vop <va, va'>
5309 } */
5311 tree rhs;
5313 if (dump_enabled_p ())
5314 dump_printf_loc (MSG_NOTE, vect_location,
5315 "Reduce using vector shifts\n");
5317 mode1 = TYPE_MODE (vectype1);
5318 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5319 for (elt_offset = nelements / 2;
5320 elt_offset >= 1;
5321 elt_offset /= 2)
5323 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5324 indices.new_vector (sel, 2, nelements);
5325 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5326 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5327 new_temp, zero_vec, mask);
5328 new_name = make_ssa_name (vec_dest, epilog_stmt);
5329 gimple_assign_set_lhs (epilog_stmt, new_name);
5330 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5332 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5333 new_temp);
5334 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5335 gimple_assign_set_lhs (epilog_stmt, new_temp);
5336 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5339 /* 2.4 Extract the final scalar result. Create:
5340 s_out3 = extract_field <v_out2, bitpos> */
5342 if (dump_enabled_p ())
5343 dump_printf_loc (MSG_NOTE, vect_location,
5344 "extract scalar result\n");
5346 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5347 bitsize, bitsize_zero_node);
5348 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5349 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5350 gimple_assign_set_lhs (epilog_stmt, new_temp);
5351 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352 scalar_results.safe_push (new_temp);
5354 else
5356 /* Case 3: Create:
5357 s = extract_field <v_out2, 0>
5358 for (offset = element_size;
5359 offset < vector_size;
5360 offset += element_size;)
5362 Create: s' = extract_field <v_out2, offset>
5363 Create: s = op <s, s'> // For non SLP cases
5364 } */
5366 if (dump_enabled_p ())
5367 dump_printf_loc (MSG_NOTE, vect_location,
5368 "Reduce using scalar code.\n");
5370 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5371 int element_bitsize = tree_to_uhwi (bitsize);
5372 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5374 int bit_offset;
5375 if (gimple_code (new_phi) == GIMPLE_PHI)
5376 vec_temp = PHI_RESULT (new_phi);
5377 else
5378 vec_temp = gimple_assign_lhs (new_phi);
5379 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5380 bitsize_zero_node);
5381 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5382 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5383 gimple_assign_set_lhs (epilog_stmt, new_temp);
5384 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5386 /* In SLP we don't need to apply reduction operation, so we just
5387 collect s' values in SCALAR_RESULTS. */
5388 if (slp_reduc)
5389 scalar_results.safe_push (new_temp);
5391 for (bit_offset = element_bitsize;
5392 bit_offset < vec_size_in_bits;
5393 bit_offset += element_bitsize)
5395 tree bitpos = bitsize_int (bit_offset);
5396 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5397 bitsize, bitpos);
5399 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5400 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5401 gimple_assign_set_lhs (epilog_stmt, new_name);
5402 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5404 if (slp_reduc)
5406 /* In SLP we don't need to apply reduction operation, so
5407 we just collect s' values in SCALAR_RESULTS. */
5408 new_temp = new_name;
5409 scalar_results.safe_push (new_name);
5411 else
5413 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5414 new_name, new_temp);
5415 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5416 gimple_assign_set_lhs (epilog_stmt, new_temp);
5417 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5422 /* The only case where we need to reduce scalar results in SLP, is
5423 unrolling. If the size of SCALAR_RESULTS is greater than
5424 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5425 REDUC_GROUP_SIZE. */
5426 if (slp_reduc)
5428 tree res, first_res, new_res;
5429 gimple *new_stmt;
5431 /* Reduce multiple scalar results in case of SLP unrolling. */
5432 for (j = group_size; scalar_results.iterate (j, &res);
5433 j++)
5435 first_res = scalar_results[j % group_size];
5436 new_stmt = gimple_build_assign (new_scalar_dest, code,
5437 first_res, res);
5438 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5439 gimple_assign_set_lhs (new_stmt, new_res);
5440 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5441 scalar_results[j % group_size] = new_res;
5444 else
5445 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5446 scalar_results.safe_push (new_temp);
5449 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5450 == INTEGER_INDUC_COND_REDUCTION)
5451 && !operand_equal_p (initial_def, induc_val, 0))
5453 /* Earlier we set the initial value to be a vector if induc_val
5454 values. Check the result and if it is induc_val then replace
5455 with the original initial value, unless induc_val is
5456 the same as initial_def already. */
5457 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5458 induc_val);
5460 tree tmp = make_ssa_name (new_scalar_dest);
5461 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5462 initial_def, new_temp);
5463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464 scalar_results[0] = tmp;
5468 vect_finalize_reduction:
5470 if (double_reduc)
5471 loop = loop->inner;
5473 /* 2.5 Adjust the final result by the initial value of the reduction
5474 variable. (When such adjustment is not needed, then
5475 'adjustment_def' is zero). For example, if code is PLUS we create:
5476 new_temp = loop_exit_def + adjustment_def */
5478 if (adjustment_def)
5480 gcc_assert (!slp_reduc);
5481 if (nested_in_vect_loop)
5483 new_phi = new_phis[0];
5484 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5485 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5486 new_dest = vect_create_destination_var (scalar_dest, vectype);
5488 else
5490 new_temp = scalar_results[0];
5491 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5492 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5493 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5496 epilog_stmt = gimple_build_assign (new_dest, expr);
5497 new_temp = make_ssa_name (new_dest, epilog_stmt);
5498 gimple_assign_set_lhs (epilog_stmt, new_temp);
5499 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5500 if (nested_in_vect_loop)
5502 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5503 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5504 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5506 if (!double_reduc)
5507 scalar_results.quick_push (new_temp);
5508 else
5509 scalar_results[0] = new_temp;
5511 else
5512 scalar_results[0] = new_temp;
5514 new_phis[0] = epilog_stmt;
5517 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5518 phis with new adjusted scalar results, i.e., replace use <s_out0>
5519 with use <s_out4>.
5521 Transform:
5522 loop_exit:
5523 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5524 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5525 v_out2 = reduce <v_out1>
5526 s_out3 = extract_field <v_out2, 0>
5527 s_out4 = adjust_result <s_out3>
5528 use <s_out0>
5529 use <s_out0>
5531 into:
5533 loop_exit:
5534 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5535 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5536 v_out2 = reduce <v_out1>
5537 s_out3 = extract_field <v_out2, 0>
5538 s_out4 = adjust_result <s_out3>
5539 use <s_out4>
5540 use <s_out4> */
5543 /* In SLP reduction chain we reduce vector results into one vector if
5544 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5545 LHS of the last stmt in the reduction chain, since we are looking for
5546 the loop exit phi node. */
5547 if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5549 stmt_vec_info dest_stmt_info
5550 = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5551 /* Handle reduction patterns. */
5552 if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
5553 dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
5555 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5556 group_size = 1;
5559 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5560 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5561 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5562 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5563 correspond to the first vector stmt, etc.
5564 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5565 if (group_size > new_phis.length ())
5567 ratio = group_size / new_phis.length ();
5568 gcc_assert (!(group_size % new_phis.length ()));
5570 else
5571 ratio = 1;
5573 for (k = 0; k < group_size; k++)
5575 if (k % ratio == 0)
5577 epilog_stmt = new_phis[k / ratio];
5578 reduction_phi_info = reduction_phis[k / ratio];
5579 if (double_reduc)
5580 inner_phi = inner_phis[k / ratio];
5583 if (slp_reduc)
5585 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5587 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5588 /* SLP statements can't participate in patterns. */
5589 gcc_assert (!orig_stmt_info);
5590 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5593 phis.create (3);
5594 /* Find the loop-closed-use at the loop exit of the original scalar
5595 result. (The reduction result is expected to have two immediate uses -
5596 one at the latch block, and one at the loop exit). */
5597 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5598 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5599 && !is_gimple_debug (USE_STMT (use_p)))
5600 phis.safe_push (USE_STMT (use_p));
5602 /* While we expect to have found an exit_phi because of loop-closed-ssa
5603 form we can end up without one if the scalar cycle is dead. */
5605 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5607 if (outer_loop)
5609 stmt_vec_info exit_phi_vinfo
5610 = loop_vinfo->lookup_stmt (exit_phi);
5611 gphi *vect_phi;
5613 /* FORNOW. Currently not supporting the case that an inner-loop
5614 reduction is not used in the outer-loop (but only outside the
5615 outer-loop), unless it is double reduction. */
5616 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5617 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5618 || double_reduc);
5620 if (double_reduc)
5621 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5622 else
5623 STMT_VINFO_VEC_STMT (exit_phi_vinfo)
5624 = vinfo_for_stmt (epilog_stmt);
5625 if (!double_reduc
5626 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5627 != vect_double_reduction_def)
5628 continue;
5630 /* Handle double reduction:
5632 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5633 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5634 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5635 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5637 At that point the regular reduction (stmt2 and stmt3) is
5638 already vectorized, as well as the exit phi node, stmt4.
5639 Here we vectorize the phi node of double reduction, stmt1, and
5640 update all relevant statements. */
5642 /* Go through all the uses of s2 to find double reduction phi
5643 node, i.e., stmt1 above. */
5644 orig_name = PHI_RESULT (exit_phi);
5645 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5647 stmt_vec_info use_stmt_vinfo;
5648 tree vect_phi_init, preheader_arg, vect_phi_res;
5649 basic_block bb = gimple_bb (use_stmt);
5651 /* Check that USE_STMT is really double reduction phi
5652 node. */
5653 if (gimple_code (use_stmt) != GIMPLE_PHI
5654 || gimple_phi_num_args (use_stmt) != 2
5655 || bb->loop_father != outer_loop)
5656 continue;
5657 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5658 if (!use_stmt_vinfo
5659 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5660 != vect_double_reduction_def)
5661 continue;
5663 /* Create vector phi node for double reduction:
5664 vs1 = phi <vs0, vs2>
5665 vs1 was created previously in this function by a call to
5666 vect_get_vec_def_for_operand and is stored in
5667 vec_initial_def;
5668 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5669 vs0 is created here. */
5671 /* Create vector phi node. */
5672 vect_phi = create_phi_node (vec_initial_def, bb);
5673 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5675 /* Create vs0 - initial def of the double reduction phi. */
5676 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5677 loop_preheader_edge (outer_loop));
5678 vect_phi_init = get_initial_def_for_reduction
5679 (stmt, preheader_arg, NULL);
5681 /* Update phi node arguments with vs0 and vs2. */
5682 add_phi_arg (vect_phi, vect_phi_init,
5683 loop_preheader_edge (outer_loop),
5684 UNKNOWN_LOCATION);
5685 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5686 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5687 if (dump_enabled_p ())
5689 dump_printf_loc (MSG_NOTE, vect_location,
5690 "created double reduction phi node: ");
5691 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5694 vect_phi_res = PHI_RESULT (vect_phi);
5696 /* Replace the use, i.e., set the correct vs1 in the regular
5697 reduction phi node. FORNOW, NCOPIES is always 1, so the
5698 loop is redundant. */
5699 stmt_vec_info use_info = reduction_phi_info;
5700 for (j = 0; j < ncopies; j++)
5702 edge pr_edge = loop_preheader_edge (loop);
5703 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5704 pr_edge->dest_idx, vect_phi_res);
5705 use_info = STMT_VINFO_RELATED_STMT (use_info);
5711 phis.release ();
5712 if (nested_in_vect_loop)
5714 if (double_reduc)
5715 loop = outer_loop;
5716 else
5717 continue;
5720 phis.create (3);
5721 /* Find the loop-closed-use at the loop exit of the original scalar
5722 result. (The reduction result is expected to have two immediate uses,
5723 one at the latch block, and one at the loop exit). For double
5724 reductions we are looking for exit phis of the outer loop. */
5725 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5727 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5729 if (!is_gimple_debug (USE_STMT (use_p)))
5730 phis.safe_push (USE_STMT (use_p));
5732 else
5734 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5736 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5738 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5740 if (!flow_bb_inside_loop_p (loop,
5741 gimple_bb (USE_STMT (phi_use_p)))
5742 && !is_gimple_debug (USE_STMT (phi_use_p)))
5743 phis.safe_push (USE_STMT (phi_use_p));
5749 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5751 /* Replace the uses: */
5752 orig_name = PHI_RESULT (exit_phi);
5753 scalar_result = scalar_results[k];
5754 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5755 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5756 SET_USE (use_p, scalar_result);
5759 phis.release ();
5763 /* Return a vector of type VECTYPE that is equal to the vector select
5764 operation "MASK ? VEC : IDENTITY". Insert the select statements
5765 before GSI. */
5767 static tree
5768 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5769 tree vec, tree identity)
5771 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5772 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5773 mask, vec, identity);
5774 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5775 return cond;
5778 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5779 order, starting with LHS. Insert the extraction statements before GSI and
5780 associate the new scalar SSA names with variable SCALAR_DEST.
5781 Return the SSA name for the result. */
5783 static tree
5784 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5785 tree_code code, tree lhs, tree vector_rhs)
5787 tree vectype = TREE_TYPE (vector_rhs);
5788 tree scalar_type = TREE_TYPE (vectype);
5789 tree bitsize = TYPE_SIZE (scalar_type);
5790 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5791 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5793 for (unsigned HOST_WIDE_INT bit_offset = 0;
5794 bit_offset < vec_size_in_bits;
5795 bit_offset += element_bitsize)
5797 tree bitpos = bitsize_int (bit_offset);
5798 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5799 bitsize, bitpos);
5801 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5802 rhs = make_ssa_name (scalar_dest, stmt);
5803 gimple_assign_set_lhs (stmt, rhs);
5804 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5806 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5807 tree new_name = make_ssa_name (scalar_dest, stmt);
5808 gimple_assign_set_lhs (stmt, new_name);
5809 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5810 lhs = new_name;
5812 return lhs;
5815 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
5816 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5817 statement. CODE is the operation performed by STMT and OPS are
5818 its scalar operands. REDUC_INDEX is the index of the operand in
5819 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5820 implements in-order reduction, or IFN_LAST if we should open-code it.
5821 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5822 that should be used to control the operation in a fully-masked loop. */
5824 static bool
5825 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5826 stmt_vec_info *vec_stmt, slp_tree slp_node,
5827 gimple *reduc_def_stmt,
5828 tree_code code, internal_fn reduc_fn,
5829 tree ops[3], tree vectype_in,
5830 int reduc_index, vec_loop_masks *masks)
5832 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5833 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5834 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5835 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5836 stmt_vec_info new_stmt_info = NULL;
5838 int ncopies;
5839 if (slp_node)
5840 ncopies = 1;
5841 else
5842 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5844 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5845 gcc_assert (ncopies == 1);
5846 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5847 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5848 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5849 == FOLD_LEFT_REDUCTION);
5851 if (slp_node)
5852 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5853 TYPE_VECTOR_SUBPARTS (vectype_in)));
5855 tree op0 = ops[1 - reduc_index];
5857 int group_size = 1;
5858 stmt_vec_info scalar_dest_def_info;
5859 auto_vec<tree> vec_oprnds0;
5860 if (slp_node)
5862 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5863 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5864 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5866 else
5868 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5869 vec_oprnds0.create (1);
5870 vec_oprnds0.quick_push (loop_vec_def0);
5871 scalar_dest_def_info = stmt_info;
5874 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5875 tree scalar_type = TREE_TYPE (scalar_dest);
5876 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5878 int vec_num = vec_oprnds0.length ();
5879 gcc_assert (vec_num == 1 || slp_node);
5880 tree vec_elem_type = TREE_TYPE (vectype_out);
5881 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5883 tree vector_identity = NULL_TREE;
5884 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5885 vector_identity = build_zero_cst (vectype_out);
5887 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5888 int i;
5889 tree def0;
5890 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5892 gimple *new_stmt;
5893 tree mask = NULL_TREE;
5894 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5895 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5897 /* Handle MINUS by adding the negative. */
5898 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5900 tree negated = make_ssa_name (vectype_out);
5901 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5902 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5903 def0 = negated;
5906 if (mask)
5907 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5908 vector_identity);
5910 /* On the first iteration the input is simply the scalar phi
5911 result, and for subsequent iterations it is the output of
5912 the preceding operation. */
5913 if (reduc_fn != IFN_LAST)
5915 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5916 /* For chained SLP reductions the output of the previous reduction
5917 operation serves as the input of the next. For the final statement
5918 the output cannot be a temporary - we reuse the original
5919 scalar destination of the last statement. */
5920 if (i != vec_num - 1)
5922 gimple_set_lhs (new_stmt, scalar_dest_var);
5923 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5924 gimple_set_lhs (new_stmt, reduc_var);
5927 else
5929 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5930 reduc_var, def0);
5931 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5932 /* Remove the statement, so that we can use the same code paths
5933 as for statements that we've just created. */
5934 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5935 gsi_remove (&tmp_gsi, false);
5938 if (i == vec_num - 1)
5940 gimple_set_lhs (new_stmt, scalar_dest);
5941 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5942 new_stmt);
5944 else
5945 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5946 new_stmt, gsi);
5948 if (slp_node)
5949 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5952 if (!slp_node)
5953 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5955 return true;
5958 /* Function is_nonwrapping_integer_induction.
5960 Check if STMT (which is part of loop LOOP) both increments and
5961 does not cause overflow. */
5963 static bool
5964 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5966 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5967 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5968 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5969 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5970 widest_int ni, max_loop_value, lhs_max;
5971 wi::overflow_type overflow = wi::OVF_NONE;
5973 /* Make sure the loop is integer based. */
5974 if (TREE_CODE (base) != INTEGER_CST
5975 || TREE_CODE (step) != INTEGER_CST)
5976 return false;
5978 /* Check that the max size of the loop will not wrap. */
5980 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5981 return true;
5983 if (! max_stmt_executions (loop, &ni))
5984 return false;
5986 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5987 &overflow);
5988 if (overflow)
5989 return false;
5991 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5992 TYPE_SIGN (lhs_type), &overflow);
5993 if (overflow)
5994 return false;
5996 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5997 <= TYPE_PRECISION (lhs_type));
6000 /* Function vectorizable_reduction.
6002 Check if STMT performs a reduction operation that can be vectorized.
6003 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6004 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6005 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6007 This function also handles reduction idioms (patterns) that have been
6008 recognized in advance during vect_pattern_recog. In this case, STMT may be
6009 of this form:
6010 X = pattern_expr (arg0, arg1, ..., X)
6011 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6012 sequence that had been detected and replaced by the pattern-stmt (STMT).
6014 This function also handles reduction of condition expressions, for example:
6015 for (int i = 0; i < N; i++)
6016 if (a[i] < value)
6017 last = a[i];
6018 This is handled by vectorising the loop and creating an additional vector
6019 containing the loop indexes for which "a[i] < value" was true. In the
6020 function epilogue this is reduced to a single max value and then used to
6021 index into the vector of results.
6023 In some cases of reduction patterns, the type of the reduction variable X is
6024 different than the type of the other arguments of STMT.
6025 In such cases, the vectype that is used when transforming STMT into a vector
6026 stmt is different than the vectype that is used to determine the
6027 vectorization factor, because it consists of a different number of elements
6028 than the actual number of elements that are being operated upon in parallel.
6030 For example, consider an accumulation of shorts into an int accumulator.
6031 On some targets it's possible to vectorize this pattern operating on 8
6032 shorts at a time (hence, the vectype for purposes of determining the
6033 vectorization factor should be V8HI); on the other hand, the vectype that
6034 is used to create the vector form is actually V4SI (the type of the result).
6036 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6037 indicates what is the actual level of parallelism (V8HI in the example), so
6038 that the right vectorization factor would be derived. This vectype
6039 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6040 be used to create the vectorized stmt. The right vectype for the vectorized
6041 stmt is obtained from the type of the result X:
6042 get_vectype_for_scalar_type (TREE_TYPE (X))
6044 This means that, contrary to "regular" reductions (or "regular" stmts in
6045 general), the following equation:
6046 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6047 does *NOT* necessarily hold for reduction patterns. */
6049 bool
6050 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6051 stmt_vec_info *vec_stmt, slp_tree slp_node,
6052 slp_instance slp_node_instance,
6053 stmt_vector_for_cost *cost_vec)
6055 tree vec_dest;
6056 tree scalar_dest;
6057 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6058 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6059 tree vectype_in = NULL_TREE;
6060 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6061 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6062 enum tree_code code, orig_code;
6063 internal_fn reduc_fn;
6064 machine_mode vec_mode;
6065 int op_type;
6066 optab optab;
6067 tree new_temp = NULL_TREE;
6068 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6069 gimple *cond_reduc_def_stmt = NULL;
6070 enum tree_code cond_reduc_op_code = ERROR_MARK;
6071 tree scalar_type;
6072 bool is_simple_use;
6073 int i;
6074 int ncopies;
6075 int epilog_copies;
6076 stmt_vec_info prev_stmt_info, prev_phi_info;
6077 bool single_defuse_cycle = false;
6078 stmt_vec_info new_stmt_info = NULL;
6079 int j;
6080 tree ops[3];
6081 enum vect_def_type dts[3];
6082 bool nested_cycle = false, found_nested_cycle_def = false;
6083 bool double_reduc = false;
6084 basic_block def_bb;
6085 struct loop * def_stmt_loop;
6086 tree def_arg;
6087 auto_vec<tree> vec_oprnds0;
6088 auto_vec<tree> vec_oprnds1;
6089 auto_vec<tree> vec_oprnds2;
6090 auto_vec<tree> vect_defs;
6091 auto_vec<stmt_vec_info> phis;
6092 int vec_num;
6093 tree def0, tem;
6094 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6095 tree cond_reduc_val = NULL_TREE;
6097 /* Make sure it was already recognized as a reduction computation. */
6098 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6099 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6100 return false;
6102 if (nested_in_vect_loop_p (loop, stmt))
6104 loop = loop->inner;
6105 nested_cycle = true;
6108 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6109 gcc_assert (slp_node
6110 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6112 if (gphi *phi = dyn_cast <gphi *> (stmt))
6114 tree phi_result = gimple_phi_result (phi);
6115 /* Analysis is fully done on the reduction stmt invocation. */
6116 if (! vec_stmt)
6118 if (slp_node)
6119 slp_node_instance->reduc_phis = slp_node;
6121 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6122 return true;
6125 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6126 /* Leave the scalar phi in place. Note that checking
6127 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6128 for reductions involving a single statement. */
6129 return true;
6131 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6132 if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
6133 reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
6135 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6136 == EXTRACT_LAST_REDUCTION)
6137 /* Leave the scalar phi in place. */
6138 return true;
6140 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6141 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6143 tree op = gimple_op (reduc_stmt, k);
6144 if (op == phi_result)
6145 continue;
6146 if (k == 1
6147 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6148 continue;
6149 if (!vectype_in
6150 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6151 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6152 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6153 break;
6155 gcc_assert (vectype_in);
6157 if (slp_node)
6158 ncopies = 1;
6159 else
6160 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6162 stmt_vec_info use_stmt_info;
6163 if (ncopies > 1
6164 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6165 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6166 && (use_stmt_info == reduc_stmt_info
6167 || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt))
6168 single_defuse_cycle = true;
6170 /* Create the destination vector */
6171 scalar_dest = gimple_assign_lhs (reduc_stmt);
6172 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6174 if (slp_node)
6175 /* The size vect_schedule_slp_instance computes is off for us. */
6176 vec_num = vect_get_num_vectors
6177 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6178 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6179 vectype_in);
6180 else
6181 vec_num = 1;
6183 /* Generate the reduction PHIs upfront. */
6184 prev_phi_info = NULL;
6185 for (j = 0; j < ncopies; j++)
6187 if (j == 0 || !single_defuse_cycle)
6189 for (i = 0; i < vec_num; i++)
6191 /* Create the reduction-phi that defines the reduction
6192 operand. */
6193 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6194 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6196 if (slp_node)
6197 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6198 else
6200 if (j == 0)
6201 STMT_VINFO_VEC_STMT (stmt_info)
6202 = *vec_stmt = new_phi_info;
6203 else
6204 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6205 prev_phi_info = new_phi_info;
6211 return true;
6214 /* 1. Is vectorizable reduction? */
6215 /* Not supportable if the reduction variable is used in the loop, unless
6216 it's a reduction chain. */
6217 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6218 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6219 return false;
6221 /* Reductions that are not used even in an enclosing outer-loop,
6222 are expected to be "live" (used out of the loop). */
6223 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6224 && !STMT_VINFO_LIVE_P (stmt_info))
6225 return false;
6227 /* 2. Has this been recognized as a reduction pattern?
6229 Check if STMT represents a pattern that has been recognized
6230 in earlier analysis stages. For stmts that represent a pattern,
6231 the STMT_VINFO_RELATED_STMT field records the last stmt in
6232 the original sequence that constitutes the pattern. */
6234 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6235 if (orig_stmt_info)
6237 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6238 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6241 /* 3. Check the operands of the operation. The first operands are defined
6242 inside the loop body. The last operand is the reduction variable,
6243 which is defined by the loop-header-phi. */
6245 gcc_assert (is_gimple_assign (stmt));
6247 /* Flatten RHS. */
6248 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6250 case GIMPLE_BINARY_RHS:
6251 code = gimple_assign_rhs_code (stmt);
6252 op_type = TREE_CODE_LENGTH (code);
6253 gcc_assert (op_type == binary_op);
6254 ops[0] = gimple_assign_rhs1 (stmt);
6255 ops[1] = gimple_assign_rhs2 (stmt);
6256 break;
6258 case GIMPLE_TERNARY_RHS:
6259 code = gimple_assign_rhs_code (stmt);
6260 op_type = TREE_CODE_LENGTH (code);
6261 gcc_assert (op_type == ternary_op);
6262 ops[0] = gimple_assign_rhs1 (stmt);
6263 ops[1] = gimple_assign_rhs2 (stmt);
6264 ops[2] = gimple_assign_rhs3 (stmt);
6265 break;
6267 case GIMPLE_UNARY_RHS:
6268 return false;
6270 default:
6271 gcc_unreachable ();
6274 if (code == COND_EXPR && slp_node)
6275 return false;
6277 scalar_dest = gimple_assign_lhs (stmt);
6278 scalar_type = TREE_TYPE (scalar_dest);
6279 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6280 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6281 return false;
6283 /* Do not try to vectorize bit-precision reductions. */
6284 if (!type_has_mode_precision_p (scalar_type))
6285 return false;
6287 /* All uses but the last are expected to be defined in the loop.
6288 The last use is the reduction variable. In case of nested cycle this
6289 assumption is not true: we use reduc_index to record the index of the
6290 reduction variable. */
6291 stmt_vec_info reduc_def_info = NULL;
6292 int reduc_index = -1;
6293 for (i = 0; i < op_type; i++)
6295 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6296 if (i == 0 && code == COND_EXPR)
6297 continue;
6299 stmt_vec_info def_stmt_info;
6300 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6301 &def_stmt_info);
6302 dt = dts[i];
6303 gcc_assert (is_simple_use);
6304 if (dt == vect_reduction_def)
6306 reduc_def_info = def_stmt_info;
6307 reduc_index = i;
6308 continue;
6310 else if (tem)
6312 /* To properly compute ncopies we are interested in the widest
6313 input type in case we're looking at a widening accumulation. */
6314 if (!vectype_in
6315 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6316 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6317 vectype_in = tem;
6320 if (dt != vect_internal_def
6321 && dt != vect_external_def
6322 && dt != vect_constant_def
6323 && dt != vect_induction_def
6324 && !(dt == vect_nested_cycle && nested_cycle))
6325 return false;
6327 if (dt == vect_nested_cycle)
6329 found_nested_cycle_def = true;
6330 reduc_def_info = def_stmt_info;
6331 reduc_index = i;
6334 if (i == 1 && code == COND_EXPR)
6336 /* Record how value of COND_EXPR is defined. */
6337 if (dt == vect_constant_def)
6339 cond_reduc_dt = dt;
6340 cond_reduc_val = ops[i];
6342 if (dt == vect_induction_def
6343 && def_stmt_info
6344 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6346 cond_reduc_dt = dt;
6347 cond_reduc_def_stmt = def_stmt_info;
6352 if (!vectype_in)
6353 vectype_in = vectype_out;
6355 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6356 directy used in stmt. */
6357 if (reduc_index == -1)
6359 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6361 if (dump_enabled_p ())
6362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6363 "in-order reduction chain without SLP.\n");
6364 return false;
6367 if (orig_stmt_info)
6368 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6369 else
6370 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6373 if (! reduc_def_info)
6374 return false;
6376 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6377 if (!reduc_def_phi)
6378 return false;
6380 if (!(reduc_index == -1
6381 || dts[reduc_index] == vect_reduction_def
6382 || dts[reduc_index] == vect_nested_cycle
6383 || ((dts[reduc_index] == vect_internal_def
6384 || dts[reduc_index] == vect_external_def
6385 || dts[reduc_index] == vect_constant_def
6386 || dts[reduc_index] == vect_induction_def)
6387 && nested_cycle && found_nested_cycle_def)))
6389 /* For pattern recognized stmts, orig_stmt might be a reduction,
6390 but some helper statements for the pattern might not, or
6391 might be COND_EXPRs with reduction uses in the condition. */
6392 gcc_assert (orig_stmt_info);
6393 return false;
6396 /* PHIs should not participate in patterns. */
6397 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6398 enum vect_reduction_type v_reduc_type
6399 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6400 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6402 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6403 /* If we have a condition reduction, see if we can simplify it further. */
6404 if (v_reduc_type == COND_REDUCTION)
6406 /* TODO: We can't yet handle reduction chains, since we need to treat
6407 each COND_EXPR in the chain specially, not just the last one.
6408 E.g. for:
6410 x_1 = PHI <x_3, ...>
6411 x_2 = a_2 ? ... : x_1;
6412 x_3 = a_3 ? ... : x_2;
6414 we're interested in the last element in x_3 for which a_2 || a_3
6415 is true, whereas the current reduction chain handling would
6416 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6417 as a reduction operation. */
6418 if (reduc_index == -1)
6420 if (dump_enabled_p ())
6421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6422 "conditional reduction chains not supported\n");
6423 return false;
6426 /* vect_is_simple_reduction ensured that operand 2 is the
6427 loop-carried operand. */
6428 gcc_assert (reduc_index == 2);
6430 /* Loop peeling modifies initial value of reduction PHI, which
6431 makes the reduction stmt to be transformed different to the
6432 original stmt analyzed. We need to record reduction code for
6433 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6434 it can be used directly at transform stage. */
6435 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6436 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6438 /* Also set the reduction type to CONST_COND_REDUCTION. */
6439 gcc_assert (cond_reduc_dt == vect_constant_def);
6440 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6442 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6443 vectype_in, OPTIMIZE_FOR_SPEED))
6445 if (dump_enabled_p ())
6446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6447 "optimizing condition reduction with"
6448 " FOLD_EXTRACT_LAST.\n");
6449 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6451 else if (cond_reduc_dt == vect_induction_def)
6453 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6454 tree base
6455 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6456 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6458 gcc_assert (TREE_CODE (base) == INTEGER_CST
6459 && TREE_CODE (step) == INTEGER_CST);
6460 cond_reduc_val = NULL_TREE;
6461 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6462 above base; punt if base is the minimum value of the type for
6463 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6464 if (tree_int_cst_sgn (step) == -1)
6466 cond_reduc_op_code = MIN_EXPR;
6467 if (tree_int_cst_sgn (base) == -1)
6468 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6469 else if (tree_int_cst_lt (base,
6470 TYPE_MAX_VALUE (TREE_TYPE (base))))
6471 cond_reduc_val
6472 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6474 else
6476 cond_reduc_op_code = MAX_EXPR;
6477 if (tree_int_cst_sgn (base) == 1)
6478 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6479 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6480 base))
6481 cond_reduc_val
6482 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6484 if (cond_reduc_val)
6486 if (dump_enabled_p ())
6487 dump_printf_loc (MSG_NOTE, vect_location,
6488 "condition expression based on "
6489 "integer induction.\n");
6490 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6491 = INTEGER_INDUC_COND_REDUCTION;
6494 else if (cond_reduc_dt == vect_constant_def)
6496 enum vect_def_type cond_initial_dt;
6497 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6498 tree cond_initial_val
6499 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6501 gcc_assert (cond_reduc_val != NULL_TREE);
6502 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6503 if (cond_initial_dt == vect_constant_def
6504 && types_compatible_p (TREE_TYPE (cond_initial_val),
6505 TREE_TYPE (cond_reduc_val)))
6507 tree e = fold_binary (LE_EXPR, boolean_type_node,
6508 cond_initial_val, cond_reduc_val);
6509 if (e && (integer_onep (e) || integer_zerop (e)))
6511 if (dump_enabled_p ())
6512 dump_printf_loc (MSG_NOTE, vect_location,
6513 "condition expression based on "
6514 "compile time constant.\n");
6515 /* Record reduction code at analysis stage. */
6516 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6517 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6518 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6519 = CONST_COND_REDUCTION;
6525 if (orig_stmt_info)
6526 gcc_assert (tmp == orig_stmt_info
6527 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6528 else
6529 /* We changed STMT to be the first stmt in reduction chain, hence we
6530 check that in this case the first element in the chain is STMT. */
6531 gcc_assert (tmp == stmt_info
6532 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6534 if (STMT_VINFO_LIVE_P (reduc_def_info))
6535 return false;
6537 if (slp_node)
6538 ncopies = 1;
6539 else
6540 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6542 gcc_assert (ncopies >= 1);
6544 vec_mode = TYPE_MODE (vectype_in);
6545 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6547 if (code == COND_EXPR)
6549 /* Only call during the analysis stage, otherwise we'll lose
6550 STMT_VINFO_TYPE. */
6551 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6552 ops[reduc_index], 0, NULL,
6553 cost_vec))
6555 if (dump_enabled_p ())
6556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6557 "unsupported condition in reduction\n");
6558 return false;
6561 else
6563 /* 4. Supportable by target? */
6565 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6566 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6568 /* Shifts and rotates are only supported by vectorizable_shifts,
6569 not vectorizable_reduction. */
6570 if (dump_enabled_p ())
6571 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6572 "unsupported shift or rotation.\n");
6573 return false;
6576 /* 4.1. check support for the operation in the loop */
6577 optab = optab_for_tree_code (code, vectype_in, optab_default);
6578 if (!optab)
6580 if (dump_enabled_p ())
6581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6582 "no optab.\n");
6584 return false;
6587 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6589 if (dump_enabled_p ())
6590 dump_printf (MSG_NOTE, "op not supported by target.\n");
6592 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6593 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6594 return false;
6596 if (dump_enabled_p ())
6597 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6600 /* Worthwhile without SIMD support? */
6601 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6602 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6604 if (dump_enabled_p ())
6605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6606 "not worthwhile without SIMD support.\n");
6608 return false;
6612 /* 4.2. Check support for the epilog operation.
6614 If STMT represents a reduction pattern, then the type of the
6615 reduction variable may be different than the type of the rest
6616 of the arguments. For example, consider the case of accumulation
6617 of shorts into an int accumulator; The original code:
6618 S1: int_a = (int) short_a;
6619 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6621 was replaced with:
6622 STMT: int_acc = widen_sum <short_a, int_acc>
6624 This means that:
6625 1. The tree-code that is used to create the vector operation in the
6626 epilog code (that reduces the partial results) is not the
6627 tree-code of STMT, but is rather the tree-code of the original
6628 stmt from the pattern that STMT is replacing. I.e, in the example
6629 above we want to use 'widen_sum' in the loop, but 'plus' in the
6630 epilog.
6631 2. The type (mode) we use to check available target support
6632 for the vector operation to be created in the *epilog*, is
6633 determined by the type of the reduction variable (in the example
6634 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6635 However the type (mode) we use to check available target support
6636 for the vector operation to be created *inside the loop*, is
6637 determined by the type of the other arguments to STMT (in the
6638 example we'd check this: optab_handler (widen_sum_optab,
6639 vect_short_mode)).
6641 This is contrary to "regular" reductions, in which the types of all
6642 the arguments are the same as the type of the reduction variable.
6643 For "regular" reductions we can therefore use the same vector type
6644 (and also the same tree-code) when generating the epilog code and
6645 when generating the code inside the loop. */
6647 vect_reduction_type reduction_type
6648 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6649 if (orig_stmt_info
6650 && (reduction_type == TREE_CODE_REDUCTION
6651 || reduction_type == FOLD_LEFT_REDUCTION))
6653 /* This is a reduction pattern: get the vectype from the type of the
6654 reduction variable, and get the tree-code from orig_stmt. */
6655 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6656 gcc_assert (vectype_out);
6657 vec_mode = TYPE_MODE (vectype_out);
6659 else
6661 /* Regular reduction: use the same vectype and tree-code as used for
6662 the vector code inside the loop can be used for the epilog code. */
6663 orig_code = code;
6665 if (code == MINUS_EXPR)
6666 orig_code = PLUS_EXPR;
6668 /* For simple condition reductions, replace with the actual expression
6669 we want to base our reduction around. */
6670 if (reduction_type == CONST_COND_REDUCTION)
6672 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6673 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6675 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6676 orig_code = cond_reduc_op_code;
6679 if (nested_cycle)
6681 def_bb = gimple_bb (reduc_def_phi);
6682 def_stmt_loop = def_bb->loop_father;
6683 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6684 loop_preheader_edge (def_stmt_loop));
6685 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6686 if (def_arg_stmt_info
6687 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6688 == vect_double_reduction_def))
6689 double_reduc = true;
6692 reduc_fn = IFN_LAST;
6694 if (reduction_type == TREE_CODE_REDUCTION
6695 || reduction_type == FOLD_LEFT_REDUCTION
6696 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6697 || reduction_type == CONST_COND_REDUCTION)
6699 if (reduction_type == FOLD_LEFT_REDUCTION
6700 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6701 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6703 if (reduc_fn != IFN_LAST
6704 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6705 OPTIMIZE_FOR_SPEED))
6707 if (dump_enabled_p ())
6708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6709 "reduc op not supported by target.\n");
6711 reduc_fn = IFN_LAST;
6714 else
6716 if (!nested_cycle || double_reduc)
6718 if (dump_enabled_p ())
6719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6720 "no reduc code for scalar code.\n");
6722 return false;
6726 else if (reduction_type == COND_REDUCTION)
6728 int scalar_precision
6729 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6730 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6731 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6732 nunits_out);
6734 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6735 OPTIMIZE_FOR_SPEED))
6736 reduc_fn = IFN_REDUC_MAX;
6739 if (reduction_type != EXTRACT_LAST_REDUCTION
6740 && reduc_fn == IFN_LAST
6741 && !nunits_out.is_constant ())
6743 if (dump_enabled_p ())
6744 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6745 "missing target support for reduction on"
6746 " variable-length vectors.\n");
6747 return false;
6750 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6751 && ncopies > 1)
6753 if (dump_enabled_p ())
6754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755 "multiple types in double reduction or condition "
6756 "reduction.\n");
6757 return false;
6760 /* For SLP reductions, see if there is a neutral value we can use. */
6761 tree neutral_op = NULL_TREE;
6762 if (slp_node)
6763 neutral_op = neutral_op_for_slp_reduction
6764 (slp_node_instance->reduc_phis, code,
6765 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL_STMT_VEC_INFO);
6767 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6769 /* We can't support in-order reductions of code such as this:
6771 for (int i = 0; i < n1; ++i)
6772 for (int j = 0; j < n2; ++j)
6773 l += a[j];
6775 since GCC effectively transforms the loop when vectorizing:
6777 for (int i = 0; i < n1 / VF; ++i)
6778 for (int j = 0; j < n2; ++j)
6779 for (int k = 0; k < VF; ++k)
6780 l += a[j];
6782 which is a reassociation of the original operation. */
6783 if (dump_enabled_p ())
6784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785 "in-order double reduction not supported.\n");
6787 return false;
6790 if (reduction_type == FOLD_LEFT_REDUCTION
6791 && slp_node
6792 && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6794 /* We cannot use in-order reductions in this case because there is
6795 an implicit reassociation of the operations involved. */
6796 if (dump_enabled_p ())
6797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6798 "in-order unchained SLP reductions not supported.\n");
6799 return false;
6802 /* For double reductions, and for SLP reductions with a neutral value,
6803 we construct a variable-length initial vector by loading a vector
6804 full of the neutral value and then shift-and-inserting the start
6805 values into the low-numbered elements. */
6806 if ((double_reduc || neutral_op)
6807 && !nunits_out.is_constant ()
6808 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6809 vectype_out, OPTIMIZE_FOR_SPEED))
6811 if (dump_enabled_p ())
6812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6813 "reduction on variable-length vectors requires"
6814 " target support for a vector-shift-and-insert"
6815 " operation.\n");
6816 return false;
6819 /* Check extra constraints for variable-length unchained SLP reductions. */
6820 if (STMT_SLP_TYPE (stmt_info)
6821 && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6822 && !nunits_out.is_constant ())
6824 /* We checked above that we could build the initial vector when
6825 there's a neutral element value. Check here for the case in
6826 which each SLP statement has its own initial value and in which
6827 that value needs to be repeated for every instance of the
6828 statement within the initial vector. */
6829 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6830 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6831 if (!neutral_op
6832 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6834 if (dump_enabled_p ())
6835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6836 "unsupported form of SLP reduction for"
6837 " variable-length vectors: cannot build"
6838 " initial vector.\n");
6839 return false;
6841 /* The epilogue code relies on the number of elements being a multiple
6842 of the group size. The duplicate-and-interleave approach to setting
6843 up the the initial vector does too. */
6844 if (!multiple_p (nunits_out, group_size))
6846 if (dump_enabled_p ())
6847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6848 "unsupported form of SLP reduction for"
6849 " variable-length vectors: the vector size"
6850 " is not a multiple of the number of results.\n");
6851 return false;
6855 /* In case of widenning multiplication by a constant, we update the type
6856 of the constant to be the type of the other operand. We check that the
6857 constant fits the type in the pattern recognition pass. */
6858 if (code == DOT_PROD_EXPR
6859 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6861 if (TREE_CODE (ops[0]) == INTEGER_CST)
6862 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6863 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6864 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6865 else
6867 if (dump_enabled_p ())
6868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6869 "invalid types in dot-prod\n");
6871 return false;
6875 if (reduction_type == COND_REDUCTION)
6877 widest_int ni;
6879 if (! max_loop_iterations (loop, &ni))
6881 if (dump_enabled_p ())
6882 dump_printf_loc (MSG_NOTE, vect_location,
6883 "loop count not known, cannot create cond "
6884 "reduction.\n");
6885 return false;
6887 /* Convert backedges to iterations. */
6888 ni += 1;
6890 /* The additional index will be the same type as the condition. Check
6891 that the loop can fit into this less one (because we'll use up the
6892 zero slot for when there are no matches). */
6893 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6894 if (wi::geu_p (ni, wi::to_widest (max_index)))
6896 if (dump_enabled_p ())
6897 dump_printf_loc (MSG_NOTE, vect_location,
6898 "loop size is greater than data size.\n");
6899 return false;
6903 /* In case the vectorization factor (VF) is bigger than the number
6904 of elements that we can fit in a vectype (nunits), we have to generate
6905 more than one vector stmt - i.e - we need to "unroll" the
6906 vector stmt by a factor VF/nunits. For more details see documentation
6907 in vectorizable_operation. */
6909 /* If the reduction is used in an outer loop we need to generate
6910 VF intermediate results, like so (e.g. for ncopies=2):
6911 r0 = phi (init, r0)
6912 r1 = phi (init, r1)
6913 r0 = x0 + r0;
6914 r1 = x1 + r1;
6915 (i.e. we generate VF results in 2 registers).
6916 In this case we have a separate def-use cycle for each copy, and therefore
6917 for each copy we get the vector def for the reduction variable from the
6918 respective phi node created for this copy.
6920 Otherwise (the reduction is unused in the loop nest), we can combine
6921 together intermediate results, like so (e.g. for ncopies=2):
6922 r = phi (init, r)
6923 r = x0 + r;
6924 r = x1 + r;
6925 (i.e. we generate VF/2 results in a single register).
6926 In this case for each copy we get the vector def for the reduction variable
6927 from the vectorized reduction operation generated in the previous iteration.
6929 This only works when we see both the reduction PHI and its only consumer
6930 in vectorizable_reduction and there are no intermediate stmts
6931 participating. */
6932 stmt_vec_info use_stmt_info;
6933 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6934 if (ncopies > 1
6935 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6936 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6937 && (use_stmt_info == stmt_info
6938 || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt))
6940 single_defuse_cycle = true;
6941 epilog_copies = 1;
6943 else
6944 epilog_copies = ncopies;
6946 /* If the reduction stmt is one of the patterns that have lane
6947 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6948 if ((ncopies > 1
6949 && ! single_defuse_cycle)
6950 && (code == DOT_PROD_EXPR
6951 || code == WIDEN_SUM_EXPR
6952 || code == SAD_EXPR))
6954 if (dump_enabled_p ())
6955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6956 "multi def-use cycle not possible for lane-reducing "
6957 "reduction operation\n");
6958 return false;
6961 if (slp_node)
6962 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6963 else
6964 vec_num = 1;
6966 internal_fn cond_fn = get_conditional_internal_fn (code);
6967 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6969 if (!vec_stmt) /* transformation not required. */
6971 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6972 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6974 if (reduction_type != FOLD_LEFT_REDUCTION
6975 && (cond_fn == IFN_LAST
6976 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6977 OPTIMIZE_FOR_SPEED)))
6979 if (dump_enabled_p ())
6980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6981 "can't use a fully-masked loop because no"
6982 " conditional operation is available.\n");
6983 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6985 else if (reduc_index == -1)
6987 if (dump_enabled_p ())
6988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6989 "can't use a fully-masked loop for chained"
6990 " reductions.\n");
6991 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6993 else
6994 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6995 vectype_in);
6997 if (dump_enabled_p ()
6998 && reduction_type == FOLD_LEFT_REDUCTION)
6999 dump_printf_loc (MSG_NOTE, vect_location,
7000 "using an in-order (fold-left) reduction.\n");
7001 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7002 return true;
7005 /* Transform. */
7007 if (dump_enabled_p ())
7008 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7010 /* FORNOW: Multiple types are not supported for condition. */
7011 if (code == COND_EXPR)
7012 gcc_assert (ncopies == 1);
7014 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7016 if (reduction_type == FOLD_LEFT_REDUCTION)
7017 return vectorize_fold_left_reduction
7018 (stmt, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7019 reduc_fn, ops, vectype_in, reduc_index, masks);
7021 if (reduction_type == EXTRACT_LAST_REDUCTION)
7023 gcc_assert (!slp_node);
7024 return vectorizable_condition (stmt, gsi, vec_stmt,
7025 NULL, reduc_index, NULL, NULL);
7028 /* Create the destination vector */
7029 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7031 prev_stmt_info = NULL;
7032 prev_phi_info = NULL;
7033 if (!slp_node)
7035 vec_oprnds0.create (1);
7036 vec_oprnds1.create (1);
7037 if (op_type == ternary_op)
7038 vec_oprnds2.create (1);
7041 phis.create (vec_num);
7042 vect_defs.create (vec_num);
7043 if (!slp_node)
7044 vect_defs.quick_push (NULL_TREE);
7046 if (slp_node)
7047 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7048 else
7049 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7051 for (j = 0; j < ncopies; j++)
7053 if (code == COND_EXPR)
7055 gcc_assert (!slp_node);
7056 vectorizable_condition (stmt, gsi, vec_stmt,
7057 PHI_RESULT (phis[0]->stmt),
7058 reduc_index, NULL, NULL);
7059 /* Multiple types are not supported for condition. */
7060 break;
7063 /* Handle uses. */
7064 if (j == 0)
7066 if (slp_node)
7068 /* Get vec defs for all the operands except the reduction index,
7069 ensuring the ordering of the ops in the vector is kept. */
7070 auto_vec<tree, 3> slp_ops;
7071 auto_vec<vec<tree>, 3> vec_defs;
7073 slp_ops.quick_push (ops[0]);
7074 slp_ops.quick_push (ops[1]);
7075 if (op_type == ternary_op)
7076 slp_ops.quick_push (ops[2]);
7078 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7080 vec_oprnds0.safe_splice (vec_defs[0]);
7081 vec_defs[0].release ();
7082 vec_oprnds1.safe_splice (vec_defs[1]);
7083 vec_defs[1].release ();
7084 if (op_type == ternary_op)
7086 vec_oprnds2.safe_splice (vec_defs[2]);
7087 vec_defs[2].release ();
7090 else
7092 vec_oprnds0.quick_push
7093 (vect_get_vec_def_for_operand (ops[0], stmt));
7094 vec_oprnds1.quick_push
7095 (vect_get_vec_def_for_operand (ops[1], stmt));
7096 if (op_type == ternary_op)
7097 vec_oprnds2.quick_push
7098 (vect_get_vec_def_for_operand (ops[2], stmt));
7101 else
7103 if (!slp_node)
7105 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7107 if (single_defuse_cycle && reduc_index == 0)
7108 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7109 else
7110 vec_oprnds0[0]
7111 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7112 if (single_defuse_cycle && reduc_index == 1)
7113 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7114 else
7115 vec_oprnds1[0]
7116 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7117 if (op_type == ternary_op)
7119 if (single_defuse_cycle && reduc_index == 2)
7120 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7121 else
7122 vec_oprnds2[0]
7123 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7128 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7130 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7131 if (masked_loop_p)
7133 /* Make sure that the reduction accumulator is vop[0]. */
7134 if (reduc_index == 1)
7136 gcc_assert (commutative_tree_code (code));
7137 std::swap (vop[0], vop[1]);
7139 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7140 vectype_in, i * ncopies + j);
7141 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7142 vop[0], vop[1],
7143 vop[0]);
7144 new_temp = make_ssa_name (vec_dest, call);
7145 gimple_call_set_lhs (call, new_temp);
7146 gimple_call_set_nothrow (call, true);
7147 new_stmt_info = vect_finish_stmt_generation (stmt, call, gsi);
7149 else
7151 if (op_type == ternary_op)
7152 vop[2] = vec_oprnds2[i];
7154 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7155 vop[0], vop[1], vop[2]);
7156 new_temp = make_ssa_name (vec_dest, new_stmt);
7157 gimple_assign_set_lhs (new_stmt, new_temp);
7158 new_stmt_info
7159 = vect_finish_stmt_generation (stmt, new_stmt, gsi);
7162 if (slp_node)
7164 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7165 vect_defs.quick_push (new_temp);
7167 else
7168 vect_defs[0] = new_temp;
7171 if (slp_node)
7172 continue;
7174 if (j == 0)
7175 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7176 else
7177 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7179 prev_stmt_info = new_stmt_info;
7182 /* Finalize the reduction-phi (set its arguments) and create the
7183 epilog reduction code. */
7184 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7185 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7187 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_phi,
7188 epilog_copies, reduc_fn, phis,
7189 double_reduc, slp_node, slp_node_instance,
7190 cond_reduc_val, cond_reduc_op_code,
7191 neutral_op);
7193 return true;
7196 /* Function vect_min_worthwhile_factor.
7198 For a loop where we could vectorize the operation indicated by CODE,
7199 return the minimum vectorization factor that makes it worthwhile
7200 to use generic vectors. */
7201 static unsigned int
7202 vect_min_worthwhile_factor (enum tree_code code)
7204 switch (code)
7206 case PLUS_EXPR:
7207 case MINUS_EXPR:
7208 case NEGATE_EXPR:
7209 return 4;
7211 case BIT_AND_EXPR:
7212 case BIT_IOR_EXPR:
7213 case BIT_XOR_EXPR:
7214 case BIT_NOT_EXPR:
7215 return 2;
7217 default:
7218 return INT_MAX;
7222 /* Return true if VINFO indicates we are doing loop vectorization and if
7223 it is worth decomposing CODE operations into scalar operations for
7224 that loop's vectorization factor. */
7226 bool
7227 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7229 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7230 unsigned HOST_WIDE_INT value;
7231 return (loop_vinfo
7232 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7233 && value >= vect_min_worthwhile_factor (code));
7236 /* Function vectorizable_induction
7238 Check if PHI performs an induction computation that can be vectorized.
7239 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7240 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7241 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7243 bool
7244 vectorizable_induction (gimple *phi,
7245 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7246 stmt_vec_info *vec_stmt, slp_tree slp_node,
7247 stmt_vector_for_cost *cost_vec)
7249 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7250 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7251 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7252 unsigned ncopies;
7253 bool nested_in_vect_loop = false;
7254 struct loop *iv_loop;
7255 tree vec_def;
7256 edge pe = loop_preheader_edge (loop);
7257 basic_block new_bb;
7258 tree new_vec, vec_init, vec_step, t;
7259 tree new_name;
7260 gimple *new_stmt;
7261 gphi *induction_phi;
7262 tree induc_def, vec_dest;
7263 tree init_expr, step_expr;
7264 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7265 unsigned i;
7266 tree expr;
7267 gimple_seq stmts;
7268 imm_use_iterator imm_iter;
7269 use_operand_p use_p;
7270 gimple *exit_phi;
7271 edge latch_e;
7272 tree loop_arg;
7273 gimple_stmt_iterator si;
7274 basic_block bb = gimple_bb (phi);
7276 if (gimple_code (phi) != GIMPLE_PHI)
7277 return false;
7279 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7280 return false;
7282 /* Make sure it was recognized as induction computation. */
7283 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7284 return false;
7286 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7287 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7289 if (slp_node)
7290 ncopies = 1;
7291 else
7292 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7293 gcc_assert (ncopies >= 1);
7295 /* FORNOW. These restrictions should be relaxed. */
7296 if (nested_in_vect_loop_p (loop, phi))
7298 imm_use_iterator imm_iter;
7299 use_operand_p use_p;
7300 gimple *exit_phi;
7301 edge latch_e;
7302 tree loop_arg;
7304 if (ncopies > 1)
7306 if (dump_enabled_p ())
7307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7308 "multiple types in nested loop.\n");
7309 return false;
7312 /* FORNOW: outer loop induction with SLP not supported. */
7313 if (STMT_SLP_TYPE (stmt_info))
7314 return false;
7316 exit_phi = NULL;
7317 latch_e = loop_latch_edge (loop->inner);
7318 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7319 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7321 gimple *use_stmt = USE_STMT (use_p);
7322 if (is_gimple_debug (use_stmt))
7323 continue;
7325 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7327 exit_phi = use_stmt;
7328 break;
7331 if (exit_phi)
7333 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7334 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7335 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7337 if (dump_enabled_p ())
7338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7339 "inner-loop induction only used outside "
7340 "of the outer vectorized loop.\n");
7341 return false;
7345 nested_in_vect_loop = true;
7346 iv_loop = loop->inner;
7348 else
7349 iv_loop = loop;
7350 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7352 if (slp_node && !nunits.is_constant ())
7354 /* The current SLP code creates the initial value element-by-element. */
7355 if (dump_enabled_p ())
7356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7357 "SLP induction not supported for variable-length"
7358 " vectors.\n");
7359 return false;
7362 if (!vec_stmt) /* transformation not required. */
7364 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7365 DUMP_VECT_SCOPE ("vectorizable_induction");
7366 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7367 return true;
7370 /* Transform. */
7372 /* Compute a vector variable, initialized with the first VF values of
7373 the induction variable. E.g., for an iv with IV_PHI='X' and
7374 evolution S, for a vector of 4 units, we want to compute:
7375 [X, X + S, X + 2*S, X + 3*S]. */
7377 if (dump_enabled_p ())
7378 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7380 latch_e = loop_latch_edge (iv_loop);
7381 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7383 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7384 gcc_assert (step_expr != NULL_TREE);
7386 pe = loop_preheader_edge (iv_loop);
7387 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7388 loop_preheader_edge (iv_loop));
7390 stmts = NULL;
7391 if (!nested_in_vect_loop)
7393 /* Convert the initial value to the desired type. */
7394 tree new_type = TREE_TYPE (vectype);
7395 init_expr = gimple_convert (&stmts, new_type, init_expr);
7397 /* If we are using the loop mask to "peel" for alignment then we need
7398 to adjust the start value here. */
7399 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7400 if (skip_niters != NULL_TREE)
7402 if (FLOAT_TYPE_P (vectype))
7403 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7404 skip_niters);
7405 else
7406 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7407 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7408 skip_niters, step_expr);
7409 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7410 init_expr, skip_step);
7414 /* Convert the step to the desired type. */
7415 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7417 if (stmts)
7419 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7420 gcc_assert (!new_bb);
7423 /* Find the first insertion point in the BB. */
7424 si = gsi_after_labels (bb);
7426 /* For SLP induction we have to generate several IVs as for example
7427 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7428 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7429 [VF*S, VF*S, VF*S, VF*S] for all. */
7430 if (slp_node)
7432 /* Enforced above. */
7433 unsigned int const_nunits = nunits.to_constant ();
7435 /* Generate [VF*S, VF*S, ... ]. */
7436 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7438 expr = build_int_cst (integer_type_node, vf);
7439 expr = fold_convert (TREE_TYPE (step_expr), expr);
7441 else
7442 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7443 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7444 expr, step_expr);
7445 if (! CONSTANT_CLASS_P (new_name))
7446 new_name = vect_init_vector (phi, new_name,
7447 TREE_TYPE (step_expr), NULL);
7448 new_vec = build_vector_from_val (vectype, new_name);
7449 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7451 /* Now generate the IVs. */
7452 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7453 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7454 unsigned elts = const_nunits * nvects;
7455 unsigned nivs = least_common_multiple (group_size,
7456 const_nunits) / const_nunits;
7457 gcc_assert (elts % group_size == 0);
7458 tree elt = init_expr;
7459 unsigned ivn;
7460 for (ivn = 0; ivn < nivs; ++ivn)
7462 tree_vector_builder elts (vectype, const_nunits, 1);
7463 stmts = NULL;
7464 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7466 if (ivn*const_nunits + eltn >= group_size
7467 && (ivn * const_nunits + eltn) % group_size == 0)
7468 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7469 elt, step_expr);
7470 elts.quick_push (elt);
7472 vec_init = gimple_build_vector (&stmts, &elts);
7473 if (stmts)
7475 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7476 gcc_assert (!new_bb);
7479 /* Create the induction-phi that defines the induction-operand. */
7480 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7481 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7482 stmt_vec_info induction_phi_info
7483 = loop_vinfo->add_stmt (induction_phi);
7484 induc_def = PHI_RESULT (induction_phi);
7486 /* Create the iv update inside the loop */
7487 vec_def = make_ssa_name (vec_dest);
7488 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7489 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7490 loop_vinfo->add_stmt (new_stmt);
7492 /* Set the arguments of the phi node: */
7493 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7494 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7495 UNKNOWN_LOCATION);
7497 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7500 /* Re-use IVs when we can. */
7501 if (ivn < nvects)
7503 unsigned vfp
7504 = least_common_multiple (group_size, const_nunits) / group_size;
7505 /* Generate [VF'*S, VF'*S, ... ]. */
7506 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7508 expr = build_int_cst (integer_type_node, vfp);
7509 expr = fold_convert (TREE_TYPE (step_expr), expr);
7511 else
7512 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7513 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7514 expr, step_expr);
7515 if (! CONSTANT_CLASS_P (new_name))
7516 new_name = vect_init_vector (phi, new_name,
7517 TREE_TYPE (step_expr), NULL);
7518 new_vec = build_vector_from_val (vectype, new_name);
7519 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7520 for (; ivn < nvects; ++ivn)
7522 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7523 tree def;
7524 if (gimple_code (iv) == GIMPLE_PHI)
7525 def = gimple_phi_result (iv);
7526 else
7527 def = gimple_assign_lhs (iv);
7528 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7529 PLUS_EXPR,
7530 def, vec_step);
7531 if (gimple_code (iv) == GIMPLE_PHI)
7532 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7533 else
7535 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7536 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7538 SLP_TREE_VEC_STMTS (slp_node).quick_push
7539 (loop_vinfo->add_stmt (new_stmt));
7543 return true;
7546 /* Create the vector that holds the initial_value of the induction. */
7547 if (nested_in_vect_loop)
7549 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7550 been created during vectorization of previous stmts. We obtain it
7551 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7552 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7553 /* If the initial value is not of proper type, convert it. */
7554 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7556 new_stmt
7557 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7558 vect_simple_var,
7559 "vec_iv_"),
7560 VIEW_CONVERT_EXPR,
7561 build1 (VIEW_CONVERT_EXPR, vectype,
7562 vec_init));
7563 vec_init = gimple_assign_lhs (new_stmt);
7564 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7565 new_stmt);
7566 gcc_assert (!new_bb);
7567 loop_vinfo->add_stmt (new_stmt);
7570 else
7572 /* iv_loop is the loop to be vectorized. Create:
7573 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7574 stmts = NULL;
7575 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7577 unsigned HOST_WIDE_INT const_nunits;
7578 if (nunits.is_constant (&const_nunits))
7580 tree_vector_builder elts (vectype, const_nunits, 1);
7581 elts.quick_push (new_name);
7582 for (i = 1; i < const_nunits; i++)
7584 /* Create: new_name_i = new_name + step_expr */
7585 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7586 new_name, step_expr);
7587 elts.quick_push (new_name);
7589 /* Create a vector from [new_name_0, new_name_1, ...,
7590 new_name_nunits-1] */
7591 vec_init = gimple_build_vector (&stmts, &elts);
7593 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7594 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7595 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7596 new_name, step_expr);
7597 else
7599 /* Build:
7600 [base, base, base, ...]
7601 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7602 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7603 gcc_assert (flag_associative_math);
7604 tree index = build_index_vector (vectype, 0, 1);
7605 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7606 new_name);
7607 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7608 step_expr);
7609 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7610 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7611 vec_init, step_vec);
7612 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7613 vec_init, base_vec);
7616 if (stmts)
7618 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7619 gcc_assert (!new_bb);
7624 /* Create the vector that holds the step of the induction. */
7625 if (nested_in_vect_loop)
7626 /* iv_loop is nested in the loop to be vectorized. Generate:
7627 vec_step = [S, S, S, S] */
7628 new_name = step_expr;
7629 else
7631 /* iv_loop is the loop to be vectorized. Generate:
7632 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7633 gimple_seq seq = NULL;
7634 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7636 expr = build_int_cst (integer_type_node, vf);
7637 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7639 else
7640 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7641 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7642 expr, step_expr);
7643 if (seq)
7645 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7646 gcc_assert (!new_bb);
7650 t = unshare_expr (new_name);
7651 gcc_assert (CONSTANT_CLASS_P (new_name)
7652 || TREE_CODE (new_name) == SSA_NAME);
7653 new_vec = build_vector_from_val (vectype, t);
7654 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7657 /* Create the following def-use cycle:
7658 loop prolog:
7659 vec_init = ...
7660 vec_step = ...
7661 loop:
7662 vec_iv = PHI <vec_init, vec_loop>
7664 STMT
7666 vec_loop = vec_iv + vec_step; */
7668 /* Create the induction-phi that defines the induction-operand. */
7669 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7670 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7671 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7672 induc_def = PHI_RESULT (induction_phi);
7674 /* Create the iv update inside the loop */
7675 vec_def = make_ssa_name (vec_dest);
7676 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7677 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7678 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7680 /* Set the arguments of the phi node: */
7681 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7682 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7683 UNKNOWN_LOCATION);
7685 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7687 /* In case that vectorization factor (VF) is bigger than the number
7688 of elements that we can fit in a vectype (nunits), we have to generate
7689 more than one vector stmt - i.e - we need to "unroll" the
7690 vector stmt by a factor VF/nunits. For more details see documentation
7691 in vectorizable_operation. */
7693 if (ncopies > 1)
7695 gimple_seq seq = NULL;
7696 stmt_vec_info prev_stmt_vinfo;
7697 /* FORNOW. This restriction should be relaxed. */
7698 gcc_assert (!nested_in_vect_loop);
7700 /* Create the vector that holds the step of the induction. */
7701 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7703 expr = build_int_cst (integer_type_node, nunits);
7704 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7706 else
7707 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7708 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7709 expr, step_expr);
7710 if (seq)
7712 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7713 gcc_assert (!new_bb);
7716 t = unshare_expr (new_name);
7717 gcc_assert (CONSTANT_CLASS_P (new_name)
7718 || TREE_CODE (new_name) == SSA_NAME);
7719 new_vec = build_vector_from_val (vectype, t);
7720 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7722 vec_def = induc_def;
7723 prev_stmt_vinfo = induction_phi_info;
7724 for (i = 1; i < ncopies; i++)
7726 /* vec_i = vec_prev + vec_step */
7727 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7728 vec_def, vec_step);
7729 vec_def = make_ssa_name (vec_dest, new_stmt);
7730 gimple_assign_set_lhs (new_stmt, vec_def);
7732 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7733 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7734 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7735 prev_stmt_vinfo = new_stmt_info;
7739 if (nested_in_vect_loop)
7741 /* Find the loop-closed exit-phi of the induction, and record
7742 the final vector of induction results: */
7743 exit_phi = NULL;
7744 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7746 gimple *use_stmt = USE_STMT (use_p);
7747 if (is_gimple_debug (use_stmt))
7748 continue;
7750 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7752 exit_phi = use_stmt;
7753 break;
7756 if (exit_phi)
7758 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7759 /* FORNOW. Currently not supporting the case that an inner-loop induction
7760 is not used in the outer-loop (i.e. only outside the outer-loop). */
7761 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7762 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7764 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7765 if (dump_enabled_p ())
7767 dump_printf_loc (MSG_NOTE, vect_location,
7768 "vector of inductions after inner-loop:");
7769 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7775 if (dump_enabled_p ())
7777 dump_printf_loc (MSG_NOTE, vect_location,
7778 "transform induction: created def-use cycle: ");
7779 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7780 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7781 SSA_NAME_DEF_STMT (vec_def), 0);
7784 return true;
7787 /* Function vectorizable_live_operation.
7789 STMT computes a value that is used outside the loop. Check if
7790 it can be supported. */
7792 bool
7793 vectorizable_live_operation (gimple *stmt,
7794 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7795 slp_tree slp_node, int slp_index,
7796 stmt_vec_info *vec_stmt,
7797 stmt_vector_for_cost *)
7799 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7800 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7801 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7802 imm_use_iterator imm_iter;
7803 tree lhs, lhs_type, bitsize, vec_bitsize;
7804 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7805 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7806 int ncopies;
7807 gimple *use_stmt;
7808 auto_vec<tree> vec_oprnds;
7809 int vec_entry = 0;
7810 poly_uint64 vec_index = 0;
7812 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7814 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7815 return false;
7817 /* FORNOW. CHECKME. */
7818 if (nested_in_vect_loop_p (loop, stmt))
7819 return false;
7821 /* If STMT is not relevant and it is a simple assignment and its inputs are
7822 invariant then it can remain in place, unvectorized. The original last
7823 scalar value that it computes will be used. */
7824 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7826 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7827 if (dump_enabled_p ())
7828 dump_printf_loc (MSG_NOTE, vect_location,
7829 "statement is simple and uses invariant. Leaving in "
7830 "place.\n");
7831 return true;
7834 if (slp_node)
7835 ncopies = 1;
7836 else
7837 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7839 if (slp_node)
7841 gcc_assert (slp_index >= 0);
7843 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7844 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7846 /* Get the last occurrence of the scalar index from the concatenation of
7847 all the slp vectors. Calculate which slp vector it is and the index
7848 within. */
7849 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7851 /* Calculate which vector contains the result, and which lane of
7852 that vector we need. */
7853 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7855 if (dump_enabled_p ())
7856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7857 "Cannot determine which vector holds the"
7858 " final result.\n");
7859 return false;
7863 if (!vec_stmt)
7865 /* No transformation required. */
7866 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7868 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7869 OPTIMIZE_FOR_SPEED))
7871 if (dump_enabled_p ())
7872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7873 "can't use a fully-masked loop because "
7874 "the target doesn't support extract last "
7875 "reduction.\n");
7876 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7878 else if (slp_node)
7880 if (dump_enabled_p ())
7881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7882 "can't use a fully-masked loop because an "
7883 "SLP statement is live after the loop.\n");
7884 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7886 else if (ncopies > 1)
7888 if (dump_enabled_p ())
7889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7890 "can't use a fully-masked loop because"
7891 " ncopies is greater than 1.\n");
7892 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7894 else
7896 gcc_assert (ncopies == 1 && !slp_node);
7897 vect_record_loop_mask (loop_vinfo,
7898 &LOOP_VINFO_MASKS (loop_vinfo),
7899 1, vectype);
7902 return true;
7905 /* If stmt has a related stmt, then use that for getting the lhs. */
7906 if (is_pattern_stmt_p (stmt_info))
7907 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7909 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7910 : gimple_get_lhs (stmt);
7911 lhs_type = TREE_TYPE (lhs);
7913 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7914 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7915 : TYPE_SIZE (TREE_TYPE (vectype)));
7916 vec_bitsize = TYPE_SIZE (vectype);
7918 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7919 tree vec_lhs, bitstart;
7920 if (slp_node)
7922 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7924 /* Get the correct slp vectorized stmt. */
7925 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7926 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7927 vec_lhs = gimple_phi_result (phi);
7928 else
7929 vec_lhs = gimple_get_lhs (vec_stmt);
7931 /* Get entry to use. */
7932 bitstart = bitsize_int (vec_index);
7933 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7935 else
7937 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7938 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7939 gcc_checking_assert (ncopies == 1
7940 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7942 /* For multiple copies, get the last copy. */
7943 for (int i = 1; i < ncopies; ++i)
7944 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7945 vec_lhs);
7947 /* Get the last lane in the vector. */
7948 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7951 gimple_seq stmts = NULL;
7952 tree new_tree;
7953 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7955 /* Emit:
7957 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7959 where VEC_LHS is the vectorized live-out result and MASK is
7960 the loop mask for the final iteration. */
7961 gcc_assert (ncopies == 1 && !slp_node);
7962 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7963 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7964 1, vectype, 0);
7965 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7966 scalar_type, mask, vec_lhs);
7968 /* Convert the extracted vector element to the required scalar type. */
7969 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7971 else
7973 tree bftype = TREE_TYPE (vectype);
7974 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7975 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7976 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7977 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7978 &stmts, true, NULL_TREE);
7981 if (stmts)
7982 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7984 /* Replace use of lhs with newly computed result. If the use stmt is a
7985 single arg PHI, just replace all uses of PHI result. It's necessary
7986 because lcssa PHI defining lhs may be before newly inserted stmt. */
7987 use_operand_p use_p;
7988 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7989 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7990 && !is_gimple_debug (use_stmt))
7992 if (gimple_code (use_stmt) == GIMPLE_PHI
7993 && gimple_phi_num_args (use_stmt) == 1)
7995 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7997 else
7999 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8000 SET_USE (use_p, new_tree);
8002 update_stmt (use_stmt);
8005 return true;
8008 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8010 static void
8011 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8013 ssa_op_iter op_iter;
8014 imm_use_iterator imm_iter;
8015 def_operand_p def_p;
8016 gimple *ustmt;
8018 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8020 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8022 basic_block bb;
8024 if (!is_gimple_debug (ustmt))
8025 continue;
8027 bb = gimple_bb (ustmt);
8029 if (!flow_bb_inside_loop_p (loop, bb))
8031 if (gimple_debug_bind_p (ustmt))
8033 if (dump_enabled_p ())
8034 dump_printf_loc (MSG_NOTE, vect_location,
8035 "killing debug use\n");
8037 gimple_debug_bind_reset_value (ustmt);
8038 update_stmt (ustmt);
8040 else
8041 gcc_unreachable ();
8047 /* Given loop represented by LOOP_VINFO, return true if computation of
8048 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8049 otherwise. */
8051 static bool
8052 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8054 /* Constant case. */
8055 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8057 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8058 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8060 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8061 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8062 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8063 return true;
8066 widest_int max;
8067 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8068 /* Check the upper bound of loop niters. */
8069 if (get_max_loop_iterations (loop, &max))
8071 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8072 signop sgn = TYPE_SIGN (type);
8073 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8074 if (max < type_max)
8075 return true;
8077 return false;
8080 /* Return a mask type with half the number of elements as TYPE. */
8082 tree
8083 vect_halve_mask_nunits (tree type)
8085 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8086 return build_truth_vector_type (nunits, current_vector_size);
8089 /* Return a mask type with twice as many elements as TYPE. */
8091 tree
8092 vect_double_mask_nunits (tree type)
8094 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8095 return build_truth_vector_type (nunits, current_vector_size);
8098 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8099 contain a sequence of NVECTORS masks that each control a vector of type
8100 VECTYPE. */
8102 void
8103 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8104 unsigned int nvectors, tree vectype)
8106 gcc_assert (nvectors != 0);
8107 if (masks->length () < nvectors)
8108 masks->safe_grow_cleared (nvectors);
8109 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8110 /* The number of scalars per iteration and the number of vectors are
8111 both compile-time constants. */
8112 unsigned int nscalars_per_iter
8113 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8114 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8115 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8117 rgm->max_nscalars_per_iter = nscalars_per_iter;
8118 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8122 /* Given a complete set of masks MASKS, extract mask number INDEX
8123 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8124 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8126 See the comment above vec_loop_masks for more details about the mask
8127 arrangement. */
8129 tree
8130 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8131 unsigned int nvectors, tree vectype, unsigned int index)
8133 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8134 tree mask_type = rgm->mask_type;
8136 /* Populate the rgroup's mask array, if this is the first time we've
8137 used it. */
8138 if (rgm->masks.is_empty ())
8140 rgm->masks.safe_grow_cleared (nvectors);
8141 for (unsigned int i = 0; i < nvectors; ++i)
8143 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8144 /* Provide a dummy definition until the real one is available. */
8145 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8146 rgm->masks[i] = mask;
8150 tree mask = rgm->masks[index];
8151 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8152 TYPE_VECTOR_SUBPARTS (vectype)))
8154 /* A loop mask for data type X can be reused for data type Y
8155 if X has N times more elements than Y and if Y's elements
8156 are N times bigger than X's. In this case each sequence
8157 of N elements in the loop mask will be all-zero or all-one.
8158 We can then view-convert the mask so that each sequence of
8159 N elements is replaced by a single element. */
8160 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8161 TYPE_VECTOR_SUBPARTS (vectype)));
8162 gimple_seq seq = NULL;
8163 mask_type = build_same_sized_truth_vector_type (vectype);
8164 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8165 if (seq)
8166 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8168 return mask;
8171 /* Scale profiling counters by estimation for LOOP which is vectorized
8172 by factor VF. */
8174 static void
8175 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8177 edge preheader = loop_preheader_edge (loop);
8178 /* Reduce loop iterations by the vectorization factor. */
8179 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8180 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8182 if (freq_h.nonzero_p ())
8184 profile_probability p;
8186 /* Avoid dropping loop body profile counter to 0 because of zero count
8187 in loop's preheader. */
8188 if (!(freq_e == profile_count::zero ()))
8189 freq_e = freq_e.force_nonzero ();
8190 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8191 scale_loop_frequencies (loop, p);
8194 edge exit_e = single_exit (loop);
8195 exit_e->probability = profile_probability::always ()
8196 .apply_scale (1, new_est_niter + 1);
8198 edge exit_l = single_pred_edge (loop->latch);
8199 profile_probability prob = exit_l->probability;
8200 exit_l->probability = exit_e->probability.invert ();
8201 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8202 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8205 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8206 When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8207 *SLP_SCHEDULE is a running record of whether we have called
8208 vect_schedule_slp. */
8210 static void
8211 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8212 gimple_stmt_iterator *gsi,
8213 stmt_vec_info *seen_store, bool *slp_scheduled)
8215 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8216 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8217 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
8218 if (!stmt_info)
8219 return;
8221 if (dump_enabled_p ())
8223 dump_printf_loc (MSG_NOTE, vect_location,
8224 "------>vectorizing statement: ");
8225 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8228 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8229 vect_loop_kill_debug_uses (loop, stmt);
8231 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8232 && !STMT_VINFO_LIVE_P (stmt_info))
8233 return;
8235 if (STMT_VINFO_VECTYPE (stmt_info))
8237 poly_uint64 nunits
8238 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8239 if (!STMT_SLP_TYPE (stmt_info)
8240 && maybe_ne (nunits, vf)
8241 && dump_enabled_p ())
8242 /* For SLP VF is set according to unrolling factor, and not
8243 to vector size, hence for SLP this print is not valid. */
8244 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8247 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8248 reached. */
8249 if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8252 if (!*slp_scheduled)
8254 *slp_scheduled = true;
8256 DUMP_VECT_SCOPE ("scheduling SLP instances");
8258 vect_schedule_slp (loop_vinfo);
8261 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8262 if (slptype == pure_slp)
8263 return;
8266 if (dump_enabled_p ())
8267 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8269 bool grouped_store = false;
8270 if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8271 *seen_store = stmt_info;
8274 /* Function vect_transform_loop.
8276 The analysis phase has determined that the loop is vectorizable.
8277 Vectorize the loop - created vectorized stmts to replace the scalar
8278 stmts in the loop, and update the loop exit condition.
8279 Returns scalar epilogue loop if any. */
8281 struct loop *
8282 vect_transform_loop (loop_vec_info loop_vinfo)
8284 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8285 struct loop *epilogue = NULL;
8286 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8287 int nbbs = loop->num_nodes;
8288 int i;
8289 tree niters_vector = NULL_TREE;
8290 tree step_vector = NULL_TREE;
8291 tree niters_vector_mult_vf = NULL_TREE;
8292 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8293 unsigned int lowest_vf = constant_lower_bound (vf);
8294 bool slp_scheduled = false;
8295 gimple *stmt;
8296 bool check_profitability = false;
8297 unsigned int th;
8299 DUMP_VECT_SCOPE ("vec_transform_loop");
8301 loop_vinfo->shared->check_datarefs ();
8303 /* Use the more conservative vectorization threshold. If the number
8304 of iterations is constant assume the cost check has been performed
8305 by our caller. If the threshold makes all loops profitable that
8306 run at least the (estimated) vectorization factor number of times
8307 checking is pointless, too. */
8308 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8309 if (th >= vect_vf_for_cost (loop_vinfo)
8310 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8312 if (dump_enabled_p ())
8313 dump_printf_loc (MSG_NOTE, vect_location,
8314 "Profitability threshold is %d loop iterations.\n",
8315 th);
8316 check_profitability = true;
8319 /* Make sure there exists a single-predecessor exit bb. Do this before
8320 versioning. */
8321 edge e = single_exit (loop);
8322 if (! single_pred_p (e->dest))
8324 split_loop_exit_edge (e);
8325 if (dump_enabled_p ())
8326 dump_printf (MSG_NOTE, "split exit edge\n");
8329 /* Version the loop first, if required, so the profitability check
8330 comes first. */
8332 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8334 poly_uint64 versioning_threshold
8335 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8336 if (check_profitability
8337 && ordered_p (poly_uint64 (th), versioning_threshold))
8339 versioning_threshold = ordered_max (poly_uint64 (th),
8340 versioning_threshold);
8341 check_profitability = false;
8343 vect_loop_versioning (loop_vinfo, th, check_profitability,
8344 versioning_threshold);
8345 check_profitability = false;
8348 /* Make sure there exists a single-predecessor exit bb also on the
8349 scalar loop copy. Do this after versioning but before peeling
8350 so CFG structure is fine for both scalar and if-converted loop
8351 to make slpeel_duplicate_current_defs_from_edges face matched
8352 loop closed PHI nodes on the exit. */
8353 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8355 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8356 if (! single_pred_p (e->dest))
8358 split_loop_exit_edge (e);
8359 if (dump_enabled_p ())
8360 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8364 tree niters = vect_build_loop_niters (loop_vinfo);
8365 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8366 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8367 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8368 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8369 &step_vector, &niters_vector_mult_vf, th,
8370 check_profitability, niters_no_overflow);
8372 if (niters_vector == NULL_TREE)
8374 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8375 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8376 && known_eq (lowest_vf, vf))
8378 niters_vector
8379 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8380 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8381 step_vector = build_one_cst (TREE_TYPE (niters));
8383 else
8384 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8385 &step_vector, niters_no_overflow);
8388 /* 1) Make sure the loop header has exactly two entries
8389 2) Make sure we have a preheader basic block. */
8391 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8393 split_edge (loop_preheader_edge (loop));
8395 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8396 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8397 /* This will deal with any possible peeling. */
8398 vect_prepare_for_masked_peels (loop_vinfo);
8400 /* FORNOW: the vectorizer supports only loops which body consist
8401 of one basic block (header + empty latch). When the vectorizer will
8402 support more involved loop forms, the order by which the BBs are
8403 traversed need to be reconsidered. */
8405 for (i = 0; i < nbbs; i++)
8407 basic_block bb = bbs[i];
8408 stmt_vec_info stmt_info;
8410 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8411 gsi_next (&si))
8413 gphi *phi = si.phi ();
8414 if (dump_enabled_p ())
8416 dump_printf_loc (MSG_NOTE, vect_location,
8417 "------>vectorizing phi: ");
8418 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8420 stmt_info = loop_vinfo->lookup_stmt (phi);
8421 if (!stmt_info)
8422 continue;
8424 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8425 vect_loop_kill_debug_uses (loop, phi);
8427 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8428 && !STMT_VINFO_LIVE_P (stmt_info))
8429 continue;
8431 if (STMT_VINFO_VECTYPE (stmt_info)
8432 && (maybe_ne
8433 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8434 && dump_enabled_p ())
8435 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8437 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8438 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8439 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8440 && ! PURE_SLP_STMT (stmt_info))
8442 if (dump_enabled_p ())
8443 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8444 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8448 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8449 !gsi_end_p (si);)
8451 stmt = gsi_stmt (si);
8452 /* During vectorization remove existing clobber stmts. */
8453 if (gimple_clobber_p (stmt))
8455 unlink_stmt_vdef (stmt);
8456 gsi_remove (&si, true);
8457 release_defs (stmt);
8459 else
8461 stmt_info = loop_vinfo->lookup_stmt (stmt);
8463 /* vector stmts created in the outer-loop during vectorization of
8464 stmts in an inner-loop may not have a stmt_info, and do not
8465 need to be vectorized. */
8466 stmt_vec_info seen_store = NULL;
8467 if (stmt_info)
8469 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8471 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8472 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8473 !gsi_end_p (subsi); gsi_next (&subsi))
8474 vect_transform_loop_stmt (loop_vinfo,
8475 gsi_stmt (subsi), &si,
8476 &seen_store,
8477 &slp_scheduled);
8478 gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8479 vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8480 &seen_store, &slp_scheduled);
8482 vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8483 &seen_store, &slp_scheduled);
8485 if (seen_store)
8487 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8489 /* Interleaving. If IS_STORE is TRUE, the
8490 vectorization of the interleaving chain was
8491 completed - free all the stores in the chain. */
8492 gsi_next (&si);
8493 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8495 else
8497 /* Free the attached stmt_vec_info and remove the
8498 stmt. */
8499 free_stmt_vec_info (stmt);
8500 unlink_stmt_vdef (stmt);
8501 gsi_remove (&si, true);
8502 release_defs (stmt);
8505 else
8506 gsi_next (&si);
8510 /* Stub out scalar statements that must not survive vectorization.
8511 Doing this here helps with grouped statements, or statements that
8512 are involved in patterns. */
8513 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8514 !gsi_end_p (gsi); gsi_next (&gsi))
8516 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8517 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8519 tree lhs = gimple_get_lhs (call);
8520 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8522 tree zero = build_zero_cst (TREE_TYPE (lhs));
8523 gimple *new_stmt = gimple_build_assign (lhs, zero);
8524 gsi_replace (&gsi, new_stmt, true);
8528 } /* BBs in loop */
8530 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8531 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8532 if (integer_onep (step_vector))
8533 niters_no_overflow = true;
8534 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8535 niters_vector_mult_vf, !niters_no_overflow);
8537 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8538 scale_profile_for_vect_loop (loop, assumed_vf);
8540 /* True if the final iteration might not handle a full vector's
8541 worth of scalar iterations. */
8542 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8543 /* The minimum number of iterations performed by the epilogue. This
8544 is 1 when peeling for gaps because we always need a final scalar
8545 iteration. */
8546 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8547 /* +1 to convert latch counts to loop iteration counts,
8548 -min_epilogue_iters to remove iterations that cannot be performed
8549 by the vector code. */
8550 int bias_for_lowest = 1 - min_epilogue_iters;
8551 int bias_for_assumed = bias_for_lowest;
8552 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8553 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8555 /* When the amount of peeling is known at compile time, the first
8556 iteration will have exactly alignment_npeels active elements.
8557 In the worst case it will have at least one. */
8558 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8559 bias_for_lowest += lowest_vf - min_first_active;
8560 bias_for_assumed += assumed_vf - min_first_active;
8562 /* In these calculations the "- 1" converts loop iteration counts
8563 back to latch counts. */
8564 if (loop->any_upper_bound)
8565 loop->nb_iterations_upper_bound
8566 = (final_iter_may_be_partial
8567 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8568 lowest_vf) - 1
8569 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8570 lowest_vf) - 1);
8571 if (loop->any_likely_upper_bound)
8572 loop->nb_iterations_likely_upper_bound
8573 = (final_iter_may_be_partial
8574 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8575 + bias_for_lowest, lowest_vf) - 1
8576 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8577 + bias_for_lowest, lowest_vf) - 1);
8578 if (loop->any_estimate)
8579 loop->nb_iterations_estimate
8580 = (final_iter_may_be_partial
8581 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8582 assumed_vf) - 1
8583 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8584 assumed_vf) - 1);
8586 if (dump_enabled_p ())
8588 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8590 dump_printf_loc (MSG_NOTE, vect_location,
8591 "LOOP VECTORIZED\n");
8592 if (loop->inner)
8593 dump_printf_loc (MSG_NOTE, vect_location,
8594 "OUTER LOOP VECTORIZED\n");
8595 dump_printf (MSG_NOTE, "\n");
8597 else
8599 dump_printf_loc (MSG_NOTE, vect_location,
8600 "LOOP EPILOGUE VECTORIZED (VS=");
8601 dump_dec (MSG_NOTE, current_vector_size);
8602 dump_printf (MSG_NOTE, ")\n");
8606 /* Free SLP instances here because otherwise stmt reference counting
8607 won't work. */
8608 slp_instance instance;
8609 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8610 vect_free_slp_instance (instance, true);
8611 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8612 /* Clear-up safelen field since its value is invalid after vectorization
8613 since vectorized loop can have loop-carried dependencies. */
8614 loop->safelen = 0;
8616 /* Don't vectorize epilogue for epilogue. */
8617 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8618 epilogue = NULL;
8620 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8621 epilogue = NULL;
8623 if (epilogue)
8625 auto_vector_sizes vector_sizes;
8626 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8627 unsigned int next_size = 0;
8629 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8630 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8631 && known_eq (vf, lowest_vf))
8633 unsigned int eiters
8634 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8635 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8636 eiters = eiters % lowest_vf;
8637 epilogue->nb_iterations_upper_bound = eiters - 1;
8639 unsigned int ratio;
8640 while (next_size < vector_sizes.length ()
8641 && !(constant_multiple_p (current_vector_size,
8642 vector_sizes[next_size], &ratio)
8643 && eiters >= lowest_vf / ratio))
8644 next_size += 1;
8646 else
8647 while (next_size < vector_sizes.length ()
8648 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8649 next_size += 1;
8651 if (next_size == vector_sizes.length ())
8652 epilogue = NULL;
8655 if (epilogue)
8657 epilogue->force_vectorize = loop->force_vectorize;
8658 epilogue->safelen = loop->safelen;
8659 epilogue->dont_vectorize = false;
8661 /* We may need to if-convert epilogue to vectorize it. */
8662 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8663 tree_if_conversion (epilogue);
8666 return epilogue;
8669 /* The code below is trying to perform simple optimization - revert
8670 if-conversion for masked stores, i.e. if the mask of a store is zero
8671 do not perform it and all stored value producers also if possible.
8672 For example,
8673 for (i=0; i<n; i++)
8674 if (c[i])
8676 p1[i] += 1;
8677 p2[i] = p3[i] +2;
8679 this transformation will produce the following semi-hammock:
8681 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8683 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8684 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8685 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8686 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8687 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8688 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8692 void
8693 optimize_mask_stores (struct loop *loop)
8695 basic_block *bbs = get_loop_body (loop);
8696 unsigned nbbs = loop->num_nodes;
8697 unsigned i;
8698 basic_block bb;
8699 struct loop *bb_loop;
8700 gimple_stmt_iterator gsi;
8701 gimple *stmt;
8702 auto_vec<gimple *> worklist;
8704 vect_location = find_loop_location (loop);
8705 /* Pick up all masked stores in loop if any. */
8706 for (i = 0; i < nbbs; i++)
8708 bb = bbs[i];
8709 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8710 gsi_next (&gsi))
8712 stmt = gsi_stmt (gsi);
8713 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8714 worklist.safe_push (stmt);
8718 free (bbs);
8719 if (worklist.is_empty ())
8720 return;
8722 /* Loop has masked stores. */
8723 while (!worklist.is_empty ())
8725 gimple *last, *last_store;
8726 edge e, efalse;
8727 tree mask;
8728 basic_block store_bb, join_bb;
8729 gimple_stmt_iterator gsi_to;
8730 tree vdef, new_vdef;
8731 gphi *phi;
8732 tree vectype;
8733 tree zero;
8735 last = worklist.pop ();
8736 mask = gimple_call_arg (last, 2);
8737 bb = gimple_bb (last);
8738 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8739 the same loop as if_bb. It could be different to LOOP when two
8740 level loop-nest is vectorized and mask_store belongs to the inner
8741 one. */
8742 e = split_block (bb, last);
8743 bb_loop = bb->loop_father;
8744 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8745 join_bb = e->dest;
8746 store_bb = create_empty_bb (bb);
8747 add_bb_to_loop (store_bb, bb_loop);
8748 e->flags = EDGE_TRUE_VALUE;
8749 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8750 /* Put STORE_BB to likely part. */
8751 efalse->probability = profile_probability::unlikely ();
8752 store_bb->count = efalse->count ();
8753 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8754 if (dom_info_available_p (CDI_DOMINATORS))
8755 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8756 if (dump_enabled_p ())
8757 dump_printf_loc (MSG_NOTE, vect_location,
8758 "Create new block %d to sink mask stores.",
8759 store_bb->index);
8760 /* Create vector comparison with boolean result. */
8761 vectype = TREE_TYPE (mask);
8762 zero = build_zero_cst (vectype);
8763 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8764 gsi = gsi_last_bb (bb);
8765 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8766 /* Create new PHI node for vdef of the last masked store:
8767 .MEM_2 = VDEF <.MEM_1>
8768 will be converted to
8769 .MEM.3 = VDEF <.MEM_1>
8770 and new PHI node will be created in join bb
8771 .MEM_2 = PHI <.MEM_1, .MEM_3>
8773 vdef = gimple_vdef (last);
8774 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8775 gimple_set_vdef (last, new_vdef);
8776 phi = create_phi_node (vdef, join_bb);
8777 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8779 /* Put all masked stores with the same mask to STORE_BB if possible. */
8780 while (true)
8782 gimple_stmt_iterator gsi_from;
8783 gimple *stmt1 = NULL;
8785 /* Move masked store to STORE_BB. */
8786 last_store = last;
8787 gsi = gsi_for_stmt (last);
8788 gsi_from = gsi;
8789 /* Shift GSI to the previous stmt for further traversal. */
8790 gsi_prev (&gsi);
8791 gsi_to = gsi_start_bb (store_bb);
8792 gsi_move_before (&gsi_from, &gsi_to);
8793 /* Setup GSI_TO to the non-empty block start. */
8794 gsi_to = gsi_start_bb (store_bb);
8795 if (dump_enabled_p ())
8797 dump_printf_loc (MSG_NOTE, vect_location,
8798 "Move stmt to created bb\n");
8799 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8801 /* Move all stored value producers if possible. */
8802 while (!gsi_end_p (gsi))
8804 tree lhs;
8805 imm_use_iterator imm_iter;
8806 use_operand_p use_p;
8807 bool res;
8809 /* Skip debug statements. */
8810 if (is_gimple_debug (gsi_stmt (gsi)))
8812 gsi_prev (&gsi);
8813 continue;
8815 stmt1 = gsi_stmt (gsi);
8816 /* Do not consider statements writing to memory or having
8817 volatile operand. */
8818 if (gimple_vdef (stmt1)
8819 || gimple_has_volatile_ops (stmt1))
8820 break;
8821 gsi_from = gsi;
8822 gsi_prev (&gsi);
8823 lhs = gimple_get_lhs (stmt1);
8824 if (!lhs)
8825 break;
8827 /* LHS of vectorized stmt must be SSA_NAME. */
8828 if (TREE_CODE (lhs) != SSA_NAME)
8829 break;
8831 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8833 /* Remove dead scalar statement. */
8834 if (has_zero_uses (lhs))
8836 gsi_remove (&gsi_from, true);
8837 continue;
8841 /* Check that LHS does not have uses outside of STORE_BB. */
8842 res = true;
8843 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8845 gimple *use_stmt;
8846 use_stmt = USE_STMT (use_p);
8847 if (is_gimple_debug (use_stmt))
8848 continue;
8849 if (gimple_bb (use_stmt) != store_bb)
8851 res = false;
8852 break;
8855 if (!res)
8856 break;
8858 if (gimple_vuse (stmt1)
8859 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8860 break;
8862 /* Can move STMT1 to STORE_BB. */
8863 if (dump_enabled_p ())
8865 dump_printf_loc (MSG_NOTE, vect_location,
8866 "Move stmt to created bb\n");
8867 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8869 gsi_move_before (&gsi_from, &gsi_to);
8870 /* Shift GSI_TO for further insertion. */
8871 gsi_prev (&gsi_to);
8873 /* Put other masked stores with the same mask to STORE_BB. */
8874 if (worklist.is_empty ()
8875 || gimple_call_arg (worklist.last (), 2) != mask
8876 || worklist.last () != stmt1)
8877 break;
8878 last = worklist.pop ();
8880 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);