[34/46] Alter interface to vect_get_vec_def_for_stmt_copy
[official-gcc.git] / gcc / tree-vect-loop.c
blob553916a837745ca3951c277313f3db9abe36791c
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
184 if (stmt_vectype)
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return true;
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
216 vec_info *vinfo = stmt_info->vinfo;
217 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
220 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
222 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
223 return false;
225 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
226 && STMT_VINFO_RELATED_STMT (stmt_info))
228 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231 /* If a pattern statement has def stmts, analyze them too. */
232 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
233 !gsi_end_p (si); gsi_next (&si))
235 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: ");
240 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
241 def_stmt_info->stmt, 0);
243 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers))
245 return false;
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: ");
252 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
254 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
255 return false;
258 return true;
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
286 static bool
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i = 0; i < nbbs; i++)
304 basic_block bb = bbs[i];
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
314 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
317 gcc_assert (stmt_info);
319 if (STMT_VINFO_RELEVANT_P (stmt_info)
320 || STMT_VINFO_LIVE_P (stmt_info))
322 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
323 scalar_type = TREE_TYPE (PHI_RESULT (phi));
325 if (dump_enabled_p ())
327 dump_printf_loc (MSG_NOTE, vect_location,
328 "get vectype for scalar type: ");
329 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
330 dump_printf (MSG_NOTE, "\n");
333 vectype = get_vectype_for_scalar_type (scalar_type);
334 if (!vectype)
336 if (dump_enabled_p ())
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "not vectorized: unsupported "
340 "data-type ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
342 scalar_type);
343 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
345 return false;
347 STMT_VINFO_VECTYPE (stmt_info) = vectype;
349 if (dump_enabled_p ())
351 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
353 dump_printf (MSG_NOTE, "\n");
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
359 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
360 dump_printf (MSG_NOTE, "\n");
363 vect_update_max_nunits (&vectorization_factor, vectype);
367 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
368 gsi_next (&si))
370 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
371 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
372 &mask_producers))
373 return false;
377 /* TODO: Analyze cost. Decide if worth while to vectorize. */
378 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
381 dump_dec (MSG_NOTE, vectorization_factor);
382 dump_printf (MSG_NOTE, "\n");
385 if (known_le (vectorization_factor, 1U))
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "not vectorized: unsupported data-type\n");
390 return false;
392 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
394 for (i = 0; i < mask_producers.length (); i++)
396 stmt_info = mask_producers[i];
397 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
398 if (!mask_type)
399 return false;
400 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
403 return true;
407 /* Function vect_is_simple_iv_evolution.
409 FORNOW: A simple evolution of an induction variables in the loop is
410 considered a polynomial evolution. */
412 static bool
413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
414 tree * step)
416 tree init_expr;
417 tree step_expr;
418 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
419 basic_block bb;
421 /* When there is no evolution in this loop, the evolution function
422 is not "simple". */
423 if (evolution_part == NULL_TREE)
424 return false;
426 /* When the evolution is a polynomial of degree >= 2
427 the evolution function is not "simple". */
428 if (tree_is_chrec (evolution_part))
429 return false;
431 step_expr = evolution_part;
432 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
434 if (dump_enabled_p ())
436 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
437 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
438 dump_printf (MSG_NOTE, ", init: ");
439 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
440 dump_printf (MSG_NOTE, "\n");
443 *init = init_expr;
444 *step = step_expr;
446 if (TREE_CODE (step_expr) != INTEGER_CST
447 && (TREE_CODE (step_expr) != SSA_NAME
448 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
449 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
450 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
451 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
452 || !flag_associative_math)))
453 && (TREE_CODE (step_expr) != REAL_CST
454 || !flag_associative_math))
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
458 "step unknown.\n");
459 return false;
462 return true;
465 /* Function vect_analyze_scalar_cycles_1.
467 Examine the cross iteration def-use cycles of scalar variables
468 in LOOP. LOOP_VINFO represents the loop that is now being
469 considered for vectorization (can be LOOP, or an outer-loop
470 enclosing LOOP). */
472 static void
473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
475 basic_block bb = loop->header;
476 tree init, step;
477 auto_vec<stmt_vec_info, 64> worklist;
478 gphi_iterator gsi;
479 bool double_reduc;
481 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
483 /* First - identify all inductions. Reduction detection assumes that all the
484 inductions have been identified, therefore, this order must not be
485 changed. */
486 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
488 gphi *phi = gsi.phi ();
489 tree access_fn = NULL;
490 tree def = PHI_RESULT (phi);
491 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
493 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
496 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
499 /* Skip virtual phi's. The data dependences that are associated with
500 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
501 if (virtual_operand_p (def))
502 continue;
504 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
506 /* Analyze the evolution function. */
507 access_fn = analyze_scalar_evolution (loop, def);
508 if (access_fn)
510 STRIP_NOPS (access_fn);
511 if (dump_enabled_p ())
513 dump_printf_loc (MSG_NOTE, vect_location,
514 "Access function of PHI: ");
515 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
516 dump_printf (MSG_NOTE, "\n");
518 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
519 = initial_condition_in_loop_num (access_fn, loop->num);
520 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
521 = evolution_part_in_loop_num (access_fn, loop->num);
524 if (!access_fn
525 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
526 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
527 && TREE_CODE (step) != INTEGER_CST))
529 worklist.safe_push (stmt_vinfo);
530 continue;
533 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
534 != NULL_TREE);
535 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
539 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
543 /* Second - identify all reductions and nested cycles. */
544 while (worklist.length () > 0)
546 stmt_vec_info stmt_vinfo = worklist.pop ();
547 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
548 tree def = PHI_RESULT (phi);
550 if (dump_enabled_p ())
552 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
553 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
556 gcc_assert (!virtual_operand_p (def)
557 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
559 stmt_vec_info reduc_stmt_info
560 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
561 &double_reduc, false);
562 if (reduc_stmt_info)
564 if (double_reduc)
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "Detected double reduction.\n");
570 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
571 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
572 = vect_double_reduction_def;
574 else
576 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "Detected vectorizable nested cycle.\n");
582 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
583 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
585 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_NOTE, vect_location,
589 "Detected reduction.\n");
591 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
592 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
593 /* Store the reduction cycles for possible vectorization in
594 loop-aware SLP if it was not detected as reduction
595 chain. */
596 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
597 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
598 (reduc_stmt_info);
602 else
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Unknown def-use cycle pattern.\n");
610 /* Function vect_analyze_scalar_cycles.
612 Examine the cross iteration def-use cycles of scalar variables, by
613 analyzing the loop-header PHIs of scalar variables. Classify each
614 cycle as one of the following: invariant, induction, reduction, unknown.
615 We do that for the loop represented by LOOP_VINFO, and also to its
616 inner-loop, if exists.
617 Examples for scalar cycles:
619 Example1: reduction:
621 loop1:
622 for (i=0; i<N; i++)
623 sum += a[i];
625 Example2: induction:
627 loop2:
628 for (i=0; i<N; i++)
629 a[i] = i; */
631 static void
632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
634 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
636 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
638 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
639 Reductions in such inner-loop therefore have different properties than
640 the reductions in the nest that gets vectorized:
641 1. When vectorized, they are executed in the same order as in the original
642 scalar loop, so we can't change the order of computation when
643 vectorizing them.
644 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
645 current checks are too strict. */
647 if (loop->inner)
648 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 /* Transfer group and reduction information from STMT_INFO to its
652 pattern stmt. */
654 static void
655 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
657 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
658 stmt_vec_info stmtp;
659 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
660 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
661 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
665 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
666 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
667 if (stmt_info)
668 REDUC_GROUP_NEXT_ELEMENT (stmtp)
669 = STMT_VINFO_RELATED_STMT (stmt_info);
671 while (stmt_info);
672 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 /* Fixup scalar cycles that now have their stmts detected as patterns. */
677 static void
678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
680 stmt_vec_info first;
681 unsigned i;
683 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
684 if (STMT_VINFO_IN_PATTERN_P (first))
686 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
687 while (next)
689 if (! STMT_VINFO_IN_PATTERN_P (next))
690 break;
691 next = REDUC_GROUP_NEXT_ELEMENT (next);
693 /* If not all stmt in the chain are patterns try to handle
694 the chain without patterns. */
695 if (! next)
697 vect_fixup_reduc_chain (first);
698 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
699 = STMT_VINFO_RELATED_STMT (first);
704 /* Function vect_get_loop_niters.
706 Determine how many iterations the loop is executed and place it
707 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
708 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
709 niter information holds in ASSUMPTIONS.
711 Return the loop exit condition. */
714 static gcond *
715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
716 tree *number_of_iterations, tree *number_of_iterationsm1)
718 edge exit = single_exit (loop);
719 struct tree_niter_desc niter_desc;
720 tree niter_assumptions, niter, may_be_zero;
721 gcond *cond = get_loop_exit_condition (loop);
723 *assumptions = boolean_true_node;
724 *number_of_iterationsm1 = chrec_dont_know;
725 *number_of_iterations = chrec_dont_know;
726 DUMP_VECT_SCOPE ("get_loop_niters");
728 if (!exit)
729 return cond;
731 niter = chrec_dont_know;
732 may_be_zero = NULL_TREE;
733 niter_assumptions = boolean_true_node;
734 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
735 || chrec_contains_undetermined (niter_desc.niter))
736 return cond;
738 niter_assumptions = niter_desc.assumptions;
739 may_be_zero = niter_desc.may_be_zero;
740 niter = niter_desc.niter;
742 if (may_be_zero && integer_zerop (may_be_zero))
743 may_be_zero = NULL_TREE;
745 if (may_be_zero)
747 if (COMPARISON_CLASS_P (may_be_zero))
749 /* Try to combine may_be_zero with assumptions, this can simplify
750 computation of niter expression. */
751 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
752 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
753 niter_assumptions,
754 fold_build1 (TRUTH_NOT_EXPR,
755 boolean_type_node,
756 may_be_zero));
757 else
758 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
759 build_int_cst (TREE_TYPE (niter), 0),
760 rewrite_to_non_trapping_overflow (niter));
762 may_be_zero = NULL_TREE;
764 else if (integer_nonzerop (may_be_zero))
766 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
767 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
768 return cond;
770 else
771 return cond;
774 *assumptions = niter_assumptions;
775 *number_of_iterationsm1 = niter;
777 /* We want the number of loop header executions which is the number
778 of latch executions plus one.
779 ??? For UINT_MAX latch executions this number overflows to zero
780 for loops like do { n++; } while (n != 0); */
781 if (niter && !chrec_contains_undetermined (niter))
782 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
783 build_int_cst (TREE_TYPE (niter), 1));
784 *number_of_iterations = niter;
786 return cond;
789 /* Function bb_in_loop_p
791 Used as predicate for dfs order traversal of the loop bbs. */
793 static bool
794 bb_in_loop_p (const_basic_block bb, const void *data)
796 const struct loop *const loop = (const struct loop *)data;
797 if (flow_bb_inside_loop_p (loop, bb))
798 return true;
799 return false;
803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
804 stmt_vec_info structs for all the stmts in LOOP_IN. */
806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
807 : vec_info (vec_info::loop, init_cost (loop_in), shared),
808 loop (loop_in),
809 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
810 num_itersm1 (NULL_TREE),
811 num_iters (NULL_TREE),
812 num_iters_unchanged (NULL_TREE),
813 num_iters_assumptions (NULL_TREE),
814 th (0),
815 versioning_threshold (0),
816 vectorization_factor (0),
817 max_vectorization_factor (0),
818 mask_skip_niters (NULL_TREE),
819 mask_compare_type (NULL_TREE),
820 unaligned_dr (NULL),
821 peeling_for_alignment (0),
822 ptr_mask (0),
823 ivexpr_map (NULL),
824 slp_unrolling_factor (1),
825 single_scalar_iteration_cost (0),
826 vectorizable (false),
827 can_fully_mask_p (true),
828 fully_masked_p (false),
829 peeling_for_gaps (false),
830 peeling_for_niter (false),
831 operands_swapped (false),
832 no_data_dependencies (false),
833 has_mask_store (false),
834 scalar_loop (NULL),
835 orig_loop_info (NULL)
837 /* Create/Update stmt_info for all stmts in the loop. */
838 basic_block *body = get_loop_body (loop);
839 for (unsigned int i = 0; i < loop->num_nodes; i++)
841 basic_block bb = body[i];
842 gimple_stmt_iterator si;
844 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
846 gimple *phi = gsi_stmt (si);
847 gimple_set_uid (phi, 0);
848 add_stmt (phi);
851 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
853 gimple *stmt = gsi_stmt (si);
854 gimple_set_uid (stmt, 0);
855 add_stmt (stmt);
858 free (body);
860 /* CHECKME: We want to visit all BBs before their successors (except for
861 latch blocks, for which this assertion wouldn't hold). In the simple
862 case of the loop forms we allow, a dfs order of the BBs would the same
863 as reversed postorder traversal, so we are safe. */
865 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 bbs, loop->num_nodes, loop);
867 gcc_assert (nbbs == loop->num_nodes);
870 /* Free all levels of MASKS. */
872 void
873 release_vec_loop_masks (vec_loop_masks *masks)
875 rgroup_masks *rgm;
876 unsigned int i;
877 FOR_EACH_VEC_ELT (*masks, i, rgm)
878 rgm->masks.release ();
879 masks->release ();
882 /* Free all memory used by the _loop_vec_info, as well as all the
883 stmt_vec_info structs of all the stmts in the loop. */
885 _loop_vec_info::~_loop_vec_info ()
887 int nbbs;
888 gimple_stmt_iterator si;
889 int j;
891 /* ??? We're releasing loop_vinfos en-block. */
892 set_stmt_vec_info_vec (&stmt_vec_infos);
893 nbbs = loop->num_nodes;
894 for (j = 0; j < nbbs; j++)
896 basic_block bb = bbs[j];
897 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
898 free_stmt_vec_info (gsi_stmt (si));
900 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
902 gimple *stmt = gsi_stmt (si);
904 /* We may have broken canonical form by moving a constant
905 into RHS1 of a commutative op. Fix such occurrences. */
906 if (operands_swapped && is_gimple_assign (stmt))
908 enum tree_code code = gimple_assign_rhs_code (stmt);
910 if ((code == PLUS_EXPR
911 || code == POINTER_PLUS_EXPR
912 || code == MULT_EXPR)
913 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
914 swap_ssa_operands (stmt,
915 gimple_assign_rhs1_ptr (stmt),
916 gimple_assign_rhs2_ptr (stmt));
917 else if (code == COND_EXPR
918 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
920 tree cond_expr = gimple_assign_rhs1 (stmt);
921 enum tree_code cond_code = TREE_CODE (cond_expr);
923 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
925 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
926 0));
927 cond_code = invert_tree_comparison (cond_code,
928 honor_nans);
929 if (cond_code != ERROR_MARK)
931 TREE_SET_CODE (cond_expr, cond_code);
932 swap_ssa_operands (stmt,
933 gimple_assign_rhs2_ptr (stmt),
934 gimple_assign_rhs3_ptr (stmt));
940 /* Free stmt_vec_info. */
941 free_stmt_vec_info (stmt);
942 gsi_next (&si);
946 free (bbs);
948 release_vec_loop_masks (&masks);
949 delete ivexpr_map;
951 loop->aux = NULL;
954 /* Return an invariant or register for EXPR and emit necessary
955 computations in the LOOP_VINFO loop preheader. */
957 tree
958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
960 if (is_gimple_reg (expr)
961 || is_gimple_min_invariant (expr))
962 return expr;
964 if (! loop_vinfo->ivexpr_map)
965 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
966 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
967 if (! cached)
969 gimple_seq stmts = NULL;
970 cached = force_gimple_operand (unshare_expr (expr),
971 &stmts, true, NULL_TREE);
972 if (stmts)
974 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
975 gsi_insert_seq_on_edge_immediate (e, stmts);
978 return cached;
981 /* Return true if we can use CMP_TYPE as the comparison type to produce
982 all masks required to mask LOOP_VINFO. */
984 static bool
985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
987 rgroup_masks *rgm;
988 unsigned int i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
990 if (rgm->mask_type != NULL_TREE
991 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
992 cmp_type, rgm->mask_type,
993 OPTIMIZE_FOR_SPEED))
994 return false;
995 return true;
998 /* Calculate the maximum number of scalars per iteration for every
999 rgroup in LOOP_VINFO. */
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1004 unsigned int res = 1;
1005 unsigned int i;
1006 rgroup_masks *rgm;
1007 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008 res = MAX (res, rgm->max_nscalars_per_iter);
1009 return res;
1012 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1013 whether we can actually generate the masks required. Return true if so,
1014 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1019 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020 unsigned int min_ni_width;
1022 /* Use a normal loop if there are no statements that need masking.
1023 This only happens in rare degenerate cases: it means that the loop
1024 has no loads, no stores, and no live-out values. */
1025 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026 return false;
1028 /* Get the maximum number of iterations that is representable
1029 in the counter type. */
1030 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1033 /* Get a more refined estimate for the number of iterations. */
1034 widest_int max_back_edges;
1035 if (max_loop_iterations (loop, &max_back_edges))
1036 max_ni = wi::smin (max_ni, max_back_edges + 1);
1038 /* Account for rgroup masks, in which each bit is replicated N times. */
1039 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1041 /* Work out how many bits we need to represent the limit. */
1042 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1044 /* Find a scalar mode for which WHILE_ULT is supported. */
1045 opt_scalar_int_mode cmp_mode_iter;
1046 tree cmp_type = NULL_TREE;
1047 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1049 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050 if (cmp_bits >= min_ni_width
1051 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1053 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054 if (this_type
1055 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1057 /* Although we could stop as soon as we find a valid mode,
1058 it's often better to continue until we hit Pmode, since the
1059 operands to the WHILE are more likely to be reusable in
1060 address calculations. */
1061 cmp_type = this_type;
1062 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063 break;
1068 if (!cmp_type)
1069 return false;
1071 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072 return true;
1075 /* Calculate the cost of one scalar iteration of the loop. */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1079 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081 int nbbs = loop->num_nodes, factor;
1082 int innerloop_iters, i;
1084 /* Gather costs for statements in the scalar loop. */
1086 /* FORNOW. */
1087 innerloop_iters = 1;
1088 if (loop->inner)
1089 innerloop_iters = 50; /* FIXME */
1091 for (i = 0; i < nbbs; i++)
1093 gimple_stmt_iterator si;
1094 basic_block bb = bbs[i];
1096 if (bb->loop_father == loop->inner)
1097 factor = innerloop_iters;
1098 else
1099 factor = 1;
1101 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1103 gimple *stmt = gsi_stmt (si);
1104 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1106 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107 continue;
1109 /* Skip stmts that are not vectorized inside the loop. */
1110 if (stmt_info
1111 && !STMT_VINFO_RELEVANT_P (stmt_info)
1112 && (!STMT_VINFO_LIVE_P (stmt_info)
1113 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115 continue;
1117 vect_cost_for_stmt kind;
1118 if (STMT_VINFO_DATA_REF (stmt_info))
1120 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121 kind = scalar_load;
1122 else
1123 kind = scalar_store;
1125 else
1126 kind = scalar_stmt;
1128 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129 factor, kind, stmt_info, 0, vect_prologue);
1133 /* Now accumulate cost. */
1134 void *target_cost_data = init_cost (loop);
1135 stmt_info_for_cost *si;
1136 int j;
1137 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138 j, si)
1139 (void) add_stmt_cost (target_cost_data, si->count,
1140 si->kind, si->stmt_info, si->misalign,
1141 vect_body);
1142 unsigned dummy, body_cost = 0;
1143 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1144 destroy_cost_data (target_cost_data);
1145 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1149 /* Function vect_analyze_loop_form_1.
1151 Verify that certain CFG restrictions hold, including:
1152 - the loop has a pre-header
1153 - the loop has a single entry and exit
1154 - the loop exit condition is simple enough
1155 - the number of iterations can be analyzed, i.e, a countable loop. The
1156 niter could be analyzed under some assumptions. */
1158 bool
1159 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1160 tree *assumptions, tree *number_of_iterationsm1,
1161 tree *number_of_iterations, gcond **inner_loop_cond)
1163 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1165 /* Different restrictions apply when we are considering an inner-most loop,
1166 vs. an outer (nested) loop.
1167 (FORNOW. May want to relax some of these restrictions in the future). */
1169 if (!loop->inner)
1171 /* Inner-most loop. We currently require that the number of BBs is
1172 exactly 2 (the header and latch). Vectorizable inner-most loops
1173 look like this:
1175 (pre-header)
1177 header <--------+
1178 | | |
1179 | +--> latch --+
1181 (exit-bb) */
1183 if (loop->num_nodes != 2)
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "not vectorized: control flow in loop.\n");
1188 return false;
1191 if (empty_block_p (loop->header))
1193 if (dump_enabled_p ())
1194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1195 "not vectorized: empty loop.\n");
1196 return false;
1199 else
1201 struct loop *innerloop = loop->inner;
1202 edge entryedge;
1204 /* Nested loop. We currently require that the loop is doubly-nested,
1205 contains a single inner loop, and the number of BBs is exactly 5.
1206 Vectorizable outer-loops look like this:
1208 (pre-header)
1210 header <---+
1212 inner-loop |
1214 tail ------+
1216 (exit-bb)
1218 The inner-loop has the properties expected of inner-most loops
1219 as described above. */
1221 if ((loop->inner)->inner || (loop->inner)->next)
1223 if (dump_enabled_p ())
1224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1225 "not vectorized: multiple nested loops.\n");
1226 return false;
1229 if (loop->num_nodes != 5)
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "not vectorized: control flow in loop.\n");
1234 return false;
1237 entryedge = loop_preheader_edge (innerloop);
1238 if (entryedge->src != loop->header
1239 || !single_exit (innerloop)
1240 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244 "not vectorized: unsupported outerloop form.\n");
1245 return false;
1248 /* Analyze the inner-loop. */
1249 tree inner_niterm1, inner_niter, inner_assumptions;
1250 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1251 &inner_assumptions, &inner_niterm1,
1252 &inner_niter, NULL)
1253 /* Don't support analyzing niter under assumptions for inner
1254 loop. */
1255 || !integer_onep (inner_assumptions))
1257 if (dump_enabled_p ())
1258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1259 "not vectorized: Bad inner loop.\n");
1260 return false;
1263 if (!expr_invariant_in_loop_p (loop, inner_niter))
1265 if (dump_enabled_p ())
1266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267 "not vectorized: inner-loop count not"
1268 " invariant.\n");
1269 return false;
1272 if (dump_enabled_p ())
1273 dump_printf_loc (MSG_NOTE, vect_location,
1274 "Considering outer-loop vectorization.\n");
1277 if (!single_exit (loop)
1278 || EDGE_COUNT (loop->header->preds) != 2)
1280 if (dump_enabled_p ())
1282 if (!single_exit (loop))
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284 "not vectorized: multiple exits.\n");
1285 else if (EDGE_COUNT (loop->header->preds) != 2)
1286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1287 "not vectorized: too many incoming edges.\n");
1289 return false;
1292 /* We assume that the loop exit condition is at the end of the loop. i.e,
1293 that the loop is represented as a do-while (with a proper if-guard
1294 before the loop if needed), where the loop header contains all the
1295 executable statements, and the latch is empty. */
1296 if (!empty_block_p (loop->latch)
1297 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301 "not vectorized: latch block not empty.\n");
1302 return false;
1305 /* Make sure the exit is not abnormal. */
1306 edge e = single_exit (loop);
1307 if (e->flags & EDGE_ABNORMAL)
1309 if (dump_enabled_p ())
1310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311 "not vectorized: abnormal loop exit edge.\n");
1312 return false;
1315 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1316 number_of_iterationsm1);
1317 if (!*loop_cond)
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321 "not vectorized: complicated exit condition.\n");
1322 return false;
1325 if (integer_zerop (*assumptions)
1326 || !*number_of_iterations
1327 || chrec_contains_undetermined (*number_of_iterations))
1329 if (dump_enabled_p ())
1330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331 "not vectorized: number of iterations cannot be "
1332 "computed.\n");
1333 return false;
1336 if (integer_zerop (*number_of_iterations))
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340 "not vectorized: number of iterations = 0.\n");
1341 return false;
1344 return true;
1347 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1349 loop_vec_info
1350 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1352 tree assumptions, number_of_iterations, number_of_iterationsm1;
1353 gcond *loop_cond, *inner_loop_cond = NULL;
1355 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1356 &assumptions, &number_of_iterationsm1,
1357 &number_of_iterations, &inner_loop_cond))
1358 return NULL;
1360 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1361 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1362 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1363 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1364 if (!integer_onep (assumptions))
1366 /* We consider to vectorize this loop by versioning it under
1367 some assumptions. In order to do this, we need to clear
1368 existing information computed by scev and niter analyzer. */
1369 scev_reset_htab ();
1370 free_numbers_of_iterations_estimates (loop);
1371 /* Also set flag for this loop so that following scev and niter
1372 analysis are done under the assumptions. */
1373 loop_constraint_set (loop, LOOP_C_FINITE);
1374 /* Also record the assumptions for versioning. */
1375 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1378 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1380 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location,
1383 "Symbolic number of iterations is ");
1384 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1385 dump_printf (MSG_NOTE, "\n");
1389 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1390 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1391 if (inner_loop_cond)
1393 stmt_vec_info inner_loop_cond_info
1394 = loop_vinfo->lookup_stmt (inner_loop_cond);
1395 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1398 gcc_assert (!loop->aux);
1399 loop->aux = loop_vinfo;
1400 return loop_vinfo;
1405 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1406 statements update the vectorization factor. */
1408 static void
1409 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1411 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1412 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1413 int nbbs = loop->num_nodes;
1414 poly_uint64 vectorization_factor;
1415 int i;
1417 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1419 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420 gcc_assert (known_ne (vectorization_factor, 0U));
1422 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1423 vectorization factor of the loop is the unrolling factor required by
1424 the SLP instances. If that unrolling factor is 1, we say, that we
1425 perform pure SLP on loop - cross iteration parallelism is not
1426 exploited. */
1427 bool only_slp_in_loop = true;
1428 for (i = 0; i < nbbs; i++)
1430 basic_block bb = bbs[i];
1431 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1432 gsi_next (&si))
1434 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1435 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1436 && STMT_VINFO_RELATED_STMT (stmt_info))
1437 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1438 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1439 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1440 && !PURE_SLP_STMT (stmt_info))
1441 /* STMT needs both SLP and loop-based vectorization. */
1442 only_slp_in_loop = false;
1446 if (only_slp_in_loop)
1448 dump_printf_loc (MSG_NOTE, vect_location,
1449 "Loop contains only SLP stmts\n");
1450 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1452 else
1454 dump_printf_loc (MSG_NOTE, vect_location,
1455 "Loop contains SLP and non-SLP stmts\n");
1456 /* Both the vectorization factor and unroll factor have the form
1457 current_vector_size * X for some rational X, so they must have
1458 a common multiple. */
1459 vectorization_factor
1460 = force_common_multiple (vectorization_factor,
1461 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1464 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1465 if (dump_enabled_p ())
1467 dump_printf_loc (MSG_NOTE, vect_location,
1468 "Updating vectorization factor to ");
1469 dump_dec (MSG_NOTE, vectorization_factor);
1470 dump_printf (MSG_NOTE, ".\n");
1474 /* Return true if STMT_INFO describes a double reduction phi and if
1475 the other phi in the reduction is also relevant for vectorization.
1476 This rejects cases such as:
1478 outer1:
1479 x_1 = PHI <x_3(outer2), ...>;
1482 inner:
1483 x_2 = ...;
1486 outer2:
1487 x_3 = PHI <x_2(inner)>;
1489 if nothing in x_2 or elsewhere makes x_1 relevant. */
1491 static bool
1492 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1494 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1495 return false;
1497 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1500 /* Function vect_analyze_loop_operations.
1502 Scan the loop stmts and make sure they are all vectorizable. */
1504 static bool
1505 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1507 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1508 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1509 int nbbs = loop->num_nodes;
1510 int i;
1511 stmt_vec_info stmt_info;
1512 bool need_to_vectorize = false;
1513 bool ok;
1515 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1517 stmt_vector_for_cost cost_vec;
1518 cost_vec.create (2);
1520 for (i = 0; i < nbbs; i++)
1522 basic_block bb = bbs[i];
1524 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1525 gsi_next (&si))
1527 gphi *phi = si.phi ();
1528 ok = true;
1530 stmt_info = loop_vinfo->lookup_stmt (phi);
1531 if (dump_enabled_p ())
1533 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1534 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1536 if (virtual_operand_p (gimple_phi_result (phi)))
1537 continue;
1539 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1540 (i.e., a phi in the tail of the outer-loop). */
1541 if (! is_loop_header_bb_p (bb))
1543 /* FORNOW: we currently don't support the case that these phis
1544 are not used in the outerloop (unless it is double reduction,
1545 i.e., this phi is vect_reduction_def), cause this case
1546 requires to actually do something here. */
1547 if (STMT_VINFO_LIVE_P (stmt_info)
1548 && !vect_active_double_reduction_p (stmt_info))
1550 if (dump_enabled_p ())
1551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1552 "Unsupported loop-closed phi in "
1553 "outer-loop.\n");
1554 return false;
1557 /* If PHI is used in the outer loop, we check that its operand
1558 is defined in the inner loop. */
1559 if (STMT_VINFO_RELEVANT_P (stmt_info))
1561 tree phi_op;
1563 if (gimple_phi_num_args (phi) != 1)
1564 return false;
1566 phi_op = PHI_ARG_DEF (phi, 0);
1567 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1568 if (!op_def_info)
1569 return false;
1571 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1572 && (STMT_VINFO_RELEVANT (op_def_info)
1573 != vect_used_in_outer_by_reduction))
1574 return false;
1577 continue;
1580 gcc_assert (stmt_info);
1582 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1583 || STMT_VINFO_LIVE_P (stmt_info))
1584 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1586 /* A scalar-dependence cycle that we don't support. */
1587 if (dump_enabled_p ())
1588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589 "not vectorized: scalar dependence cycle.\n");
1590 return false;
1593 if (STMT_VINFO_RELEVANT_P (stmt_info))
1595 need_to_vectorize = true;
1596 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1597 && ! PURE_SLP_STMT (stmt_info))
1598 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1599 &cost_vec);
1600 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1601 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1602 && ! PURE_SLP_STMT (stmt_info))
1603 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1604 &cost_vec);
1607 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1608 if (ok
1609 && STMT_VINFO_LIVE_P (stmt_info)
1610 && !PURE_SLP_STMT (stmt_info))
1611 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1612 &cost_vec);
1614 if (!ok)
1616 if (dump_enabled_p ())
1618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1619 "not vectorized: relevant phi not "
1620 "supported: ");
1621 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1623 return false;
1627 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1628 gsi_next (&si))
1630 gimple *stmt = gsi_stmt (si);
1631 if (!gimple_clobber_p (stmt)
1632 && !vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1633 &need_to_vectorize,
1634 NULL, NULL, &cost_vec))
1635 return false;
1637 } /* bbs */
1639 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1640 cost_vec.release ();
1642 /* All operations in the loop are either irrelevant (deal with loop
1643 control, or dead), or only used outside the loop and can be moved
1644 out of the loop (e.g. invariants, inductions). The loop can be
1645 optimized away by scalar optimizations. We're better off not
1646 touching this loop. */
1647 if (!need_to_vectorize)
1649 if (dump_enabled_p ())
1650 dump_printf_loc (MSG_NOTE, vect_location,
1651 "All the computation can be taken out of the loop.\n");
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654 "not vectorized: redundant loop. no profit to "
1655 "vectorize.\n");
1656 return false;
1659 return true;
1662 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1663 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1664 definitely no, or -1 if it's worth retrying. */
1666 static int
1667 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1669 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1670 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1672 /* Only fully-masked loops can have iteration counts less than the
1673 vectorization factor. */
1674 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1676 HOST_WIDE_INT max_niter;
1678 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1679 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1680 else
1681 max_niter = max_stmt_executions_int (loop);
1683 if (max_niter != -1
1684 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1686 if (dump_enabled_p ())
1687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1688 "not vectorized: iteration count smaller than "
1689 "vectorization factor.\n");
1690 return 0;
1694 int min_profitable_iters, min_profitable_estimate;
1695 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1696 &min_profitable_estimate);
1698 if (min_profitable_iters < 0)
1700 if (dump_enabled_p ())
1701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1702 "not vectorized: vectorization not profitable.\n");
1703 if (dump_enabled_p ())
1704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1705 "not vectorized: vector version will never be "
1706 "profitable.\n");
1707 return -1;
1710 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1711 * assumed_vf);
1713 /* Use the cost model only if it is more conservative than user specified
1714 threshold. */
1715 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1716 min_profitable_iters);
1718 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1720 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1721 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1723 if (dump_enabled_p ())
1724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725 "not vectorized: vectorization not profitable.\n");
1726 if (dump_enabled_p ())
1727 dump_printf_loc (MSG_NOTE, vect_location,
1728 "not vectorized: iteration count smaller than user "
1729 "specified loop bound parameter or minimum profitable "
1730 "iterations (whichever is more conservative).\n");
1731 return 0;
1734 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1735 if (estimated_niter == -1)
1736 estimated_niter = likely_max_stmt_executions_int (loop);
1737 if (estimated_niter != -1
1738 && ((unsigned HOST_WIDE_INT) estimated_niter
1739 < MAX (th, (unsigned) min_profitable_estimate)))
1741 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1743 "not vectorized: estimated iteration count too "
1744 "small.\n");
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_NOTE, vect_location,
1747 "not vectorized: estimated iteration count smaller "
1748 "than specified loop bound parameter or minimum "
1749 "profitable iterations (whichever is more "
1750 "conservative).\n");
1751 return -1;
1754 return 1;
1757 static bool
1758 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1759 vec<data_reference_p> *datarefs,
1760 unsigned int *n_stmts)
1762 *n_stmts = 0;
1763 for (unsigned i = 0; i < loop->num_nodes; i++)
1764 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1765 !gsi_end_p (gsi); gsi_next (&gsi))
1767 gimple *stmt = gsi_stmt (gsi);
1768 if (is_gimple_debug (stmt))
1769 continue;
1770 ++(*n_stmts);
1771 if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1773 if (is_gimple_call (stmt) && loop->safelen)
1775 tree fndecl = gimple_call_fndecl (stmt), op;
1776 if (fndecl != NULL_TREE)
1778 cgraph_node *node = cgraph_node::get (fndecl);
1779 if (node != NULL && node->simd_clones != NULL)
1781 unsigned int j, n = gimple_call_num_args (stmt);
1782 for (j = 0; j < n; j++)
1784 op = gimple_call_arg (stmt, j);
1785 if (DECL_P (op)
1786 || (REFERENCE_CLASS_P (op)
1787 && get_base_address (op)))
1788 break;
1790 op = gimple_call_lhs (stmt);
1791 /* Ignore #pragma omp declare simd functions
1792 if they don't have data references in the
1793 call stmt itself. */
1794 if (j == n
1795 && !(op
1796 && (DECL_P (op)
1797 || (REFERENCE_CLASS_P (op)
1798 && get_base_address (op)))))
1799 continue;
1803 return false;
1805 /* If dependence analysis will give up due to the limit on the
1806 number of datarefs stop here and fail fatally. */
1807 if (datarefs->length ()
1808 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1809 return false;
1811 return true;
1814 /* Function vect_analyze_loop_2.
1816 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1817 for it. The different analyses will record information in the
1818 loop_vec_info struct. */
1819 static bool
1820 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1822 bool ok;
1823 int res;
1824 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1825 poly_uint64 min_vf = 2;
1827 /* The first group of checks is independent of the vector size. */
1828 fatal = true;
1830 /* Find all data references in the loop (which correspond to vdefs/vuses)
1831 and analyze their evolution in the loop. */
1833 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1835 /* Gather the data references and count stmts in the loop. */
1836 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1838 if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1839 &LOOP_VINFO_DATAREFS (loop_vinfo),
1840 n_stmts))
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844 "not vectorized: loop contains function "
1845 "calls or data references that cannot "
1846 "be analyzed\n");
1847 return false;
1849 loop_vinfo->shared->save_datarefs ();
1851 else
1852 loop_vinfo->shared->check_datarefs ();
1854 /* Analyze the data references and also adjust the minimal
1855 vectorization factor according to the loads and stores. */
1857 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1858 if (!ok)
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1862 "bad data references.\n");
1863 return false;
1866 /* Classify all cross-iteration scalar data-flow cycles.
1867 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1868 vect_analyze_scalar_cycles (loop_vinfo);
1870 vect_pattern_recog (loop_vinfo);
1872 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1874 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1875 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1877 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1878 if (!ok)
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1882 "bad data access.\n");
1883 return false;
1886 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1888 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1889 if (!ok)
1891 if (dump_enabled_p ())
1892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1893 "unexpected pattern.\n");
1894 return false;
1897 /* While the rest of the analysis below depends on it in some way. */
1898 fatal = false;
1900 /* Analyze data dependences between the data-refs in the loop
1901 and adjust the maximum vectorization factor according to
1902 the dependences.
1903 FORNOW: fail at the first data dependence that we encounter. */
1905 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1906 if (!ok
1907 || (max_vf != MAX_VECTORIZATION_FACTOR
1908 && maybe_lt (max_vf, min_vf)))
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912 "bad data dependence.\n");
1913 return false;
1915 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1917 ok = vect_determine_vectorization_factor (loop_vinfo);
1918 if (!ok)
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922 "can't determine vectorization factor.\n");
1923 return false;
1925 if (max_vf != MAX_VECTORIZATION_FACTOR
1926 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1928 if (dump_enabled_p ())
1929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930 "bad data dependence.\n");
1931 return false;
1934 /* Compute the scalar iteration cost. */
1935 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1937 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1938 unsigned th;
1940 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1941 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1942 if (!ok)
1943 return false;
1945 /* If there are any SLP instances mark them as pure_slp. */
1946 bool slp = vect_make_slp_decision (loop_vinfo);
1947 if (slp)
1949 /* Find stmts that need to be both vectorized and SLPed. */
1950 vect_detect_hybrid_slp (loop_vinfo);
1952 /* Update the vectorization factor based on the SLP decision. */
1953 vect_update_vf_for_slp (loop_vinfo);
1956 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1958 /* We don't expect to have to roll back to anything other than an empty
1959 set of rgroups. */
1960 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1962 /* This is the point where we can re-start analysis with SLP forced off. */
1963 start_over:
1965 /* Now the vectorization factor is final. */
1966 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1967 gcc_assert (known_ne (vectorization_factor, 0U));
1969 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1971 dump_printf_loc (MSG_NOTE, vect_location,
1972 "vectorization_factor = ");
1973 dump_dec (MSG_NOTE, vectorization_factor);
1974 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1975 LOOP_VINFO_INT_NITERS (loop_vinfo));
1978 HOST_WIDE_INT max_niter
1979 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1981 /* Analyze the alignment of the data-refs in the loop.
1982 Fail if a data reference is found that cannot be vectorized. */
1984 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1985 if (!ok)
1987 if (dump_enabled_p ())
1988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1989 "bad data alignment.\n");
1990 return false;
1993 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1994 It is important to call pruning after vect_analyze_data_ref_accesses,
1995 since we use grouping information gathered by interleaving analysis. */
1996 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1997 if (!ok)
1998 return false;
2000 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2001 vectorization. */
2002 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2004 /* This pass will decide on using loop versioning and/or loop peeling in
2005 order to enhance the alignment of data references in the loop. */
2006 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2007 if (!ok)
2009 if (dump_enabled_p ())
2010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2011 "bad data alignment.\n");
2012 return false;
2016 if (slp)
2018 /* Analyze operations in the SLP instances. Note this may
2019 remove unsupported SLP instances which makes the above
2020 SLP kind detection invalid. */
2021 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2022 vect_slp_analyze_operations (loop_vinfo);
2023 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2024 goto again;
2027 /* Scan all the remaining operations in the loop that are not subject
2028 to SLP and make sure they are vectorizable. */
2029 ok = vect_analyze_loop_operations (loop_vinfo);
2030 if (!ok)
2032 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2034 "bad operation or unsupported loop bound.\n");
2035 return false;
2038 /* Decide whether to use a fully-masked loop for this vectorization
2039 factor. */
2040 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2041 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2042 && vect_verify_full_masking (loop_vinfo));
2043 if (dump_enabled_p ())
2045 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2046 dump_printf_loc (MSG_NOTE, vect_location,
2047 "using a fully-masked loop.\n");
2048 else
2049 dump_printf_loc (MSG_NOTE, vect_location,
2050 "not using a fully-masked loop.\n");
2053 /* If epilog loop is required because of data accesses with gaps,
2054 one additional iteration needs to be peeled. Check if there is
2055 enough iterations for vectorization. */
2056 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2057 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2058 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2060 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2061 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2063 if (known_lt (wi::to_widest (scalar_niters), vf))
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_NOTE, vect_location,
2067 "loop has no enough iterations to support"
2068 " peeling for gaps.\n");
2069 return false;
2073 /* Check the costings of the loop make vectorizing worthwhile. */
2074 res = vect_analyze_loop_costing (loop_vinfo);
2075 if (res < 0)
2076 goto again;
2077 if (!res)
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081 "Loop costings not worthwhile.\n");
2082 return false;
2085 /* Decide whether we need to create an epilogue loop to handle
2086 remaining scalar iterations. */
2087 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2089 unsigned HOST_WIDE_INT const_vf;
2090 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2091 /* The main loop handles all iterations. */
2092 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2093 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2094 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2096 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2097 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2098 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2099 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2101 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2102 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2103 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2104 < (unsigned) exact_log2 (const_vf))
2105 /* In case of versioning, check if the maximum number of
2106 iterations is greater than th. If they are identical,
2107 the epilogue is unnecessary. */
2108 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2109 || ((unsigned HOST_WIDE_INT) max_niter
2110 > (th / const_vf) * const_vf))))
2111 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2113 /* If an epilogue loop is required make sure we can create one. */
2114 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2115 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2119 if (!vect_can_advance_ivs_p (loop_vinfo)
2120 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2121 single_exit (LOOP_VINFO_LOOP
2122 (loop_vinfo))))
2124 if (dump_enabled_p ())
2125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2126 "not vectorized: can't create required "
2127 "epilog loop\n");
2128 goto again;
2132 /* During peeling, we need to check if number of loop iterations is
2133 enough for both peeled prolog loop and vector loop. This check
2134 can be merged along with threshold check of loop versioning, so
2135 increase threshold for this case if necessary. */
2136 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2138 poly_uint64 niters_th = 0;
2140 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2142 /* Niters for peeled prolog loop. */
2143 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2145 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2146 tree vectype = STMT_VINFO_VECTYPE (vect_dr_stmt (dr));
2147 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2149 else
2150 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2153 /* Niters for at least one iteration of vectorized loop. */
2154 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2155 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2156 /* One additional iteration because of peeling for gap. */
2157 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2158 niters_th += 1;
2159 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2162 gcc_assert (known_eq (vectorization_factor,
2163 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2165 /* Ok to vectorize! */
2166 return true;
2168 again:
2169 /* Try again with SLP forced off but if we didn't do any SLP there is
2170 no point in re-trying. */
2171 if (!slp)
2172 return false;
2174 /* If there are reduction chains re-trying will fail anyway. */
2175 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2176 return false;
2178 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2179 via interleaving or lane instructions. */
2180 slp_instance instance;
2181 slp_tree node;
2182 unsigned i, j;
2183 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2185 stmt_vec_info vinfo;
2186 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2187 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2188 continue;
2189 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2190 unsigned int size = DR_GROUP_SIZE (vinfo);
2191 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2192 if (! vect_store_lanes_supported (vectype, size, false)
2193 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2194 && ! vect_grouped_store_supported (vectype, size))
2195 return false;
2196 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2198 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2199 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2200 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2201 size = DR_GROUP_SIZE (vinfo);
2202 vectype = STMT_VINFO_VECTYPE (vinfo);
2203 if (! vect_load_lanes_supported (vectype, size, false)
2204 && ! vect_grouped_load_supported (vectype, single_element_p,
2205 size))
2206 return false;
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_NOTE, vect_location,
2212 "re-trying with SLP disabled\n");
2214 /* Roll back state appropriately. No SLP this time. */
2215 slp = false;
2216 /* Restore vectorization factor as it were without SLP. */
2217 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2218 /* Free the SLP instances. */
2219 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2220 vect_free_slp_instance (instance, false);
2221 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2222 /* Reset SLP type to loop_vect on all stmts. */
2223 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2225 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2226 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2227 !gsi_end_p (si); gsi_next (&si))
2229 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2230 STMT_SLP_TYPE (stmt_info) = loop_vect;
2232 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2233 !gsi_end_p (si); gsi_next (&si))
2235 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2236 STMT_SLP_TYPE (stmt_info) = loop_vect;
2237 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2239 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2240 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2241 STMT_SLP_TYPE (stmt_info) = loop_vect;
2242 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2243 !gsi_end_p (pi); gsi_next (&pi))
2244 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2245 = loop_vect;
2249 /* Free optimized alias test DDRS. */
2250 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2251 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2252 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2253 /* Reset target cost data. */
2254 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2255 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2256 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2257 /* Reset accumulated rgroup information. */
2258 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2259 /* Reset assorted flags. */
2260 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2261 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2262 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2263 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2264 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2266 goto start_over;
2269 /* Function vect_analyze_loop.
2271 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2272 for it. The different analyses will record information in the
2273 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2274 be vectorized. */
2275 loop_vec_info
2276 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2277 vec_info_shared *shared)
2279 loop_vec_info loop_vinfo;
2280 auto_vector_sizes vector_sizes;
2282 /* Autodetect first vector size we try. */
2283 current_vector_size = 0;
2284 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2285 unsigned int next_size = 0;
2287 DUMP_VECT_SCOPE ("analyze_loop_nest");
2289 if (loop_outer (loop)
2290 && loop_vec_info_for_loop (loop_outer (loop))
2291 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE, vect_location,
2295 "outer-loop already vectorized.\n");
2296 return NULL;
2299 if (!find_loop_nest (loop, &shared->loop_nest))
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "not vectorized: loop nest containing two "
2304 "or more consecutive inner loops cannot be "
2305 "vectorized\n");
2306 return NULL;
2309 unsigned n_stmts = 0;
2310 poly_uint64 autodetected_vector_size = 0;
2311 while (1)
2313 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2314 loop_vinfo = vect_analyze_loop_form (loop, shared);
2315 if (!loop_vinfo)
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319 "bad loop form.\n");
2320 return NULL;
2323 bool fatal = false;
2325 if (orig_loop_vinfo)
2326 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2328 if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2330 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2332 return loop_vinfo;
2335 delete loop_vinfo;
2337 if (next_size == 0)
2338 autodetected_vector_size = current_vector_size;
2340 if (next_size < vector_sizes.length ()
2341 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2342 next_size += 1;
2344 if (fatal
2345 || next_size == vector_sizes.length ()
2346 || known_eq (current_vector_size, 0U))
2347 return NULL;
2349 /* Try the next biggest vector size. */
2350 current_vector_size = vector_sizes[next_size++];
2351 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE, vect_location,
2354 "***** Re-trying analysis with "
2355 "vector size ");
2356 dump_dec (MSG_NOTE, current_vector_size);
2357 dump_printf (MSG_NOTE, "\n");
2362 /* Return true if there is an in-order reduction function for CODE, storing
2363 it in *REDUC_FN if so. */
2365 static bool
2366 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2368 switch (code)
2370 case PLUS_EXPR:
2371 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2372 return true;
2374 default:
2375 return false;
2379 /* Function reduction_fn_for_scalar_code
2381 Input:
2382 CODE - tree_code of a reduction operations.
2384 Output:
2385 REDUC_FN - the corresponding internal function to be used to reduce the
2386 vector of partial results into a single scalar result, or IFN_LAST
2387 if the operation is a supported reduction operation, but does not have
2388 such an internal function.
2390 Return FALSE if CODE currently cannot be vectorized as reduction. */
2392 static bool
2393 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2395 switch (code)
2397 case MAX_EXPR:
2398 *reduc_fn = IFN_REDUC_MAX;
2399 return true;
2401 case MIN_EXPR:
2402 *reduc_fn = IFN_REDUC_MIN;
2403 return true;
2405 case PLUS_EXPR:
2406 *reduc_fn = IFN_REDUC_PLUS;
2407 return true;
2409 case BIT_AND_EXPR:
2410 *reduc_fn = IFN_REDUC_AND;
2411 return true;
2413 case BIT_IOR_EXPR:
2414 *reduc_fn = IFN_REDUC_IOR;
2415 return true;
2417 case BIT_XOR_EXPR:
2418 *reduc_fn = IFN_REDUC_XOR;
2419 return true;
2421 case MULT_EXPR:
2422 case MINUS_EXPR:
2423 *reduc_fn = IFN_LAST;
2424 return true;
2426 default:
2427 return false;
2431 /* If there is a neutral value X such that SLP reduction NODE would not
2432 be affected by the introduction of additional X elements, return that X,
2433 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2434 is true if the SLP statements perform a single reduction, false if each
2435 statement performs an independent reduction. */
2437 static tree
2438 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2439 bool reduc_chain)
2441 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2442 stmt_vec_info stmt_vinfo = stmts[0];
2443 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2444 tree scalar_type = TREE_TYPE (vector_type);
2445 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2446 gcc_assert (loop);
2448 switch (code)
2450 case WIDEN_SUM_EXPR:
2451 case DOT_PROD_EXPR:
2452 case SAD_EXPR:
2453 case PLUS_EXPR:
2454 case MINUS_EXPR:
2455 case BIT_IOR_EXPR:
2456 case BIT_XOR_EXPR:
2457 return build_zero_cst (scalar_type);
2459 case MULT_EXPR:
2460 return build_one_cst (scalar_type);
2462 case BIT_AND_EXPR:
2463 return build_all_ones_cst (scalar_type);
2465 case MAX_EXPR:
2466 case MIN_EXPR:
2467 /* For MIN/MAX the initial values are neutral. A reduction chain
2468 has only a single initial value, so that value is neutral for
2469 all statements. */
2470 if (reduc_chain)
2471 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2472 loop_preheader_edge (loop));
2473 return NULL_TREE;
2475 default:
2476 return NULL_TREE;
2480 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2481 STMT is printed with a message MSG. */
2483 static void
2484 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2486 dump_printf_loc (msg_type, vect_location, "%s", msg);
2487 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2490 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2491 operation. Return true if the results of DEF_STMT_INFO are something
2492 that can be accumulated by such a reduction. */
2494 static bool
2495 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2497 return (is_gimple_assign (def_stmt_info->stmt)
2498 || is_gimple_call (def_stmt_info->stmt)
2499 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2500 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2501 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2502 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2505 /* Detect SLP reduction of the form:
2507 #a1 = phi <a5, a0>
2508 a2 = operation (a1)
2509 a3 = operation (a2)
2510 a4 = operation (a3)
2511 a5 = operation (a4)
2513 #a = phi <a5>
2515 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2516 FIRST_STMT is the first reduction stmt in the chain
2517 (a2 = operation (a1)).
2519 Return TRUE if a reduction chain was detected. */
2521 static bool
2522 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2523 gimple *first_stmt)
2525 struct loop *loop = (gimple_bb (phi))->loop_father;
2526 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2527 enum tree_code code;
2528 gimple *loop_use_stmt = NULL;
2529 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2530 tree lhs;
2531 imm_use_iterator imm_iter;
2532 use_operand_p use_p;
2533 int nloop_uses, size = 0, n_out_of_loop_uses;
2534 bool found = false;
2536 if (loop != vect_loop)
2537 return false;
2539 lhs = PHI_RESULT (phi);
2540 code = gimple_assign_rhs_code (first_stmt);
2541 while (1)
2543 nloop_uses = 0;
2544 n_out_of_loop_uses = 0;
2545 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2547 gimple *use_stmt = USE_STMT (use_p);
2548 if (is_gimple_debug (use_stmt))
2549 continue;
2551 /* Check if we got back to the reduction phi. */
2552 if (use_stmt == phi)
2554 loop_use_stmt = use_stmt;
2555 found = true;
2556 break;
2559 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2561 loop_use_stmt = use_stmt;
2562 nloop_uses++;
2564 else
2565 n_out_of_loop_uses++;
2567 /* There are can be either a single use in the loop or two uses in
2568 phi nodes. */
2569 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2570 return false;
2573 if (found)
2574 break;
2576 /* We reached a statement with no loop uses. */
2577 if (nloop_uses == 0)
2578 return false;
2580 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2581 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2582 return false;
2584 if (!is_gimple_assign (loop_use_stmt)
2585 || code != gimple_assign_rhs_code (loop_use_stmt)
2586 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2587 return false;
2589 /* Insert USE_STMT into reduction chain. */
2590 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2591 if (current_stmt_info)
2593 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2594 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2595 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2597 else
2598 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2600 lhs = gimple_assign_lhs (loop_use_stmt);
2601 current_stmt_info = use_stmt_info;
2602 size++;
2605 if (!found || loop_use_stmt != phi || size < 2)
2606 return false;
2608 /* Swap the operands, if needed, to make the reduction operand be the second
2609 operand. */
2610 lhs = PHI_RESULT (phi);
2611 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2612 while (next_stmt_info)
2614 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2615 if (gimple_assign_rhs2 (next_stmt) == lhs)
2617 tree op = gimple_assign_rhs1 (next_stmt);
2618 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2620 /* Check that the other def is either defined in the loop
2621 ("vect_internal_def"), or it's an induction (defined by a
2622 loop-header phi-node). */
2623 if (def_stmt_info
2624 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2625 && vect_valid_reduction_input_p (def_stmt_info))
2627 lhs = gimple_assign_lhs (next_stmt);
2628 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2629 continue;
2632 return false;
2634 else
2636 tree op = gimple_assign_rhs2 (next_stmt);
2637 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2639 /* Check that the other def is either defined in the loop
2640 ("vect_internal_def"), or it's an induction (defined by a
2641 loop-header phi-node). */
2642 if (def_stmt_info
2643 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2644 && vect_valid_reduction_input_p (def_stmt_info))
2646 if (dump_enabled_p ())
2648 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2649 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2652 swap_ssa_operands (next_stmt,
2653 gimple_assign_rhs1_ptr (next_stmt),
2654 gimple_assign_rhs2_ptr (next_stmt));
2655 update_stmt (next_stmt);
2657 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2658 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2660 else
2661 return false;
2664 lhs = gimple_assign_lhs (next_stmt);
2665 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2668 /* Save the chain for further analysis in SLP detection. */
2669 stmt_vec_info first_stmt_info
2670 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2671 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2672 REDUC_GROUP_SIZE (first_stmt_info) = size;
2674 return true;
2677 /* Return true if we need an in-order reduction for operation CODE
2678 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2679 overflow must wrap. */
2681 static bool
2682 needs_fold_left_reduction_p (tree type, tree_code code,
2683 bool need_wrapping_integral_overflow)
2685 /* CHECKME: check for !flag_finite_math_only too? */
2686 if (SCALAR_FLOAT_TYPE_P (type))
2687 switch (code)
2689 case MIN_EXPR:
2690 case MAX_EXPR:
2691 return false;
2693 default:
2694 return !flag_associative_math;
2697 if (INTEGRAL_TYPE_P (type))
2699 if (!operation_no_trapping_overflow (type, code))
2700 return true;
2701 if (need_wrapping_integral_overflow
2702 && !TYPE_OVERFLOW_WRAPS (type)
2703 && operation_can_overflow (code))
2704 return true;
2705 return false;
2708 if (SAT_FIXED_POINT_TYPE_P (type))
2709 return true;
2711 return false;
2714 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2715 reduction operation CODE has a handled computation expression. */
2717 bool
2718 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2719 tree loop_arg, enum tree_code code)
2721 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2722 auto_bitmap visited;
2723 tree lookfor = PHI_RESULT (phi);
2724 ssa_op_iter curri;
2725 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2726 while (USE_FROM_PTR (curr) != loop_arg)
2727 curr = op_iter_next_use (&curri);
2728 curri.i = curri.numops;
2731 path.safe_push (std::make_pair (curri, curr));
2732 tree use = USE_FROM_PTR (curr);
2733 if (use == lookfor)
2734 break;
2735 gimple *def = SSA_NAME_DEF_STMT (use);
2736 if (gimple_nop_p (def)
2737 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2739 pop:
2742 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2743 curri = x.first;
2744 curr = x.second;
2746 curr = op_iter_next_use (&curri);
2747 /* Skip already visited or non-SSA operands (from iterating
2748 over PHI args). */
2749 while (curr != NULL_USE_OPERAND_P
2750 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2751 || ! bitmap_set_bit (visited,
2752 SSA_NAME_VERSION
2753 (USE_FROM_PTR (curr)))));
2755 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2756 if (curr == NULL_USE_OPERAND_P)
2757 break;
2759 else
2761 if (gimple_code (def) == GIMPLE_PHI)
2762 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2763 else
2764 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2765 while (curr != NULL_USE_OPERAND_P
2766 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2767 || ! bitmap_set_bit (visited,
2768 SSA_NAME_VERSION
2769 (USE_FROM_PTR (curr)))))
2770 curr = op_iter_next_use (&curri);
2771 if (curr == NULL_USE_OPERAND_P)
2772 goto pop;
2775 while (1);
2776 if (dump_file && (dump_flags & TDF_DETAILS))
2778 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2779 unsigned i;
2780 std::pair<ssa_op_iter, use_operand_p> *x;
2781 FOR_EACH_VEC_ELT (path, i, x)
2783 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2784 dump_printf (MSG_NOTE, " ");
2786 dump_printf (MSG_NOTE, "\n");
2789 /* Check whether the reduction path detected is valid. */
2790 bool fail = path.length () == 0;
2791 bool neg = false;
2792 for (unsigned i = 1; i < path.length (); ++i)
2794 gimple *use_stmt = USE_STMT (path[i].second);
2795 tree op = USE_FROM_PTR (path[i].second);
2796 if (! has_single_use (op)
2797 || ! is_gimple_assign (use_stmt))
2799 fail = true;
2800 break;
2802 if (gimple_assign_rhs_code (use_stmt) != code)
2804 if (code == PLUS_EXPR
2805 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2807 /* Track whether we negate the reduction value each iteration. */
2808 if (gimple_assign_rhs2 (use_stmt) == op)
2809 neg = ! neg;
2811 else
2813 fail = true;
2814 break;
2818 return ! fail && ! neg;
2822 /* Function vect_is_simple_reduction
2824 (1) Detect a cross-iteration def-use cycle that represents a simple
2825 reduction computation. We look for the following pattern:
2827 loop_header:
2828 a1 = phi < a0, a2 >
2829 a3 = ...
2830 a2 = operation (a3, a1)
2834 a3 = ...
2835 loop_header:
2836 a1 = phi < a0, a2 >
2837 a2 = operation (a3, a1)
2839 such that:
2840 1. operation is commutative and associative and it is safe to
2841 change the order of the computation
2842 2. no uses for a2 in the loop (a2 is used out of the loop)
2843 3. no uses of a1 in the loop besides the reduction operation
2844 4. no uses of a1 outside the loop.
2846 Conditions 1,4 are tested here.
2847 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2849 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2850 nested cycles.
2852 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2853 reductions:
2855 a1 = phi < a0, a2 >
2856 inner loop (def of a3)
2857 a2 = phi < a3 >
2859 (4) Detect condition expressions, ie:
2860 for (int i = 0; i < N; i++)
2861 if (a[i] < val)
2862 ret_val = a[i];
2866 static stmt_vec_info
2867 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2868 bool *double_reduc,
2869 bool need_wrapping_integral_overflow,
2870 enum vect_reduction_type *v_reduc_type)
2872 gphi *phi = as_a <gphi *> (phi_info->stmt);
2873 struct loop *loop = (gimple_bb (phi))->loop_father;
2874 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2875 gimple *phi_use_stmt = NULL;
2876 enum tree_code orig_code, code;
2877 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2878 tree type;
2879 int nloop_uses;
2880 tree name;
2881 imm_use_iterator imm_iter;
2882 use_operand_p use_p;
2883 bool phi_def;
2885 *double_reduc = false;
2886 *v_reduc_type = TREE_CODE_REDUCTION;
2888 tree phi_name = PHI_RESULT (phi);
2889 /* ??? If there are no uses of the PHI result the inner loop reduction
2890 won't be detected as possibly double-reduction by vectorizable_reduction
2891 because that tries to walk the PHI arg from the preheader edge which
2892 can be constant. See PR60382. */
2893 if (has_zero_uses (phi_name))
2894 return NULL;
2895 nloop_uses = 0;
2896 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2898 gimple *use_stmt = USE_STMT (use_p);
2899 if (is_gimple_debug (use_stmt))
2900 continue;
2902 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2904 if (dump_enabled_p ())
2905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2906 "intermediate value used outside loop.\n");
2908 return NULL;
2911 nloop_uses++;
2912 if (nloop_uses > 1)
2914 if (dump_enabled_p ())
2915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2916 "reduction value used in loop.\n");
2917 return NULL;
2920 phi_use_stmt = use_stmt;
2923 edge latch_e = loop_latch_edge (loop);
2924 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2925 if (TREE_CODE (loop_arg) != SSA_NAME)
2927 if (dump_enabled_p ())
2929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2930 "reduction: not ssa_name: ");
2931 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2932 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2934 return NULL;
2937 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2938 if (!def_stmt_info)
2939 return NULL;
2941 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2943 name = gimple_assign_lhs (def_stmt);
2944 phi_def = false;
2946 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2948 name = PHI_RESULT (def_stmt);
2949 phi_def = true;
2951 else
2953 if (dump_enabled_p ())
2955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2956 "reduction: unhandled reduction operation: ");
2957 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2958 def_stmt_info->stmt, 0);
2960 return NULL;
2963 nloop_uses = 0;
2964 auto_vec<gphi *, 3> lcphis;
2965 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2967 gimple *use_stmt = USE_STMT (use_p);
2968 if (is_gimple_debug (use_stmt))
2969 continue;
2970 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2971 nloop_uses++;
2972 else
2973 /* We can have more than one loop-closed PHI. */
2974 lcphis.safe_push (as_a <gphi *> (use_stmt));
2975 if (nloop_uses > 1)
2977 if (dump_enabled_p ())
2978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2979 "reduction used in loop.\n");
2980 return NULL;
2984 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2985 defined in the inner loop. */
2986 if (phi_def)
2988 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2989 op1 = PHI_ARG_DEF (def_stmt, 0);
2991 if (gimple_phi_num_args (def_stmt) != 1
2992 || TREE_CODE (op1) != SSA_NAME)
2994 if (dump_enabled_p ())
2995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2996 "unsupported phi node definition.\n");
2998 return NULL;
3001 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3002 if (gimple_bb (def1)
3003 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3004 && loop->inner
3005 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3006 && is_gimple_assign (def1)
3007 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3009 if (dump_enabled_p ())
3010 report_vect_op (MSG_NOTE, def_stmt,
3011 "detected double reduction: ");
3013 *double_reduc = true;
3014 return def_stmt_info;
3017 return NULL;
3020 /* If we are vectorizing an inner reduction we are executing that
3021 in the original order only in case we are not dealing with a
3022 double reduction. */
3023 bool check_reduction = true;
3024 if (flow_loop_nested_p (vect_loop, loop))
3026 gphi *lcphi;
3027 unsigned i;
3028 check_reduction = false;
3029 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3030 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3032 gimple *use_stmt = USE_STMT (use_p);
3033 if (is_gimple_debug (use_stmt))
3034 continue;
3035 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3036 check_reduction = true;
3040 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3041 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3042 code = orig_code = gimple_assign_rhs_code (def_stmt);
3044 /* We can handle "res -= x[i]", which is non-associative by
3045 simply rewriting this into "res += -x[i]". Avoid changing
3046 gimple instruction for the first simple tests and only do this
3047 if we're allowed to change code at all. */
3048 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3049 code = PLUS_EXPR;
3051 if (code == COND_EXPR)
3053 if (! nested_in_vect_loop)
3054 *v_reduc_type = COND_REDUCTION;
3056 op3 = gimple_assign_rhs1 (def_stmt);
3057 if (COMPARISON_CLASS_P (op3))
3059 op4 = TREE_OPERAND (op3, 1);
3060 op3 = TREE_OPERAND (op3, 0);
3062 if (op3 == phi_name || op4 == phi_name)
3064 if (dump_enabled_p ())
3065 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3066 "reduction: condition depends on previous"
3067 " iteration: ");
3068 return NULL;
3071 op1 = gimple_assign_rhs2 (def_stmt);
3072 op2 = gimple_assign_rhs3 (def_stmt);
3074 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3076 if (dump_enabled_p ())
3077 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3078 "reduction: not commutative/associative: ");
3079 return NULL;
3081 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3083 op1 = gimple_assign_rhs1 (def_stmt);
3084 op2 = gimple_assign_rhs2 (def_stmt);
3086 else
3088 if (dump_enabled_p ())
3089 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3090 "reduction: not handled operation: ");
3091 return NULL;
3094 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3096 if (dump_enabled_p ())
3097 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3098 "reduction: both uses not ssa_names: ");
3100 return NULL;
3103 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3104 if ((TREE_CODE (op1) == SSA_NAME
3105 && !types_compatible_p (type,TREE_TYPE (op1)))
3106 || (TREE_CODE (op2) == SSA_NAME
3107 && !types_compatible_p (type, TREE_TYPE (op2)))
3108 || (op3 && TREE_CODE (op3) == SSA_NAME
3109 && !types_compatible_p (type, TREE_TYPE (op3)))
3110 || (op4 && TREE_CODE (op4) == SSA_NAME
3111 && !types_compatible_p (type, TREE_TYPE (op4))))
3113 if (dump_enabled_p ())
3115 dump_printf_loc (MSG_NOTE, vect_location,
3116 "reduction: multiple types: operation type: ");
3117 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3118 dump_printf (MSG_NOTE, ", operands types: ");
3119 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3120 TREE_TYPE (op1));
3121 dump_printf (MSG_NOTE, ",");
3122 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3123 TREE_TYPE (op2));
3124 if (op3)
3126 dump_printf (MSG_NOTE, ",");
3127 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3128 TREE_TYPE (op3));
3131 if (op4)
3133 dump_printf (MSG_NOTE, ",");
3134 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3135 TREE_TYPE (op4));
3137 dump_printf (MSG_NOTE, "\n");
3140 return NULL;
3143 /* Check whether it's ok to change the order of the computation.
3144 Generally, when vectorizing a reduction we change the order of the
3145 computation. This may change the behavior of the program in some
3146 cases, so we need to check that this is ok. One exception is when
3147 vectorizing an outer-loop: the inner-loop is executed sequentially,
3148 and therefore vectorizing reductions in the inner-loop during
3149 outer-loop vectorization is safe. */
3150 if (check_reduction
3151 && *v_reduc_type == TREE_CODE_REDUCTION
3152 && needs_fold_left_reduction_p (type, code,
3153 need_wrapping_integral_overflow))
3154 *v_reduc_type = FOLD_LEFT_REDUCTION;
3156 /* Reduction is safe. We're dealing with one of the following:
3157 1) integer arithmetic and no trapv
3158 2) floating point arithmetic, and special flags permit this optimization
3159 3) nested cycle (i.e., outer loop vectorization). */
3160 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3161 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3162 if (code != COND_EXPR && !def1_info && !def2_info)
3164 if (dump_enabled_p ())
3165 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3166 return NULL;
3169 /* Check that one def is the reduction def, defined by PHI,
3170 the other def is either defined in the loop ("vect_internal_def"),
3171 or it's an induction (defined by a loop-header phi-node). */
3173 if (def2_info
3174 && def2_info->stmt == phi
3175 && (code == COND_EXPR
3176 || !def1_info
3177 || vect_valid_reduction_input_p (def1_info)))
3179 if (dump_enabled_p ())
3180 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3181 return def_stmt_info;
3184 if (def1_info
3185 && def1_info->stmt == phi
3186 && (code == COND_EXPR
3187 || !def2_info
3188 || vect_valid_reduction_input_p (def2_info)))
3190 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3192 /* Check if we can swap operands (just for simplicity - so that
3193 the rest of the code can assume that the reduction variable
3194 is always the last (second) argument). */
3195 if (code == COND_EXPR)
3197 /* Swap cond_expr by inverting the condition. */
3198 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3199 enum tree_code invert_code = ERROR_MARK;
3200 enum tree_code cond_code = TREE_CODE (cond_expr);
3202 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3204 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3205 invert_code = invert_tree_comparison (cond_code, honor_nans);
3207 if (invert_code != ERROR_MARK)
3209 TREE_SET_CODE (cond_expr, invert_code);
3210 swap_ssa_operands (def_stmt,
3211 gimple_assign_rhs2_ptr (def_stmt),
3212 gimple_assign_rhs3_ptr (def_stmt));
3214 else
3216 if (dump_enabled_p ())
3217 report_vect_op (MSG_NOTE, def_stmt,
3218 "detected reduction: cannot swap operands "
3219 "for cond_expr");
3220 return NULL;
3223 else
3224 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3225 gimple_assign_rhs2_ptr (def_stmt));
3227 if (dump_enabled_p ())
3228 report_vect_op (MSG_NOTE, def_stmt,
3229 "detected reduction: need to swap operands: ");
3231 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3232 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3234 else
3236 if (dump_enabled_p ())
3237 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3240 return def_stmt_info;
3243 /* Try to find SLP reduction chain. */
3244 if (! nested_in_vect_loop
3245 && code != COND_EXPR
3246 && orig_code != MINUS_EXPR
3247 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3249 if (dump_enabled_p ())
3250 report_vect_op (MSG_NOTE, def_stmt,
3251 "reduction: detected reduction chain: ");
3253 return def_stmt_info;
3256 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3257 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3258 while (first)
3260 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3261 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3262 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3263 first = next;
3266 /* Look for the expression computing loop_arg from loop PHI result. */
3267 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3268 return def_stmt_info;
3270 if (dump_enabled_p ())
3272 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3273 "reduction: unknown pattern: ");
3276 return NULL;
3279 /* Wrapper around vect_is_simple_reduction, which will modify code
3280 in-place if it enables detection of more reductions. Arguments
3281 as there. */
3283 stmt_vec_info
3284 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3285 bool *double_reduc,
3286 bool need_wrapping_integral_overflow)
3288 enum vect_reduction_type v_reduc_type;
3289 stmt_vec_info def_info
3290 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3291 need_wrapping_integral_overflow,
3292 &v_reduc_type);
3293 if (def_info)
3295 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3296 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3297 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3298 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3300 return def_info;
3303 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3305 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3306 int *peel_iters_epilogue,
3307 stmt_vector_for_cost *scalar_cost_vec,
3308 stmt_vector_for_cost *prologue_cost_vec,
3309 stmt_vector_for_cost *epilogue_cost_vec)
3311 int retval = 0;
3312 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3314 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3316 *peel_iters_epilogue = assumed_vf / 2;
3317 if (dump_enabled_p ())
3318 dump_printf_loc (MSG_NOTE, vect_location,
3319 "cost model: epilogue peel iters set to vf/2 "
3320 "because loop iterations are unknown .\n");
3322 /* If peeled iterations are known but number of scalar loop
3323 iterations are unknown, count a taken branch per peeled loop. */
3324 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3325 NULL, 0, vect_prologue);
3326 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3327 NULL, 0, vect_epilogue);
3329 else
3331 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3332 peel_iters_prologue = niters < peel_iters_prologue ?
3333 niters : peel_iters_prologue;
3334 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3335 /* If we need to peel for gaps, but no peeling is required, we have to
3336 peel VF iterations. */
3337 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3338 *peel_iters_epilogue = assumed_vf;
3341 stmt_info_for_cost *si;
3342 int j;
3343 if (peel_iters_prologue)
3344 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3345 retval += record_stmt_cost (prologue_cost_vec,
3346 si->count * peel_iters_prologue,
3347 si->kind, si->stmt_info, si->misalign,
3348 vect_prologue);
3349 if (*peel_iters_epilogue)
3350 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3351 retval += record_stmt_cost (epilogue_cost_vec,
3352 si->count * *peel_iters_epilogue,
3353 si->kind, si->stmt_info, si->misalign,
3354 vect_epilogue);
3356 return retval;
3359 /* Function vect_estimate_min_profitable_iters
3361 Return the number of iterations required for the vector version of the
3362 loop to be profitable relative to the cost of the scalar version of the
3363 loop.
3365 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3366 of iterations for vectorization. -1 value means loop vectorization
3367 is not profitable. This returned value may be used for dynamic
3368 profitability check.
3370 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3371 for static check against estimated number of iterations. */
3373 static void
3374 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3375 int *ret_min_profitable_niters,
3376 int *ret_min_profitable_estimate)
3378 int min_profitable_iters;
3379 int min_profitable_estimate;
3380 int peel_iters_prologue;
3381 int peel_iters_epilogue;
3382 unsigned vec_inside_cost = 0;
3383 int vec_outside_cost = 0;
3384 unsigned vec_prologue_cost = 0;
3385 unsigned vec_epilogue_cost = 0;
3386 int scalar_single_iter_cost = 0;
3387 int scalar_outside_cost = 0;
3388 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3389 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3390 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3392 /* Cost model disabled. */
3393 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3395 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3396 *ret_min_profitable_niters = 0;
3397 *ret_min_profitable_estimate = 0;
3398 return;
3401 /* Requires loop versioning tests to handle misalignment. */
3402 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3404 /* FIXME: Make cost depend on complexity of individual check. */
3405 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3406 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3407 vect_prologue);
3408 dump_printf (MSG_NOTE,
3409 "cost model: Adding cost of checks for loop "
3410 "versioning to treat misalignment.\n");
3413 /* Requires loop versioning with alias checks. */
3414 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3416 /* FIXME: Make cost depend on complexity of individual check. */
3417 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3418 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3419 vect_prologue);
3420 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3421 if (len)
3422 /* Count LEN - 1 ANDs and LEN comparisons. */
3423 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3424 NULL, 0, vect_prologue);
3425 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3426 if (len)
3428 /* Count LEN - 1 ANDs and LEN comparisons. */
3429 unsigned int nstmts = len * 2 - 1;
3430 /* +1 for each bias that needs adding. */
3431 for (unsigned int i = 0; i < len; ++i)
3432 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3433 nstmts += 1;
3434 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3435 NULL, 0, vect_prologue);
3437 dump_printf (MSG_NOTE,
3438 "cost model: Adding cost of checks for loop "
3439 "versioning aliasing.\n");
3442 /* Requires loop versioning with niter checks. */
3443 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3445 /* FIXME: Make cost depend on complexity of individual check. */
3446 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3447 vect_prologue);
3448 dump_printf (MSG_NOTE,
3449 "cost model: Adding cost of checks for loop "
3450 "versioning niters.\n");
3453 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3454 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3455 vect_prologue);
3457 /* Count statements in scalar loop. Using this as scalar cost for a single
3458 iteration for now.
3460 TODO: Add outer loop support.
3462 TODO: Consider assigning different costs to different scalar
3463 statements. */
3465 scalar_single_iter_cost
3466 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3468 /* Add additional cost for the peeled instructions in prologue and epilogue
3469 loop. (For fully-masked loops there will be no peeling.)
3471 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3472 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3474 TODO: Build an expression that represents peel_iters for prologue and
3475 epilogue to be used in a run-time test. */
3477 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3479 peel_iters_prologue = 0;
3480 peel_iters_epilogue = 0;
3482 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3484 /* We need to peel exactly one iteration. */
3485 peel_iters_epilogue += 1;
3486 stmt_info_for_cost *si;
3487 int j;
3488 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3489 j, si)
3490 (void) add_stmt_cost (target_cost_data, si->count,
3491 si->kind, si->stmt_info, si->misalign,
3492 vect_epilogue);
3495 else if (npeel < 0)
3497 peel_iters_prologue = assumed_vf / 2;
3498 dump_printf (MSG_NOTE, "cost model: "
3499 "prologue peel iters set to vf/2.\n");
3501 /* If peeling for alignment is unknown, loop bound of main loop becomes
3502 unknown. */
3503 peel_iters_epilogue = assumed_vf / 2;
3504 dump_printf (MSG_NOTE, "cost model: "
3505 "epilogue peel iters set to vf/2 because "
3506 "peeling for alignment is unknown.\n");
3508 /* If peeled iterations are unknown, count a taken branch and a not taken
3509 branch per peeled loop. Even if scalar loop iterations are known,
3510 vector iterations are not known since peeled prologue iterations are
3511 not known. Hence guards remain the same. */
3512 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3513 NULL, 0, vect_prologue);
3514 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3515 NULL, 0, vect_prologue);
3516 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3517 NULL, 0, vect_epilogue);
3518 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3519 NULL, 0, vect_epilogue);
3520 stmt_info_for_cost *si;
3521 int j;
3522 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3524 (void) add_stmt_cost (target_cost_data,
3525 si->count * peel_iters_prologue,
3526 si->kind, si->stmt_info, si->misalign,
3527 vect_prologue);
3528 (void) add_stmt_cost (target_cost_data,
3529 si->count * peel_iters_epilogue,
3530 si->kind, si->stmt_info, si->misalign,
3531 vect_epilogue);
3534 else
3536 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3537 stmt_info_for_cost *si;
3538 int j;
3539 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3541 prologue_cost_vec.create (2);
3542 epilogue_cost_vec.create (2);
3543 peel_iters_prologue = npeel;
3545 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3546 &peel_iters_epilogue,
3547 &LOOP_VINFO_SCALAR_ITERATION_COST
3548 (loop_vinfo),
3549 &prologue_cost_vec,
3550 &epilogue_cost_vec);
3552 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3553 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3554 si->misalign, vect_prologue);
3556 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3557 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3558 si->misalign, vect_epilogue);
3560 prologue_cost_vec.release ();
3561 epilogue_cost_vec.release ();
3564 /* FORNOW: The scalar outside cost is incremented in one of the
3565 following ways:
3567 1. The vectorizer checks for alignment and aliasing and generates
3568 a condition that allows dynamic vectorization. A cost model
3569 check is ANDED with the versioning condition. Hence scalar code
3570 path now has the added cost of the versioning check.
3572 if (cost > th & versioning_check)
3573 jmp to vector code
3575 Hence run-time scalar is incremented by not-taken branch cost.
3577 2. The vectorizer then checks if a prologue is required. If the
3578 cost model check was not done before during versioning, it has to
3579 be done before the prologue check.
3581 if (cost <= th)
3582 prologue = scalar_iters
3583 if (prologue == 0)
3584 jmp to vector code
3585 else
3586 execute prologue
3587 if (prologue == num_iters)
3588 go to exit
3590 Hence the run-time scalar cost is incremented by a taken branch,
3591 plus a not-taken branch, plus a taken branch cost.
3593 3. The vectorizer then checks if an epilogue is required. If the
3594 cost model check was not done before during prologue check, it
3595 has to be done with the epilogue check.
3597 if (prologue == 0)
3598 jmp to vector code
3599 else
3600 execute prologue
3601 if (prologue == num_iters)
3602 go to exit
3603 vector code:
3604 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3605 jmp to epilogue
3607 Hence the run-time scalar cost should be incremented by 2 taken
3608 branches.
3610 TODO: The back end may reorder the BBS's differently and reverse
3611 conditions/branch directions. Change the estimates below to
3612 something more reasonable. */
3614 /* If the number of iterations is known and we do not do versioning, we can
3615 decide whether to vectorize at compile time. Hence the scalar version
3616 do not carry cost model guard costs. */
3617 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3618 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3620 /* Cost model check occurs at versioning. */
3621 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3622 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3623 else
3625 /* Cost model check occurs at prologue generation. */
3626 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3627 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3628 + vect_get_stmt_cost (cond_branch_not_taken);
3629 /* Cost model check occurs at epilogue generation. */
3630 else
3631 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3635 /* Complete the target-specific cost calculations. */
3636 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3637 &vec_inside_cost, &vec_epilogue_cost);
3639 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3641 if (dump_enabled_p ())
3643 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3644 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3645 vec_inside_cost);
3646 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3647 vec_prologue_cost);
3648 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3649 vec_epilogue_cost);
3650 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3651 scalar_single_iter_cost);
3652 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3653 scalar_outside_cost);
3654 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3655 vec_outside_cost);
3656 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3657 peel_iters_prologue);
3658 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3659 peel_iters_epilogue);
3662 /* Calculate number of iterations required to make the vector version
3663 profitable, relative to the loop bodies only. The following condition
3664 must hold true:
3665 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3666 where
3667 SIC = scalar iteration cost, VIC = vector iteration cost,
3668 VOC = vector outside cost, VF = vectorization factor,
3669 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3670 SOC = scalar outside cost for run time cost model check. */
3672 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3674 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3675 * assumed_vf
3676 - vec_inside_cost * peel_iters_prologue
3677 - vec_inside_cost * peel_iters_epilogue);
3678 if (min_profitable_iters <= 0)
3679 min_profitable_iters = 0;
3680 else
3682 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3683 - vec_inside_cost);
3685 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3686 <= (((int) vec_inside_cost * min_profitable_iters)
3687 + (((int) vec_outside_cost - scalar_outside_cost)
3688 * assumed_vf)))
3689 min_profitable_iters++;
3692 /* vector version will never be profitable. */
3693 else
3695 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3696 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3697 "vectorization did not happen for a simd loop");
3699 if (dump_enabled_p ())
3700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3701 "cost model: the vector iteration cost = %d "
3702 "divided by the scalar iteration cost = %d "
3703 "is greater or equal to the vectorization factor = %d"
3704 ".\n",
3705 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3706 *ret_min_profitable_niters = -1;
3707 *ret_min_profitable_estimate = -1;
3708 return;
3711 dump_printf (MSG_NOTE,
3712 " Calculated minimum iters for profitability: %d\n",
3713 min_profitable_iters);
3715 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3716 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3717 /* We want the vectorized loop to execute at least once. */
3718 min_profitable_iters = assumed_vf + peel_iters_prologue;
3720 if (dump_enabled_p ())
3721 dump_printf_loc (MSG_NOTE, vect_location,
3722 " Runtime profitability threshold = %d\n",
3723 min_profitable_iters);
3725 *ret_min_profitable_niters = min_profitable_iters;
3727 /* Calculate number of iterations required to make the vector version
3728 profitable, relative to the loop bodies only.
3730 Non-vectorized variant is SIC * niters and it must win over vector
3731 variant on the expected loop trip count. The following condition must hold true:
3732 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3734 if (vec_outside_cost <= 0)
3735 min_profitable_estimate = 0;
3736 else
3738 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3739 * assumed_vf
3740 - vec_inside_cost * peel_iters_prologue
3741 - vec_inside_cost * peel_iters_epilogue)
3742 / ((scalar_single_iter_cost * assumed_vf)
3743 - vec_inside_cost);
3745 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3746 if (dump_enabled_p ())
3747 dump_printf_loc (MSG_NOTE, vect_location,
3748 " Static estimate profitability threshold = %d\n",
3749 min_profitable_estimate);
3751 *ret_min_profitable_estimate = min_profitable_estimate;
3754 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3755 vector elements (not bits) for a vector with NELT elements. */
3756 static void
3757 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3758 vec_perm_builder *sel)
3760 /* The encoding is a single stepped pattern. Any wrap-around is handled
3761 by vec_perm_indices. */
3762 sel->new_vector (nelt, 1, 3);
3763 for (unsigned int i = 0; i < 3; i++)
3764 sel->quick_push (i + offset);
3767 /* Checks whether the target supports whole-vector shifts for vectors of mode
3768 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3769 it supports vec_perm_const with masks for all necessary shift amounts. */
3770 static bool
3771 have_whole_vector_shift (machine_mode mode)
3773 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3774 return true;
3776 /* Variable-length vectors should be handled via the optab. */
3777 unsigned int nelt;
3778 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3779 return false;
3781 vec_perm_builder sel;
3782 vec_perm_indices indices;
3783 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3785 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3786 indices.new_vector (sel, 2, nelt);
3787 if (!can_vec_perm_const_p (mode, indices, false))
3788 return false;
3790 return true;
3793 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3794 functions. Design better to avoid maintenance issues. */
3796 /* Function vect_model_reduction_cost.
3798 Models cost for a reduction operation, including the vector ops
3799 generated within the strip-mine loop, the initial definition before
3800 the loop, and the epilogue code that must be generated. */
3802 static void
3803 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3804 int ncopies, stmt_vector_for_cost *cost_vec)
3806 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3807 enum tree_code code;
3808 optab optab;
3809 tree vectype;
3810 machine_mode mode;
3811 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3812 struct loop *loop = NULL;
3814 if (loop_vinfo)
3815 loop = LOOP_VINFO_LOOP (loop_vinfo);
3817 /* Condition reductions generate two reductions in the loop. */
3818 vect_reduction_type reduction_type
3819 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3820 if (reduction_type == COND_REDUCTION)
3821 ncopies *= 2;
3823 vectype = STMT_VINFO_VECTYPE (stmt_info);
3824 mode = TYPE_MODE (vectype);
3825 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3827 if (!orig_stmt_info)
3828 orig_stmt_info = stmt_info;
3830 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3832 if (reduction_type == EXTRACT_LAST_REDUCTION
3833 || reduction_type == FOLD_LEFT_REDUCTION)
3835 /* No extra instructions needed in the prologue. */
3836 prologue_cost = 0;
3838 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3839 /* Count one reduction-like operation per vector. */
3840 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3841 stmt_info, 0, vect_body);
3842 else
3844 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3845 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3846 inside_cost = record_stmt_cost (cost_vec, nelements,
3847 vec_to_scalar, stmt_info, 0,
3848 vect_body);
3849 inside_cost += record_stmt_cost (cost_vec, nelements,
3850 scalar_stmt, stmt_info, 0,
3851 vect_body);
3854 else
3856 /* Add in cost for initial definition.
3857 For cond reduction we have four vectors: initial index, step,
3858 initial result of the data reduction, initial value of the index
3859 reduction. */
3860 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3861 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3862 scalar_to_vec, stmt_info, 0,
3863 vect_prologue);
3865 /* Cost of reduction op inside loop. */
3866 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3867 stmt_info, 0, vect_body);
3870 /* Determine cost of epilogue code.
3872 We have a reduction operator that will reduce the vector in one statement.
3873 Also requires scalar extract. */
3875 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3877 if (reduc_fn != IFN_LAST)
3879 if (reduction_type == COND_REDUCTION)
3881 /* An EQ stmt and an COND_EXPR stmt. */
3882 epilogue_cost += record_stmt_cost (cost_vec, 2,
3883 vector_stmt, stmt_info, 0,
3884 vect_epilogue);
3885 /* Reduction of the max index and a reduction of the found
3886 values. */
3887 epilogue_cost += record_stmt_cost (cost_vec, 2,
3888 vec_to_scalar, stmt_info, 0,
3889 vect_epilogue);
3890 /* A broadcast of the max value. */
3891 epilogue_cost += record_stmt_cost (cost_vec, 1,
3892 scalar_to_vec, stmt_info, 0,
3893 vect_epilogue);
3895 else
3897 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3898 stmt_info, 0, vect_epilogue);
3899 epilogue_cost += record_stmt_cost (cost_vec, 1,
3900 vec_to_scalar, stmt_info, 0,
3901 vect_epilogue);
3904 else if (reduction_type == COND_REDUCTION)
3906 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3907 /* Extraction of scalar elements. */
3908 epilogue_cost += record_stmt_cost (cost_vec,
3909 2 * estimated_nunits,
3910 vec_to_scalar, stmt_info, 0,
3911 vect_epilogue);
3912 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3913 epilogue_cost += record_stmt_cost (cost_vec,
3914 2 * estimated_nunits - 3,
3915 scalar_stmt, stmt_info, 0,
3916 vect_epilogue);
3918 else if (reduction_type == EXTRACT_LAST_REDUCTION
3919 || reduction_type == FOLD_LEFT_REDUCTION)
3920 /* No extra instructions need in the epilogue. */
3922 else
3924 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3925 tree bitsize =
3926 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3927 int element_bitsize = tree_to_uhwi (bitsize);
3928 int nelements = vec_size_in_bits / element_bitsize;
3930 if (code == COND_EXPR)
3931 code = MAX_EXPR;
3933 optab = optab_for_tree_code (code, vectype, optab_default);
3935 /* We have a whole vector shift available. */
3936 if (optab != unknown_optab
3937 && VECTOR_MODE_P (mode)
3938 && optab_handler (optab, mode) != CODE_FOR_nothing
3939 && have_whole_vector_shift (mode))
3941 /* Final reduction via vector shifts and the reduction operator.
3942 Also requires scalar extract. */
3943 epilogue_cost += record_stmt_cost (cost_vec,
3944 exact_log2 (nelements) * 2,
3945 vector_stmt, stmt_info, 0,
3946 vect_epilogue);
3947 epilogue_cost += record_stmt_cost (cost_vec, 1,
3948 vec_to_scalar, stmt_info, 0,
3949 vect_epilogue);
3951 else
3952 /* Use extracts and reduction op for final reduction. For N
3953 elements, we have N extracts and N-1 reduction ops. */
3954 epilogue_cost += record_stmt_cost (cost_vec,
3955 nelements + nelements - 1,
3956 vector_stmt, stmt_info, 0,
3957 vect_epilogue);
3961 if (dump_enabled_p ())
3962 dump_printf (MSG_NOTE,
3963 "vect_model_reduction_cost: inside_cost = %d, "
3964 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3965 prologue_cost, epilogue_cost);
3969 /* Function vect_model_induction_cost.
3971 Models cost for induction operations. */
3973 static void
3974 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3975 stmt_vector_for_cost *cost_vec)
3977 unsigned inside_cost, prologue_cost;
3979 if (PURE_SLP_STMT (stmt_info))
3980 return;
3982 /* loop cost for vec_loop. */
3983 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3984 stmt_info, 0, vect_body);
3986 /* prologue cost for vec_init and vec_step. */
3987 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3988 stmt_info, 0, vect_prologue);
3990 if (dump_enabled_p ())
3991 dump_printf_loc (MSG_NOTE, vect_location,
3992 "vect_model_induction_cost: inside_cost = %d, "
3993 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3998 /* Function get_initial_def_for_reduction
4000 Input:
4001 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4002 INIT_VAL - the initial value of the reduction variable
4004 Output:
4005 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4006 of the reduction (used for adjusting the epilog - see below).
4007 Return a vector variable, initialized according to the operation that
4008 STMT_VINFO performs. This vector will be used as the initial value
4009 of the vector of partial results.
4011 Option1 (adjust in epilog): Initialize the vector as follows:
4012 add/bit or/xor: [0,0,...,0,0]
4013 mult/bit and: [1,1,...,1,1]
4014 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4015 and when necessary (e.g. add/mult case) let the caller know
4016 that it needs to adjust the result by init_val.
4018 Option2: Initialize the vector as follows:
4019 add/bit or/xor: [init_val,0,0,...,0]
4020 mult/bit and: [init_val,1,1,...,1]
4021 min/max/cond_expr: [init_val,init_val,...,init_val]
4022 and no adjustments are needed.
4024 For example, for the following code:
4026 s = init_val;
4027 for (i=0;i<n;i++)
4028 s = s + a[i];
4030 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4031 For a vector of 4 units, we want to return either [0,0,0,init_val],
4032 or [0,0,0,0] and let the caller know that it needs to adjust
4033 the result at the end by 'init_val'.
4035 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4036 initialization vector is simpler (same element in all entries), if
4037 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4039 A cost model should help decide between these two schemes. */
4041 tree
4042 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4043 tree *adjustment_def)
4045 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4046 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4047 tree scalar_type = TREE_TYPE (init_val);
4048 tree vectype = get_vectype_for_scalar_type (scalar_type);
4049 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4050 tree def_for_init;
4051 tree init_def;
4052 REAL_VALUE_TYPE real_init_val = dconst0;
4053 int int_init_val = 0;
4054 gimple_seq stmts = NULL;
4056 gcc_assert (vectype);
4058 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4059 || SCALAR_FLOAT_TYPE_P (scalar_type));
4061 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4062 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4064 vect_reduction_type reduction_type
4065 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4067 switch (code)
4069 case WIDEN_SUM_EXPR:
4070 case DOT_PROD_EXPR:
4071 case SAD_EXPR:
4072 case PLUS_EXPR:
4073 case MINUS_EXPR:
4074 case BIT_IOR_EXPR:
4075 case BIT_XOR_EXPR:
4076 case MULT_EXPR:
4077 case BIT_AND_EXPR:
4079 /* ADJUSTMENT_DEF is NULL when called from
4080 vect_create_epilog_for_reduction to vectorize double reduction. */
4081 if (adjustment_def)
4082 *adjustment_def = init_val;
4084 if (code == MULT_EXPR)
4086 real_init_val = dconst1;
4087 int_init_val = 1;
4090 if (code == BIT_AND_EXPR)
4091 int_init_val = -1;
4093 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4094 def_for_init = build_real (scalar_type, real_init_val);
4095 else
4096 def_for_init = build_int_cst (scalar_type, int_init_val);
4098 if (adjustment_def)
4099 /* Option1: the first element is '0' or '1' as well. */
4100 init_def = gimple_build_vector_from_val (&stmts, vectype,
4101 def_for_init);
4102 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4104 /* Option2 (variable length): the first element is INIT_VAL. */
4105 init_def = gimple_build_vector_from_val (&stmts, vectype,
4106 def_for_init);
4107 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4108 vectype, init_def, init_val);
4110 else
4112 /* Option2: the first element is INIT_VAL. */
4113 tree_vector_builder elts (vectype, 1, 2);
4114 elts.quick_push (init_val);
4115 elts.quick_push (def_for_init);
4116 init_def = gimple_build_vector (&stmts, &elts);
4119 break;
4121 case MIN_EXPR:
4122 case MAX_EXPR:
4123 case COND_EXPR:
4125 if (adjustment_def)
4127 *adjustment_def = NULL_TREE;
4128 if (reduction_type != COND_REDUCTION
4129 && reduction_type != EXTRACT_LAST_REDUCTION)
4131 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4132 break;
4135 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4136 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4138 break;
4140 default:
4141 gcc_unreachable ();
4144 if (stmts)
4145 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4146 return init_def;
4149 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4150 NUMBER_OF_VECTORS is the number of vector defs to create.
4151 If NEUTRAL_OP is nonnull, introducing extra elements of that
4152 value will not change the result. */
4154 static void
4155 get_initial_defs_for_reduction (slp_tree slp_node,
4156 vec<tree> *vec_oprnds,
4157 unsigned int number_of_vectors,
4158 bool reduc_chain, tree neutral_op)
4160 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4161 stmt_vec_info stmt_vinfo = stmts[0];
4162 unsigned HOST_WIDE_INT nunits;
4163 unsigned j, number_of_places_left_in_vector;
4164 tree vector_type;
4165 tree vop;
4166 int group_size = stmts.length ();
4167 unsigned int vec_num, i;
4168 unsigned number_of_copies = 1;
4169 vec<tree> voprnds;
4170 voprnds.create (number_of_vectors);
4171 struct loop *loop;
4172 auto_vec<tree, 16> permute_results;
4174 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4176 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4178 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4179 gcc_assert (loop);
4180 edge pe = loop_preheader_edge (loop);
4182 gcc_assert (!reduc_chain || neutral_op);
4184 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4185 created vectors. It is greater than 1 if unrolling is performed.
4187 For example, we have two scalar operands, s1 and s2 (e.g., group of
4188 strided accesses of size two), while NUNITS is four (i.e., four scalars
4189 of this type can be packed in a vector). The output vector will contain
4190 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4191 will be 2).
4193 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4194 vectors containing the operands.
4196 For example, NUNITS is four as before, and the group size is 8
4197 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4198 {s5, s6, s7, s8}. */
4200 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4201 nunits = group_size;
4203 number_of_copies = nunits * number_of_vectors / group_size;
4205 number_of_places_left_in_vector = nunits;
4206 bool constant_p = true;
4207 tree_vector_builder elts (vector_type, nunits, 1);
4208 elts.quick_grow (nunits);
4209 for (j = 0; j < number_of_copies; j++)
4211 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4213 tree op;
4214 /* Get the def before the loop. In reduction chain we have only
4215 one initial value. */
4216 if ((j != (number_of_copies - 1)
4217 || (reduc_chain && i != 0))
4218 && neutral_op)
4219 op = neutral_op;
4220 else
4221 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4223 /* Create 'vect_ = {op0,op1,...,opn}'. */
4224 number_of_places_left_in_vector--;
4225 elts[number_of_places_left_in_vector] = op;
4226 if (!CONSTANT_CLASS_P (op))
4227 constant_p = false;
4229 if (number_of_places_left_in_vector == 0)
4231 gimple_seq ctor_seq = NULL;
4232 tree init;
4233 if (constant_p && !neutral_op
4234 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4235 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4236 /* Build the vector directly from ELTS. */
4237 init = gimple_build_vector (&ctor_seq, &elts);
4238 else if (neutral_op)
4240 /* Build a vector of the neutral value and shift the
4241 other elements into place. */
4242 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4243 neutral_op);
4244 int k = nunits;
4245 while (k > 0 && elts[k - 1] == neutral_op)
4246 k -= 1;
4247 while (k > 0)
4249 k -= 1;
4250 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4251 vector_type, init, elts[k]);
4254 else
4256 /* First time round, duplicate ELTS to fill the
4257 required number of vectors, then cherry pick the
4258 appropriate result for each iteration. */
4259 if (vec_oprnds->is_empty ())
4260 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4261 number_of_vectors,
4262 permute_results);
4263 init = permute_results[number_of_vectors - j - 1];
4265 if (ctor_seq != NULL)
4266 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4267 voprnds.quick_push (init);
4269 number_of_places_left_in_vector = nunits;
4270 elts.new_vector (vector_type, nunits, 1);
4271 elts.quick_grow (nunits);
4272 constant_p = true;
4277 /* Since the vectors are created in the reverse order, we should invert
4278 them. */
4279 vec_num = voprnds.length ();
4280 for (j = vec_num; j != 0; j--)
4282 vop = voprnds[j - 1];
4283 vec_oprnds->quick_push (vop);
4286 voprnds.release ();
4288 /* In case that VF is greater than the unrolling factor needed for the SLP
4289 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4290 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4291 to replicate the vectors. */
4292 tree neutral_vec = NULL;
4293 while (number_of_vectors > vec_oprnds->length ())
4295 if (neutral_op)
4297 if (!neutral_vec)
4299 gimple_seq ctor_seq = NULL;
4300 neutral_vec = gimple_build_vector_from_val
4301 (&ctor_seq, vector_type, neutral_op);
4302 if (ctor_seq != NULL)
4303 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4305 vec_oprnds->quick_push (neutral_vec);
4307 else
4309 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4310 vec_oprnds->quick_push (vop);
4316 /* Function vect_create_epilog_for_reduction
4318 Create code at the loop-epilog to finalize the result of a reduction
4319 computation.
4321 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4322 reduction statements.
4323 STMT_INFO is the scalar reduction stmt that is being vectorized.
4324 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4325 number of elements that we can fit in a vectype (nunits). In this case
4326 we have to generate more than one vector stmt - i.e - we need to "unroll"
4327 the vector stmt by a factor VF/nunits. For more details see documentation
4328 in vectorizable_operation.
4329 REDUC_FN is the internal function for the epilog reduction.
4330 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4331 computation.
4332 REDUC_INDEX is the index of the operand in the right hand side of the
4333 statement that is defined by REDUCTION_PHI.
4334 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4335 SLP_NODE is an SLP node containing a group of reduction statements. The
4336 first one in this group is STMT_INFO.
4337 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4338 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4339 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4340 any value of the IV in the loop.
4341 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4342 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4343 null if this is not an SLP reduction
4345 This function:
4346 1. Creates the reduction def-use cycles: sets the arguments for
4347 REDUCTION_PHIS:
4348 The loop-entry argument is the vectorized initial-value of the reduction.
4349 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4350 sums.
4351 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4352 by calling the function specified by REDUC_FN if available, or by
4353 other means (whole-vector shifts or a scalar loop).
4354 The function also creates a new phi node at the loop exit to preserve
4355 loop-closed form, as illustrated below.
4357 The flow at the entry to this function:
4359 loop:
4360 vec_def = phi <null, null> # REDUCTION_PHI
4361 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4362 s_loop = scalar_stmt # (scalar) STMT_INFO
4363 loop_exit:
4364 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4365 use <s_out0>
4366 use <s_out0>
4368 The above is transformed by this function into:
4370 loop:
4371 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4372 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4373 s_loop = scalar_stmt # (scalar) STMT_INFO
4374 loop_exit:
4375 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4376 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4377 v_out2 = reduce <v_out1>
4378 s_out3 = extract_field <v_out2, 0>
4379 s_out4 = adjust_result <s_out3>
4380 use <s_out4>
4381 use <s_out4>
4384 static void
4385 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4386 stmt_vec_info stmt_info,
4387 gimple *reduc_def_stmt,
4388 int ncopies, internal_fn reduc_fn,
4389 vec<stmt_vec_info> reduction_phis,
4390 bool double_reduc,
4391 slp_tree slp_node,
4392 slp_instance slp_node_instance,
4393 tree induc_val, enum tree_code induc_code,
4394 tree neutral_op)
4396 stmt_vec_info prev_phi_info;
4397 tree vectype;
4398 machine_mode mode;
4399 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4400 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4401 basic_block exit_bb;
4402 tree scalar_dest;
4403 tree scalar_type;
4404 gimple *new_phi = NULL, *phi;
4405 stmt_vec_info phi_info;
4406 gimple_stmt_iterator exit_gsi;
4407 tree vec_dest;
4408 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4409 gimple *epilog_stmt = NULL;
4410 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4411 gimple *exit_phi;
4412 tree bitsize;
4413 tree adjustment_def = NULL;
4414 tree vec_initial_def = NULL;
4415 tree expr, def, initial_def = NULL;
4416 tree orig_name, scalar_result;
4417 imm_use_iterator imm_iter, phi_imm_iter;
4418 use_operand_p use_p, phi_use_p;
4419 gimple *use_stmt;
4420 stmt_vec_info reduction_phi_info = NULL;
4421 bool nested_in_vect_loop = false;
4422 auto_vec<gimple *> new_phis;
4423 auto_vec<stmt_vec_info> inner_phis;
4424 int j, i;
4425 auto_vec<tree> scalar_results;
4426 unsigned int group_size = 1, k, ratio;
4427 auto_vec<tree> vec_initial_defs;
4428 auto_vec<gimple *> phis;
4429 bool slp_reduc = false;
4430 bool direct_slp_reduc;
4431 tree new_phi_result;
4432 stmt_vec_info inner_phi = NULL;
4433 tree induction_index = NULL_TREE;
4435 if (slp_node)
4436 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4438 if (nested_in_vect_loop_p (loop, stmt_info))
4440 outer_loop = loop;
4441 loop = loop->inner;
4442 nested_in_vect_loop = true;
4443 gcc_assert (!slp_node);
4446 vectype = STMT_VINFO_VECTYPE (stmt_info);
4447 gcc_assert (vectype);
4448 mode = TYPE_MODE (vectype);
4450 /* 1. Create the reduction def-use cycle:
4451 Set the arguments of REDUCTION_PHIS, i.e., transform
4453 loop:
4454 vec_def = phi <null, null> # REDUCTION_PHI
4455 VECT_DEF = vector_stmt # vectorized form of STMT
4458 into:
4460 loop:
4461 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4462 VECT_DEF = vector_stmt # vectorized form of STMT
4465 (in case of SLP, do it for all the phis). */
4467 /* Get the loop-entry arguments. */
4468 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4469 if (slp_node)
4471 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4472 vec_initial_defs.reserve (vec_num);
4473 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4474 &vec_initial_defs, vec_num,
4475 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4476 neutral_op);
4478 else
4480 /* Get at the scalar def before the loop, that defines the initial value
4481 of the reduction variable. */
4482 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4483 loop_preheader_edge (loop));
4484 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4485 and we can't use zero for induc_val, use initial_def. Similarly
4486 for REDUC_MIN and initial_def larger than the base. */
4487 if (TREE_CODE (initial_def) == INTEGER_CST
4488 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4489 == INTEGER_INDUC_COND_REDUCTION)
4490 && !integer_zerop (induc_val)
4491 && ((induc_code == MAX_EXPR
4492 && tree_int_cst_lt (initial_def, induc_val))
4493 || (induc_code == MIN_EXPR
4494 && tree_int_cst_lt (induc_val, initial_def))))
4495 induc_val = initial_def;
4497 if (double_reduc)
4498 /* In case of double reduction we only create a vector variable
4499 to be put in the reduction phi node. The actual statement
4500 creation is done later in this function. */
4501 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4502 else if (nested_in_vect_loop)
4504 /* Do not use an adjustment def as that case is not supported
4505 correctly if ncopies is not one. */
4506 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4507 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4508 stmt_info);
4510 else
4511 vec_initial_def
4512 = get_initial_def_for_reduction (stmt_info, initial_def,
4513 &adjustment_def);
4514 vec_initial_defs.create (1);
4515 vec_initial_defs.quick_push (vec_initial_def);
4518 /* Set phi nodes arguments. */
4519 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4521 tree vec_init_def = vec_initial_defs[i];
4522 tree def = vect_defs[i];
4523 for (j = 0; j < ncopies; j++)
4525 if (j != 0)
4527 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4528 if (nested_in_vect_loop)
4529 vec_init_def
4530 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4533 /* Set the loop-entry arg of the reduction-phi. */
4535 gphi *phi = as_a <gphi *> (phi_info->stmt);
4536 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4537 == INTEGER_INDUC_COND_REDUCTION)
4539 /* Initialise the reduction phi to zero. This prevents initial
4540 values of non-zero interferring with the reduction op. */
4541 gcc_assert (ncopies == 1);
4542 gcc_assert (i == 0);
4544 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4545 tree induc_val_vec
4546 = build_vector_from_val (vec_init_def_type, induc_val);
4548 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4549 UNKNOWN_LOCATION);
4551 else
4552 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4553 UNKNOWN_LOCATION);
4555 /* Set the loop-latch arg for the reduction-phi. */
4556 if (j > 0)
4557 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4559 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4561 if (dump_enabled_p ())
4563 dump_printf_loc (MSG_NOTE, vect_location,
4564 "transform reduction: created def-use cycle: ");
4565 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4566 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4571 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4572 which is updated with the current index of the loop for every match of
4573 the original loop's cond_expr (VEC_STMT). This results in a vector
4574 containing the last time the condition passed for that vector lane.
4575 The first match will be a 1 to allow 0 to be used for non-matching
4576 indexes. If there are no matches at all then the vector will be all
4577 zeroes. */
4578 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4580 tree indx_before_incr, indx_after_incr;
4581 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4583 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4584 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4586 int scalar_precision
4587 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4588 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4589 tree cr_index_vector_type = build_vector_type
4590 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4592 /* First we create a simple vector induction variable which starts
4593 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4594 vector size (STEP). */
4596 /* Create a {1,2,3,...} vector. */
4597 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4599 /* Create a vector of the step value. */
4600 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4601 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4603 /* Create an induction variable. */
4604 gimple_stmt_iterator incr_gsi;
4605 bool insert_after;
4606 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4607 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4608 insert_after, &indx_before_incr, &indx_after_incr);
4610 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4611 filled with zeros (VEC_ZERO). */
4613 /* Create a vector of 0s. */
4614 tree zero = build_zero_cst (cr_index_scalar_type);
4615 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4617 /* Create a vector phi node. */
4618 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4619 new_phi = create_phi_node (new_phi_tree, loop->header);
4620 loop_vinfo->add_stmt (new_phi);
4621 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4622 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4624 /* Now take the condition from the loops original cond_expr
4625 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4626 every match uses values from the induction variable
4627 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4628 (NEW_PHI_TREE).
4629 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4630 the new cond_expr (INDEX_COND_EXPR). */
4632 /* Duplicate the condition from vec_stmt. */
4633 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4635 /* Create a conditional, where the condition is taken from vec_stmt
4636 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4637 else is the phi (NEW_PHI_TREE). */
4638 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4639 ccompare, indx_before_incr,
4640 new_phi_tree);
4641 induction_index = make_ssa_name (cr_index_vector_type);
4642 gimple *index_condition = gimple_build_assign (induction_index,
4643 index_cond_expr);
4644 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4645 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4646 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4648 /* Update the phi with the vec cond. */
4649 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4650 loop_latch_edge (loop), UNKNOWN_LOCATION);
4653 /* 2. Create epilog code.
4654 The reduction epilog code operates across the elements of the vector
4655 of partial results computed by the vectorized loop.
4656 The reduction epilog code consists of:
4658 step 1: compute the scalar result in a vector (v_out2)
4659 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4660 step 3: adjust the scalar result (s_out3) if needed.
4662 Step 1 can be accomplished using one the following three schemes:
4663 (scheme 1) using reduc_fn, if available.
4664 (scheme 2) using whole-vector shifts, if available.
4665 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4666 combined.
4668 The overall epilog code looks like this:
4670 s_out0 = phi <s_loop> # original EXIT_PHI
4671 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4672 v_out2 = reduce <v_out1> # step 1
4673 s_out3 = extract_field <v_out2, 0> # step 2
4674 s_out4 = adjust_result <s_out3> # step 3
4676 (step 3 is optional, and steps 1 and 2 may be combined).
4677 Lastly, the uses of s_out0 are replaced by s_out4. */
4680 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4681 v_out1 = phi <VECT_DEF>
4682 Store them in NEW_PHIS. */
4684 exit_bb = single_exit (loop)->dest;
4685 prev_phi_info = NULL;
4686 new_phis.create (vect_defs.length ());
4687 FOR_EACH_VEC_ELT (vect_defs, i, def)
4689 for (j = 0; j < ncopies; j++)
4691 tree new_def = copy_ssa_name (def);
4692 phi = create_phi_node (new_def, exit_bb);
4693 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4694 if (j == 0)
4695 new_phis.quick_push (phi);
4696 else
4698 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4699 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4702 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4703 prev_phi_info = phi_info;
4707 /* The epilogue is created for the outer-loop, i.e., for the loop being
4708 vectorized. Create exit phis for the outer loop. */
4709 if (double_reduc)
4711 loop = outer_loop;
4712 exit_bb = single_exit (loop)->dest;
4713 inner_phis.create (vect_defs.length ());
4714 FOR_EACH_VEC_ELT (new_phis, i, phi)
4716 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4717 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4718 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4719 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4720 PHI_RESULT (phi));
4721 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4722 inner_phis.quick_push (phi_info);
4723 new_phis[i] = outer_phi;
4724 while (STMT_VINFO_RELATED_STMT (phi_info))
4726 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4727 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4728 outer_phi = create_phi_node (new_result, exit_bb);
4729 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4730 PHI_RESULT (phi_info->stmt));
4731 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4732 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4733 prev_phi_info = outer_phi_info;
4738 exit_gsi = gsi_after_labels (exit_bb);
4740 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4741 (i.e. when reduc_fn is not available) and in the final adjustment
4742 code (if needed). Also get the original scalar reduction variable as
4743 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4744 represents a reduction pattern), the tree-code and scalar-def are
4745 taken from the original stmt that the pattern-stmt (STMT) replaces.
4746 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4747 are taken from STMT. */
4749 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4750 if (!orig_stmt_info)
4752 /* Regular reduction */
4753 orig_stmt_info = stmt_info;
4755 else
4757 /* Reduction pattern */
4758 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4759 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4762 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4763 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4764 partial results are added and not subtracted. */
4765 if (code == MINUS_EXPR)
4766 code = PLUS_EXPR;
4768 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4769 scalar_type = TREE_TYPE (scalar_dest);
4770 scalar_results.create (group_size);
4771 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4772 bitsize = TYPE_SIZE (scalar_type);
4774 /* In case this is a reduction in an inner-loop while vectorizing an outer
4775 loop - we don't need to extract a single scalar result at the end of the
4776 inner-loop (unless it is double reduction, i.e., the use of reduction is
4777 outside the outer-loop). The final vector of partial results will be used
4778 in the vectorized outer-loop, or reduced to a scalar result at the end of
4779 the outer-loop. */
4780 if (nested_in_vect_loop && !double_reduc)
4781 goto vect_finalize_reduction;
4783 /* SLP reduction without reduction chain, e.g.,
4784 # a1 = phi <a2, a0>
4785 # b1 = phi <b2, b0>
4786 a2 = operation (a1)
4787 b2 = operation (b1) */
4788 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4790 /* True if we should implement SLP_REDUC using native reduction operations
4791 instead of scalar operations. */
4792 direct_slp_reduc = (reduc_fn != IFN_LAST
4793 && slp_reduc
4794 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4796 /* In case of reduction chain, e.g.,
4797 # a1 = phi <a3, a0>
4798 a2 = operation (a1)
4799 a3 = operation (a2),
4801 we may end up with more than one vector result. Here we reduce them to
4802 one vector. */
4803 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4805 tree first_vect = PHI_RESULT (new_phis[0]);
4806 gassign *new_vec_stmt = NULL;
4807 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4808 for (k = 1; k < new_phis.length (); k++)
4810 gimple *next_phi = new_phis[k];
4811 tree second_vect = PHI_RESULT (next_phi);
4812 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4813 new_vec_stmt = gimple_build_assign (tem, code,
4814 first_vect, second_vect);
4815 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4816 first_vect = tem;
4819 new_phi_result = first_vect;
4820 if (new_vec_stmt)
4822 new_phis.truncate (0);
4823 new_phis.safe_push (new_vec_stmt);
4826 /* Likewise if we couldn't use a single defuse cycle. */
4827 else if (ncopies > 1)
4829 gcc_assert (new_phis.length () == 1);
4830 tree first_vect = PHI_RESULT (new_phis[0]);
4831 gassign *new_vec_stmt = NULL;
4832 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4833 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4834 for (int k = 1; k < ncopies; ++k)
4836 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4837 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4838 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4839 new_vec_stmt = gimple_build_assign (tem, code,
4840 first_vect, second_vect);
4841 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4842 first_vect = tem;
4844 new_phi_result = first_vect;
4845 new_phis.truncate (0);
4846 new_phis.safe_push (new_vec_stmt);
4848 else
4849 new_phi_result = PHI_RESULT (new_phis[0]);
4851 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4852 && reduc_fn != IFN_LAST)
4854 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4855 various data values where the condition matched and another vector
4856 (INDUCTION_INDEX) containing all the indexes of those matches. We
4857 need to extract the last matching index (which will be the index with
4858 highest value) and use this to index into the data vector.
4859 For the case where there were no matches, the data vector will contain
4860 all default values and the index vector will be all zeros. */
4862 /* Get various versions of the type of the vector of indexes. */
4863 tree index_vec_type = TREE_TYPE (induction_index);
4864 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4865 tree index_scalar_type = TREE_TYPE (index_vec_type);
4866 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4867 (index_vec_type);
4869 /* Get an unsigned integer version of the type of the data vector. */
4870 int scalar_precision
4871 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4872 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4873 tree vectype_unsigned = build_vector_type
4874 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4876 /* First we need to create a vector (ZERO_VEC) of zeros and another
4877 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4878 can create using a MAX reduction and then expanding.
4879 In the case where the loop never made any matches, the max index will
4880 be zero. */
4882 /* Vector of {0, 0, 0,...}. */
4883 tree zero_vec = make_ssa_name (vectype);
4884 tree zero_vec_rhs = build_zero_cst (vectype);
4885 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4886 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4888 /* Find maximum value from the vector of found indexes. */
4889 tree max_index = make_ssa_name (index_scalar_type);
4890 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4891 1, induction_index);
4892 gimple_call_set_lhs (max_index_stmt, max_index);
4893 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4895 /* Vector of {max_index, max_index, max_index,...}. */
4896 tree max_index_vec = make_ssa_name (index_vec_type);
4897 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4898 max_index);
4899 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4900 max_index_vec_rhs);
4901 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4903 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4904 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4905 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4906 otherwise. Only one value should match, resulting in a vector
4907 (VEC_COND) with one data value and the rest zeros.
4908 In the case where the loop never made any matches, every index will
4909 match, resulting in a vector with all data values (which will all be
4910 the default value). */
4912 /* Compare the max index vector to the vector of found indexes to find
4913 the position of the max value. */
4914 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4915 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4916 induction_index,
4917 max_index_vec);
4918 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4920 /* Use the compare to choose either values from the data vector or
4921 zero. */
4922 tree vec_cond = make_ssa_name (vectype);
4923 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4924 vec_compare, new_phi_result,
4925 zero_vec);
4926 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4928 /* Finally we need to extract the data value from the vector (VEC_COND)
4929 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4930 reduction, but because this doesn't exist, we can use a MAX reduction
4931 instead. The data value might be signed or a float so we need to cast
4932 it first.
4933 In the case where the loop never made any matches, the data values are
4934 all identical, and so will reduce down correctly. */
4936 /* Make the matched data values unsigned. */
4937 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4938 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4939 vec_cond);
4940 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4941 VIEW_CONVERT_EXPR,
4942 vec_cond_cast_rhs);
4943 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4945 /* Reduce down to a scalar value. */
4946 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4947 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4948 1, vec_cond_cast);
4949 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4950 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4952 /* Convert the reduced value back to the result type and set as the
4953 result. */
4954 gimple_seq stmts = NULL;
4955 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4956 data_reduc);
4957 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4958 scalar_results.safe_push (new_temp);
4960 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4961 && reduc_fn == IFN_LAST)
4963 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4964 idx = 0;
4965 idx_val = induction_index[0];
4966 val = data_reduc[0];
4967 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4968 if (induction_index[i] > idx_val)
4969 val = data_reduc[i], idx_val = induction_index[i];
4970 return val; */
4972 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4973 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4974 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4975 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4976 /* Enforced by vectorizable_reduction, which ensures we have target
4977 support before allowing a conditional reduction on variable-length
4978 vectors. */
4979 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4980 tree idx_val = NULL_TREE, val = NULL_TREE;
4981 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4983 tree old_idx_val = idx_val;
4984 tree old_val = val;
4985 idx_val = make_ssa_name (idx_eltype);
4986 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4987 build3 (BIT_FIELD_REF, idx_eltype,
4988 induction_index,
4989 bitsize_int (el_size),
4990 bitsize_int (off)));
4991 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4992 val = make_ssa_name (data_eltype);
4993 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4994 build3 (BIT_FIELD_REF,
4995 data_eltype,
4996 new_phi_result,
4997 bitsize_int (el_size),
4998 bitsize_int (off)));
4999 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5000 if (off != 0)
5002 tree new_idx_val = idx_val;
5003 tree new_val = val;
5004 if (off != v_size - el_size)
5006 new_idx_val = make_ssa_name (idx_eltype);
5007 epilog_stmt = gimple_build_assign (new_idx_val,
5008 MAX_EXPR, idx_val,
5009 old_idx_val);
5010 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5012 new_val = make_ssa_name (data_eltype);
5013 epilog_stmt = gimple_build_assign (new_val,
5014 COND_EXPR,
5015 build2 (GT_EXPR,
5016 boolean_type_node,
5017 idx_val,
5018 old_idx_val),
5019 val, old_val);
5020 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5021 idx_val = new_idx_val;
5022 val = new_val;
5025 /* Convert the reduced value back to the result type and set as the
5026 result. */
5027 gimple_seq stmts = NULL;
5028 val = gimple_convert (&stmts, scalar_type, val);
5029 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5030 scalar_results.safe_push (val);
5033 /* 2.3 Create the reduction code, using one of the three schemes described
5034 above. In SLP we simply need to extract all the elements from the
5035 vector (without reducing them), so we use scalar shifts. */
5036 else if (reduc_fn != IFN_LAST && !slp_reduc)
5038 tree tmp;
5039 tree vec_elem_type;
5041 /* Case 1: Create:
5042 v_out2 = reduc_expr <v_out1> */
5044 if (dump_enabled_p ())
5045 dump_printf_loc (MSG_NOTE, vect_location,
5046 "Reduce using direct vector reduction.\n");
5048 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5049 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5051 tree tmp_dest
5052 = vect_create_destination_var (scalar_dest, vec_elem_type);
5053 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5054 new_phi_result);
5055 gimple_set_lhs (epilog_stmt, tmp_dest);
5056 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5057 gimple_set_lhs (epilog_stmt, new_temp);
5058 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5060 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5061 new_temp);
5063 else
5065 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5066 new_phi_result);
5067 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5070 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5071 gimple_set_lhs (epilog_stmt, new_temp);
5072 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5074 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5075 == INTEGER_INDUC_COND_REDUCTION)
5076 && !operand_equal_p (initial_def, induc_val, 0))
5078 /* Earlier we set the initial value to be a vector if induc_val
5079 values. Check the result and if it is induc_val then replace
5080 with the original initial value, unless induc_val is
5081 the same as initial_def already. */
5082 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5083 induc_val);
5085 tmp = make_ssa_name (new_scalar_dest);
5086 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5087 initial_def, new_temp);
5088 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5089 new_temp = tmp;
5092 scalar_results.safe_push (new_temp);
5094 else if (direct_slp_reduc)
5096 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5097 with the elements for other SLP statements replaced with the
5098 neutral value. We can then do a normal reduction on each vector. */
5100 /* Enforced by vectorizable_reduction. */
5101 gcc_assert (new_phis.length () == 1);
5102 gcc_assert (pow2p_hwi (group_size));
5104 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5105 vec<stmt_vec_info> orig_phis
5106 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5107 gimple_seq seq = NULL;
5109 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5110 and the same element size as VECTYPE. */
5111 tree index = build_index_vector (vectype, 0, 1);
5112 tree index_type = TREE_TYPE (index);
5113 tree index_elt_type = TREE_TYPE (index_type);
5114 tree mask_type = build_same_sized_truth_vector_type (index_type);
5116 /* Create a vector that, for each element, identifies which of
5117 the REDUC_GROUP_SIZE results should use it. */
5118 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5119 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5120 build_vector_from_val (index_type, index_mask));
5122 /* Get a neutral vector value. This is simply a splat of the neutral
5123 scalar value if we have one, otherwise the initial scalar value
5124 is itself a neutral value. */
5125 tree vector_identity = NULL_TREE;
5126 if (neutral_op)
5127 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5128 neutral_op);
5129 for (unsigned int i = 0; i < group_size; ++i)
5131 /* If there's no univeral neutral value, we can use the
5132 initial scalar value from the original PHI. This is used
5133 for MIN and MAX reduction, for example. */
5134 if (!neutral_op)
5136 tree scalar_value
5137 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5138 loop_preheader_edge (loop));
5139 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5140 scalar_value);
5143 /* Calculate the equivalent of:
5145 sel[j] = (index[j] == i);
5147 which selects the elements of NEW_PHI_RESULT that should
5148 be included in the result. */
5149 tree compare_val = build_int_cst (index_elt_type, i);
5150 compare_val = build_vector_from_val (index_type, compare_val);
5151 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5152 index, compare_val);
5154 /* Calculate the equivalent of:
5156 vec = seq ? new_phi_result : vector_identity;
5158 VEC is now suitable for a full vector reduction. */
5159 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5160 sel, new_phi_result, vector_identity);
5162 /* Do the reduction and convert it to the appropriate type. */
5163 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5164 TREE_TYPE (vectype), vec);
5165 scalar = gimple_convert (&seq, scalar_type, scalar);
5166 scalar_results.safe_push (scalar);
5168 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5170 else
5172 bool reduce_with_shift;
5173 tree vec_temp;
5175 /* COND reductions all do the final reduction with MAX_EXPR
5176 or MIN_EXPR. */
5177 if (code == COND_EXPR)
5179 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5180 == INTEGER_INDUC_COND_REDUCTION)
5181 code = induc_code;
5182 else
5183 code = MAX_EXPR;
5186 /* See if the target wants to do the final (shift) reduction
5187 in a vector mode of smaller size and first reduce upper/lower
5188 halves against each other. */
5189 enum machine_mode mode1 = mode;
5190 tree vectype1 = vectype;
5191 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5192 unsigned sz1 = sz;
5193 if (!slp_reduc
5194 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5195 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5197 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5198 reduce_with_shift = have_whole_vector_shift (mode1);
5199 if (!VECTOR_MODE_P (mode1))
5200 reduce_with_shift = false;
5201 else
5203 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5204 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5205 reduce_with_shift = false;
5208 /* First reduce the vector to the desired vector size we should
5209 do shift reduction on by combining upper and lower halves. */
5210 new_temp = new_phi_result;
5211 while (sz > sz1)
5213 gcc_assert (!slp_reduc);
5214 sz /= 2;
5215 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5217 /* The target has to make sure we support lowpart/highpart
5218 extraction, either via direct vector extract or through
5219 an integer mode punning. */
5220 tree dst1, dst2;
5221 if (convert_optab_handler (vec_extract_optab,
5222 TYPE_MODE (TREE_TYPE (new_temp)),
5223 TYPE_MODE (vectype1))
5224 != CODE_FOR_nothing)
5226 /* Extract sub-vectors directly once vec_extract becomes
5227 a conversion optab. */
5228 dst1 = make_ssa_name (vectype1);
5229 epilog_stmt
5230 = gimple_build_assign (dst1, BIT_FIELD_REF,
5231 build3 (BIT_FIELD_REF, vectype1,
5232 new_temp, TYPE_SIZE (vectype1),
5233 bitsize_int (0)));
5234 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5235 dst2 = make_ssa_name (vectype1);
5236 epilog_stmt
5237 = gimple_build_assign (dst2, BIT_FIELD_REF,
5238 build3 (BIT_FIELD_REF, vectype1,
5239 new_temp, TYPE_SIZE (vectype1),
5240 bitsize_int (sz * BITS_PER_UNIT)));
5241 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5243 else
5245 /* Extract via punning to appropriately sized integer mode
5246 vector. */
5247 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5249 tree etype = build_vector_type (eltype, 2);
5250 gcc_assert (convert_optab_handler (vec_extract_optab,
5251 TYPE_MODE (etype),
5252 TYPE_MODE (eltype))
5253 != CODE_FOR_nothing);
5254 tree tem = make_ssa_name (etype);
5255 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5256 build1 (VIEW_CONVERT_EXPR,
5257 etype, new_temp));
5258 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5259 new_temp = tem;
5260 tem = make_ssa_name (eltype);
5261 epilog_stmt
5262 = gimple_build_assign (tem, BIT_FIELD_REF,
5263 build3 (BIT_FIELD_REF, eltype,
5264 new_temp, TYPE_SIZE (eltype),
5265 bitsize_int (0)));
5266 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5267 dst1 = make_ssa_name (vectype1);
5268 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5269 build1 (VIEW_CONVERT_EXPR,
5270 vectype1, tem));
5271 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5272 tem = make_ssa_name (eltype);
5273 epilog_stmt
5274 = gimple_build_assign (tem, BIT_FIELD_REF,
5275 build3 (BIT_FIELD_REF, eltype,
5276 new_temp, TYPE_SIZE (eltype),
5277 bitsize_int (sz * BITS_PER_UNIT)));
5278 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5279 dst2 = make_ssa_name (vectype1);
5280 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5281 build1 (VIEW_CONVERT_EXPR,
5282 vectype1, tem));
5283 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5286 new_temp = make_ssa_name (vectype1);
5287 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5288 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5291 if (reduce_with_shift && !slp_reduc)
5293 int element_bitsize = tree_to_uhwi (bitsize);
5294 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5295 for variable-length vectors and also requires direct target support
5296 for loop reductions. */
5297 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5298 int nelements = vec_size_in_bits / element_bitsize;
5299 vec_perm_builder sel;
5300 vec_perm_indices indices;
5302 int elt_offset;
5304 tree zero_vec = build_zero_cst (vectype1);
5305 /* Case 2: Create:
5306 for (offset = nelements/2; offset >= 1; offset/=2)
5308 Create: va' = vec_shift <va, offset>
5309 Create: va = vop <va, va'>
5310 } */
5312 tree rhs;
5314 if (dump_enabled_p ())
5315 dump_printf_loc (MSG_NOTE, vect_location,
5316 "Reduce using vector shifts\n");
5318 mode1 = TYPE_MODE (vectype1);
5319 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5320 for (elt_offset = nelements / 2;
5321 elt_offset >= 1;
5322 elt_offset /= 2)
5324 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5325 indices.new_vector (sel, 2, nelements);
5326 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5327 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5328 new_temp, zero_vec, mask);
5329 new_name = make_ssa_name (vec_dest, epilog_stmt);
5330 gimple_assign_set_lhs (epilog_stmt, new_name);
5331 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5333 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5334 new_temp);
5335 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5336 gimple_assign_set_lhs (epilog_stmt, new_temp);
5337 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5340 /* 2.4 Extract the final scalar result. Create:
5341 s_out3 = extract_field <v_out2, bitpos> */
5343 if (dump_enabled_p ())
5344 dump_printf_loc (MSG_NOTE, vect_location,
5345 "extract scalar result\n");
5347 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5348 bitsize, bitsize_zero_node);
5349 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5350 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5351 gimple_assign_set_lhs (epilog_stmt, new_temp);
5352 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5353 scalar_results.safe_push (new_temp);
5355 else
5357 /* Case 3: Create:
5358 s = extract_field <v_out2, 0>
5359 for (offset = element_size;
5360 offset < vector_size;
5361 offset += element_size;)
5363 Create: s' = extract_field <v_out2, offset>
5364 Create: s = op <s, s'> // For non SLP cases
5365 } */
5367 if (dump_enabled_p ())
5368 dump_printf_loc (MSG_NOTE, vect_location,
5369 "Reduce using scalar code.\n");
5371 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5372 int element_bitsize = tree_to_uhwi (bitsize);
5373 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5375 int bit_offset;
5376 if (gimple_code (new_phi) == GIMPLE_PHI)
5377 vec_temp = PHI_RESULT (new_phi);
5378 else
5379 vec_temp = gimple_assign_lhs (new_phi);
5380 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5381 bitsize_zero_node);
5382 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5383 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5384 gimple_assign_set_lhs (epilog_stmt, new_temp);
5385 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5387 /* In SLP we don't need to apply reduction operation, so we just
5388 collect s' values in SCALAR_RESULTS. */
5389 if (slp_reduc)
5390 scalar_results.safe_push (new_temp);
5392 for (bit_offset = element_bitsize;
5393 bit_offset < vec_size_in_bits;
5394 bit_offset += element_bitsize)
5396 tree bitpos = bitsize_int (bit_offset);
5397 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5398 bitsize, bitpos);
5400 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5401 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5402 gimple_assign_set_lhs (epilog_stmt, new_name);
5403 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5405 if (slp_reduc)
5407 /* In SLP we don't need to apply reduction operation, so
5408 we just collect s' values in SCALAR_RESULTS. */
5409 new_temp = new_name;
5410 scalar_results.safe_push (new_name);
5412 else
5414 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5415 new_name, new_temp);
5416 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5417 gimple_assign_set_lhs (epilog_stmt, new_temp);
5418 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5423 /* The only case where we need to reduce scalar results in SLP, is
5424 unrolling. If the size of SCALAR_RESULTS is greater than
5425 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5426 REDUC_GROUP_SIZE. */
5427 if (slp_reduc)
5429 tree res, first_res, new_res;
5430 gimple *new_stmt;
5432 /* Reduce multiple scalar results in case of SLP unrolling. */
5433 for (j = group_size; scalar_results.iterate (j, &res);
5434 j++)
5436 first_res = scalar_results[j % group_size];
5437 new_stmt = gimple_build_assign (new_scalar_dest, code,
5438 first_res, res);
5439 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5440 gimple_assign_set_lhs (new_stmt, new_res);
5441 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5442 scalar_results[j % group_size] = new_res;
5445 else
5446 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5447 scalar_results.safe_push (new_temp);
5450 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5451 == INTEGER_INDUC_COND_REDUCTION)
5452 && !operand_equal_p (initial_def, induc_val, 0))
5454 /* Earlier we set the initial value to be a vector if induc_val
5455 values. Check the result and if it is induc_val then replace
5456 with the original initial value, unless induc_val is
5457 the same as initial_def already. */
5458 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5459 induc_val);
5461 tree tmp = make_ssa_name (new_scalar_dest);
5462 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5463 initial_def, new_temp);
5464 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5465 scalar_results[0] = tmp;
5469 vect_finalize_reduction:
5471 if (double_reduc)
5472 loop = loop->inner;
5474 /* 2.5 Adjust the final result by the initial value of the reduction
5475 variable. (When such adjustment is not needed, then
5476 'adjustment_def' is zero). For example, if code is PLUS we create:
5477 new_temp = loop_exit_def + adjustment_def */
5479 if (adjustment_def)
5481 gcc_assert (!slp_reduc);
5482 if (nested_in_vect_loop)
5484 new_phi = new_phis[0];
5485 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5486 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5487 new_dest = vect_create_destination_var (scalar_dest, vectype);
5489 else
5491 new_temp = scalar_results[0];
5492 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5493 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5494 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5497 epilog_stmt = gimple_build_assign (new_dest, expr);
5498 new_temp = make_ssa_name (new_dest, epilog_stmt);
5499 gimple_assign_set_lhs (epilog_stmt, new_temp);
5500 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5501 if (nested_in_vect_loop)
5503 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5504 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5505 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5507 if (!double_reduc)
5508 scalar_results.quick_push (new_temp);
5509 else
5510 scalar_results[0] = new_temp;
5512 else
5513 scalar_results[0] = new_temp;
5515 new_phis[0] = epilog_stmt;
5518 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5519 phis with new adjusted scalar results, i.e., replace use <s_out0>
5520 with use <s_out4>.
5522 Transform:
5523 loop_exit:
5524 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5525 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5526 v_out2 = reduce <v_out1>
5527 s_out3 = extract_field <v_out2, 0>
5528 s_out4 = adjust_result <s_out3>
5529 use <s_out0>
5530 use <s_out0>
5532 into:
5534 loop_exit:
5535 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5536 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5537 v_out2 = reduce <v_out1>
5538 s_out3 = extract_field <v_out2, 0>
5539 s_out4 = adjust_result <s_out3>
5540 use <s_out4>
5541 use <s_out4> */
5544 /* In SLP reduction chain we reduce vector results into one vector if
5545 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5546 LHS of the last stmt in the reduction chain, since we are looking for
5547 the loop exit phi node. */
5548 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5550 stmt_vec_info dest_stmt_info
5551 = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5552 /* Handle reduction patterns. */
5553 if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
5554 dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
5556 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5557 group_size = 1;
5560 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5561 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5562 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5563 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5564 correspond to the first vector stmt, etc.
5565 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5566 if (group_size > new_phis.length ())
5568 ratio = group_size / new_phis.length ();
5569 gcc_assert (!(group_size % new_phis.length ()));
5571 else
5572 ratio = 1;
5574 stmt_vec_info epilog_stmt_info = NULL;
5575 for (k = 0; k < group_size; k++)
5577 if (k % ratio == 0)
5579 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5580 reduction_phi_info = reduction_phis[k / ratio];
5581 if (double_reduc)
5582 inner_phi = inner_phis[k / ratio];
5585 if (slp_reduc)
5587 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5589 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5590 /* SLP statements can't participate in patterns. */
5591 gcc_assert (!orig_stmt_info);
5592 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5595 phis.create (3);
5596 /* Find the loop-closed-use at the loop exit of the original scalar
5597 result. (The reduction result is expected to have two immediate uses -
5598 one at the latch block, and one at the loop exit). */
5599 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5600 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5601 && !is_gimple_debug (USE_STMT (use_p)))
5602 phis.safe_push (USE_STMT (use_p));
5604 /* While we expect to have found an exit_phi because of loop-closed-ssa
5605 form we can end up without one if the scalar cycle is dead. */
5607 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5609 if (outer_loop)
5611 stmt_vec_info exit_phi_vinfo
5612 = loop_vinfo->lookup_stmt (exit_phi);
5613 gphi *vect_phi;
5615 /* FORNOW. Currently not supporting the case that an inner-loop
5616 reduction is not used in the outer-loop (but only outside the
5617 outer-loop), unless it is double reduction. */
5618 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5619 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5620 || double_reduc);
5622 if (double_reduc)
5623 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5624 else
5625 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5626 if (!double_reduc
5627 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5628 != vect_double_reduction_def)
5629 continue;
5631 /* Handle double reduction:
5633 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5634 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5635 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5636 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5638 At that point the regular reduction (stmt2 and stmt3) is
5639 already vectorized, as well as the exit phi node, stmt4.
5640 Here we vectorize the phi node of double reduction, stmt1, and
5641 update all relevant statements. */
5643 /* Go through all the uses of s2 to find double reduction phi
5644 node, i.e., stmt1 above. */
5645 orig_name = PHI_RESULT (exit_phi);
5646 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5648 stmt_vec_info use_stmt_vinfo;
5649 tree vect_phi_init, preheader_arg, vect_phi_res;
5650 basic_block bb = gimple_bb (use_stmt);
5652 /* Check that USE_STMT is really double reduction phi
5653 node. */
5654 if (gimple_code (use_stmt) != GIMPLE_PHI
5655 || gimple_phi_num_args (use_stmt) != 2
5656 || bb->loop_father != outer_loop)
5657 continue;
5658 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5659 if (!use_stmt_vinfo
5660 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5661 != vect_double_reduction_def)
5662 continue;
5664 /* Create vector phi node for double reduction:
5665 vs1 = phi <vs0, vs2>
5666 vs1 was created previously in this function by a call to
5667 vect_get_vec_def_for_operand and is stored in
5668 vec_initial_def;
5669 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5670 vs0 is created here. */
5672 /* Create vector phi node. */
5673 vect_phi = create_phi_node (vec_initial_def, bb);
5674 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5676 /* Create vs0 - initial def of the double reduction phi. */
5677 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5678 loop_preheader_edge (outer_loop));
5679 vect_phi_init = get_initial_def_for_reduction
5680 (stmt_info, preheader_arg, NULL);
5682 /* Update phi node arguments with vs0 and vs2. */
5683 add_phi_arg (vect_phi, vect_phi_init,
5684 loop_preheader_edge (outer_loop),
5685 UNKNOWN_LOCATION);
5686 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5687 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5688 if (dump_enabled_p ())
5690 dump_printf_loc (MSG_NOTE, vect_location,
5691 "created double reduction phi node: ");
5692 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5695 vect_phi_res = PHI_RESULT (vect_phi);
5697 /* Replace the use, i.e., set the correct vs1 in the regular
5698 reduction phi node. FORNOW, NCOPIES is always 1, so the
5699 loop is redundant. */
5700 stmt_vec_info use_info = reduction_phi_info;
5701 for (j = 0; j < ncopies; j++)
5703 edge pr_edge = loop_preheader_edge (loop);
5704 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5705 pr_edge->dest_idx, vect_phi_res);
5706 use_info = STMT_VINFO_RELATED_STMT (use_info);
5712 phis.release ();
5713 if (nested_in_vect_loop)
5715 if (double_reduc)
5716 loop = outer_loop;
5717 else
5718 continue;
5721 phis.create (3);
5722 /* Find the loop-closed-use at the loop exit of the original scalar
5723 result. (The reduction result is expected to have two immediate uses,
5724 one at the latch block, and one at the loop exit). For double
5725 reductions we are looking for exit phis of the outer loop. */
5726 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5728 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5730 if (!is_gimple_debug (USE_STMT (use_p)))
5731 phis.safe_push (USE_STMT (use_p));
5733 else
5735 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5737 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5739 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5741 if (!flow_bb_inside_loop_p (loop,
5742 gimple_bb (USE_STMT (phi_use_p)))
5743 && !is_gimple_debug (USE_STMT (phi_use_p)))
5744 phis.safe_push (USE_STMT (phi_use_p));
5750 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5752 /* Replace the uses: */
5753 orig_name = PHI_RESULT (exit_phi);
5754 scalar_result = scalar_results[k];
5755 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5756 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5757 SET_USE (use_p, scalar_result);
5760 phis.release ();
5764 /* Return a vector of type VECTYPE that is equal to the vector select
5765 operation "MASK ? VEC : IDENTITY". Insert the select statements
5766 before GSI. */
5768 static tree
5769 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5770 tree vec, tree identity)
5772 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5773 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5774 mask, vec, identity);
5775 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5776 return cond;
5779 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5780 order, starting with LHS. Insert the extraction statements before GSI and
5781 associate the new scalar SSA names with variable SCALAR_DEST.
5782 Return the SSA name for the result. */
5784 static tree
5785 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5786 tree_code code, tree lhs, tree vector_rhs)
5788 tree vectype = TREE_TYPE (vector_rhs);
5789 tree scalar_type = TREE_TYPE (vectype);
5790 tree bitsize = TYPE_SIZE (scalar_type);
5791 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5792 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5794 for (unsigned HOST_WIDE_INT bit_offset = 0;
5795 bit_offset < vec_size_in_bits;
5796 bit_offset += element_bitsize)
5798 tree bitpos = bitsize_int (bit_offset);
5799 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5800 bitsize, bitpos);
5802 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5803 rhs = make_ssa_name (scalar_dest, stmt);
5804 gimple_assign_set_lhs (stmt, rhs);
5805 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5807 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5808 tree new_name = make_ssa_name (scalar_dest, stmt);
5809 gimple_assign_set_lhs (stmt, new_name);
5810 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5811 lhs = new_name;
5813 return lhs;
5816 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5817 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5818 statement. CODE is the operation performed by STMT_INFO and OPS are
5819 its scalar operands. REDUC_INDEX is the index of the operand in
5820 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5821 implements in-order reduction, or IFN_LAST if we should open-code it.
5822 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5823 that should be used to control the operation in a fully-masked loop. */
5825 static bool
5826 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5827 gimple_stmt_iterator *gsi,
5828 stmt_vec_info *vec_stmt, slp_tree slp_node,
5829 gimple *reduc_def_stmt,
5830 tree_code code, internal_fn reduc_fn,
5831 tree ops[3], tree vectype_in,
5832 int reduc_index, vec_loop_masks *masks)
5834 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5835 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5836 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5837 stmt_vec_info new_stmt_info = NULL;
5839 int ncopies;
5840 if (slp_node)
5841 ncopies = 1;
5842 else
5843 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5845 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5846 gcc_assert (ncopies == 1);
5847 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5848 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5849 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5850 == FOLD_LEFT_REDUCTION);
5852 if (slp_node)
5853 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5854 TYPE_VECTOR_SUBPARTS (vectype_in)));
5856 tree op0 = ops[1 - reduc_index];
5858 int group_size = 1;
5859 stmt_vec_info scalar_dest_def_info;
5860 auto_vec<tree> vec_oprnds0;
5861 if (slp_node)
5863 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5864 slp_node);
5865 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5866 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5868 else
5870 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5871 vec_oprnds0.create (1);
5872 vec_oprnds0.quick_push (loop_vec_def0);
5873 scalar_dest_def_info = stmt_info;
5876 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5877 tree scalar_type = TREE_TYPE (scalar_dest);
5878 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5880 int vec_num = vec_oprnds0.length ();
5881 gcc_assert (vec_num == 1 || slp_node);
5882 tree vec_elem_type = TREE_TYPE (vectype_out);
5883 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5885 tree vector_identity = NULL_TREE;
5886 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5887 vector_identity = build_zero_cst (vectype_out);
5889 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5890 int i;
5891 tree def0;
5892 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5894 gimple *new_stmt;
5895 tree mask = NULL_TREE;
5896 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5897 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5899 /* Handle MINUS by adding the negative. */
5900 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5902 tree negated = make_ssa_name (vectype_out);
5903 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5904 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5905 def0 = negated;
5908 if (mask)
5909 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5910 vector_identity);
5912 /* On the first iteration the input is simply the scalar phi
5913 result, and for subsequent iterations it is the output of
5914 the preceding operation. */
5915 if (reduc_fn != IFN_LAST)
5917 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5918 /* For chained SLP reductions the output of the previous reduction
5919 operation serves as the input of the next. For the final statement
5920 the output cannot be a temporary - we reuse the original
5921 scalar destination of the last statement. */
5922 if (i != vec_num - 1)
5924 gimple_set_lhs (new_stmt, scalar_dest_var);
5925 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5926 gimple_set_lhs (new_stmt, reduc_var);
5929 else
5931 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5932 reduc_var, def0);
5933 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5934 /* Remove the statement, so that we can use the same code paths
5935 as for statements that we've just created. */
5936 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5937 gsi_remove (&tmp_gsi, false);
5940 if (i == vec_num - 1)
5942 gimple_set_lhs (new_stmt, scalar_dest);
5943 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5944 new_stmt);
5946 else
5947 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5948 new_stmt, gsi);
5950 if (slp_node)
5951 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5954 if (!slp_node)
5955 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5957 return true;
5960 /* Function is_nonwrapping_integer_induction.
5962 Check if STMT_VINO (which is part of loop LOOP) both increments and
5963 does not cause overflow. */
5965 static bool
5966 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5968 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5969 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5970 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5971 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5972 widest_int ni, max_loop_value, lhs_max;
5973 wi::overflow_type overflow = wi::OVF_NONE;
5975 /* Make sure the loop is integer based. */
5976 if (TREE_CODE (base) != INTEGER_CST
5977 || TREE_CODE (step) != INTEGER_CST)
5978 return false;
5980 /* Check that the max size of the loop will not wrap. */
5982 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5983 return true;
5985 if (! max_stmt_executions (loop, &ni))
5986 return false;
5988 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5989 &overflow);
5990 if (overflow)
5991 return false;
5993 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5994 TYPE_SIGN (lhs_type), &overflow);
5995 if (overflow)
5996 return false;
5998 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5999 <= TYPE_PRECISION (lhs_type));
6002 /* Function vectorizable_reduction.
6004 Check if STMT_INFO performs a reduction operation that can be vectorized.
6005 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6006 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6007 Return true if STMT_INFO is vectorizable in this way.
6009 This function also handles reduction idioms (patterns) that have been
6010 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6011 may be of this form:
6012 X = pattern_expr (arg0, arg1, ..., X)
6013 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6014 sequence that had been detected and replaced by the pattern-stmt
6015 (STMT_INFO).
6017 This function also handles reduction of condition expressions, for example:
6018 for (int i = 0; i < N; i++)
6019 if (a[i] < value)
6020 last = a[i];
6021 This is handled by vectorising the loop and creating an additional vector
6022 containing the loop indexes for which "a[i] < value" was true. In the
6023 function epilogue this is reduced to a single max value and then used to
6024 index into the vector of results.
6026 In some cases of reduction patterns, the type of the reduction variable X is
6027 different than the type of the other arguments of STMT_INFO.
6028 In such cases, the vectype that is used when transforming STMT_INFO into
6029 a vector stmt is different than the vectype that is used to determine the
6030 vectorization factor, because it consists of a different number of elements
6031 than the actual number of elements that are being operated upon in parallel.
6033 For example, consider an accumulation of shorts into an int accumulator.
6034 On some targets it's possible to vectorize this pattern operating on 8
6035 shorts at a time (hence, the vectype for purposes of determining the
6036 vectorization factor should be V8HI); on the other hand, the vectype that
6037 is used to create the vector form is actually V4SI (the type of the result).
6039 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6040 indicates what is the actual level of parallelism (V8HI in the example), so
6041 that the right vectorization factor would be derived. This vectype
6042 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6043 be used to create the vectorized stmt. The right vectype for the vectorized
6044 stmt is obtained from the type of the result X:
6045 get_vectype_for_scalar_type (TREE_TYPE (X))
6047 This means that, contrary to "regular" reductions (or "regular" stmts in
6048 general), the following equation:
6049 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6050 does *NOT* necessarily hold for reduction patterns. */
6052 bool
6053 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6054 stmt_vec_info *vec_stmt, slp_tree slp_node,
6055 slp_instance slp_node_instance,
6056 stmt_vector_for_cost *cost_vec)
6058 tree vec_dest;
6059 tree scalar_dest;
6060 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6061 tree vectype_in = NULL_TREE;
6062 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6063 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6064 enum tree_code code, orig_code;
6065 internal_fn reduc_fn;
6066 machine_mode vec_mode;
6067 int op_type;
6068 optab optab;
6069 tree new_temp = NULL_TREE;
6070 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6071 stmt_vec_info cond_stmt_vinfo = NULL;
6072 enum tree_code cond_reduc_op_code = ERROR_MARK;
6073 tree scalar_type;
6074 bool is_simple_use;
6075 int i;
6076 int ncopies;
6077 int epilog_copies;
6078 stmt_vec_info prev_stmt_info, prev_phi_info;
6079 bool single_defuse_cycle = false;
6080 stmt_vec_info new_stmt_info = NULL;
6081 int j;
6082 tree ops[3];
6083 enum vect_def_type dts[3];
6084 bool nested_cycle = false, found_nested_cycle_def = false;
6085 bool double_reduc = false;
6086 basic_block def_bb;
6087 struct loop * def_stmt_loop;
6088 tree def_arg;
6089 auto_vec<tree> vec_oprnds0;
6090 auto_vec<tree> vec_oprnds1;
6091 auto_vec<tree> vec_oprnds2;
6092 auto_vec<tree> vect_defs;
6093 auto_vec<stmt_vec_info> phis;
6094 int vec_num;
6095 tree def0, tem;
6096 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6097 tree cond_reduc_val = NULL_TREE;
6099 /* Make sure it was already recognized as a reduction computation. */
6100 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6101 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6102 return false;
6104 if (nested_in_vect_loop_p (loop, stmt_info))
6106 loop = loop->inner;
6107 nested_cycle = true;
6110 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6111 gcc_assert (slp_node
6112 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6114 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6116 tree phi_result = gimple_phi_result (phi);
6117 /* Analysis is fully done on the reduction stmt invocation. */
6118 if (! vec_stmt)
6120 if (slp_node)
6121 slp_node_instance->reduc_phis = slp_node;
6123 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6124 return true;
6127 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6128 /* Leave the scalar phi in place. Note that checking
6129 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6130 for reductions involving a single statement. */
6131 return true;
6133 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6134 if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
6135 reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
6137 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6138 == EXTRACT_LAST_REDUCTION)
6139 /* Leave the scalar phi in place. */
6140 return true;
6142 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6143 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6145 tree op = gimple_op (reduc_stmt, k);
6146 if (op == phi_result)
6147 continue;
6148 if (k == 1
6149 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6150 continue;
6151 if (!vectype_in
6152 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6153 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6154 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6155 break;
6157 gcc_assert (vectype_in);
6159 if (slp_node)
6160 ncopies = 1;
6161 else
6162 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6164 stmt_vec_info use_stmt_info;
6165 if (ncopies > 1
6166 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6167 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6168 && (use_stmt_info == reduc_stmt_info
6169 || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt_info))
6170 single_defuse_cycle = true;
6172 /* Create the destination vector */
6173 scalar_dest = gimple_assign_lhs (reduc_stmt);
6174 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6176 if (slp_node)
6177 /* The size vect_schedule_slp_instance computes is off for us. */
6178 vec_num = vect_get_num_vectors
6179 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6180 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6181 vectype_in);
6182 else
6183 vec_num = 1;
6185 /* Generate the reduction PHIs upfront. */
6186 prev_phi_info = NULL;
6187 for (j = 0; j < ncopies; j++)
6189 if (j == 0 || !single_defuse_cycle)
6191 for (i = 0; i < vec_num; i++)
6193 /* Create the reduction-phi that defines the reduction
6194 operand. */
6195 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6196 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6198 if (slp_node)
6199 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6200 else
6202 if (j == 0)
6203 STMT_VINFO_VEC_STMT (stmt_info)
6204 = *vec_stmt = new_phi_info;
6205 else
6206 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6207 prev_phi_info = new_phi_info;
6213 return true;
6216 /* 1. Is vectorizable reduction? */
6217 /* Not supportable if the reduction variable is used in the loop, unless
6218 it's a reduction chain. */
6219 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6220 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6221 return false;
6223 /* Reductions that are not used even in an enclosing outer-loop,
6224 are expected to be "live" (used out of the loop). */
6225 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6226 && !STMT_VINFO_LIVE_P (stmt_info))
6227 return false;
6229 /* 2. Has this been recognized as a reduction pattern?
6231 Check if STMT represents a pattern that has been recognized
6232 in earlier analysis stages. For stmts that represent a pattern,
6233 the STMT_VINFO_RELATED_STMT field records the last stmt in
6234 the original sequence that constitutes the pattern. */
6236 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6237 if (orig_stmt_info)
6239 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6240 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6243 /* 3. Check the operands of the operation. The first operands are defined
6244 inside the loop body. The last operand is the reduction variable,
6245 which is defined by the loop-header-phi. */
6247 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6249 /* Flatten RHS. */
6250 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6252 case GIMPLE_BINARY_RHS:
6253 code = gimple_assign_rhs_code (stmt);
6254 op_type = TREE_CODE_LENGTH (code);
6255 gcc_assert (op_type == binary_op);
6256 ops[0] = gimple_assign_rhs1 (stmt);
6257 ops[1] = gimple_assign_rhs2 (stmt);
6258 break;
6260 case GIMPLE_TERNARY_RHS:
6261 code = gimple_assign_rhs_code (stmt);
6262 op_type = TREE_CODE_LENGTH (code);
6263 gcc_assert (op_type == ternary_op);
6264 ops[0] = gimple_assign_rhs1 (stmt);
6265 ops[1] = gimple_assign_rhs2 (stmt);
6266 ops[2] = gimple_assign_rhs3 (stmt);
6267 break;
6269 case GIMPLE_UNARY_RHS:
6270 return false;
6272 default:
6273 gcc_unreachable ();
6276 if (code == COND_EXPR && slp_node)
6277 return false;
6279 scalar_dest = gimple_assign_lhs (stmt);
6280 scalar_type = TREE_TYPE (scalar_dest);
6281 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6282 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6283 return false;
6285 /* Do not try to vectorize bit-precision reductions. */
6286 if (!type_has_mode_precision_p (scalar_type))
6287 return false;
6289 /* All uses but the last are expected to be defined in the loop.
6290 The last use is the reduction variable. In case of nested cycle this
6291 assumption is not true: we use reduc_index to record the index of the
6292 reduction variable. */
6293 stmt_vec_info reduc_def_info = NULL;
6294 int reduc_index = -1;
6295 for (i = 0; i < op_type; i++)
6297 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6298 if (i == 0 && code == COND_EXPR)
6299 continue;
6301 stmt_vec_info def_stmt_info;
6302 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6303 &def_stmt_info);
6304 dt = dts[i];
6305 gcc_assert (is_simple_use);
6306 if (dt == vect_reduction_def)
6308 reduc_def_info = def_stmt_info;
6309 reduc_index = i;
6310 continue;
6312 else if (tem)
6314 /* To properly compute ncopies we are interested in the widest
6315 input type in case we're looking at a widening accumulation. */
6316 if (!vectype_in
6317 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6318 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6319 vectype_in = tem;
6322 if (dt != vect_internal_def
6323 && dt != vect_external_def
6324 && dt != vect_constant_def
6325 && dt != vect_induction_def
6326 && !(dt == vect_nested_cycle && nested_cycle))
6327 return false;
6329 if (dt == vect_nested_cycle)
6331 found_nested_cycle_def = true;
6332 reduc_def_info = def_stmt_info;
6333 reduc_index = i;
6336 if (i == 1 && code == COND_EXPR)
6338 /* Record how value of COND_EXPR is defined. */
6339 if (dt == vect_constant_def)
6341 cond_reduc_dt = dt;
6342 cond_reduc_val = ops[i];
6344 if (dt == vect_induction_def
6345 && def_stmt_info
6346 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6348 cond_reduc_dt = dt;
6349 cond_stmt_vinfo = def_stmt_info;
6354 if (!vectype_in)
6355 vectype_in = vectype_out;
6357 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6358 directy used in stmt. */
6359 if (reduc_index == -1)
6361 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6363 if (dump_enabled_p ())
6364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6365 "in-order reduction chain without SLP.\n");
6366 return false;
6369 if (orig_stmt_info)
6370 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6371 else
6372 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6375 if (! reduc_def_info)
6376 return false;
6378 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6379 if (!reduc_def_phi)
6380 return false;
6382 if (!(reduc_index == -1
6383 || dts[reduc_index] == vect_reduction_def
6384 || dts[reduc_index] == vect_nested_cycle
6385 || ((dts[reduc_index] == vect_internal_def
6386 || dts[reduc_index] == vect_external_def
6387 || dts[reduc_index] == vect_constant_def
6388 || dts[reduc_index] == vect_induction_def)
6389 && nested_cycle && found_nested_cycle_def)))
6391 /* For pattern recognized stmts, orig_stmt might be a reduction,
6392 but some helper statements for the pattern might not, or
6393 might be COND_EXPRs with reduction uses in the condition. */
6394 gcc_assert (orig_stmt_info);
6395 return false;
6398 /* PHIs should not participate in patterns. */
6399 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6400 enum vect_reduction_type v_reduc_type
6401 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6402 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6404 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6405 /* If we have a condition reduction, see if we can simplify it further. */
6406 if (v_reduc_type == COND_REDUCTION)
6408 /* TODO: We can't yet handle reduction chains, since we need to treat
6409 each COND_EXPR in the chain specially, not just the last one.
6410 E.g. for:
6412 x_1 = PHI <x_3, ...>
6413 x_2 = a_2 ? ... : x_1;
6414 x_3 = a_3 ? ... : x_2;
6416 we're interested in the last element in x_3 for which a_2 || a_3
6417 is true, whereas the current reduction chain handling would
6418 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6419 as a reduction operation. */
6420 if (reduc_index == -1)
6422 if (dump_enabled_p ())
6423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6424 "conditional reduction chains not supported\n");
6425 return false;
6428 /* vect_is_simple_reduction ensured that operand 2 is the
6429 loop-carried operand. */
6430 gcc_assert (reduc_index == 2);
6432 /* Loop peeling modifies initial value of reduction PHI, which
6433 makes the reduction stmt to be transformed different to the
6434 original stmt analyzed. We need to record reduction code for
6435 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6436 it can be used directly at transform stage. */
6437 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6438 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6440 /* Also set the reduction type to CONST_COND_REDUCTION. */
6441 gcc_assert (cond_reduc_dt == vect_constant_def);
6442 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6444 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6445 vectype_in, OPTIMIZE_FOR_SPEED))
6447 if (dump_enabled_p ())
6448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6449 "optimizing condition reduction with"
6450 " FOLD_EXTRACT_LAST.\n");
6451 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6453 else if (cond_reduc_dt == vect_induction_def)
6455 tree base
6456 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6457 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6459 gcc_assert (TREE_CODE (base) == INTEGER_CST
6460 && TREE_CODE (step) == INTEGER_CST);
6461 cond_reduc_val = NULL_TREE;
6462 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6463 above base; punt if base is the minimum value of the type for
6464 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6465 if (tree_int_cst_sgn (step) == -1)
6467 cond_reduc_op_code = MIN_EXPR;
6468 if (tree_int_cst_sgn (base) == -1)
6469 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6470 else if (tree_int_cst_lt (base,
6471 TYPE_MAX_VALUE (TREE_TYPE (base))))
6472 cond_reduc_val
6473 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6475 else
6477 cond_reduc_op_code = MAX_EXPR;
6478 if (tree_int_cst_sgn (base) == 1)
6479 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6480 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6481 base))
6482 cond_reduc_val
6483 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6485 if (cond_reduc_val)
6487 if (dump_enabled_p ())
6488 dump_printf_loc (MSG_NOTE, vect_location,
6489 "condition expression based on "
6490 "integer induction.\n");
6491 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6492 = INTEGER_INDUC_COND_REDUCTION;
6495 else if (cond_reduc_dt == vect_constant_def)
6497 enum vect_def_type cond_initial_dt;
6498 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6499 tree cond_initial_val
6500 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6502 gcc_assert (cond_reduc_val != NULL_TREE);
6503 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6504 if (cond_initial_dt == vect_constant_def
6505 && types_compatible_p (TREE_TYPE (cond_initial_val),
6506 TREE_TYPE (cond_reduc_val)))
6508 tree e = fold_binary (LE_EXPR, boolean_type_node,
6509 cond_initial_val, cond_reduc_val);
6510 if (e && (integer_onep (e) || integer_zerop (e)))
6512 if (dump_enabled_p ())
6513 dump_printf_loc (MSG_NOTE, vect_location,
6514 "condition expression based on "
6515 "compile time constant.\n");
6516 /* Record reduction code at analysis stage. */
6517 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6518 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6519 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6520 = CONST_COND_REDUCTION;
6526 if (orig_stmt_info)
6527 gcc_assert (tmp == orig_stmt_info
6528 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6529 else
6530 /* We changed STMT to be the first stmt in reduction chain, hence we
6531 check that in this case the first element in the chain is STMT. */
6532 gcc_assert (tmp == stmt_info
6533 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6535 if (STMT_VINFO_LIVE_P (reduc_def_info))
6536 return false;
6538 if (slp_node)
6539 ncopies = 1;
6540 else
6541 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6543 gcc_assert (ncopies >= 1);
6545 vec_mode = TYPE_MODE (vectype_in);
6546 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6548 if (code == COND_EXPR)
6550 /* Only call during the analysis stage, otherwise we'll lose
6551 STMT_VINFO_TYPE. */
6552 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6553 ops[reduc_index], 0, NULL,
6554 cost_vec))
6556 if (dump_enabled_p ())
6557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6558 "unsupported condition in reduction\n");
6559 return false;
6562 else
6564 /* 4. Supportable by target? */
6566 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6567 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6569 /* Shifts and rotates are only supported by vectorizable_shifts,
6570 not vectorizable_reduction. */
6571 if (dump_enabled_p ())
6572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573 "unsupported shift or rotation.\n");
6574 return false;
6577 /* 4.1. check support for the operation in the loop */
6578 optab = optab_for_tree_code (code, vectype_in, optab_default);
6579 if (!optab)
6581 if (dump_enabled_p ())
6582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6583 "no optab.\n");
6585 return false;
6588 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6590 if (dump_enabled_p ())
6591 dump_printf (MSG_NOTE, "op not supported by target.\n");
6593 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6594 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6595 return false;
6597 if (dump_enabled_p ())
6598 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6601 /* Worthwhile without SIMD support? */
6602 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6603 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6605 if (dump_enabled_p ())
6606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6607 "not worthwhile without SIMD support.\n");
6609 return false;
6613 /* 4.2. Check support for the epilog operation.
6615 If STMT represents a reduction pattern, then the type of the
6616 reduction variable may be different than the type of the rest
6617 of the arguments. For example, consider the case of accumulation
6618 of shorts into an int accumulator; The original code:
6619 S1: int_a = (int) short_a;
6620 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6622 was replaced with:
6623 STMT: int_acc = widen_sum <short_a, int_acc>
6625 This means that:
6626 1. The tree-code that is used to create the vector operation in the
6627 epilog code (that reduces the partial results) is not the
6628 tree-code of STMT, but is rather the tree-code of the original
6629 stmt from the pattern that STMT is replacing. I.e, in the example
6630 above we want to use 'widen_sum' in the loop, but 'plus' in the
6631 epilog.
6632 2. The type (mode) we use to check available target support
6633 for the vector operation to be created in the *epilog*, is
6634 determined by the type of the reduction variable (in the example
6635 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6636 However the type (mode) we use to check available target support
6637 for the vector operation to be created *inside the loop*, is
6638 determined by the type of the other arguments to STMT (in the
6639 example we'd check this: optab_handler (widen_sum_optab,
6640 vect_short_mode)).
6642 This is contrary to "regular" reductions, in which the types of all
6643 the arguments are the same as the type of the reduction variable.
6644 For "regular" reductions we can therefore use the same vector type
6645 (and also the same tree-code) when generating the epilog code and
6646 when generating the code inside the loop. */
6648 vect_reduction_type reduction_type
6649 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6650 if (orig_stmt_info
6651 && (reduction_type == TREE_CODE_REDUCTION
6652 || reduction_type == FOLD_LEFT_REDUCTION))
6654 /* This is a reduction pattern: get the vectype from the type of the
6655 reduction variable, and get the tree-code from orig_stmt. */
6656 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6657 gcc_assert (vectype_out);
6658 vec_mode = TYPE_MODE (vectype_out);
6660 else
6662 /* Regular reduction: use the same vectype and tree-code as used for
6663 the vector code inside the loop can be used for the epilog code. */
6664 orig_code = code;
6666 if (code == MINUS_EXPR)
6667 orig_code = PLUS_EXPR;
6669 /* For simple condition reductions, replace with the actual expression
6670 we want to base our reduction around. */
6671 if (reduction_type == CONST_COND_REDUCTION)
6673 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6674 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6676 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6677 orig_code = cond_reduc_op_code;
6680 if (nested_cycle)
6682 def_bb = gimple_bb (reduc_def_phi);
6683 def_stmt_loop = def_bb->loop_father;
6684 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6685 loop_preheader_edge (def_stmt_loop));
6686 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6687 if (def_arg_stmt_info
6688 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6689 == vect_double_reduction_def))
6690 double_reduc = true;
6693 reduc_fn = IFN_LAST;
6695 if (reduction_type == TREE_CODE_REDUCTION
6696 || reduction_type == FOLD_LEFT_REDUCTION
6697 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6698 || reduction_type == CONST_COND_REDUCTION)
6700 if (reduction_type == FOLD_LEFT_REDUCTION
6701 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6702 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6704 if (reduc_fn != IFN_LAST
6705 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6706 OPTIMIZE_FOR_SPEED))
6708 if (dump_enabled_p ())
6709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710 "reduc op not supported by target.\n");
6712 reduc_fn = IFN_LAST;
6715 else
6717 if (!nested_cycle || double_reduc)
6719 if (dump_enabled_p ())
6720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6721 "no reduc code for scalar code.\n");
6723 return false;
6727 else if (reduction_type == COND_REDUCTION)
6729 int scalar_precision
6730 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6731 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6732 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6733 nunits_out);
6735 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6736 OPTIMIZE_FOR_SPEED))
6737 reduc_fn = IFN_REDUC_MAX;
6740 if (reduction_type != EXTRACT_LAST_REDUCTION
6741 && reduc_fn == IFN_LAST
6742 && !nunits_out.is_constant ())
6744 if (dump_enabled_p ())
6745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6746 "missing target support for reduction on"
6747 " variable-length vectors.\n");
6748 return false;
6751 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6752 && ncopies > 1)
6754 if (dump_enabled_p ())
6755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6756 "multiple types in double reduction or condition "
6757 "reduction.\n");
6758 return false;
6761 /* For SLP reductions, see if there is a neutral value we can use. */
6762 tree neutral_op = NULL_TREE;
6763 if (slp_node)
6764 neutral_op = neutral_op_for_slp_reduction
6765 (slp_node_instance->reduc_phis, code,
6766 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL_STMT_VEC_INFO);
6768 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6770 /* We can't support in-order reductions of code such as this:
6772 for (int i = 0; i < n1; ++i)
6773 for (int j = 0; j < n2; ++j)
6774 l += a[j];
6776 since GCC effectively transforms the loop when vectorizing:
6778 for (int i = 0; i < n1 / VF; ++i)
6779 for (int j = 0; j < n2; ++j)
6780 for (int k = 0; k < VF; ++k)
6781 l += a[j];
6783 which is a reassociation of the original operation. */
6784 if (dump_enabled_p ())
6785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6786 "in-order double reduction not supported.\n");
6788 return false;
6791 if (reduction_type == FOLD_LEFT_REDUCTION
6792 && slp_node
6793 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6795 /* We cannot use in-order reductions in this case because there is
6796 an implicit reassociation of the operations involved. */
6797 if (dump_enabled_p ())
6798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799 "in-order unchained SLP reductions not supported.\n");
6800 return false;
6803 /* For double reductions, and for SLP reductions with a neutral value,
6804 we construct a variable-length initial vector by loading a vector
6805 full of the neutral value and then shift-and-inserting the start
6806 values into the low-numbered elements. */
6807 if ((double_reduc || neutral_op)
6808 && !nunits_out.is_constant ()
6809 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6810 vectype_out, OPTIMIZE_FOR_SPEED))
6812 if (dump_enabled_p ())
6813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6814 "reduction on variable-length vectors requires"
6815 " target support for a vector-shift-and-insert"
6816 " operation.\n");
6817 return false;
6820 /* Check extra constraints for variable-length unchained SLP reductions. */
6821 if (STMT_SLP_TYPE (stmt_info)
6822 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6823 && !nunits_out.is_constant ())
6825 /* We checked above that we could build the initial vector when
6826 there's a neutral element value. Check here for the case in
6827 which each SLP statement has its own initial value and in which
6828 that value needs to be repeated for every instance of the
6829 statement within the initial vector. */
6830 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6831 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6832 if (!neutral_op
6833 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6835 if (dump_enabled_p ())
6836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837 "unsupported form of SLP reduction for"
6838 " variable-length vectors: cannot build"
6839 " initial vector.\n");
6840 return false;
6842 /* The epilogue code relies on the number of elements being a multiple
6843 of the group size. The duplicate-and-interleave approach to setting
6844 up the the initial vector does too. */
6845 if (!multiple_p (nunits_out, group_size))
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849 "unsupported form of SLP reduction for"
6850 " variable-length vectors: the vector size"
6851 " is not a multiple of the number of results.\n");
6852 return false;
6856 /* In case of widenning multiplication by a constant, we update the type
6857 of the constant to be the type of the other operand. We check that the
6858 constant fits the type in the pattern recognition pass. */
6859 if (code == DOT_PROD_EXPR
6860 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6862 if (TREE_CODE (ops[0]) == INTEGER_CST)
6863 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6864 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6865 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6866 else
6868 if (dump_enabled_p ())
6869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6870 "invalid types in dot-prod\n");
6872 return false;
6876 if (reduction_type == COND_REDUCTION)
6878 widest_int ni;
6880 if (! max_loop_iterations (loop, &ni))
6882 if (dump_enabled_p ())
6883 dump_printf_loc (MSG_NOTE, vect_location,
6884 "loop count not known, cannot create cond "
6885 "reduction.\n");
6886 return false;
6888 /* Convert backedges to iterations. */
6889 ni += 1;
6891 /* The additional index will be the same type as the condition. Check
6892 that the loop can fit into this less one (because we'll use up the
6893 zero slot for when there are no matches). */
6894 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6895 if (wi::geu_p (ni, wi::to_widest (max_index)))
6897 if (dump_enabled_p ())
6898 dump_printf_loc (MSG_NOTE, vect_location,
6899 "loop size is greater than data size.\n");
6900 return false;
6904 /* In case the vectorization factor (VF) is bigger than the number
6905 of elements that we can fit in a vectype (nunits), we have to generate
6906 more than one vector stmt - i.e - we need to "unroll" the
6907 vector stmt by a factor VF/nunits. For more details see documentation
6908 in vectorizable_operation. */
6910 /* If the reduction is used in an outer loop we need to generate
6911 VF intermediate results, like so (e.g. for ncopies=2):
6912 r0 = phi (init, r0)
6913 r1 = phi (init, r1)
6914 r0 = x0 + r0;
6915 r1 = x1 + r1;
6916 (i.e. we generate VF results in 2 registers).
6917 In this case we have a separate def-use cycle for each copy, and therefore
6918 for each copy we get the vector def for the reduction variable from the
6919 respective phi node created for this copy.
6921 Otherwise (the reduction is unused in the loop nest), we can combine
6922 together intermediate results, like so (e.g. for ncopies=2):
6923 r = phi (init, r)
6924 r = x0 + r;
6925 r = x1 + r;
6926 (i.e. we generate VF/2 results in a single register).
6927 In this case for each copy we get the vector def for the reduction variable
6928 from the vectorized reduction operation generated in the previous iteration.
6930 This only works when we see both the reduction PHI and its only consumer
6931 in vectorizable_reduction and there are no intermediate stmts
6932 participating. */
6933 stmt_vec_info use_stmt_info;
6934 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6935 if (ncopies > 1
6936 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6937 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6938 && (use_stmt_info == stmt_info
6939 || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt_info))
6941 single_defuse_cycle = true;
6942 epilog_copies = 1;
6944 else
6945 epilog_copies = ncopies;
6947 /* If the reduction stmt is one of the patterns that have lane
6948 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6949 if ((ncopies > 1
6950 && ! single_defuse_cycle)
6951 && (code == DOT_PROD_EXPR
6952 || code == WIDEN_SUM_EXPR
6953 || code == SAD_EXPR))
6955 if (dump_enabled_p ())
6956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6957 "multi def-use cycle not possible for lane-reducing "
6958 "reduction operation\n");
6959 return false;
6962 if (slp_node)
6963 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6964 else
6965 vec_num = 1;
6967 internal_fn cond_fn = get_conditional_internal_fn (code);
6968 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6970 if (!vec_stmt) /* transformation not required. */
6972 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6973 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6975 if (reduction_type != FOLD_LEFT_REDUCTION
6976 && (cond_fn == IFN_LAST
6977 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6978 OPTIMIZE_FOR_SPEED)))
6980 if (dump_enabled_p ())
6981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982 "can't use a fully-masked loop because no"
6983 " conditional operation is available.\n");
6984 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6986 else if (reduc_index == -1)
6988 if (dump_enabled_p ())
6989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6990 "can't use a fully-masked loop for chained"
6991 " reductions.\n");
6992 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6994 else
6995 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6996 vectype_in);
6998 if (dump_enabled_p ()
6999 && reduction_type == FOLD_LEFT_REDUCTION)
7000 dump_printf_loc (MSG_NOTE, vect_location,
7001 "using an in-order (fold-left) reduction.\n");
7002 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7003 return true;
7006 /* Transform. */
7008 if (dump_enabled_p ())
7009 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7011 /* FORNOW: Multiple types are not supported for condition. */
7012 if (code == COND_EXPR)
7013 gcc_assert (ncopies == 1);
7015 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7017 if (reduction_type == FOLD_LEFT_REDUCTION)
7018 return vectorize_fold_left_reduction
7019 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7020 reduc_fn, ops, vectype_in, reduc_index, masks);
7022 if (reduction_type == EXTRACT_LAST_REDUCTION)
7024 gcc_assert (!slp_node);
7025 return vectorizable_condition (stmt_info, gsi, vec_stmt,
7026 NULL, reduc_index, NULL, NULL);
7029 /* Create the destination vector */
7030 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7032 prev_stmt_info = NULL;
7033 prev_phi_info = NULL;
7034 if (!slp_node)
7036 vec_oprnds0.create (1);
7037 vec_oprnds1.create (1);
7038 if (op_type == ternary_op)
7039 vec_oprnds2.create (1);
7042 phis.create (vec_num);
7043 vect_defs.create (vec_num);
7044 if (!slp_node)
7045 vect_defs.quick_push (NULL_TREE);
7047 if (slp_node)
7048 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7049 else
7050 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7052 for (j = 0; j < ncopies; j++)
7054 if (code == COND_EXPR)
7056 gcc_assert (!slp_node);
7057 vectorizable_condition (stmt_info, gsi, vec_stmt,
7058 PHI_RESULT (phis[0]->stmt),
7059 reduc_index, NULL, NULL);
7060 /* Multiple types are not supported for condition. */
7061 break;
7064 /* Handle uses. */
7065 if (j == 0)
7067 if (slp_node)
7069 /* Get vec defs for all the operands except the reduction index,
7070 ensuring the ordering of the ops in the vector is kept. */
7071 auto_vec<tree, 3> slp_ops;
7072 auto_vec<vec<tree>, 3> vec_defs;
7074 slp_ops.quick_push (ops[0]);
7075 slp_ops.quick_push (ops[1]);
7076 if (op_type == ternary_op)
7077 slp_ops.quick_push (ops[2]);
7079 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7081 vec_oprnds0.safe_splice (vec_defs[0]);
7082 vec_defs[0].release ();
7083 vec_oprnds1.safe_splice (vec_defs[1]);
7084 vec_defs[1].release ();
7085 if (op_type == ternary_op)
7087 vec_oprnds2.safe_splice (vec_defs[2]);
7088 vec_defs[2].release ();
7091 else
7093 vec_oprnds0.quick_push
7094 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7095 vec_oprnds1.quick_push
7096 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7097 if (op_type == ternary_op)
7098 vec_oprnds2.quick_push
7099 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7102 else
7104 if (!slp_node)
7106 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7108 if (single_defuse_cycle && reduc_index == 0)
7109 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7110 else
7111 vec_oprnds0[0]
7112 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7113 vec_oprnds0[0]);
7114 if (single_defuse_cycle && reduc_index == 1)
7115 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7116 else
7117 vec_oprnds1[0]
7118 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7119 vec_oprnds1[0]);
7120 if (op_type == ternary_op)
7122 if (single_defuse_cycle && reduc_index == 2)
7123 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7124 else
7125 vec_oprnds2[0]
7126 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7127 vec_oprnds2[0]);
7132 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7134 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7135 if (masked_loop_p)
7137 /* Make sure that the reduction accumulator is vop[0]. */
7138 if (reduc_index == 1)
7140 gcc_assert (commutative_tree_code (code));
7141 std::swap (vop[0], vop[1]);
7143 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7144 vectype_in, i * ncopies + j);
7145 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7146 vop[0], vop[1],
7147 vop[0]);
7148 new_temp = make_ssa_name (vec_dest, call);
7149 gimple_call_set_lhs (call, new_temp);
7150 gimple_call_set_nothrow (call, true);
7151 new_stmt_info
7152 = vect_finish_stmt_generation (stmt_info, call, gsi);
7154 else
7156 if (op_type == ternary_op)
7157 vop[2] = vec_oprnds2[i];
7159 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7160 vop[0], vop[1], vop[2]);
7161 new_temp = make_ssa_name (vec_dest, new_stmt);
7162 gimple_assign_set_lhs (new_stmt, new_temp);
7163 new_stmt_info
7164 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7167 if (slp_node)
7169 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7170 vect_defs.quick_push (new_temp);
7172 else
7173 vect_defs[0] = new_temp;
7176 if (slp_node)
7177 continue;
7179 if (j == 0)
7180 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7181 else
7182 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7184 prev_stmt_info = new_stmt_info;
7187 /* Finalize the reduction-phi (set its arguments) and create the
7188 epilog reduction code. */
7189 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7190 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7192 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7193 epilog_copies, reduc_fn, phis,
7194 double_reduc, slp_node, slp_node_instance,
7195 cond_reduc_val, cond_reduc_op_code,
7196 neutral_op);
7198 return true;
7201 /* Function vect_min_worthwhile_factor.
7203 For a loop where we could vectorize the operation indicated by CODE,
7204 return the minimum vectorization factor that makes it worthwhile
7205 to use generic vectors. */
7206 static unsigned int
7207 vect_min_worthwhile_factor (enum tree_code code)
7209 switch (code)
7211 case PLUS_EXPR:
7212 case MINUS_EXPR:
7213 case NEGATE_EXPR:
7214 return 4;
7216 case BIT_AND_EXPR:
7217 case BIT_IOR_EXPR:
7218 case BIT_XOR_EXPR:
7219 case BIT_NOT_EXPR:
7220 return 2;
7222 default:
7223 return INT_MAX;
7227 /* Return true if VINFO indicates we are doing loop vectorization and if
7228 it is worth decomposing CODE operations into scalar operations for
7229 that loop's vectorization factor. */
7231 bool
7232 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7234 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7235 unsigned HOST_WIDE_INT value;
7236 return (loop_vinfo
7237 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7238 && value >= vect_min_worthwhile_factor (code));
7241 /* Function vectorizable_induction
7243 Check if STMT_INFO performs an induction computation that can be vectorized.
7244 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7245 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7246 Return true if STMT_INFO is vectorizable in this way. */
7248 bool
7249 vectorizable_induction (stmt_vec_info stmt_info,
7250 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7251 stmt_vec_info *vec_stmt, slp_tree slp_node,
7252 stmt_vector_for_cost *cost_vec)
7254 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7255 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7256 unsigned ncopies;
7257 bool nested_in_vect_loop = false;
7258 struct loop *iv_loop;
7259 tree vec_def;
7260 edge pe = loop_preheader_edge (loop);
7261 basic_block new_bb;
7262 tree new_vec, vec_init, vec_step, t;
7263 tree new_name;
7264 gimple *new_stmt;
7265 gphi *induction_phi;
7266 tree induc_def, vec_dest;
7267 tree init_expr, step_expr;
7268 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7269 unsigned i;
7270 tree expr;
7271 gimple_seq stmts;
7272 imm_use_iterator imm_iter;
7273 use_operand_p use_p;
7274 gimple *exit_phi;
7275 edge latch_e;
7276 tree loop_arg;
7277 gimple_stmt_iterator si;
7279 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7280 if (!phi)
7281 return false;
7283 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7284 return false;
7286 /* Make sure it was recognized as induction computation. */
7287 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7288 return false;
7290 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7291 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7293 if (slp_node)
7294 ncopies = 1;
7295 else
7296 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7297 gcc_assert (ncopies >= 1);
7299 /* FORNOW. These restrictions should be relaxed. */
7300 if (nested_in_vect_loop_p (loop, stmt_info))
7302 imm_use_iterator imm_iter;
7303 use_operand_p use_p;
7304 gimple *exit_phi;
7305 edge latch_e;
7306 tree loop_arg;
7308 if (ncopies > 1)
7310 if (dump_enabled_p ())
7311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312 "multiple types in nested loop.\n");
7313 return false;
7316 /* FORNOW: outer loop induction with SLP not supported. */
7317 if (STMT_SLP_TYPE (stmt_info))
7318 return false;
7320 exit_phi = NULL;
7321 latch_e = loop_latch_edge (loop->inner);
7322 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7323 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7325 gimple *use_stmt = USE_STMT (use_p);
7326 if (is_gimple_debug (use_stmt))
7327 continue;
7329 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7331 exit_phi = use_stmt;
7332 break;
7335 if (exit_phi)
7337 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7338 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7339 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7341 if (dump_enabled_p ())
7342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7343 "inner-loop induction only used outside "
7344 "of the outer vectorized loop.\n");
7345 return false;
7349 nested_in_vect_loop = true;
7350 iv_loop = loop->inner;
7352 else
7353 iv_loop = loop;
7354 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7356 if (slp_node && !nunits.is_constant ())
7358 /* The current SLP code creates the initial value element-by-element. */
7359 if (dump_enabled_p ())
7360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7361 "SLP induction not supported for variable-length"
7362 " vectors.\n");
7363 return false;
7366 if (!vec_stmt) /* transformation not required. */
7368 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7369 DUMP_VECT_SCOPE ("vectorizable_induction");
7370 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7371 return true;
7374 /* Transform. */
7376 /* Compute a vector variable, initialized with the first VF values of
7377 the induction variable. E.g., for an iv with IV_PHI='X' and
7378 evolution S, for a vector of 4 units, we want to compute:
7379 [X, X + S, X + 2*S, X + 3*S]. */
7381 if (dump_enabled_p ())
7382 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7384 latch_e = loop_latch_edge (iv_loop);
7385 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7387 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7388 gcc_assert (step_expr != NULL_TREE);
7390 pe = loop_preheader_edge (iv_loop);
7391 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7392 loop_preheader_edge (iv_loop));
7394 stmts = NULL;
7395 if (!nested_in_vect_loop)
7397 /* Convert the initial value to the desired type. */
7398 tree new_type = TREE_TYPE (vectype);
7399 init_expr = gimple_convert (&stmts, new_type, init_expr);
7401 /* If we are using the loop mask to "peel" for alignment then we need
7402 to adjust the start value here. */
7403 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7404 if (skip_niters != NULL_TREE)
7406 if (FLOAT_TYPE_P (vectype))
7407 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7408 skip_niters);
7409 else
7410 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7411 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7412 skip_niters, step_expr);
7413 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7414 init_expr, skip_step);
7418 /* Convert the step to the desired type. */
7419 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7421 if (stmts)
7423 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7424 gcc_assert (!new_bb);
7427 /* Find the first insertion point in the BB. */
7428 basic_block bb = gimple_bb (phi);
7429 si = gsi_after_labels (bb);
7431 /* For SLP induction we have to generate several IVs as for example
7432 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7433 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7434 [VF*S, VF*S, VF*S, VF*S] for all. */
7435 if (slp_node)
7437 /* Enforced above. */
7438 unsigned int const_nunits = nunits.to_constant ();
7440 /* Generate [VF*S, VF*S, ... ]. */
7441 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7443 expr = build_int_cst (integer_type_node, vf);
7444 expr = fold_convert (TREE_TYPE (step_expr), expr);
7446 else
7447 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7448 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7449 expr, step_expr);
7450 if (! CONSTANT_CLASS_P (new_name))
7451 new_name = vect_init_vector (stmt_info, new_name,
7452 TREE_TYPE (step_expr), NULL);
7453 new_vec = build_vector_from_val (vectype, new_name);
7454 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7456 /* Now generate the IVs. */
7457 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7458 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7459 unsigned elts = const_nunits * nvects;
7460 unsigned nivs = least_common_multiple (group_size,
7461 const_nunits) / const_nunits;
7462 gcc_assert (elts % group_size == 0);
7463 tree elt = init_expr;
7464 unsigned ivn;
7465 for (ivn = 0; ivn < nivs; ++ivn)
7467 tree_vector_builder elts (vectype, const_nunits, 1);
7468 stmts = NULL;
7469 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7471 if (ivn*const_nunits + eltn >= group_size
7472 && (ivn * const_nunits + eltn) % group_size == 0)
7473 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7474 elt, step_expr);
7475 elts.quick_push (elt);
7477 vec_init = gimple_build_vector (&stmts, &elts);
7478 if (stmts)
7480 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7481 gcc_assert (!new_bb);
7484 /* Create the induction-phi that defines the induction-operand. */
7485 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7486 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7487 stmt_vec_info induction_phi_info
7488 = loop_vinfo->add_stmt (induction_phi);
7489 induc_def = PHI_RESULT (induction_phi);
7491 /* Create the iv update inside the loop */
7492 vec_def = make_ssa_name (vec_dest);
7493 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7494 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7495 loop_vinfo->add_stmt (new_stmt);
7497 /* Set the arguments of the phi node: */
7498 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7499 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7500 UNKNOWN_LOCATION);
7502 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7505 /* Re-use IVs when we can. */
7506 if (ivn < nvects)
7508 unsigned vfp
7509 = least_common_multiple (group_size, const_nunits) / group_size;
7510 /* Generate [VF'*S, VF'*S, ... ]. */
7511 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7513 expr = build_int_cst (integer_type_node, vfp);
7514 expr = fold_convert (TREE_TYPE (step_expr), expr);
7516 else
7517 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7518 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7519 expr, step_expr);
7520 if (! CONSTANT_CLASS_P (new_name))
7521 new_name = vect_init_vector (stmt_info, new_name,
7522 TREE_TYPE (step_expr), NULL);
7523 new_vec = build_vector_from_val (vectype, new_name);
7524 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7525 for (; ivn < nvects; ++ivn)
7527 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7528 tree def;
7529 if (gimple_code (iv) == GIMPLE_PHI)
7530 def = gimple_phi_result (iv);
7531 else
7532 def = gimple_assign_lhs (iv);
7533 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7534 PLUS_EXPR,
7535 def, vec_step);
7536 if (gimple_code (iv) == GIMPLE_PHI)
7537 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7538 else
7540 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7541 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7543 SLP_TREE_VEC_STMTS (slp_node).quick_push
7544 (loop_vinfo->add_stmt (new_stmt));
7548 return true;
7551 /* Create the vector that holds the initial_value of the induction. */
7552 if (nested_in_vect_loop)
7554 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7555 been created during vectorization of previous stmts. We obtain it
7556 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7557 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7558 /* If the initial value is not of proper type, convert it. */
7559 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7561 new_stmt
7562 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7563 vect_simple_var,
7564 "vec_iv_"),
7565 VIEW_CONVERT_EXPR,
7566 build1 (VIEW_CONVERT_EXPR, vectype,
7567 vec_init));
7568 vec_init = gimple_assign_lhs (new_stmt);
7569 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7570 new_stmt);
7571 gcc_assert (!new_bb);
7572 loop_vinfo->add_stmt (new_stmt);
7575 else
7577 /* iv_loop is the loop to be vectorized. Create:
7578 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7579 stmts = NULL;
7580 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7582 unsigned HOST_WIDE_INT const_nunits;
7583 if (nunits.is_constant (&const_nunits))
7585 tree_vector_builder elts (vectype, const_nunits, 1);
7586 elts.quick_push (new_name);
7587 for (i = 1; i < const_nunits; i++)
7589 /* Create: new_name_i = new_name + step_expr */
7590 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7591 new_name, step_expr);
7592 elts.quick_push (new_name);
7594 /* Create a vector from [new_name_0, new_name_1, ...,
7595 new_name_nunits-1] */
7596 vec_init = gimple_build_vector (&stmts, &elts);
7598 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7599 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7600 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7601 new_name, step_expr);
7602 else
7604 /* Build:
7605 [base, base, base, ...]
7606 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7607 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7608 gcc_assert (flag_associative_math);
7609 tree index = build_index_vector (vectype, 0, 1);
7610 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7611 new_name);
7612 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7613 step_expr);
7614 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7615 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7616 vec_init, step_vec);
7617 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7618 vec_init, base_vec);
7621 if (stmts)
7623 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7624 gcc_assert (!new_bb);
7629 /* Create the vector that holds the step of the induction. */
7630 if (nested_in_vect_loop)
7631 /* iv_loop is nested in the loop to be vectorized. Generate:
7632 vec_step = [S, S, S, S] */
7633 new_name = step_expr;
7634 else
7636 /* iv_loop is the loop to be vectorized. Generate:
7637 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7638 gimple_seq seq = NULL;
7639 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7641 expr = build_int_cst (integer_type_node, vf);
7642 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7644 else
7645 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7646 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7647 expr, step_expr);
7648 if (seq)
7650 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7651 gcc_assert (!new_bb);
7655 t = unshare_expr (new_name);
7656 gcc_assert (CONSTANT_CLASS_P (new_name)
7657 || TREE_CODE (new_name) == SSA_NAME);
7658 new_vec = build_vector_from_val (vectype, t);
7659 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7662 /* Create the following def-use cycle:
7663 loop prolog:
7664 vec_init = ...
7665 vec_step = ...
7666 loop:
7667 vec_iv = PHI <vec_init, vec_loop>
7669 STMT
7671 vec_loop = vec_iv + vec_step; */
7673 /* Create the induction-phi that defines the induction-operand. */
7674 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7675 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7676 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7677 induc_def = PHI_RESULT (induction_phi);
7679 /* Create the iv update inside the loop */
7680 vec_def = make_ssa_name (vec_dest);
7681 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7682 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7683 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7685 /* Set the arguments of the phi node: */
7686 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7687 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7688 UNKNOWN_LOCATION);
7690 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7692 /* In case that vectorization factor (VF) is bigger than the number
7693 of elements that we can fit in a vectype (nunits), we have to generate
7694 more than one vector stmt - i.e - we need to "unroll" the
7695 vector stmt by a factor VF/nunits. For more details see documentation
7696 in vectorizable_operation. */
7698 if (ncopies > 1)
7700 gimple_seq seq = NULL;
7701 stmt_vec_info prev_stmt_vinfo;
7702 /* FORNOW. This restriction should be relaxed. */
7703 gcc_assert (!nested_in_vect_loop);
7705 /* Create the vector that holds the step of the induction. */
7706 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7708 expr = build_int_cst (integer_type_node, nunits);
7709 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7711 else
7712 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7713 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7714 expr, step_expr);
7715 if (seq)
7717 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7718 gcc_assert (!new_bb);
7721 t = unshare_expr (new_name);
7722 gcc_assert (CONSTANT_CLASS_P (new_name)
7723 || TREE_CODE (new_name) == SSA_NAME);
7724 new_vec = build_vector_from_val (vectype, t);
7725 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7727 vec_def = induc_def;
7728 prev_stmt_vinfo = induction_phi_info;
7729 for (i = 1; i < ncopies; i++)
7731 /* vec_i = vec_prev + vec_step */
7732 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7733 vec_def, vec_step);
7734 vec_def = make_ssa_name (vec_dest, new_stmt);
7735 gimple_assign_set_lhs (new_stmt, vec_def);
7737 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7738 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7739 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7740 prev_stmt_vinfo = new_stmt_info;
7744 if (nested_in_vect_loop)
7746 /* Find the loop-closed exit-phi of the induction, and record
7747 the final vector of induction results: */
7748 exit_phi = NULL;
7749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7751 gimple *use_stmt = USE_STMT (use_p);
7752 if (is_gimple_debug (use_stmt))
7753 continue;
7755 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7757 exit_phi = use_stmt;
7758 break;
7761 if (exit_phi)
7763 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7764 /* FORNOW. Currently not supporting the case that an inner-loop induction
7765 is not used in the outer-loop (i.e. only outside the outer-loop). */
7766 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7767 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7769 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7770 if (dump_enabled_p ())
7772 dump_printf_loc (MSG_NOTE, vect_location,
7773 "vector of inductions after inner-loop:");
7774 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7780 if (dump_enabled_p ())
7782 dump_printf_loc (MSG_NOTE, vect_location,
7783 "transform induction: created def-use cycle: ");
7784 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7785 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7786 SSA_NAME_DEF_STMT (vec_def), 0);
7789 return true;
7792 /* Function vectorizable_live_operation.
7794 STMT_INFO computes a value that is used outside the loop. Check if
7795 it can be supported. */
7797 bool
7798 vectorizable_live_operation (stmt_vec_info stmt_info,
7799 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7800 slp_tree slp_node, int slp_index,
7801 stmt_vec_info *vec_stmt,
7802 stmt_vector_for_cost *)
7804 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7805 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7806 imm_use_iterator imm_iter;
7807 tree lhs, lhs_type, bitsize, vec_bitsize;
7808 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7809 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7810 int ncopies;
7811 gimple *use_stmt;
7812 auto_vec<tree> vec_oprnds;
7813 int vec_entry = 0;
7814 poly_uint64 vec_index = 0;
7816 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7818 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7819 return false;
7821 /* FORNOW. CHECKME. */
7822 if (nested_in_vect_loop_p (loop, stmt_info))
7823 return false;
7825 /* If STMT is not relevant and it is a simple assignment and its inputs are
7826 invariant then it can remain in place, unvectorized. The original last
7827 scalar value that it computes will be used. */
7828 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7830 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7831 if (dump_enabled_p ())
7832 dump_printf_loc (MSG_NOTE, vect_location,
7833 "statement is simple and uses invariant. Leaving in "
7834 "place.\n");
7835 return true;
7838 if (slp_node)
7839 ncopies = 1;
7840 else
7841 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7843 if (slp_node)
7845 gcc_assert (slp_index >= 0);
7847 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7848 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7850 /* Get the last occurrence of the scalar index from the concatenation of
7851 all the slp vectors. Calculate which slp vector it is and the index
7852 within. */
7853 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7855 /* Calculate which vector contains the result, and which lane of
7856 that vector we need. */
7857 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7859 if (dump_enabled_p ())
7860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7861 "Cannot determine which vector holds the"
7862 " final result.\n");
7863 return false;
7867 if (!vec_stmt)
7869 /* No transformation required. */
7870 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7872 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7873 OPTIMIZE_FOR_SPEED))
7875 if (dump_enabled_p ())
7876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7877 "can't use a fully-masked loop because "
7878 "the target doesn't support extract last "
7879 "reduction.\n");
7880 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7882 else if (slp_node)
7884 if (dump_enabled_p ())
7885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886 "can't use a fully-masked loop because an "
7887 "SLP statement is live after the loop.\n");
7888 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7890 else if (ncopies > 1)
7892 if (dump_enabled_p ())
7893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7894 "can't use a fully-masked loop because"
7895 " ncopies is greater than 1.\n");
7896 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7898 else
7900 gcc_assert (ncopies == 1 && !slp_node);
7901 vect_record_loop_mask (loop_vinfo,
7902 &LOOP_VINFO_MASKS (loop_vinfo),
7903 1, vectype);
7906 return true;
7909 /* If stmt has a related stmt, then use that for getting the lhs. */
7910 gimple *stmt = (is_pattern_stmt_p (stmt_info)
7911 ? STMT_VINFO_RELATED_STMT (stmt_info)->stmt
7912 : stmt_info->stmt);
7914 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7915 : gimple_get_lhs (stmt);
7916 lhs_type = TREE_TYPE (lhs);
7918 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7919 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7920 : TYPE_SIZE (TREE_TYPE (vectype)));
7921 vec_bitsize = TYPE_SIZE (vectype);
7923 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7924 tree vec_lhs, bitstart;
7925 if (slp_node)
7927 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7929 /* Get the correct slp vectorized stmt. */
7930 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7931 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7932 vec_lhs = gimple_phi_result (phi);
7933 else
7934 vec_lhs = gimple_get_lhs (vec_stmt);
7936 /* Get entry to use. */
7937 bitstart = bitsize_int (vec_index);
7938 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7940 else
7942 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7943 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7944 gcc_checking_assert (ncopies == 1
7945 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7947 /* For multiple copies, get the last copy. */
7948 for (int i = 1; i < ncopies; ++i)
7949 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7951 /* Get the last lane in the vector. */
7952 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7955 gimple_seq stmts = NULL;
7956 tree new_tree;
7957 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7959 /* Emit:
7961 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7963 where VEC_LHS is the vectorized live-out result and MASK is
7964 the loop mask for the final iteration. */
7965 gcc_assert (ncopies == 1 && !slp_node);
7966 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7967 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7968 1, vectype, 0);
7969 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7970 scalar_type, mask, vec_lhs);
7972 /* Convert the extracted vector element to the required scalar type. */
7973 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7975 else
7977 tree bftype = TREE_TYPE (vectype);
7978 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7979 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7980 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7981 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7982 &stmts, true, NULL_TREE);
7985 if (stmts)
7986 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7988 /* Replace use of lhs with newly computed result. If the use stmt is a
7989 single arg PHI, just replace all uses of PHI result. It's necessary
7990 because lcssa PHI defining lhs may be before newly inserted stmt. */
7991 use_operand_p use_p;
7992 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7993 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7994 && !is_gimple_debug (use_stmt))
7996 if (gimple_code (use_stmt) == GIMPLE_PHI
7997 && gimple_phi_num_args (use_stmt) == 1)
7999 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8001 else
8003 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8004 SET_USE (use_p, new_tree);
8006 update_stmt (use_stmt);
8009 return true;
8012 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8014 static void
8015 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8017 ssa_op_iter op_iter;
8018 imm_use_iterator imm_iter;
8019 def_operand_p def_p;
8020 gimple *ustmt;
8022 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8024 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8026 basic_block bb;
8028 if (!is_gimple_debug (ustmt))
8029 continue;
8031 bb = gimple_bb (ustmt);
8033 if (!flow_bb_inside_loop_p (loop, bb))
8035 if (gimple_debug_bind_p (ustmt))
8037 if (dump_enabled_p ())
8038 dump_printf_loc (MSG_NOTE, vect_location,
8039 "killing debug use\n");
8041 gimple_debug_bind_reset_value (ustmt);
8042 update_stmt (ustmt);
8044 else
8045 gcc_unreachable ();
8051 /* Given loop represented by LOOP_VINFO, return true if computation of
8052 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8053 otherwise. */
8055 static bool
8056 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8058 /* Constant case. */
8059 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8061 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8062 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8064 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8065 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8066 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8067 return true;
8070 widest_int max;
8071 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8072 /* Check the upper bound of loop niters. */
8073 if (get_max_loop_iterations (loop, &max))
8075 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8076 signop sgn = TYPE_SIGN (type);
8077 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8078 if (max < type_max)
8079 return true;
8081 return false;
8084 /* Return a mask type with half the number of elements as TYPE. */
8086 tree
8087 vect_halve_mask_nunits (tree type)
8089 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8090 return build_truth_vector_type (nunits, current_vector_size);
8093 /* Return a mask type with twice as many elements as TYPE. */
8095 tree
8096 vect_double_mask_nunits (tree type)
8098 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8099 return build_truth_vector_type (nunits, current_vector_size);
8102 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8103 contain a sequence of NVECTORS masks that each control a vector of type
8104 VECTYPE. */
8106 void
8107 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8108 unsigned int nvectors, tree vectype)
8110 gcc_assert (nvectors != 0);
8111 if (masks->length () < nvectors)
8112 masks->safe_grow_cleared (nvectors);
8113 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8114 /* The number of scalars per iteration and the number of vectors are
8115 both compile-time constants. */
8116 unsigned int nscalars_per_iter
8117 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8118 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8119 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8121 rgm->max_nscalars_per_iter = nscalars_per_iter;
8122 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8126 /* Given a complete set of masks MASKS, extract mask number INDEX
8127 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8128 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8130 See the comment above vec_loop_masks for more details about the mask
8131 arrangement. */
8133 tree
8134 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8135 unsigned int nvectors, tree vectype, unsigned int index)
8137 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8138 tree mask_type = rgm->mask_type;
8140 /* Populate the rgroup's mask array, if this is the first time we've
8141 used it. */
8142 if (rgm->masks.is_empty ())
8144 rgm->masks.safe_grow_cleared (nvectors);
8145 for (unsigned int i = 0; i < nvectors; ++i)
8147 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8148 /* Provide a dummy definition until the real one is available. */
8149 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8150 rgm->masks[i] = mask;
8154 tree mask = rgm->masks[index];
8155 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8156 TYPE_VECTOR_SUBPARTS (vectype)))
8158 /* A loop mask for data type X can be reused for data type Y
8159 if X has N times more elements than Y and if Y's elements
8160 are N times bigger than X's. In this case each sequence
8161 of N elements in the loop mask will be all-zero or all-one.
8162 We can then view-convert the mask so that each sequence of
8163 N elements is replaced by a single element. */
8164 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8165 TYPE_VECTOR_SUBPARTS (vectype)));
8166 gimple_seq seq = NULL;
8167 mask_type = build_same_sized_truth_vector_type (vectype);
8168 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8169 if (seq)
8170 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8172 return mask;
8175 /* Scale profiling counters by estimation for LOOP which is vectorized
8176 by factor VF. */
8178 static void
8179 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8181 edge preheader = loop_preheader_edge (loop);
8182 /* Reduce loop iterations by the vectorization factor. */
8183 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8184 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8186 if (freq_h.nonzero_p ())
8188 profile_probability p;
8190 /* Avoid dropping loop body profile counter to 0 because of zero count
8191 in loop's preheader. */
8192 if (!(freq_e == profile_count::zero ()))
8193 freq_e = freq_e.force_nonzero ();
8194 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8195 scale_loop_frequencies (loop, p);
8198 edge exit_e = single_exit (loop);
8199 exit_e->probability = profile_probability::always ()
8200 .apply_scale (1, new_est_niter + 1);
8202 edge exit_l = single_pred_edge (loop->latch);
8203 profile_probability prob = exit_l->probability;
8204 exit_l->probability = exit_e->probability.invert ();
8205 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8206 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8209 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8210 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its stmt_vec_info.
8211 *SLP_SCHEDULE is a running record of whether we have called
8212 vect_schedule_slp. */
8214 static void
8215 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8216 gimple_stmt_iterator *gsi,
8217 stmt_vec_info *seen_store, bool *slp_scheduled)
8219 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8220 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8222 if (dump_enabled_p ())
8224 dump_printf_loc (MSG_NOTE, vect_location,
8225 "------>vectorizing statement: ");
8226 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
8229 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8230 vect_loop_kill_debug_uses (loop, stmt_info);
8232 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8233 && !STMT_VINFO_LIVE_P (stmt_info))
8234 return;
8236 if (STMT_VINFO_VECTYPE (stmt_info))
8238 poly_uint64 nunits
8239 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8240 if (!STMT_SLP_TYPE (stmt_info)
8241 && maybe_ne (nunits, vf)
8242 && dump_enabled_p ())
8243 /* For SLP VF is set according to unrolling factor, and not
8244 to vector size, hence for SLP this print is not valid. */
8245 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8248 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8249 reached. */
8250 if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8253 if (!*slp_scheduled)
8255 *slp_scheduled = true;
8257 DUMP_VECT_SCOPE ("scheduling SLP instances");
8259 vect_schedule_slp (loop_vinfo);
8262 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8263 if (slptype == pure_slp)
8264 return;
8267 if (dump_enabled_p ())
8268 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8270 bool grouped_store = false;
8271 if (vect_transform_stmt (stmt_info, gsi, &grouped_store, NULL, NULL))
8272 *seen_store = stmt_info;
8275 /* Function vect_transform_loop.
8277 The analysis phase has determined that the loop is vectorizable.
8278 Vectorize the loop - created vectorized stmts to replace the scalar
8279 stmts in the loop, and update the loop exit condition.
8280 Returns scalar epilogue loop if any. */
8282 struct loop *
8283 vect_transform_loop (loop_vec_info loop_vinfo)
8285 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8286 struct loop *epilogue = NULL;
8287 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8288 int nbbs = loop->num_nodes;
8289 int i;
8290 tree niters_vector = NULL_TREE;
8291 tree step_vector = NULL_TREE;
8292 tree niters_vector_mult_vf = NULL_TREE;
8293 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8294 unsigned int lowest_vf = constant_lower_bound (vf);
8295 bool slp_scheduled = false;
8296 gimple *stmt;
8297 bool check_profitability = false;
8298 unsigned int th;
8300 DUMP_VECT_SCOPE ("vec_transform_loop");
8302 loop_vinfo->shared->check_datarefs ();
8304 /* Use the more conservative vectorization threshold. If the number
8305 of iterations is constant assume the cost check has been performed
8306 by our caller. If the threshold makes all loops profitable that
8307 run at least the (estimated) vectorization factor number of times
8308 checking is pointless, too. */
8309 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8310 if (th >= vect_vf_for_cost (loop_vinfo)
8311 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8313 if (dump_enabled_p ())
8314 dump_printf_loc (MSG_NOTE, vect_location,
8315 "Profitability threshold is %d loop iterations.\n",
8316 th);
8317 check_profitability = true;
8320 /* Make sure there exists a single-predecessor exit bb. Do this before
8321 versioning. */
8322 edge e = single_exit (loop);
8323 if (! single_pred_p (e->dest))
8325 split_loop_exit_edge (e);
8326 if (dump_enabled_p ())
8327 dump_printf (MSG_NOTE, "split exit edge\n");
8330 /* Version the loop first, if required, so the profitability check
8331 comes first. */
8333 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8335 poly_uint64 versioning_threshold
8336 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8337 if (check_profitability
8338 && ordered_p (poly_uint64 (th), versioning_threshold))
8340 versioning_threshold = ordered_max (poly_uint64 (th),
8341 versioning_threshold);
8342 check_profitability = false;
8344 vect_loop_versioning (loop_vinfo, th, check_profitability,
8345 versioning_threshold);
8346 check_profitability = false;
8349 /* Make sure there exists a single-predecessor exit bb also on the
8350 scalar loop copy. Do this after versioning but before peeling
8351 so CFG structure is fine for both scalar and if-converted loop
8352 to make slpeel_duplicate_current_defs_from_edges face matched
8353 loop closed PHI nodes on the exit. */
8354 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8356 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8357 if (! single_pred_p (e->dest))
8359 split_loop_exit_edge (e);
8360 if (dump_enabled_p ())
8361 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8365 tree niters = vect_build_loop_niters (loop_vinfo);
8366 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8367 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8368 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8369 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8370 &step_vector, &niters_vector_mult_vf, th,
8371 check_profitability, niters_no_overflow);
8373 if (niters_vector == NULL_TREE)
8375 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8376 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8377 && known_eq (lowest_vf, vf))
8379 niters_vector
8380 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8381 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8382 step_vector = build_one_cst (TREE_TYPE (niters));
8384 else
8385 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8386 &step_vector, niters_no_overflow);
8389 /* 1) Make sure the loop header has exactly two entries
8390 2) Make sure we have a preheader basic block. */
8392 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8394 split_edge (loop_preheader_edge (loop));
8396 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8397 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8398 /* This will deal with any possible peeling. */
8399 vect_prepare_for_masked_peels (loop_vinfo);
8401 /* FORNOW: the vectorizer supports only loops which body consist
8402 of one basic block (header + empty latch). When the vectorizer will
8403 support more involved loop forms, the order by which the BBs are
8404 traversed need to be reconsidered. */
8406 for (i = 0; i < nbbs; i++)
8408 basic_block bb = bbs[i];
8409 stmt_vec_info stmt_info;
8411 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8412 gsi_next (&si))
8414 gphi *phi = si.phi ();
8415 if (dump_enabled_p ())
8417 dump_printf_loc (MSG_NOTE, vect_location,
8418 "------>vectorizing phi: ");
8419 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8421 stmt_info = loop_vinfo->lookup_stmt (phi);
8422 if (!stmt_info)
8423 continue;
8425 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8426 vect_loop_kill_debug_uses (loop, stmt_info);
8428 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8429 && !STMT_VINFO_LIVE_P (stmt_info))
8430 continue;
8432 if (STMT_VINFO_VECTYPE (stmt_info)
8433 && (maybe_ne
8434 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8435 && dump_enabled_p ())
8436 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8438 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8439 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8440 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8441 && ! PURE_SLP_STMT (stmt_info))
8443 if (dump_enabled_p ())
8444 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8445 vect_transform_stmt (stmt_info, NULL, NULL, NULL, NULL);
8449 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8450 !gsi_end_p (si);)
8452 stmt = gsi_stmt (si);
8453 /* During vectorization remove existing clobber stmts. */
8454 if (gimple_clobber_p (stmt))
8456 unlink_stmt_vdef (stmt);
8457 gsi_remove (&si, true);
8458 release_defs (stmt);
8460 else
8462 stmt_info = loop_vinfo->lookup_stmt (stmt);
8464 /* vector stmts created in the outer-loop during vectorization of
8465 stmts in an inner-loop may not have a stmt_info, and do not
8466 need to be vectorized. */
8467 stmt_vec_info seen_store = NULL;
8468 if (stmt_info)
8470 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8472 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8473 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8474 !gsi_end_p (subsi); gsi_next (&subsi))
8476 stmt_vec_info pat_stmt_info
8477 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8478 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8479 &si, &seen_store,
8480 &slp_scheduled);
8482 stmt_vec_info pat_stmt_info
8483 = STMT_VINFO_RELATED_STMT (stmt_info);
8484 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8485 &seen_store, &slp_scheduled);
8487 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8488 &seen_store, &slp_scheduled);
8490 if (seen_store)
8492 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8494 /* Interleaving. If IS_STORE is TRUE, the
8495 vectorization of the interleaving chain was
8496 completed - free all the stores in the chain. */
8497 gsi_next (&si);
8498 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8500 else
8502 /* Free the attached stmt_vec_info and remove the
8503 stmt. */
8504 free_stmt_vec_info (stmt);
8505 unlink_stmt_vdef (stmt);
8506 gsi_remove (&si, true);
8507 release_defs (stmt);
8510 else
8511 gsi_next (&si);
8515 /* Stub out scalar statements that must not survive vectorization.
8516 Doing this here helps with grouped statements, or statements that
8517 are involved in patterns. */
8518 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8519 !gsi_end_p (gsi); gsi_next (&gsi))
8521 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8522 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8524 tree lhs = gimple_get_lhs (call);
8525 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8527 tree zero = build_zero_cst (TREE_TYPE (lhs));
8528 gimple *new_stmt = gimple_build_assign (lhs, zero);
8529 gsi_replace (&gsi, new_stmt, true);
8533 } /* BBs in loop */
8535 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8536 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8537 if (integer_onep (step_vector))
8538 niters_no_overflow = true;
8539 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8540 niters_vector_mult_vf, !niters_no_overflow);
8542 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8543 scale_profile_for_vect_loop (loop, assumed_vf);
8545 /* True if the final iteration might not handle a full vector's
8546 worth of scalar iterations. */
8547 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8548 /* The minimum number of iterations performed by the epilogue. This
8549 is 1 when peeling for gaps because we always need a final scalar
8550 iteration. */
8551 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8552 /* +1 to convert latch counts to loop iteration counts,
8553 -min_epilogue_iters to remove iterations that cannot be performed
8554 by the vector code. */
8555 int bias_for_lowest = 1 - min_epilogue_iters;
8556 int bias_for_assumed = bias_for_lowest;
8557 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8558 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8560 /* When the amount of peeling is known at compile time, the first
8561 iteration will have exactly alignment_npeels active elements.
8562 In the worst case it will have at least one. */
8563 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8564 bias_for_lowest += lowest_vf - min_first_active;
8565 bias_for_assumed += assumed_vf - min_first_active;
8567 /* In these calculations the "- 1" converts loop iteration counts
8568 back to latch counts. */
8569 if (loop->any_upper_bound)
8570 loop->nb_iterations_upper_bound
8571 = (final_iter_may_be_partial
8572 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8573 lowest_vf) - 1
8574 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8575 lowest_vf) - 1);
8576 if (loop->any_likely_upper_bound)
8577 loop->nb_iterations_likely_upper_bound
8578 = (final_iter_may_be_partial
8579 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8580 + bias_for_lowest, lowest_vf) - 1
8581 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8582 + bias_for_lowest, lowest_vf) - 1);
8583 if (loop->any_estimate)
8584 loop->nb_iterations_estimate
8585 = (final_iter_may_be_partial
8586 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8587 assumed_vf) - 1
8588 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8589 assumed_vf) - 1);
8591 if (dump_enabled_p ())
8593 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8595 dump_printf_loc (MSG_NOTE, vect_location,
8596 "LOOP VECTORIZED\n");
8597 if (loop->inner)
8598 dump_printf_loc (MSG_NOTE, vect_location,
8599 "OUTER LOOP VECTORIZED\n");
8600 dump_printf (MSG_NOTE, "\n");
8602 else
8604 dump_printf_loc (MSG_NOTE, vect_location,
8605 "LOOP EPILOGUE VECTORIZED (VS=");
8606 dump_dec (MSG_NOTE, current_vector_size);
8607 dump_printf (MSG_NOTE, ")\n");
8611 /* Free SLP instances here because otherwise stmt reference counting
8612 won't work. */
8613 slp_instance instance;
8614 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8615 vect_free_slp_instance (instance, true);
8616 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8617 /* Clear-up safelen field since its value is invalid after vectorization
8618 since vectorized loop can have loop-carried dependencies. */
8619 loop->safelen = 0;
8621 /* Don't vectorize epilogue for epilogue. */
8622 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8623 epilogue = NULL;
8625 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8626 epilogue = NULL;
8628 if (epilogue)
8630 auto_vector_sizes vector_sizes;
8631 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8632 unsigned int next_size = 0;
8634 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8635 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8636 && known_eq (vf, lowest_vf))
8638 unsigned int eiters
8639 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8640 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8641 eiters = eiters % lowest_vf;
8642 epilogue->nb_iterations_upper_bound = eiters - 1;
8644 unsigned int ratio;
8645 while (next_size < vector_sizes.length ()
8646 && !(constant_multiple_p (current_vector_size,
8647 vector_sizes[next_size], &ratio)
8648 && eiters >= lowest_vf / ratio))
8649 next_size += 1;
8651 else
8652 while (next_size < vector_sizes.length ()
8653 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8654 next_size += 1;
8656 if (next_size == vector_sizes.length ())
8657 epilogue = NULL;
8660 if (epilogue)
8662 epilogue->force_vectorize = loop->force_vectorize;
8663 epilogue->safelen = loop->safelen;
8664 epilogue->dont_vectorize = false;
8666 /* We may need to if-convert epilogue to vectorize it. */
8667 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8668 tree_if_conversion (epilogue);
8671 return epilogue;
8674 /* The code below is trying to perform simple optimization - revert
8675 if-conversion for masked stores, i.e. if the mask of a store is zero
8676 do not perform it and all stored value producers also if possible.
8677 For example,
8678 for (i=0; i<n; i++)
8679 if (c[i])
8681 p1[i] += 1;
8682 p2[i] = p3[i] +2;
8684 this transformation will produce the following semi-hammock:
8686 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8688 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8689 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8690 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8691 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8692 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8693 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8697 void
8698 optimize_mask_stores (struct loop *loop)
8700 basic_block *bbs = get_loop_body (loop);
8701 unsigned nbbs = loop->num_nodes;
8702 unsigned i;
8703 basic_block bb;
8704 struct loop *bb_loop;
8705 gimple_stmt_iterator gsi;
8706 gimple *stmt;
8707 auto_vec<gimple *> worklist;
8709 vect_location = find_loop_location (loop);
8710 /* Pick up all masked stores in loop if any. */
8711 for (i = 0; i < nbbs; i++)
8713 bb = bbs[i];
8714 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8715 gsi_next (&gsi))
8717 stmt = gsi_stmt (gsi);
8718 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8719 worklist.safe_push (stmt);
8723 free (bbs);
8724 if (worklist.is_empty ())
8725 return;
8727 /* Loop has masked stores. */
8728 while (!worklist.is_empty ())
8730 gimple *last, *last_store;
8731 edge e, efalse;
8732 tree mask;
8733 basic_block store_bb, join_bb;
8734 gimple_stmt_iterator gsi_to;
8735 tree vdef, new_vdef;
8736 gphi *phi;
8737 tree vectype;
8738 tree zero;
8740 last = worklist.pop ();
8741 mask = gimple_call_arg (last, 2);
8742 bb = gimple_bb (last);
8743 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8744 the same loop as if_bb. It could be different to LOOP when two
8745 level loop-nest is vectorized and mask_store belongs to the inner
8746 one. */
8747 e = split_block (bb, last);
8748 bb_loop = bb->loop_father;
8749 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8750 join_bb = e->dest;
8751 store_bb = create_empty_bb (bb);
8752 add_bb_to_loop (store_bb, bb_loop);
8753 e->flags = EDGE_TRUE_VALUE;
8754 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8755 /* Put STORE_BB to likely part. */
8756 efalse->probability = profile_probability::unlikely ();
8757 store_bb->count = efalse->count ();
8758 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8759 if (dom_info_available_p (CDI_DOMINATORS))
8760 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8761 if (dump_enabled_p ())
8762 dump_printf_loc (MSG_NOTE, vect_location,
8763 "Create new block %d to sink mask stores.",
8764 store_bb->index);
8765 /* Create vector comparison with boolean result. */
8766 vectype = TREE_TYPE (mask);
8767 zero = build_zero_cst (vectype);
8768 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8769 gsi = gsi_last_bb (bb);
8770 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8771 /* Create new PHI node for vdef of the last masked store:
8772 .MEM_2 = VDEF <.MEM_1>
8773 will be converted to
8774 .MEM.3 = VDEF <.MEM_1>
8775 and new PHI node will be created in join bb
8776 .MEM_2 = PHI <.MEM_1, .MEM_3>
8778 vdef = gimple_vdef (last);
8779 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8780 gimple_set_vdef (last, new_vdef);
8781 phi = create_phi_node (vdef, join_bb);
8782 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8784 /* Put all masked stores with the same mask to STORE_BB if possible. */
8785 while (true)
8787 gimple_stmt_iterator gsi_from;
8788 gimple *stmt1 = NULL;
8790 /* Move masked store to STORE_BB. */
8791 last_store = last;
8792 gsi = gsi_for_stmt (last);
8793 gsi_from = gsi;
8794 /* Shift GSI to the previous stmt for further traversal. */
8795 gsi_prev (&gsi);
8796 gsi_to = gsi_start_bb (store_bb);
8797 gsi_move_before (&gsi_from, &gsi_to);
8798 /* Setup GSI_TO to the non-empty block start. */
8799 gsi_to = gsi_start_bb (store_bb);
8800 if (dump_enabled_p ())
8802 dump_printf_loc (MSG_NOTE, vect_location,
8803 "Move stmt to created bb\n");
8804 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8806 /* Move all stored value producers if possible. */
8807 while (!gsi_end_p (gsi))
8809 tree lhs;
8810 imm_use_iterator imm_iter;
8811 use_operand_p use_p;
8812 bool res;
8814 /* Skip debug statements. */
8815 if (is_gimple_debug (gsi_stmt (gsi)))
8817 gsi_prev (&gsi);
8818 continue;
8820 stmt1 = gsi_stmt (gsi);
8821 /* Do not consider statements writing to memory or having
8822 volatile operand. */
8823 if (gimple_vdef (stmt1)
8824 || gimple_has_volatile_ops (stmt1))
8825 break;
8826 gsi_from = gsi;
8827 gsi_prev (&gsi);
8828 lhs = gimple_get_lhs (stmt1);
8829 if (!lhs)
8830 break;
8832 /* LHS of vectorized stmt must be SSA_NAME. */
8833 if (TREE_CODE (lhs) != SSA_NAME)
8834 break;
8836 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8838 /* Remove dead scalar statement. */
8839 if (has_zero_uses (lhs))
8841 gsi_remove (&gsi_from, true);
8842 continue;
8846 /* Check that LHS does not have uses outside of STORE_BB. */
8847 res = true;
8848 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8850 gimple *use_stmt;
8851 use_stmt = USE_STMT (use_p);
8852 if (is_gimple_debug (use_stmt))
8853 continue;
8854 if (gimple_bb (use_stmt) != store_bb)
8856 res = false;
8857 break;
8860 if (!res)
8861 break;
8863 if (gimple_vuse (stmt1)
8864 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8865 break;
8867 /* Can move STMT1 to STORE_BB. */
8868 if (dump_enabled_p ())
8870 dump_printf_loc (MSG_NOTE, vect_location,
8871 "Move stmt to created bb\n");
8872 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8874 gsi_move_before (&gsi_from, &gsi_to);
8875 /* Shift GSI_TO for further insertion. */
8876 gsi_prev (&gsi_to);
8878 /* Put other masked stores with the same mask to STORE_BB. */
8879 if (worklist.is_empty ()
8880 || gimple_call_arg (worklist.last (), 2) != mask
8881 || worklist.last () != stmt1)
8882 break;
8883 last = worklist.pop ();
8885 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);