[44/46] Remove global vinfo_for_stmt-related routines
[official-gcc.git] / gcc / tree-vect-loop.c
blob7ce9ca5e6fc021ce9d97e4b87e8e00205851c3bb
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
184 if (stmt_vectype)
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return true;
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
216 vec_info *vinfo = stmt_info->vinfo;
217 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
220 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
222 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
223 return false;
225 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
226 && STMT_VINFO_RELATED_STMT (stmt_info))
228 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231 /* If a pattern statement has def stmts, analyze them too. */
232 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
233 !gsi_end_p (si); gsi_next (&si))
235 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
236 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: ");
240 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
241 def_stmt_info->stmt, 0);
243 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers))
245 return false;
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: ");
252 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
254 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
255 return false;
258 return true;
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
286 static bool
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i = 0; i < nbbs; i++)
304 basic_block bb = bbs[i];
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
313 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
314 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
317 gcc_assert (stmt_info);
319 if (STMT_VINFO_RELEVANT_P (stmt_info)
320 || STMT_VINFO_LIVE_P (stmt_info))
322 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
323 scalar_type = TREE_TYPE (PHI_RESULT (phi));
325 if (dump_enabled_p ())
327 dump_printf_loc (MSG_NOTE, vect_location,
328 "get vectype for scalar type: ");
329 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
330 dump_printf (MSG_NOTE, "\n");
333 vectype = get_vectype_for_scalar_type (scalar_type);
334 if (!vectype)
336 if (dump_enabled_p ())
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "not vectorized: unsupported "
340 "data-type ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
342 scalar_type);
343 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
345 return false;
347 STMT_VINFO_VECTYPE (stmt_info) = vectype;
349 if (dump_enabled_p ())
351 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
353 dump_printf (MSG_NOTE, "\n");
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
359 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
360 dump_printf (MSG_NOTE, "\n");
363 vect_update_max_nunits (&vectorization_factor, vectype);
367 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
368 gsi_next (&si))
370 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
371 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
372 &mask_producers))
373 return false;
377 /* TODO: Analyze cost. Decide if worth while to vectorize. */
378 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
381 dump_dec (MSG_NOTE, vectorization_factor);
382 dump_printf (MSG_NOTE, "\n");
385 if (known_le (vectorization_factor, 1U))
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "not vectorized: unsupported data-type\n");
390 return false;
392 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
394 for (i = 0; i < mask_producers.length (); i++)
396 stmt_info = mask_producers[i];
397 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
398 if (!mask_type)
399 return false;
400 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
403 return true;
407 /* Function vect_is_simple_iv_evolution.
409 FORNOW: A simple evolution of an induction variables in the loop is
410 considered a polynomial evolution. */
412 static bool
413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
414 tree * step)
416 tree init_expr;
417 tree step_expr;
418 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
419 basic_block bb;
421 /* When there is no evolution in this loop, the evolution function
422 is not "simple". */
423 if (evolution_part == NULL_TREE)
424 return false;
426 /* When the evolution is a polynomial of degree >= 2
427 the evolution function is not "simple". */
428 if (tree_is_chrec (evolution_part))
429 return false;
431 step_expr = evolution_part;
432 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
434 if (dump_enabled_p ())
436 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
437 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
438 dump_printf (MSG_NOTE, ", init: ");
439 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
440 dump_printf (MSG_NOTE, "\n");
443 *init = init_expr;
444 *step = step_expr;
446 if (TREE_CODE (step_expr) != INTEGER_CST
447 && (TREE_CODE (step_expr) != SSA_NAME
448 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
449 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
450 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
451 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
452 || !flag_associative_math)))
453 && (TREE_CODE (step_expr) != REAL_CST
454 || !flag_associative_math))
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
458 "step unknown.\n");
459 return false;
462 return true;
465 /* Function vect_analyze_scalar_cycles_1.
467 Examine the cross iteration def-use cycles of scalar variables
468 in LOOP. LOOP_VINFO represents the loop that is now being
469 considered for vectorization (can be LOOP, or an outer-loop
470 enclosing LOOP). */
472 static void
473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
475 basic_block bb = loop->header;
476 tree init, step;
477 auto_vec<stmt_vec_info, 64> worklist;
478 gphi_iterator gsi;
479 bool double_reduc;
481 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
483 /* First - identify all inductions. Reduction detection assumes that all the
484 inductions have been identified, therefore, this order must not be
485 changed. */
486 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
488 gphi *phi = gsi.phi ();
489 tree access_fn = NULL;
490 tree def = PHI_RESULT (phi);
491 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
493 if (dump_enabled_p ())
495 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
496 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
499 /* Skip virtual phi's. The data dependences that are associated with
500 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
501 if (virtual_operand_p (def))
502 continue;
504 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
506 /* Analyze the evolution function. */
507 access_fn = analyze_scalar_evolution (loop, def);
508 if (access_fn)
510 STRIP_NOPS (access_fn);
511 if (dump_enabled_p ())
513 dump_printf_loc (MSG_NOTE, vect_location,
514 "Access function of PHI: ");
515 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
516 dump_printf (MSG_NOTE, "\n");
518 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
519 = initial_condition_in_loop_num (access_fn, loop->num);
520 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
521 = evolution_part_in_loop_num (access_fn, loop->num);
524 if (!access_fn
525 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
526 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
527 && TREE_CODE (step) != INTEGER_CST))
529 worklist.safe_push (stmt_vinfo);
530 continue;
533 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
534 != NULL_TREE);
535 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
539 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
543 /* Second - identify all reductions and nested cycles. */
544 while (worklist.length () > 0)
546 stmt_vec_info stmt_vinfo = worklist.pop ();
547 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
548 tree def = PHI_RESULT (phi);
550 if (dump_enabled_p ())
552 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
553 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
556 gcc_assert (!virtual_operand_p (def)
557 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
559 stmt_vec_info reduc_stmt_info
560 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
561 &double_reduc, false);
562 if (reduc_stmt_info)
564 if (double_reduc)
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "Detected double reduction.\n");
570 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
571 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
572 = vect_double_reduction_def;
574 else
576 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "Detected vectorizable nested cycle.\n");
582 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
583 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
585 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_NOTE, vect_location,
589 "Detected reduction.\n");
591 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
592 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
593 /* Store the reduction cycles for possible vectorization in
594 loop-aware SLP if it was not detected as reduction
595 chain. */
596 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
597 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
598 (reduc_stmt_info);
602 else
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Unknown def-use cycle pattern.\n");
610 /* Function vect_analyze_scalar_cycles.
612 Examine the cross iteration def-use cycles of scalar variables, by
613 analyzing the loop-header PHIs of scalar variables. Classify each
614 cycle as one of the following: invariant, induction, reduction, unknown.
615 We do that for the loop represented by LOOP_VINFO, and also to its
616 inner-loop, if exists.
617 Examples for scalar cycles:
619 Example1: reduction:
621 loop1:
622 for (i=0; i<N; i++)
623 sum += a[i];
625 Example2: induction:
627 loop2:
628 for (i=0; i<N; i++)
629 a[i] = i; */
631 static void
632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
634 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
636 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
638 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
639 Reductions in such inner-loop therefore have different properties than
640 the reductions in the nest that gets vectorized:
641 1. When vectorized, they are executed in the same order as in the original
642 scalar loop, so we can't change the order of computation when
643 vectorizing them.
644 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
645 current checks are too strict. */
647 if (loop->inner)
648 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 /* Transfer group and reduction information from STMT_INFO to its
652 pattern stmt. */
654 static void
655 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
657 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
658 stmt_vec_info stmtp;
659 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
660 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
661 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
665 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
666 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
667 if (stmt_info)
668 REDUC_GROUP_NEXT_ELEMENT (stmtp)
669 = STMT_VINFO_RELATED_STMT (stmt_info);
671 while (stmt_info);
672 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 /* Fixup scalar cycles that now have their stmts detected as patterns. */
677 static void
678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
680 stmt_vec_info first;
681 unsigned i;
683 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
684 if (STMT_VINFO_IN_PATTERN_P (first))
686 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
687 while (next)
689 if (! STMT_VINFO_IN_PATTERN_P (next))
690 break;
691 next = REDUC_GROUP_NEXT_ELEMENT (next);
693 /* If not all stmt in the chain are patterns try to handle
694 the chain without patterns. */
695 if (! next)
697 vect_fixup_reduc_chain (first);
698 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
699 = STMT_VINFO_RELATED_STMT (first);
704 /* Function vect_get_loop_niters.
706 Determine how many iterations the loop is executed and place it
707 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
708 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
709 niter information holds in ASSUMPTIONS.
711 Return the loop exit condition. */
714 static gcond *
715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
716 tree *number_of_iterations, tree *number_of_iterationsm1)
718 edge exit = single_exit (loop);
719 struct tree_niter_desc niter_desc;
720 tree niter_assumptions, niter, may_be_zero;
721 gcond *cond = get_loop_exit_condition (loop);
723 *assumptions = boolean_true_node;
724 *number_of_iterationsm1 = chrec_dont_know;
725 *number_of_iterations = chrec_dont_know;
726 DUMP_VECT_SCOPE ("get_loop_niters");
728 if (!exit)
729 return cond;
731 niter = chrec_dont_know;
732 may_be_zero = NULL_TREE;
733 niter_assumptions = boolean_true_node;
734 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
735 || chrec_contains_undetermined (niter_desc.niter))
736 return cond;
738 niter_assumptions = niter_desc.assumptions;
739 may_be_zero = niter_desc.may_be_zero;
740 niter = niter_desc.niter;
742 if (may_be_zero && integer_zerop (may_be_zero))
743 may_be_zero = NULL_TREE;
745 if (may_be_zero)
747 if (COMPARISON_CLASS_P (may_be_zero))
749 /* Try to combine may_be_zero with assumptions, this can simplify
750 computation of niter expression. */
751 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
752 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
753 niter_assumptions,
754 fold_build1 (TRUTH_NOT_EXPR,
755 boolean_type_node,
756 may_be_zero));
757 else
758 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
759 build_int_cst (TREE_TYPE (niter), 0),
760 rewrite_to_non_trapping_overflow (niter));
762 may_be_zero = NULL_TREE;
764 else if (integer_nonzerop (may_be_zero))
766 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
767 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
768 return cond;
770 else
771 return cond;
774 *assumptions = niter_assumptions;
775 *number_of_iterationsm1 = niter;
777 /* We want the number of loop header executions which is the number
778 of latch executions plus one.
779 ??? For UINT_MAX latch executions this number overflows to zero
780 for loops like do { n++; } while (n != 0); */
781 if (niter && !chrec_contains_undetermined (niter))
782 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
783 build_int_cst (TREE_TYPE (niter), 1));
784 *number_of_iterations = niter;
786 return cond;
789 /* Function bb_in_loop_p
791 Used as predicate for dfs order traversal of the loop bbs. */
793 static bool
794 bb_in_loop_p (const_basic_block bb, const void *data)
796 const struct loop *const loop = (const struct loop *)data;
797 if (flow_bb_inside_loop_p (loop, bb))
798 return true;
799 return false;
803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
804 stmt_vec_info structs for all the stmts in LOOP_IN. */
806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
807 : vec_info (vec_info::loop, init_cost (loop_in), shared),
808 loop (loop_in),
809 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
810 num_itersm1 (NULL_TREE),
811 num_iters (NULL_TREE),
812 num_iters_unchanged (NULL_TREE),
813 num_iters_assumptions (NULL_TREE),
814 th (0),
815 versioning_threshold (0),
816 vectorization_factor (0),
817 max_vectorization_factor (0),
818 mask_skip_niters (NULL_TREE),
819 mask_compare_type (NULL_TREE),
820 unaligned_dr (NULL),
821 peeling_for_alignment (0),
822 ptr_mask (0),
823 ivexpr_map (NULL),
824 slp_unrolling_factor (1),
825 single_scalar_iteration_cost (0),
826 vectorizable (false),
827 can_fully_mask_p (true),
828 fully_masked_p (false),
829 peeling_for_gaps (false),
830 peeling_for_niter (false),
831 operands_swapped (false),
832 no_data_dependencies (false),
833 has_mask_store (false),
834 scalar_loop (NULL),
835 orig_loop_info (NULL)
837 /* Create/Update stmt_info for all stmts in the loop. */
838 basic_block *body = get_loop_body (loop);
839 for (unsigned int i = 0; i < loop->num_nodes; i++)
841 basic_block bb = body[i];
842 gimple_stmt_iterator si;
844 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
846 gimple *phi = gsi_stmt (si);
847 gimple_set_uid (phi, 0);
848 add_stmt (phi);
851 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
853 gimple *stmt = gsi_stmt (si);
854 gimple_set_uid (stmt, 0);
855 add_stmt (stmt);
858 free (body);
860 /* CHECKME: We want to visit all BBs before their successors (except for
861 latch blocks, for which this assertion wouldn't hold). In the simple
862 case of the loop forms we allow, a dfs order of the BBs would the same
863 as reversed postorder traversal, so we are safe. */
865 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 bbs, loop->num_nodes, loop);
867 gcc_assert (nbbs == loop->num_nodes);
870 /* Free all levels of MASKS. */
872 void
873 release_vec_loop_masks (vec_loop_masks *masks)
875 rgroup_masks *rgm;
876 unsigned int i;
877 FOR_EACH_VEC_ELT (*masks, i, rgm)
878 rgm->masks.release ();
879 masks->release ();
882 /* Free all memory used by the _loop_vec_info, as well as all the
883 stmt_vec_info structs of all the stmts in the loop. */
885 _loop_vec_info::~_loop_vec_info ()
887 int nbbs;
888 gimple_stmt_iterator si;
889 int j;
891 nbbs = loop->num_nodes;
892 for (j = 0; j < nbbs; j++)
894 basic_block bb = bbs[j];
895 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
897 gimple *stmt = gsi_stmt (si);
899 /* We may have broken canonical form by moving a constant
900 into RHS1 of a commutative op. Fix such occurrences. */
901 if (operands_swapped && is_gimple_assign (stmt))
903 enum tree_code code = gimple_assign_rhs_code (stmt);
905 if ((code == PLUS_EXPR
906 || code == POINTER_PLUS_EXPR
907 || code == MULT_EXPR)
908 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
909 swap_ssa_operands (stmt,
910 gimple_assign_rhs1_ptr (stmt),
911 gimple_assign_rhs2_ptr (stmt));
912 else if (code == COND_EXPR
913 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
915 tree cond_expr = gimple_assign_rhs1 (stmt);
916 enum tree_code cond_code = TREE_CODE (cond_expr);
918 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
920 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
921 0));
922 cond_code = invert_tree_comparison (cond_code,
923 honor_nans);
924 if (cond_code != ERROR_MARK)
926 TREE_SET_CODE (cond_expr, cond_code);
927 swap_ssa_operands (stmt,
928 gimple_assign_rhs2_ptr (stmt),
929 gimple_assign_rhs3_ptr (stmt));
934 gsi_next (&si);
938 free (bbs);
940 release_vec_loop_masks (&masks);
941 delete ivexpr_map;
943 loop->aux = NULL;
946 /* Return an invariant or register for EXPR and emit necessary
947 computations in the LOOP_VINFO loop preheader. */
949 tree
950 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
952 if (is_gimple_reg (expr)
953 || is_gimple_min_invariant (expr))
954 return expr;
956 if (! loop_vinfo->ivexpr_map)
957 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
958 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
959 if (! cached)
961 gimple_seq stmts = NULL;
962 cached = force_gimple_operand (unshare_expr (expr),
963 &stmts, true, NULL_TREE);
964 if (stmts)
966 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
967 gsi_insert_seq_on_edge_immediate (e, stmts);
970 return cached;
973 /* Return true if we can use CMP_TYPE as the comparison type to produce
974 all masks required to mask LOOP_VINFO. */
976 static bool
977 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
979 rgroup_masks *rgm;
980 unsigned int i;
981 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
982 if (rgm->mask_type != NULL_TREE
983 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
984 cmp_type, rgm->mask_type,
985 OPTIMIZE_FOR_SPEED))
986 return false;
987 return true;
990 /* Calculate the maximum number of scalars per iteration for every
991 rgroup in LOOP_VINFO. */
993 static unsigned int
994 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
996 unsigned int res = 1;
997 unsigned int i;
998 rgroup_masks *rgm;
999 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1000 res = MAX (res, rgm->max_nscalars_per_iter);
1001 return res;
1004 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1005 whether we can actually generate the masks required. Return true if so,
1006 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1008 static bool
1009 vect_verify_full_masking (loop_vec_info loop_vinfo)
1011 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1012 unsigned int min_ni_width;
1014 /* Use a normal loop if there are no statements that need masking.
1015 This only happens in rare degenerate cases: it means that the loop
1016 has no loads, no stores, and no live-out values. */
1017 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1018 return false;
1020 /* Get the maximum number of iterations that is representable
1021 in the counter type. */
1022 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1023 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1025 /* Get a more refined estimate for the number of iterations. */
1026 widest_int max_back_edges;
1027 if (max_loop_iterations (loop, &max_back_edges))
1028 max_ni = wi::smin (max_ni, max_back_edges + 1);
1030 /* Account for rgroup masks, in which each bit is replicated N times. */
1031 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1033 /* Work out how many bits we need to represent the limit. */
1034 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1036 /* Find a scalar mode for which WHILE_ULT is supported. */
1037 opt_scalar_int_mode cmp_mode_iter;
1038 tree cmp_type = NULL_TREE;
1039 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1041 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1042 if (cmp_bits >= min_ni_width
1043 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1045 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1046 if (this_type
1047 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1049 /* Although we could stop as soon as we find a valid mode,
1050 it's often better to continue until we hit Pmode, since the
1051 operands to the WHILE are more likely to be reusable in
1052 address calculations. */
1053 cmp_type = this_type;
1054 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1055 break;
1060 if (!cmp_type)
1061 return false;
1063 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1064 return true;
1067 /* Calculate the cost of one scalar iteration of the loop. */
1068 static void
1069 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1071 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1072 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1073 int nbbs = loop->num_nodes, factor;
1074 int innerloop_iters, i;
1076 /* Gather costs for statements in the scalar loop. */
1078 /* FORNOW. */
1079 innerloop_iters = 1;
1080 if (loop->inner)
1081 innerloop_iters = 50; /* FIXME */
1083 for (i = 0; i < nbbs; i++)
1085 gimple_stmt_iterator si;
1086 basic_block bb = bbs[i];
1088 if (bb->loop_father == loop->inner)
1089 factor = innerloop_iters;
1090 else
1091 factor = 1;
1093 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095 gimple *stmt = gsi_stmt (si);
1096 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1098 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1099 continue;
1101 /* Skip stmts that are not vectorized inside the loop. */
1102 if (stmt_info
1103 && !STMT_VINFO_RELEVANT_P (stmt_info)
1104 && (!STMT_VINFO_LIVE_P (stmt_info)
1105 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1106 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1107 continue;
1109 vect_cost_for_stmt kind;
1110 if (STMT_VINFO_DATA_REF (stmt_info))
1112 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1113 kind = scalar_load;
1114 else
1115 kind = scalar_store;
1117 else
1118 kind = scalar_stmt;
1120 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1121 factor, kind, stmt_info, 0, vect_prologue);
1125 /* Now accumulate cost. */
1126 void *target_cost_data = init_cost (loop);
1127 stmt_info_for_cost *si;
1128 int j;
1129 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1130 j, si)
1131 (void) add_stmt_cost (target_cost_data, si->count,
1132 si->kind, si->stmt_info, si->misalign,
1133 vect_body);
1134 unsigned dummy, body_cost = 0;
1135 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1136 destroy_cost_data (target_cost_data);
1137 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1141 /* Function vect_analyze_loop_form_1.
1143 Verify that certain CFG restrictions hold, including:
1144 - the loop has a pre-header
1145 - the loop has a single entry and exit
1146 - the loop exit condition is simple enough
1147 - the number of iterations can be analyzed, i.e, a countable loop. The
1148 niter could be analyzed under some assumptions. */
1150 bool
1151 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1152 tree *assumptions, tree *number_of_iterationsm1,
1153 tree *number_of_iterations, gcond **inner_loop_cond)
1155 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157 /* Different restrictions apply when we are considering an inner-most loop,
1158 vs. an outer (nested) loop.
1159 (FORNOW. May want to relax some of these restrictions in the future). */
1161 if (!loop->inner)
1163 /* Inner-most loop. We currently require that the number of BBs is
1164 exactly 2 (the header and latch). Vectorizable inner-most loops
1165 look like this:
1167 (pre-header)
1169 header <--------+
1170 | | |
1171 | +--> latch --+
1173 (exit-bb) */
1175 if (loop->num_nodes != 2)
1177 if (dump_enabled_p ())
1178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179 "not vectorized: control flow in loop.\n");
1180 return false;
1183 if (empty_block_p (loop->header))
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "not vectorized: empty loop.\n");
1188 return false;
1191 else
1193 struct loop *innerloop = loop->inner;
1194 edge entryedge;
1196 /* Nested loop. We currently require that the loop is doubly-nested,
1197 contains a single inner loop, and the number of BBs is exactly 5.
1198 Vectorizable outer-loops look like this:
1200 (pre-header)
1202 header <---+
1204 inner-loop |
1206 tail ------+
1208 (exit-bb)
1210 The inner-loop has the properties expected of inner-most loops
1211 as described above. */
1213 if ((loop->inner)->inner || (loop->inner)->next)
1215 if (dump_enabled_p ())
1216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1217 "not vectorized: multiple nested loops.\n");
1218 return false;
1221 if (loop->num_nodes != 5)
1223 if (dump_enabled_p ())
1224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1225 "not vectorized: control flow in loop.\n");
1226 return false;
1229 entryedge = loop_preheader_edge (innerloop);
1230 if (entryedge->src != loop->header
1231 || !single_exit (innerloop)
1232 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236 "not vectorized: unsupported outerloop form.\n");
1237 return false;
1240 /* Analyze the inner-loop. */
1241 tree inner_niterm1, inner_niter, inner_assumptions;
1242 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1243 &inner_assumptions, &inner_niterm1,
1244 &inner_niter, NULL)
1245 /* Don't support analyzing niter under assumptions for inner
1246 loop. */
1247 || !integer_onep (inner_assumptions))
1249 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 "not vectorized: Bad inner loop.\n");
1252 return false;
1255 if (!expr_invariant_in_loop_p (loop, inner_niter))
1257 if (dump_enabled_p ())
1258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1259 "not vectorized: inner-loop count not"
1260 " invariant.\n");
1261 return false;
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_NOTE, vect_location,
1266 "Considering outer-loop vectorization.\n");
1269 if (!single_exit (loop)
1270 || EDGE_COUNT (loop->header->preds) != 2)
1272 if (dump_enabled_p ())
1274 if (!single_exit (loop))
1275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276 "not vectorized: multiple exits.\n");
1277 else if (EDGE_COUNT (loop->header->preds) != 2)
1278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1279 "not vectorized: too many incoming edges.\n");
1281 return false;
1284 /* We assume that the loop exit condition is at the end of the loop. i.e,
1285 that the loop is represented as a do-while (with a proper if-guard
1286 before the loop if needed), where the loop header contains all the
1287 executable statements, and the latch is empty. */
1288 if (!empty_block_p (loop->latch)
1289 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: latch block not empty.\n");
1294 return false;
1297 /* Make sure the exit is not abnormal. */
1298 edge e = single_exit (loop);
1299 if (e->flags & EDGE_ABNORMAL)
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: abnormal loop exit edge.\n");
1304 return false;
1307 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1308 number_of_iterationsm1);
1309 if (!*loop_cond)
1311 if (dump_enabled_p ())
1312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313 "not vectorized: complicated exit condition.\n");
1314 return false;
1317 if (integer_zerop (*assumptions)
1318 || !*number_of_iterations
1319 || chrec_contains_undetermined (*number_of_iterations))
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323 "not vectorized: number of iterations cannot be "
1324 "computed.\n");
1325 return false;
1328 if (integer_zerop (*number_of_iterations))
1330 if (dump_enabled_p ())
1331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1332 "not vectorized: number of iterations = 0.\n");
1333 return false;
1336 return true;
1339 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1341 loop_vec_info
1342 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1344 tree assumptions, number_of_iterations, number_of_iterationsm1;
1345 gcond *loop_cond, *inner_loop_cond = NULL;
1347 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1348 &assumptions, &number_of_iterationsm1,
1349 &number_of_iterations, &inner_loop_cond))
1350 return NULL;
1352 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1353 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1354 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1355 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1356 if (!integer_onep (assumptions))
1358 /* We consider to vectorize this loop by versioning it under
1359 some assumptions. In order to do this, we need to clear
1360 existing information computed by scev and niter analyzer. */
1361 scev_reset_htab ();
1362 free_numbers_of_iterations_estimates (loop);
1363 /* Also set flag for this loop so that following scev and niter
1364 analysis are done under the assumptions. */
1365 loop_constraint_set (loop, LOOP_C_FINITE);
1366 /* Also record the assumptions for versioning. */
1367 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1370 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1372 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_NOTE, vect_location,
1375 "Symbolic number of iterations is ");
1376 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1377 dump_printf (MSG_NOTE, "\n");
1381 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1382 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1383 if (inner_loop_cond)
1385 stmt_vec_info inner_loop_cond_info
1386 = loop_vinfo->lookup_stmt (inner_loop_cond);
1387 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1390 gcc_assert (!loop->aux);
1391 loop->aux = loop_vinfo;
1392 return loop_vinfo;
1397 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1398 statements update the vectorization factor. */
1400 static void
1401 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1403 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1404 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1405 int nbbs = loop->num_nodes;
1406 poly_uint64 vectorization_factor;
1407 int i;
1409 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1411 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1412 gcc_assert (known_ne (vectorization_factor, 0U));
1414 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1415 vectorization factor of the loop is the unrolling factor required by
1416 the SLP instances. If that unrolling factor is 1, we say, that we
1417 perform pure SLP on loop - cross iteration parallelism is not
1418 exploited. */
1419 bool only_slp_in_loop = true;
1420 for (i = 0; i < nbbs; i++)
1422 basic_block bb = bbs[i];
1423 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1424 gsi_next (&si))
1426 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1427 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1428 && STMT_VINFO_RELATED_STMT (stmt_info))
1429 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1430 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1431 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1432 && !PURE_SLP_STMT (stmt_info))
1433 /* STMT needs both SLP and loop-based vectorization. */
1434 only_slp_in_loop = false;
1438 if (only_slp_in_loop)
1440 dump_printf_loc (MSG_NOTE, vect_location,
1441 "Loop contains only SLP stmts\n");
1442 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1444 else
1446 dump_printf_loc (MSG_NOTE, vect_location,
1447 "Loop contains SLP and non-SLP stmts\n");
1448 /* Both the vectorization factor and unroll factor have the form
1449 current_vector_size * X for some rational X, so they must have
1450 a common multiple. */
1451 vectorization_factor
1452 = force_common_multiple (vectorization_factor,
1453 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1456 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1457 if (dump_enabled_p ())
1459 dump_printf_loc (MSG_NOTE, vect_location,
1460 "Updating vectorization factor to ");
1461 dump_dec (MSG_NOTE, vectorization_factor);
1462 dump_printf (MSG_NOTE, ".\n");
1466 /* Return true if STMT_INFO describes a double reduction phi and if
1467 the other phi in the reduction is also relevant for vectorization.
1468 This rejects cases such as:
1470 outer1:
1471 x_1 = PHI <x_3(outer2), ...>;
1474 inner:
1475 x_2 = ...;
1478 outer2:
1479 x_3 = PHI <x_2(inner)>;
1481 if nothing in x_2 or elsewhere makes x_1 relevant. */
1483 static bool
1484 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1486 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1487 return false;
1489 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1492 /* Function vect_analyze_loop_operations.
1494 Scan the loop stmts and make sure they are all vectorizable. */
1496 static bool
1497 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1499 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1500 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1501 int nbbs = loop->num_nodes;
1502 int i;
1503 stmt_vec_info stmt_info;
1504 bool need_to_vectorize = false;
1505 bool ok;
1507 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1509 stmt_vector_for_cost cost_vec;
1510 cost_vec.create (2);
1512 for (i = 0; i < nbbs; i++)
1514 basic_block bb = bbs[i];
1516 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1517 gsi_next (&si))
1519 gphi *phi = si.phi ();
1520 ok = true;
1522 stmt_info = loop_vinfo->lookup_stmt (phi);
1523 if (dump_enabled_p ())
1525 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1526 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1528 if (virtual_operand_p (gimple_phi_result (phi)))
1529 continue;
1531 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1532 (i.e., a phi in the tail of the outer-loop). */
1533 if (! is_loop_header_bb_p (bb))
1535 /* FORNOW: we currently don't support the case that these phis
1536 are not used in the outerloop (unless it is double reduction,
1537 i.e., this phi is vect_reduction_def), cause this case
1538 requires to actually do something here. */
1539 if (STMT_VINFO_LIVE_P (stmt_info)
1540 && !vect_active_double_reduction_p (stmt_info))
1542 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1544 "Unsupported loop-closed phi in "
1545 "outer-loop.\n");
1546 return false;
1549 /* If PHI is used in the outer loop, we check that its operand
1550 is defined in the inner loop. */
1551 if (STMT_VINFO_RELEVANT_P (stmt_info))
1553 tree phi_op;
1555 if (gimple_phi_num_args (phi) != 1)
1556 return false;
1558 phi_op = PHI_ARG_DEF (phi, 0);
1559 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1560 if (!op_def_info)
1561 return false;
1563 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1564 && (STMT_VINFO_RELEVANT (op_def_info)
1565 != vect_used_in_outer_by_reduction))
1566 return false;
1569 continue;
1572 gcc_assert (stmt_info);
1574 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1575 || STMT_VINFO_LIVE_P (stmt_info))
1576 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1578 /* A scalar-dependence cycle that we don't support. */
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1581 "not vectorized: scalar dependence cycle.\n");
1582 return false;
1585 if (STMT_VINFO_RELEVANT_P (stmt_info))
1587 need_to_vectorize = true;
1588 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1589 && ! PURE_SLP_STMT (stmt_info))
1590 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1591 &cost_vec);
1592 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1593 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1594 && ! PURE_SLP_STMT (stmt_info))
1595 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1596 &cost_vec);
1599 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1600 if (ok
1601 && STMT_VINFO_LIVE_P (stmt_info)
1602 && !PURE_SLP_STMT (stmt_info))
1603 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1604 &cost_vec);
1606 if (!ok)
1608 if (dump_enabled_p ())
1610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611 "not vectorized: relevant phi not "
1612 "supported: ");
1613 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1615 return false;
1619 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1620 gsi_next (&si))
1622 gimple *stmt = gsi_stmt (si);
1623 if (!gimple_clobber_p (stmt)
1624 && !vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1625 &need_to_vectorize,
1626 NULL, NULL, &cost_vec))
1627 return false;
1629 } /* bbs */
1631 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1632 cost_vec.release ();
1634 /* All operations in the loop are either irrelevant (deal with loop
1635 control, or dead), or only used outside the loop and can be moved
1636 out of the loop (e.g. invariants, inductions). The loop can be
1637 optimized away by scalar optimizations. We're better off not
1638 touching this loop. */
1639 if (!need_to_vectorize)
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "All the computation can be taken out of the loop.\n");
1644 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646 "not vectorized: redundant loop. no profit to "
1647 "vectorize.\n");
1648 return false;
1651 return true;
1654 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1655 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1656 definitely no, or -1 if it's worth retrying. */
1658 static int
1659 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1661 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1662 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1664 /* Only fully-masked loops can have iteration counts less than the
1665 vectorization factor. */
1666 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1668 HOST_WIDE_INT max_niter;
1670 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1671 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1672 else
1673 max_niter = max_stmt_executions_int (loop);
1675 if (max_niter != -1
1676 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680 "not vectorized: iteration count smaller than "
1681 "vectorization factor.\n");
1682 return 0;
1686 int min_profitable_iters, min_profitable_estimate;
1687 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1688 &min_profitable_estimate);
1690 if (min_profitable_iters < 0)
1692 if (dump_enabled_p ())
1693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694 "not vectorized: vectorization not profitable.\n");
1695 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697 "not vectorized: vector version will never be "
1698 "profitable.\n");
1699 return -1;
1702 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1703 * assumed_vf);
1705 /* Use the cost model only if it is more conservative than user specified
1706 threshold. */
1707 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1708 min_profitable_iters);
1710 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1712 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1713 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1715 if (dump_enabled_p ())
1716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717 "not vectorized: vectorization not profitable.\n");
1718 if (dump_enabled_p ())
1719 dump_printf_loc (MSG_NOTE, vect_location,
1720 "not vectorized: iteration count smaller than user "
1721 "specified loop bound parameter or minimum profitable "
1722 "iterations (whichever is more conservative).\n");
1723 return 0;
1726 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1727 if (estimated_niter == -1)
1728 estimated_niter = likely_max_stmt_executions_int (loop);
1729 if (estimated_niter != -1
1730 && ((unsigned HOST_WIDE_INT) estimated_niter
1731 < MAX (th, (unsigned) min_profitable_estimate)))
1733 if (dump_enabled_p ())
1734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735 "not vectorized: estimated iteration count too "
1736 "small.\n");
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "not vectorized: estimated iteration count smaller "
1740 "than specified loop bound parameter or minimum "
1741 "profitable iterations (whichever is more "
1742 "conservative).\n");
1743 return -1;
1746 return 1;
1749 static bool
1750 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1751 vec<data_reference_p> *datarefs,
1752 unsigned int *n_stmts)
1754 *n_stmts = 0;
1755 for (unsigned i = 0; i < loop->num_nodes; i++)
1756 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1757 !gsi_end_p (gsi); gsi_next (&gsi))
1759 gimple *stmt = gsi_stmt (gsi);
1760 if (is_gimple_debug (stmt))
1761 continue;
1762 ++(*n_stmts);
1763 if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1765 if (is_gimple_call (stmt) && loop->safelen)
1767 tree fndecl = gimple_call_fndecl (stmt), op;
1768 if (fndecl != NULL_TREE)
1770 cgraph_node *node = cgraph_node::get (fndecl);
1771 if (node != NULL && node->simd_clones != NULL)
1773 unsigned int j, n = gimple_call_num_args (stmt);
1774 for (j = 0; j < n; j++)
1776 op = gimple_call_arg (stmt, j);
1777 if (DECL_P (op)
1778 || (REFERENCE_CLASS_P (op)
1779 && get_base_address (op)))
1780 break;
1782 op = gimple_call_lhs (stmt);
1783 /* Ignore #pragma omp declare simd functions
1784 if they don't have data references in the
1785 call stmt itself. */
1786 if (j == n
1787 && !(op
1788 && (DECL_P (op)
1789 || (REFERENCE_CLASS_P (op)
1790 && get_base_address (op)))))
1791 continue;
1795 return false;
1797 /* If dependence analysis will give up due to the limit on the
1798 number of datarefs stop here and fail fatally. */
1799 if (datarefs->length ()
1800 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1801 return false;
1803 return true;
1806 /* Function vect_analyze_loop_2.
1808 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1809 for it. The different analyses will record information in the
1810 loop_vec_info struct. */
1811 static bool
1812 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1814 bool ok;
1815 int res;
1816 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1817 poly_uint64 min_vf = 2;
1819 /* The first group of checks is independent of the vector size. */
1820 fatal = true;
1822 /* Find all data references in the loop (which correspond to vdefs/vuses)
1823 and analyze their evolution in the loop. */
1825 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1827 /* Gather the data references and count stmts in the loop. */
1828 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1830 if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1831 &LOOP_VINFO_DATAREFS (loop_vinfo),
1832 n_stmts))
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1836 "not vectorized: loop contains function "
1837 "calls or data references that cannot "
1838 "be analyzed\n");
1839 return false;
1841 loop_vinfo->shared->save_datarefs ();
1843 else
1844 loop_vinfo->shared->check_datarefs ();
1846 /* Analyze the data references and also adjust the minimal
1847 vectorization factor according to the loads and stores. */
1849 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1850 if (!ok)
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "bad data references.\n");
1855 return false;
1858 /* Classify all cross-iteration scalar data-flow cycles.
1859 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1860 vect_analyze_scalar_cycles (loop_vinfo);
1862 vect_pattern_recog (loop_vinfo);
1864 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1866 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1867 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1869 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1870 if (!ok)
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 "bad data access.\n");
1875 return false;
1878 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1880 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1881 if (!ok)
1883 if (dump_enabled_p ())
1884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885 "unexpected pattern.\n");
1886 return false;
1889 /* While the rest of the analysis below depends on it in some way. */
1890 fatal = false;
1892 /* Analyze data dependences between the data-refs in the loop
1893 and adjust the maximum vectorization factor according to
1894 the dependences.
1895 FORNOW: fail at the first data dependence that we encounter. */
1897 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1898 if (!ok
1899 || (max_vf != MAX_VECTORIZATION_FACTOR
1900 && maybe_lt (max_vf, min_vf)))
1902 if (dump_enabled_p ())
1903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904 "bad data dependence.\n");
1905 return false;
1907 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1909 ok = vect_determine_vectorization_factor (loop_vinfo);
1910 if (!ok)
1912 if (dump_enabled_p ())
1913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914 "can't determine vectorization factor.\n");
1915 return false;
1917 if (max_vf != MAX_VECTORIZATION_FACTOR
1918 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922 "bad data dependence.\n");
1923 return false;
1926 /* Compute the scalar iteration cost. */
1927 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1929 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1930 unsigned th;
1932 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1933 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1934 if (!ok)
1935 return false;
1937 /* If there are any SLP instances mark them as pure_slp. */
1938 bool slp = vect_make_slp_decision (loop_vinfo);
1939 if (slp)
1941 /* Find stmts that need to be both vectorized and SLPed. */
1942 vect_detect_hybrid_slp (loop_vinfo);
1944 /* Update the vectorization factor based on the SLP decision. */
1945 vect_update_vf_for_slp (loop_vinfo);
1948 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1950 /* We don't expect to have to roll back to anything other than an empty
1951 set of rgroups. */
1952 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1954 /* This is the point where we can re-start analysis with SLP forced off. */
1955 start_over:
1957 /* Now the vectorization factor is final. */
1958 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959 gcc_assert (known_ne (vectorization_factor, 0U));
1961 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "vectorization_factor = ");
1965 dump_dec (MSG_NOTE, vectorization_factor);
1966 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1967 LOOP_VINFO_INT_NITERS (loop_vinfo));
1970 HOST_WIDE_INT max_niter
1971 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1973 /* Analyze the alignment of the data-refs in the loop.
1974 Fail if a data reference is found that cannot be vectorized. */
1976 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1977 if (!ok)
1979 if (dump_enabled_p ())
1980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981 "bad data alignment.\n");
1982 return false;
1985 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1986 It is important to call pruning after vect_analyze_data_ref_accesses,
1987 since we use grouping information gathered by interleaving analysis. */
1988 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1989 if (!ok)
1990 return false;
1992 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
1993 vectorization. */
1994 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1996 /* This pass will decide on using loop versioning and/or loop peeling in
1997 order to enhance the alignment of data references in the loop. */
1998 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1999 if (!ok)
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003 "bad data alignment.\n");
2004 return false;
2008 if (slp)
2010 /* Analyze operations in the SLP instances. Note this may
2011 remove unsupported SLP instances which makes the above
2012 SLP kind detection invalid. */
2013 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2014 vect_slp_analyze_operations (loop_vinfo);
2015 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2016 goto again;
2019 /* Scan all the remaining operations in the loop that are not subject
2020 to SLP and make sure they are vectorizable. */
2021 ok = vect_analyze_loop_operations (loop_vinfo);
2022 if (!ok)
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026 "bad operation or unsupported loop bound.\n");
2027 return false;
2030 /* Decide whether to use a fully-masked loop for this vectorization
2031 factor. */
2032 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2033 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2034 && vect_verify_full_masking (loop_vinfo));
2035 if (dump_enabled_p ())
2037 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2038 dump_printf_loc (MSG_NOTE, vect_location,
2039 "using a fully-masked loop.\n");
2040 else
2041 dump_printf_loc (MSG_NOTE, vect_location,
2042 "not using a fully-masked loop.\n");
2045 /* If epilog loop is required because of data accesses with gaps,
2046 one additional iteration needs to be peeled. Check if there is
2047 enough iterations for vectorization. */
2048 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2049 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2050 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2052 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2053 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2055 if (known_lt (wi::to_widest (scalar_niters), vf))
2057 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_NOTE, vect_location,
2059 "loop has no enough iterations to support"
2060 " peeling for gaps.\n");
2061 return false;
2065 /* Check the costings of the loop make vectorizing worthwhile. */
2066 res = vect_analyze_loop_costing (loop_vinfo);
2067 if (res < 0)
2068 goto again;
2069 if (!res)
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2073 "Loop costings not worthwhile.\n");
2074 return false;
2077 /* Decide whether we need to create an epilogue loop to handle
2078 remaining scalar iterations. */
2079 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2081 unsigned HOST_WIDE_INT const_vf;
2082 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2083 /* The main loop handles all iterations. */
2084 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2085 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2086 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2088 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2089 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2090 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2091 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2093 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2094 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2095 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2096 < (unsigned) exact_log2 (const_vf))
2097 /* In case of versioning, check if the maximum number of
2098 iterations is greater than th. If they are identical,
2099 the epilogue is unnecessary. */
2100 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2101 || ((unsigned HOST_WIDE_INT) max_niter
2102 > (th / const_vf) * const_vf))))
2103 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2105 /* If an epilogue loop is required make sure we can create one. */
2106 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2107 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2109 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2111 if (!vect_can_advance_ivs_p (loop_vinfo)
2112 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2113 single_exit (LOOP_VINFO_LOOP
2114 (loop_vinfo))))
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118 "not vectorized: can't create required "
2119 "epilog loop\n");
2120 goto again;
2124 /* During peeling, we need to check if number of loop iterations is
2125 enough for both peeled prolog loop and vector loop. This check
2126 can be merged along with threshold check of loop versioning, so
2127 increase threshold for this case if necessary. */
2128 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2130 poly_uint64 niters_th = 0;
2132 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2134 /* Niters for peeled prolog loop. */
2135 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2137 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2138 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2139 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2141 else
2142 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2145 /* Niters for at least one iteration of vectorized loop. */
2146 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2147 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2148 /* One additional iteration because of peeling for gap. */
2149 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2150 niters_th += 1;
2151 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2154 gcc_assert (known_eq (vectorization_factor,
2155 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2157 /* Ok to vectorize! */
2158 return true;
2160 again:
2161 /* Try again with SLP forced off but if we didn't do any SLP there is
2162 no point in re-trying. */
2163 if (!slp)
2164 return false;
2166 /* If there are reduction chains re-trying will fail anyway. */
2167 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2168 return false;
2170 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2171 via interleaving or lane instructions. */
2172 slp_instance instance;
2173 slp_tree node;
2174 unsigned i, j;
2175 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2177 stmt_vec_info vinfo;
2178 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2179 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2180 continue;
2181 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2182 unsigned int size = DR_GROUP_SIZE (vinfo);
2183 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2184 if (! vect_store_lanes_supported (vectype, size, false)
2185 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2186 && ! vect_grouped_store_supported (vectype, size))
2187 return false;
2188 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2190 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2191 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2192 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2193 size = DR_GROUP_SIZE (vinfo);
2194 vectype = STMT_VINFO_VECTYPE (vinfo);
2195 if (! vect_load_lanes_supported (vectype, size, false)
2196 && ! vect_grouped_load_supported (vectype, single_element_p,
2197 size))
2198 return false;
2202 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_NOTE, vect_location,
2204 "re-trying with SLP disabled\n");
2206 /* Roll back state appropriately. No SLP this time. */
2207 slp = false;
2208 /* Restore vectorization factor as it were without SLP. */
2209 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2210 /* Free the SLP instances. */
2211 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2212 vect_free_slp_instance (instance, false);
2213 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2214 /* Reset SLP type to loop_vect on all stmts. */
2215 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2217 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2218 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2219 !gsi_end_p (si); gsi_next (&si))
2221 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2222 STMT_SLP_TYPE (stmt_info) = loop_vect;
2224 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2225 !gsi_end_p (si); gsi_next (&si))
2227 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2228 STMT_SLP_TYPE (stmt_info) = loop_vect;
2229 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2231 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2232 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2233 STMT_SLP_TYPE (stmt_info) = loop_vect;
2234 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2235 !gsi_end_p (pi); gsi_next (&pi))
2236 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2237 = loop_vect;
2241 /* Free optimized alias test DDRS. */
2242 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2243 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2244 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2245 /* Reset target cost data. */
2246 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2247 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2248 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2249 /* Reset accumulated rgroup information. */
2250 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2251 /* Reset assorted flags. */
2252 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2253 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2254 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2255 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2256 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2258 goto start_over;
2261 /* Function vect_analyze_loop.
2263 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2264 for it. The different analyses will record information in the
2265 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2266 be vectorized. */
2267 loop_vec_info
2268 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2269 vec_info_shared *shared)
2271 loop_vec_info loop_vinfo;
2272 auto_vector_sizes vector_sizes;
2274 /* Autodetect first vector size we try. */
2275 current_vector_size = 0;
2276 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2277 unsigned int next_size = 0;
2279 DUMP_VECT_SCOPE ("analyze_loop_nest");
2281 if (loop_outer (loop)
2282 && loop_vec_info_for_loop (loop_outer (loop))
2283 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_NOTE, vect_location,
2287 "outer-loop already vectorized.\n");
2288 return NULL;
2291 if (!find_loop_nest (loop, &shared->loop_nest))
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2295 "not vectorized: loop nest containing two "
2296 "or more consecutive inner loops cannot be "
2297 "vectorized\n");
2298 return NULL;
2301 unsigned n_stmts = 0;
2302 poly_uint64 autodetected_vector_size = 0;
2303 while (1)
2305 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2306 loop_vinfo = vect_analyze_loop_form (loop, shared);
2307 if (!loop_vinfo)
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "bad loop form.\n");
2312 return NULL;
2315 bool fatal = false;
2317 if (orig_loop_vinfo)
2318 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2320 if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2322 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2324 return loop_vinfo;
2327 delete loop_vinfo;
2329 if (next_size == 0)
2330 autodetected_vector_size = current_vector_size;
2332 if (next_size < vector_sizes.length ()
2333 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2334 next_size += 1;
2336 if (fatal
2337 || next_size == vector_sizes.length ()
2338 || known_eq (current_vector_size, 0U))
2339 return NULL;
2341 /* Try the next biggest vector size. */
2342 current_vector_size = vector_sizes[next_size++];
2343 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_NOTE, vect_location,
2346 "***** Re-trying analysis with "
2347 "vector size ");
2348 dump_dec (MSG_NOTE, current_vector_size);
2349 dump_printf (MSG_NOTE, "\n");
2354 /* Return true if there is an in-order reduction function for CODE, storing
2355 it in *REDUC_FN if so. */
2357 static bool
2358 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2360 switch (code)
2362 case PLUS_EXPR:
2363 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2364 return true;
2366 default:
2367 return false;
2371 /* Function reduction_fn_for_scalar_code
2373 Input:
2374 CODE - tree_code of a reduction operations.
2376 Output:
2377 REDUC_FN - the corresponding internal function to be used to reduce the
2378 vector of partial results into a single scalar result, or IFN_LAST
2379 if the operation is a supported reduction operation, but does not have
2380 such an internal function.
2382 Return FALSE if CODE currently cannot be vectorized as reduction. */
2384 static bool
2385 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2387 switch (code)
2389 case MAX_EXPR:
2390 *reduc_fn = IFN_REDUC_MAX;
2391 return true;
2393 case MIN_EXPR:
2394 *reduc_fn = IFN_REDUC_MIN;
2395 return true;
2397 case PLUS_EXPR:
2398 *reduc_fn = IFN_REDUC_PLUS;
2399 return true;
2401 case BIT_AND_EXPR:
2402 *reduc_fn = IFN_REDUC_AND;
2403 return true;
2405 case BIT_IOR_EXPR:
2406 *reduc_fn = IFN_REDUC_IOR;
2407 return true;
2409 case BIT_XOR_EXPR:
2410 *reduc_fn = IFN_REDUC_XOR;
2411 return true;
2413 case MULT_EXPR:
2414 case MINUS_EXPR:
2415 *reduc_fn = IFN_LAST;
2416 return true;
2418 default:
2419 return false;
2423 /* If there is a neutral value X such that SLP reduction NODE would not
2424 be affected by the introduction of additional X elements, return that X,
2425 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2426 is true if the SLP statements perform a single reduction, false if each
2427 statement performs an independent reduction. */
2429 static tree
2430 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2431 bool reduc_chain)
2433 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2434 stmt_vec_info stmt_vinfo = stmts[0];
2435 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2436 tree scalar_type = TREE_TYPE (vector_type);
2437 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2438 gcc_assert (loop);
2440 switch (code)
2442 case WIDEN_SUM_EXPR:
2443 case DOT_PROD_EXPR:
2444 case SAD_EXPR:
2445 case PLUS_EXPR:
2446 case MINUS_EXPR:
2447 case BIT_IOR_EXPR:
2448 case BIT_XOR_EXPR:
2449 return build_zero_cst (scalar_type);
2451 case MULT_EXPR:
2452 return build_one_cst (scalar_type);
2454 case BIT_AND_EXPR:
2455 return build_all_ones_cst (scalar_type);
2457 case MAX_EXPR:
2458 case MIN_EXPR:
2459 /* For MIN/MAX the initial values are neutral. A reduction chain
2460 has only a single initial value, so that value is neutral for
2461 all statements. */
2462 if (reduc_chain)
2463 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2464 loop_preheader_edge (loop));
2465 return NULL_TREE;
2467 default:
2468 return NULL_TREE;
2472 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2473 STMT is printed with a message MSG. */
2475 static void
2476 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2478 dump_printf_loc (msg_type, vect_location, "%s", msg);
2479 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2482 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2483 operation. Return true if the results of DEF_STMT_INFO are something
2484 that can be accumulated by such a reduction. */
2486 static bool
2487 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2489 return (is_gimple_assign (def_stmt_info->stmt)
2490 || is_gimple_call (def_stmt_info->stmt)
2491 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2492 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2493 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2494 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2497 /* Detect SLP reduction of the form:
2499 #a1 = phi <a5, a0>
2500 a2 = operation (a1)
2501 a3 = operation (a2)
2502 a4 = operation (a3)
2503 a5 = operation (a4)
2505 #a = phi <a5>
2507 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2508 FIRST_STMT is the first reduction stmt in the chain
2509 (a2 = operation (a1)).
2511 Return TRUE if a reduction chain was detected. */
2513 static bool
2514 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2515 gimple *first_stmt)
2517 struct loop *loop = (gimple_bb (phi))->loop_father;
2518 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2519 enum tree_code code;
2520 gimple *loop_use_stmt = NULL;
2521 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2522 tree lhs;
2523 imm_use_iterator imm_iter;
2524 use_operand_p use_p;
2525 int nloop_uses, size = 0, n_out_of_loop_uses;
2526 bool found = false;
2528 if (loop != vect_loop)
2529 return false;
2531 lhs = PHI_RESULT (phi);
2532 code = gimple_assign_rhs_code (first_stmt);
2533 while (1)
2535 nloop_uses = 0;
2536 n_out_of_loop_uses = 0;
2537 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2539 gimple *use_stmt = USE_STMT (use_p);
2540 if (is_gimple_debug (use_stmt))
2541 continue;
2543 /* Check if we got back to the reduction phi. */
2544 if (use_stmt == phi)
2546 loop_use_stmt = use_stmt;
2547 found = true;
2548 break;
2551 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2553 loop_use_stmt = use_stmt;
2554 nloop_uses++;
2556 else
2557 n_out_of_loop_uses++;
2559 /* There are can be either a single use in the loop or two uses in
2560 phi nodes. */
2561 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2562 return false;
2565 if (found)
2566 break;
2568 /* We reached a statement with no loop uses. */
2569 if (nloop_uses == 0)
2570 return false;
2572 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2573 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2574 return false;
2576 if (!is_gimple_assign (loop_use_stmt)
2577 || code != gimple_assign_rhs_code (loop_use_stmt)
2578 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2579 return false;
2581 /* Insert USE_STMT into reduction chain. */
2582 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2583 if (current_stmt_info)
2585 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2586 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2587 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2589 else
2590 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2592 lhs = gimple_assign_lhs (loop_use_stmt);
2593 current_stmt_info = use_stmt_info;
2594 size++;
2597 if (!found || loop_use_stmt != phi || size < 2)
2598 return false;
2600 /* Swap the operands, if needed, to make the reduction operand be the second
2601 operand. */
2602 lhs = PHI_RESULT (phi);
2603 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2604 while (next_stmt_info)
2606 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2607 if (gimple_assign_rhs2 (next_stmt) == lhs)
2609 tree op = gimple_assign_rhs1 (next_stmt);
2610 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2612 /* Check that the other def is either defined in the loop
2613 ("vect_internal_def"), or it's an induction (defined by a
2614 loop-header phi-node). */
2615 if (def_stmt_info
2616 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2617 && vect_valid_reduction_input_p (def_stmt_info))
2619 lhs = gimple_assign_lhs (next_stmt);
2620 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2621 continue;
2624 return false;
2626 else
2628 tree op = gimple_assign_rhs2 (next_stmt);
2629 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2631 /* Check that the other def is either defined in the loop
2632 ("vect_internal_def"), or it's an induction (defined by a
2633 loop-header phi-node). */
2634 if (def_stmt_info
2635 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2636 && vect_valid_reduction_input_p (def_stmt_info))
2638 if (dump_enabled_p ())
2640 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2641 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2644 swap_ssa_operands (next_stmt,
2645 gimple_assign_rhs1_ptr (next_stmt),
2646 gimple_assign_rhs2_ptr (next_stmt));
2647 update_stmt (next_stmt);
2649 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2650 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2652 else
2653 return false;
2656 lhs = gimple_assign_lhs (next_stmt);
2657 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2660 /* Save the chain for further analysis in SLP detection. */
2661 stmt_vec_info first_stmt_info
2662 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2663 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2664 REDUC_GROUP_SIZE (first_stmt_info) = size;
2666 return true;
2669 /* Return true if we need an in-order reduction for operation CODE
2670 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2671 overflow must wrap. */
2673 static bool
2674 needs_fold_left_reduction_p (tree type, tree_code code,
2675 bool need_wrapping_integral_overflow)
2677 /* CHECKME: check for !flag_finite_math_only too? */
2678 if (SCALAR_FLOAT_TYPE_P (type))
2679 switch (code)
2681 case MIN_EXPR:
2682 case MAX_EXPR:
2683 return false;
2685 default:
2686 return !flag_associative_math;
2689 if (INTEGRAL_TYPE_P (type))
2691 if (!operation_no_trapping_overflow (type, code))
2692 return true;
2693 if (need_wrapping_integral_overflow
2694 && !TYPE_OVERFLOW_WRAPS (type)
2695 && operation_can_overflow (code))
2696 return true;
2697 return false;
2700 if (SAT_FIXED_POINT_TYPE_P (type))
2701 return true;
2703 return false;
2706 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2707 reduction operation CODE has a handled computation expression. */
2709 bool
2710 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2711 tree loop_arg, enum tree_code code)
2713 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2714 auto_bitmap visited;
2715 tree lookfor = PHI_RESULT (phi);
2716 ssa_op_iter curri;
2717 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2718 while (USE_FROM_PTR (curr) != loop_arg)
2719 curr = op_iter_next_use (&curri);
2720 curri.i = curri.numops;
2723 path.safe_push (std::make_pair (curri, curr));
2724 tree use = USE_FROM_PTR (curr);
2725 if (use == lookfor)
2726 break;
2727 gimple *def = SSA_NAME_DEF_STMT (use);
2728 if (gimple_nop_p (def)
2729 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2731 pop:
2734 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2735 curri = x.first;
2736 curr = x.second;
2738 curr = op_iter_next_use (&curri);
2739 /* Skip already visited or non-SSA operands (from iterating
2740 over PHI args). */
2741 while (curr != NULL_USE_OPERAND_P
2742 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2743 || ! bitmap_set_bit (visited,
2744 SSA_NAME_VERSION
2745 (USE_FROM_PTR (curr)))));
2747 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2748 if (curr == NULL_USE_OPERAND_P)
2749 break;
2751 else
2753 if (gimple_code (def) == GIMPLE_PHI)
2754 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2755 else
2756 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2757 while (curr != NULL_USE_OPERAND_P
2758 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2759 || ! bitmap_set_bit (visited,
2760 SSA_NAME_VERSION
2761 (USE_FROM_PTR (curr)))))
2762 curr = op_iter_next_use (&curri);
2763 if (curr == NULL_USE_OPERAND_P)
2764 goto pop;
2767 while (1);
2768 if (dump_file && (dump_flags & TDF_DETAILS))
2770 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2771 unsigned i;
2772 std::pair<ssa_op_iter, use_operand_p> *x;
2773 FOR_EACH_VEC_ELT (path, i, x)
2775 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2776 dump_printf (MSG_NOTE, " ");
2778 dump_printf (MSG_NOTE, "\n");
2781 /* Check whether the reduction path detected is valid. */
2782 bool fail = path.length () == 0;
2783 bool neg = false;
2784 for (unsigned i = 1; i < path.length (); ++i)
2786 gimple *use_stmt = USE_STMT (path[i].second);
2787 tree op = USE_FROM_PTR (path[i].second);
2788 if (! has_single_use (op)
2789 || ! is_gimple_assign (use_stmt))
2791 fail = true;
2792 break;
2794 if (gimple_assign_rhs_code (use_stmt) != code)
2796 if (code == PLUS_EXPR
2797 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2799 /* Track whether we negate the reduction value each iteration. */
2800 if (gimple_assign_rhs2 (use_stmt) == op)
2801 neg = ! neg;
2803 else
2805 fail = true;
2806 break;
2810 return ! fail && ! neg;
2814 /* Function vect_is_simple_reduction
2816 (1) Detect a cross-iteration def-use cycle that represents a simple
2817 reduction computation. We look for the following pattern:
2819 loop_header:
2820 a1 = phi < a0, a2 >
2821 a3 = ...
2822 a2 = operation (a3, a1)
2826 a3 = ...
2827 loop_header:
2828 a1 = phi < a0, a2 >
2829 a2 = operation (a3, a1)
2831 such that:
2832 1. operation is commutative and associative and it is safe to
2833 change the order of the computation
2834 2. no uses for a2 in the loop (a2 is used out of the loop)
2835 3. no uses of a1 in the loop besides the reduction operation
2836 4. no uses of a1 outside the loop.
2838 Conditions 1,4 are tested here.
2839 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2841 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2842 nested cycles.
2844 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2845 reductions:
2847 a1 = phi < a0, a2 >
2848 inner loop (def of a3)
2849 a2 = phi < a3 >
2851 (4) Detect condition expressions, ie:
2852 for (int i = 0; i < N; i++)
2853 if (a[i] < val)
2854 ret_val = a[i];
2858 static stmt_vec_info
2859 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2860 bool *double_reduc,
2861 bool need_wrapping_integral_overflow,
2862 enum vect_reduction_type *v_reduc_type)
2864 gphi *phi = as_a <gphi *> (phi_info->stmt);
2865 struct loop *loop = (gimple_bb (phi))->loop_father;
2866 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2867 gimple *phi_use_stmt = NULL;
2868 enum tree_code orig_code, code;
2869 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2870 tree type;
2871 int nloop_uses;
2872 tree name;
2873 imm_use_iterator imm_iter;
2874 use_operand_p use_p;
2875 bool phi_def;
2877 *double_reduc = false;
2878 *v_reduc_type = TREE_CODE_REDUCTION;
2880 tree phi_name = PHI_RESULT (phi);
2881 /* ??? If there are no uses of the PHI result the inner loop reduction
2882 won't be detected as possibly double-reduction by vectorizable_reduction
2883 because that tries to walk the PHI arg from the preheader edge which
2884 can be constant. See PR60382. */
2885 if (has_zero_uses (phi_name))
2886 return NULL;
2887 nloop_uses = 0;
2888 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2890 gimple *use_stmt = USE_STMT (use_p);
2891 if (is_gimple_debug (use_stmt))
2892 continue;
2894 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2896 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898 "intermediate value used outside loop.\n");
2900 return NULL;
2903 nloop_uses++;
2904 if (nloop_uses > 1)
2906 if (dump_enabled_p ())
2907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908 "reduction value used in loop.\n");
2909 return NULL;
2912 phi_use_stmt = use_stmt;
2915 edge latch_e = loop_latch_edge (loop);
2916 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2917 if (TREE_CODE (loop_arg) != SSA_NAME)
2919 if (dump_enabled_p ())
2921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2922 "reduction: not ssa_name: ");
2923 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2924 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2926 return NULL;
2929 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2930 if (!def_stmt_info)
2931 return NULL;
2933 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2935 name = gimple_assign_lhs (def_stmt);
2936 phi_def = false;
2938 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2940 name = PHI_RESULT (def_stmt);
2941 phi_def = true;
2943 else
2945 if (dump_enabled_p ())
2947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2948 "reduction: unhandled reduction operation: ");
2949 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2950 def_stmt_info->stmt, 0);
2952 return NULL;
2955 nloop_uses = 0;
2956 auto_vec<gphi *, 3> lcphis;
2957 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2959 gimple *use_stmt = USE_STMT (use_p);
2960 if (is_gimple_debug (use_stmt))
2961 continue;
2962 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2963 nloop_uses++;
2964 else
2965 /* We can have more than one loop-closed PHI. */
2966 lcphis.safe_push (as_a <gphi *> (use_stmt));
2967 if (nloop_uses > 1)
2969 if (dump_enabled_p ())
2970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2971 "reduction used in loop.\n");
2972 return NULL;
2976 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2977 defined in the inner loop. */
2978 if (phi_def)
2980 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2981 op1 = PHI_ARG_DEF (def_stmt, 0);
2983 if (gimple_phi_num_args (def_stmt) != 1
2984 || TREE_CODE (op1) != SSA_NAME)
2986 if (dump_enabled_p ())
2987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2988 "unsupported phi node definition.\n");
2990 return NULL;
2993 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2994 if (gimple_bb (def1)
2995 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2996 && loop->inner
2997 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2998 && is_gimple_assign (def1)
2999 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3001 if (dump_enabled_p ())
3002 report_vect_op (MSG_NOTE, def_stmt,
3003 "detected double reduction: ");
3005 *double_reduc = true;
3006 return def_stmt_info;
3009 return NULL;
3012 /* If we are vectorizing an inner reduction we are executing that
3013 in the original order only in case we are not dealing with a
3014 double reduction. */
3015 bool check_reduction = true;
3016 if (flow_loop_nested_p (vect_loop, loop))
3018 gphi *lcphi;
3019 unsigned i;
3020 check_reduction = false;
3021 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3022 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3024 gimple *use_stmt = USE_STMT (use_p);
3025 if (is_gimple_debug (use_stmt))
3026 continue;
3027 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3028 check_reduction = true;
3032 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3033 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3034 code = orig_code = gimple_assign_rhs_code (def_stmt);
3036 /* We can handle "res -= x[i]", which is non-associative by
3037 simply rewriting this into "res += -x[i]". Avoid changing
3038 gimple instruction for the first simple tests and only do this
3039 if we're allowed to change code at all. */
3040 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3041 code = PLUS_EXPR;
3043 if (code == COND_EXPR)
3045 if (! nested_in_vect_loop)
3046 *v_reduc_type = COND_REDUCTION;
3048 op3 = gimple_assign_rhs1 (def_stmt);
3049 if (COMPARISON_CLASS_P (op3))
3051 op4 = TREE_OPERAND (op3, 1);
3052 op3 = TREE_OPERAND (op3, 0);
3054 if (op3 == phi_name || op4 == phi_name)
3056 if (dump_enabled_p ())
3057 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3058 "reduction: condition depends on previous"
3059 " iteration: ");
3060 return NULL;
3063 op1 = gimple_assign_rhs2 (def_stmt);
3064 op2 = gimple_assign_rhs3 (def_stmt);
3066 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3068 if (dump_enabled_p ())
3069 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3070 "reduction: not commutative/associative: ");
3071 return NULL;
3073 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3075 op1 = gimple_assign_rhs1 (def_stmt);
3076 op2 = gimple_assign_rhs2 (def_stmt);
3078 else
3080 if (dump_enabled_p ())
3081 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3082 "reduction: not handled operation: ");
3083 return NULL;
3086 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3088 if (dump_enabled_p ())
3089 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3090 "reduction: both uses not ssa_names: ");
3092 return NULL;
3095 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3096 if ((TREE_CODE (op1) == SSA_NAME
3097 && !types_compatible_p (type,TREE_TYPE (op1)))
3098 || (TREE_CODE (op2) == SSA_NAME
3099 && !types_compatible_p (type, TREE_TYPE (op2)))
3100 || (op3 && TREE_CODE (op3) == SSA_NAME
3101 && !types_compatible_p (type, TREE_TYPE (op3)))
3102 || (op4 && TREE_CODE (op4) == SSA_NAME
3103 && !types_compatible_p (type, TREE_TYPE (op4))))
3105 if (dump_enabled_p ())
3107 dump_printf_loc (MSG_NOTE, vect_location,
3108 "reduction: multiple types: operation type: ");
3109 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3110 dump_printf (MSG_NOTE, ", operands types: ");
3111 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3112 TREE_TYPE (op1));
3113 dump_printf (MSG_NOTE, ",");
3114 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3115 TREE_TYPE (op2));
3116 if (op3)
3118 dump_printf (MSG_NOTE, ",");
3119 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3120 TREE_TYPE (op3));
3123 if (op4)
3125 dump_printf (MSG_NOTE, ",");
3126 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3127 TREE_TYPE (op4));
3129 dump_printf (MSG_NOTE, "\n");
3132 return NULL;
3135 /* Check whether it's ok to change the order of the computation.
3136 Generally, when vectorizing a reduction we change the order of the
3137 computation. This may change the behavior of the program in some
3138 cases, so we need to check that this is ok. One exception is when
3139 vectorizing an outer-loop: the inner-loop is executed sequentially,
3140 and therefore vectorizing reductions in the inner-loop during
3141 outer-loop vectorization is safe. */
3142 if (check_reduction
3143 && *v_reduc_type == TREE_CODE_REDUCTION
3144 && needs_fold_left_reduction_p (type, code,
3145 need_wrapping_integral_overflow))
3146 *v_reduc_type = FOLD_LEFT_REDUCTION;
3148 /* Reduction is safe. We're dealing with one of the following:
3149 1) integer arithmetic and no trapv
3150 2) floating point arithmetic, and special flags permit this optimization
3151 3) nested cycle (i.e., outer loop vectorization). */
3152 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3153 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3154 if (code != COND_EXPR && !def1_info && !def2_info)
3156 if (dump_enabled_p ())
3157 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3158 return NULL;
3161 /* Check that one def is the reduction def, defined by PHI,
3162 the other def is either defined in the loop ("vect_internal_def"),
3163 or it's an induction (defined by a loop-header phi-node). */
3165 if (def2_info
3166 && def2_info->stmt == phi
3167 && (code == COND_EXPR
3168 || !def1_info
3169 || vect_valid_reduction_input_p (def1_info)))
3171 if (dump_enabled_p ())
3172 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3173 return def_stmt_info;
3176 if (def1_info
3177 && def1_info->stmt == phi
3178 && (code == COND_EXPR
3179 || !def2_info
3180 || vect_valid_reduction_input_p (def2_info)))
3182 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3184 /* Check if we can swap operands (just for simplicity - so that
3185 the rest of the code can assume that the reduction variable
3186 is always the last (second) argument). */
3187 if (code == COND_EXPR)
3189 /* Swap cond_expr by inverting the condition. */
3190 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3191 enum tree_code invert_code = ERROR_MARK;
3192 enum tree_code cond_code = TREE_CODE (cond_expr);
3194 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3196 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3197 invert_code = invert_tree_comparison (cond_code, honor_nans);
3199 if (invert_code != ERROR_MARK)
3201 TREE_SET_CODE (cond_expr, invert_code);
3202 swap_ssa_operands (def_stmt,
3203 gimple_assign_rhs2_ptr (def_stmt),
3204 gimple_assign_rhs3_ptr (def_stmt));
3206 else
3208 if (dump_enabled_p ())
3209 report_vect_op (MSG_NOTE, def_stmt,
3210 "detected reduction: cannot swap operands "
3211 "for cond_expr");
3212 return NULL;
3215 else
3216 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3217 gimple_assign_rhs2_ptr (def_stmt));
3219 if (dump_enabled_p ())
3220 report_vect_op (MSG_NOTE, def_stmt,
3221 "detected reduction: need to swap operands: ");
3223 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3224 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3226 else
3228 if (dump_enabled_p ())
3229 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3232 return def_stmt_info;
3235 /* Try to find SLP reduction chain. */
3236 if (! nested_in_vect_loop
3237 && code != COND_EXPR
3238 && orig_code != MINUS_EXPR
3239 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3241 if (dump_enabled_p ())
3242 report_vect_op (MSG_NOTE, def_stmt,
3243 "reduction: detected reduction chain: ");
3245 return def_stmt_info;
3248 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3249 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3250 while (first)
3252 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3253 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3254 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3255 first = next;
3258 /* Look for the expression computing loop_arg from loop PHI result. */
3259 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3260 return def_stmt_info;
3262 if (dump_enabled_p ())
3264 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3265 "reduction: unknown pattern: ");
3268 return NULL;
3271 /* Wrapper around vect_is_simple_reduction, which will modify code
3272 in-place if it enables detection of more reductions. Arguments
3273 as there. */
3275 stmt_vec_info
3276 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3277 bool *double_reduc,
3278 bool need_wrapping_integral_overflow)
3280 enum vect_reduction_type v_reduc_type;
3281 stmt_vec_info def_info
3282 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3283 need_wrapping_integral_overflow,
3284 &v_reduc_type);
3285 if (def_info)
3287 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3288 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3289 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3290 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3292 return def_info;
3295 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3297 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3298 int *peel_iters_epilogue,
3299 stmt_vector_for_cost *scalar_cost_vec,
3300 stmt_vector_for_cost *prologue_cost_vec,
3301 stmt_vector_for_cost *epilogue_cost_vec)
3303 int retval = 0;
3304 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3306 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3308 *peel_iters_epilogue = assumed_vf / 2;
3309 if (dump_enabled_p ())
3310 dump_printf_loc (MSG_NOTE, vect_location,
3311 "cost model: epilogue peel iters set to vf/2 "
3312 "because loop iterations are unknown .\n");
3314 /* If peeled iterations are known but number of scalar loop
3315 iterations are unknown, count a taken branch per peeled loop. */
3316 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3317 NULL, 0, vect_prologue);
3318 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3319 NULL, 0, vect_epilogue);
3321 else
3323 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3324 peel_iters_prologue = niters < peel_iters_prologue ?
3325 niters : peel_iters_prologue;
3326 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3327 /* If we need to peel for gaps, but no peeling is required, we have to
3328 peel VF iterations. */
3329 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3330 *peel_iters_epilogue = assumed_vf;
3333 stmt_info_for_cost *si;
3334 int j;
3335 if (peel_iters_prologue)
3336 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3337 retval += record_stmt_cost (prologue_cost_vec,
3338 si->count * peel_iters_prologue,
3339 si->kind, si->stmt_info, si->misalign,
3340 vect_prologue);
3341 if (*peel_iters_epilogue)
3342 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3343 retval += record_stmt_cost (epilogue_cost_vec,
3344 si->count * *peel_iters_epilogue,
3345 si->kind, si->stmt_info, si->misalign,
3346 vect_epilogue);
3348 return retval;
3351 /* Function vect_estimate_min_profitable_iters
3353 Return the number of iterations required for the vector version of the
3354 loop to be profitable relative to the cost of the scalar version of the
3355 loop.
3357 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3358 of iterations for vectorization. -1 value means loop vectorization
3359 is not profitable. This returned value may be used for dynamic
3360 profitability check.
3362 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3363 for static check against estimated number of iterations. */
3365 static void
3366 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3367 int *ret_min_profitable_niters,
3368 int *ret_min_profitable_estimate)
3370 int min_profitable_iters;
3371 int min_profitable_estimate;
3372 int peel_iters_prologue;
3373 int peel_iters_epilogue;
3374 unsigned vec_inside_cost = 0;
3375 int vec_outside_cost = 0;
3376 unsigned vec_prologue_cost = 0;
3377 unsigned vec_epilogue_cost = 0;
3378 int scalar_single_iter_cost = 0;
3379 int scalar_outside_cost = 0;
3380 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3381 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3382 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3384 /* Cost model disabled. */
3385 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3387 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3388 *ret_min_profitable_niters = 0;
3389 *ret_min_profitable_estimate = 0;
3390 return;
3393 /* Requires loop versioning tests to handle misalignment. */
3394 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3396 /* FIXME: Make cost depend on complexity of individual check. */
3397 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3398 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3399 vect_prologue);
3400 dump_printf (MSG_NOTE,
3401 "cost model: Adding cost of checks for loop "
3402 "versioning to treat misalignment.\n");
3405 /* Requires loop versioning with alias checks. */
3406 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3408 /* FIXME: Make cost depend on complexity of individual check. */
3409 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3410 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3411 vect_prologue);
3412 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3413 if (len)
3414 /* Count LEN - 1 ANDs and LEN comparisons. */
3415 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3416 NULL, 0, vect_prologue);
3417 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3418 if (len)
3420 /* Count LEN - 1 ANDs and LEN comparisons. */
3421 unsigned int nstmts = len * 2 - 1;
3422 /* +1 for each bias that needs adding. */
3423 for (unsigned int i = 0; i < len; ++i)
3424 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3425 nstmts += 1;
3426 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3427 NULL, 0, vect_prologue);
3429 dump_printf (MSG_NOTE,
3430 "cost model: Adding cost of checks for loop "
3431 "versioning aliasing.\n");
3434 /* Requires loop versioning with niter checks. */
3435 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3437 /* FIXME: Make cost depend on complexity of individual check. */
3438 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3439 vect_prologue);
3440 dump_printf (MSG_NOTE,
3441 "cost model: Adding cost of checks for loop "
3442 "versioning niters.\n");
3445 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3446 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3447 vect_prologue);
3449 /* Count statements in scalar loop. Using this as scalar cost for a single
3450 iteration for now.
3452 TODO: Add outer loop support.
3454 TODO: Consider assigning different costs to different scalar
3455 statements. */
3457 scalar_single_iter_cost
3458 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3460 /* Add additional cost for the peeled instructions in prologue and epilogue
3461 loop. (For fully-masked loops there will be no peeling.)
3463 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3464 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3466 TODO: Build an expression that represents peel_iters for prologue and
3467 epilogue to be used in a run-time test. */
3469 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3471 peel_iters_prologue = 0;
3472 peel_iters_epilogue = 0;
3474 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3476 /* We need to peel exactly one iteration. */
3477 peel_iters_epilogue += 1;
3478 stmt_info_for_cost *si;
3479 int j;
3480 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3481 j, si)
3482 (void) add_stmt_cost (target_cost_data, si->count,
3483 si->kind, si->stmt_info, si->misalign,
3484 vect_epilogue);
3487 else if (npeel < 0)
3489 peel_iters_prologue = assumed_vf / 2;
3490 dump_printf (MSG_NOTE, "cost model: "
3491 "prologue peel iters set to vf/2.\n");
3493 /* If peeling for alignment is unknown, loop bound of main loop becomes
3494 unknown. */
3495 peel_iters_epilogue = assumed_vf / 2;
3496 dump_printf (MSG_NOTE, "cost model: "
3497 "epilogue peel iters set to vf/2 because "
3498 "peeling for alignment is unknown.\n");
3500 /* If peeled iterations are unknown, count a taken branch and a not taken
3501 branch per peeled loop. Even if scalar loop iterations are known,
3502 vector iterations are not known since peeled prologue iterations are
3503 not known. Hence guards remain the same. */
3504 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3505 NULL, 0, vect_prologue);
3506 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3507 NULL, 0, vect_prologue);
3508 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3509 NULL, 0, vect_epilogue);
3510 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3511 NULL, 0, vect_epilogue);
3512 stmt_info_for_cost *si;
3513 int j;
3514 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3516 (void) add_stmt_cost (target_cost_data,
3517 si->count * peel_iters_prologue,
3518 si->kind, si->stmt_info, si->misalign,
3519 vect_prologue);
3520 (void) add_stmt_cost (target_cost_data,
3521 si->count * peel_iters_epilogue,
3522 si->kind, si->stmt_info, si->misalign,
3523 vect_epilogue);
3526 else
3528 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3529 stmt_info_for_cost *si;
3530 int j;
3531 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3533 prologue_cost_vec.create (2);
3534 epilogue_cost_vec.create (2);
3535 peel_iters_prologue = npeel;
3537 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3538 &peel_iters_epilogue,
3539 &LOOP_VINFO_SCALAR_ITERATION_COST
3540 (loop_vinfo),
3541 &prologue_cost_vec,
3542 &epilogue_cost_vec);
3544 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3545 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3546 si->misalign, vect_prologue);
3548 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3549 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3550 si->misalign, vect_epilogue);
3552 prologue_cost_vec.release ();
3553 epilogue_cost_vec.release ();
3556 /* FORNOW: The scalar outside cost is incremented in one of the
3557 following ways:
3559 1. The vectorizer checks for alignment and aliasing and generates
3560 a condition that allows dynamic vectorization. A cost model
3561 check is ANDED with the versioning condition. Hence scalar code
3562 path now has the added cost of the versioning check.
3564 if (cost > th & versioning_check)
3565 jmp to vector code
3567 Hence run-time scalar is incremented by not-taken branch cost.
3569 2. The vectorizer then checks if a prologue is required. If the
3570 cost model check was not done before during versioning, it has to
3571 be done before the prologue check.
3573 if (cost <= th)
3574 prologue = scalar_iters
3575 if (prologue == 0)
3576 jmp to vector code
3577 else
3578 execute prologue
3579 if (prologue == num_iters)
3580 go to exit
3582 Hence the run-time scalar cost is incremented by a taken branch,
3583 plus a not-taken branch, plus a taken branch cost.
3585 3. The vectorizer then checks if an epilogue is required. If the
3586 cost model check was not done before during prologue check, it
3587 has to be done with the epilogue check.
3589 if (prologue == 0)
3590 jmp to vector code
3591 else
3592 execute prologue
3593 if (prologue == num_iters)
3594 go to exit
3595 vector code:
3596 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3597 jmp to epilogue
3599 Hence the run-time scalar cost should be incremented by 2 taken
3600 branches.
3602 TODO: The back end may reorder the BBS's differently and reverse
3603 conditions/branch directions. Change the estimates below to
3604 something more reasonable. */
3606 /* If the number of iterations is known and we do not do versioning, we can
3607 decide whether to vectorize at compile time. Hence the scalar version
3608 do not carry cost model guard costs. */
3609 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3610 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3612 /* Cost model check occurs at versioning. */
3613 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3614 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3615 else
3617 /* Cost model check occurs at prologue generation. */
3618 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3619 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3620 + vect_get_stmt_cost (cond_branch_not_taken);
3621 /* Cost model check occurs at epilogue generation. */
3622 else
3623 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3627 /* Complete the target-specific cost calculations. */
3628 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3629 &vec_inside_cost, &vec_epilogue_cost);
3631 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3633 if (dump_enabled_p ())
3635 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3636 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3637 vec_inside_cost);
3638 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3639 vec_prologue_cost);
3640 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3641 vec_epilogue_cost);
3642 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3643 scalar_single_iter_cost);
3644 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3645 scalar_outside_cost);
3646 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3647 vec_outside_cost);
3648 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3649 peel_iters_prologue);
3650 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3651 peel_iters_epilogue);
3654 /* Calculate number of iterations required to make the vector version
3655 profitable, relative to the loop bodies only. The following condition
3656 must hold true:
3657 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3658 where
3659 SIC = scalar iteration cost, VIC = vector iteration cost,
3660 VOC = vector outside cost, VF = vectorization factor,
3661 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3662 SOC = scalar outside cost for run time cost model check. */
3664 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3666 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3667 * assumed_vf
3668 - vec_inside_cost * peel_iters_prologue
3669 - vec_inside_cost * peel_iters_epilogue);
3670 if (min_profitable_iters <= 0)
3671 min_profitable_iters = 0;
3672 else
3674 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3675 - vec_inside_cost);
3677 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3678 <= (((int) vec_inside_cost * min_profitable_iters)
3679 + (((int) vec_outside_cost - scalar_outside_cost)
3680 * assumed_vf)))
3681 min_profitable_iters++;
3684 /* vector version will never be profitable. */
3685 else
3687 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3688 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3689 "vectorization did not happen for a simd loop");
3691 if (dump_enabled_p ())
3692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3693 "cost model: the vector iteration cost = %d "
3694 "divided by the scalar iteration cost = %d "
3695 "is greater or equal to the vectorization factor = %d"
3696 ".\n",
3697 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3698 *ret_min_profitable_niters = -1;
3699 *ret_min_profitable_estimate = -1;
3700 return;
3703 dump_printf (MSG_NOTE,
3704 " Calculated minimum iters for profitability: %d\n",
3705 min_profitable_iters);
3707 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3708 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3709 /* We want the vectorized loop to execute at least once. */
3710 min_profitable_iters = assumed_vf + peel_iters_prologue;
3712 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_NOTE, vect_location,
3714 " Runtime profitability threshold = %d\n",
3715 min_profitable_iters);
3717 *ret_min_profitable_niters = min_profitable_iters;
3719 /* Calculate number of iterations required to make the vector version
3720 profitable, relative to the loop bodies only.
3722 Non-vectorized variant is SIC * niters and it must win over vector
3723 variant on the expected loop trip count. The following condition must hold true:
3724 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3726 if (vec_outside_cost <= 0)
3727 min_profitable_estimate = 0;
3728 else
3730 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3731 * assumed_vf
3732 - vec_inside_cost * peel_iters_prologue
3733 - vec_inside_cost * peel_iters_epilogue)
3734 / ((scalar_single_iter_cost * assumed_vf)
3735 - vec_inside_cost);
3737 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3738 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE, vect_location,
3740 " Static estimate profitability threshold = %d\n",
3741 min_profitable_estimate);
3743 *ret_min_profitable_estimate = min_profitable_estimate;
3746 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3747 vector elements (not bits) for a vector with NELT elements. */
3748 static void
3749 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3750 vec_perm_builder *sel)
3752 /* The encoding is a single stepped pattern. Any wrap-around is handled
3753 by vec_perm_indices. */
3754 sel->new_vector (nelt, 1, 3);
3755 for (unsigned int i = 0; i < 3; i++)
3756 sel->quick_push (i + offset);
3759 /* Checks whether the target supports whole-vector shifts for vectors of mode
3760 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3761 it supports vec_perm_const with masks for all necessary shift amounts. */
3762 static bool
3763 have_whole_vector_shift (machine_mode mode)
3765 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3766 return true;
3768 /* Variable-length vectors should be handled via the optab. */
3769 unsigned int nelt;
3770 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3771 return false;
3773 vec_perm_builder sel;
3774 vec_perm_indices indices;
3775 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3777 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3778 indices.new_vector (sel, 2, nelt);
3779 if (!can_vec_perm_const_p (mode, indices, false))
3780 return false;
3782 return true;
3785 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3786 functions. Design better to avoid maintenance issues. */
3788 /* Function vect_model_reduction_cost.
3790 Models cost for a reduction operation, including the vector ops
3791 generated within the strip-mine loop, the initial definition before
3792 the loop, and the epilogue code that must be generated. */
3794 static void
3795 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3796 int ncopies, stmt_vector_for_cost *cost_vec)
3798 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3799 enum tree_code code;
3800 optab optab;
3801 tree vectype;
3802 machine_mode mode;
3803 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3804 struct loop *loop = NULL;
3806 if (loop_vinfo)
3807 loop = LOOP_VINFO_LOOP (loop_vinfo);
3809 /* Condition reductions generate two reductions in the loop. */
3810 vect_reduction_type reduction_type
3811 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3812 if (reduction_type == COND_REDUCTION)
3813 ncopies *= 2;
3815 vectype = STMT_VINFO_VECTYPE (stmt_info);
3816 mode = TYPE_MODE (vectype);
3817 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3819 if (!orig_stmt_info)
3820 orig_stmt_info = stmt_info;
3822 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3824 if (reduction_type == EXTRACT_LAST_REDUCTION
3825 || reduction_type == FOLD_LEFT_REDUCTION)
3827 /* No extra instructions needed in the prologue. */
3828 prologue_cost = 0;
3830 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3831 /* Count one reduction-like operation per vector. */
3832 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3833 stmt_info, 0, vect_body);
3834 else
3836 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3837 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3838 inside_cost = record_stmt_cost (cost_vec, nelements,
3839 vec_to_scalar, stmt_info, 0,
3840 vect_body);
3841 inside_cost += record_stmt_cost (cost_vec, nelements,
3842 scalar_stmt, stmt_info, 0,
3843 vect_body);
3846 else
3848 /* Add in cost for initial definition.
3849 For cond reduction we have four vectors: initial index, step,
3850 initial result of the data reduction, initial value of the index
3851 reduction. */
3852 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3853 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3854 scalar_to_vec, stmt_info, 0,
3855 vect_prologue);
3857 /* Cost of reduction op inside loop. */
3858 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3859 stmt_info, 0, vect_body);
3862 /* Determine cost of epilogue code.
3864 We have a reduction operator that will reduce the vector in one statement.
3865 Also requires scalar extract. */
3867 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3869 if (reduc_fn != IFN_LAST)
3871 if (reduction_type == COND_REDUCTION)
3873 /* An EQ stmt and an COND_EXPR stmt. */
3874 epilogue_cost += record_stmt_cost (cost_vec, 2,
3875 vector_stmt, stmt_info, 0,
3876 vect_epilogue);
3877 /* Reduction of the max index and a reduction of the found
3878 values. */
3879 epilogue_cost += record_stmt_cost (cost_vec, 2,
3880 vec_to_scalar, stmt_info, 0,
3881 vect_epilogue);
3882 /* A broadcast of the max value. */
3883 epilogue_cost += record_stmt_cost (cost_vec, 1,
3884 scalar_to_vec, stmt_info, 0,
3885 vect_epilogue);
3887 else
3889 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3890 stmt_info, 0, vect_epilogue);
3891 epilogue_cost += record_stmt_cost (cost_vec, 1,
3892 vec_to_scalar, stmt_info, 0,
3893 vect_epilogue);
3896 else if (reduction_type == COND_REDUCTION)
3898 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3899 /* Extraction of scalar elements. */
3900 epilogue_cost += record_stmt_cost (cost_vec,
3901 2 * estimated_nunits,
3902 vec_to_scalar, stmt_info, 0,
3903 vect_epilogue);
3904 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3905 epilogue_cost += record_stmt_cost (cost_vec,
3906 2 * estimated_nunits - 3,
3907 scalar_stmt, stmt_info, 0,
3908 vect_epilogue);
3910 else if (reduction_type == EXTRACT_LAST_REDUCTION
3911 || reduction_type == FOLD_LEFT_REDUCTION)
3912 /* No extra instructions need in the epilogue. */
3914 else
3916 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3917 tree bitsize =
3918 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3919 int element_bitsize = tree_to_uhwi (bitsize);
3920 int nelements = vec_size_in_bits / element_bitsize;
3922 if (code == COND_EXPR)
3923 code = MAX_EXPR;
3925 optab = optab_for_tree_code (code, vectype, optab_default);
3927 /* We have a whole vector shift available. */
3928 if (optab != unknown_optab
3929 && VECTOR_MODE_P (mode)
3930 && optab_handler (optab, mode) != CODE_FOR_nothing
3931 && have_whole_vector_shift (mode))
3933 /* Final reduction via vector shifts and the reduction operator.
3934 Also requires scalar extract. */
3935 epilogue_cost += record_stmt_cost (cost_vec,
3936 exact_log2 (nelements) * 2,
3937 vector_stmt, stmt_info, 0,
3938 vect_epilogue);
3939 epilogue_cost += record_stmt_cost (cost_vec, 1,
3940 vec_to_scalar, stmt_info, 0,
3941 vect_epilogue);
3943 else
3944 /* Use extracts and reduction op for final reduction. For N
3945 elements, we have N extracts and N-1 reduction ops. */
3946 epilogue_cost += record_stmt_cost (cost_vec,
3947 nelements + nelements - 1,
3948 vector_stmt, stmt_info, 0,
3949 vect_epilogue);
3953 if (dump_enabled_p ())
3954 dump_printf (MSG_NOTE,
3955 "vect_model_reduction_cost: inside_cost = %d, "
3956 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3957 prologue_cost, epilogue_cost);
3961 /* Function vect_model_induction_cost.
3963 Models cost for induction operations. */
3965 static void
3966 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3967 stmt_vector_for_cost *cost_vec)
3969 unsigned inside_cost, prologue_cost;
3971 if (PURE_SLP_STMT (stmt_info))
3972 return;
3974 /* loop cost for vec_loop. */
3975 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3976 stmt_info, 0, vect_body);
3978 /* prologue cost for vec_init and vec_step. */
3979 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3980 stmt_info, 0, vect_prologue);
3982 if (dump_enabled_p ())
3983 dump_printf_loc (MSG_NOTE, vect_location,
3984 "vect_model_induction_cost: inside_cost = %d, "
3985 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3990 /* Function get_initial_def_for_reduction
3992 Input:
3993 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3994 INIT_VAL - the initial value of the reduction variable
3996 Output:
3997 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3998 of the reduction (used for adjusting the epilog - see below).
3999 Return a vector variable, initialized according to the operation that
4000 STMT_VINFO performs. This vector will be used as the initial value
4001 of the vector of partial results.
4003 Option1 (adjust in epilog): Initialize the vector as follows:
4004 add/bit or/xor: [0,0,...,0,0]
4005 mult/bit and: [1,1,...,1,1]
4006 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4007 and when necessary (e.g. add/mult case) let the caller know
4008 that it needs to adjust the result by init_val.
4010 Option2: Initialize the vector as follows:
4011 add/bit or/xor: [init_val,0,0,...,0]
4012 mult/bit and: [init_val,1,1,...,1]
4013 min/max/cond_expr: [init_val,init_val,...,init_val]
4014 and no adjustments are needed.
4016 For example, for the following code:
4018 s = init_val;
4019 for (i=0;i<n;i++)
4020 s = s + a[i];
4022 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4023 For a vector of 4 units, we want to return either [0,0,0,init_val],
4024 or [0,0,0,0] and let the caller know that it needs to adjust
4025 the result at the end by 'init_val'.
4027 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4028 initialization vector is simpler (same element in all entries), if
4029 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4031 A cost model should help decide between these two schemes. */
4033 tree
4034 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4035 tree *adjustment_def)
4037 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4038 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4039 tree scalar_type = TREE_TYPE (init_val);
4040 tree vectype = get_vectype_for_scalar_type (scalar_type);
4041 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4042 tree def_for_init;
4043 tree init_def;
4044 REAL_VALUE_TYPE real_init_val = dconst0;
4045 int int_init_val = 0;
4046 gimple_seq stmts = NULL;
4048 gcc_assert (vectype);
4050 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4051 || SCALAR_FLOAT_TYPE_P (scalar_type));
4053 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4054 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4056 vect_reduction_type reduction_type
4057 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4059 switch (code)
4061 case WIDEN_SUM_EXPR:
4062 case DOT_PROD_EXPR:
4063 case SAD_EXPR:
4064 case PLUS_EXPR:
4065 case MINUS_EXPR:
4066 case BIT_IOR_EXPR:
4067 case BIT_XOR_EXPR:
4068 case MULT_EXPR:
4069 case BIT_AND_EXPR:
4071 /* ADJUSTMENT_DEF is NULL when called from
4072 vect_create_epilog_for_reduction to vectorize double reduction. */
4073 if (adjustment_def)
4074 *adjustment_def = init_val;
4076 if (code == MULT_EXPR)
4078 real_init_val = dconst1;
4079 int_init_val = 1;
4082 if (code == BIT_AND_EXPR)
4083 int_init_val = -1;
4085 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4086 def_for_init = build_real (scalar_type, real_init_val);
4087 else
4088 def_for_init = build_int_cst (scalar_type, int_init_val);
4090 if (adjustment_def)
4091 /* Option1: the first element is '0' or '1' as well. */
4092 init_def = gimple_build_vector_from_val (&stmts, vectype,
4093 def_for_init);
4094 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4096 /* Option2 (variable length): the first element is INIT_VAL. */
4097 init_def = gimple_build_vector_from_val (&stmts, vectype,
4098 def_for_init);
4099 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4100 vectype, init_def, init_val);
4102 else
4104 /* Option2: the first element is INIT_VAL. */
4105 tree_vector_builder elts (vectype, 1, 2);
4106 elts.quick_push (init_val);
4107 elts.quick_push (def_for_init);
4108 init_def = gimple_build_vector (&stmts, &elts);
4111 break;
4113 case MIN_EXPR:
4114 case MAX_EXPR:
4115 case COND_EXPR:
4117 if (adjustment_def)
4119 *adjustment_def = NULL_TREE;
4120 if (reduction_type != COND_REDUCTION
4121 && reduction_type != EXTRACT_LAST_REDUCTION)
4123 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4124 break;
4127 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4128 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4130 break;
4132 default:
4133 gcc_unreachable ();
4136 if (stmts)
4137 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4138 return init_def;
4141 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4142 NUMBER_OF_VECTORS is the number of vector defs to create.
4143 If NEUTRAL_OP is nonnull, introducing extra elements of that
4144 value will not change the result. */
4146 static void
4147 get_initial_defs_for_reduction (slp_tree slp_node,
4148 vec<tree> *vec_oprnds,
4149 unsigned int number_of_vectors,
4150 bool reduc_chain, tree neutral_op)
4152 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4153 stmt_vec_info stmt_vinfo = stmts[0];
4154 unsigned HOST_WIDE_INT nunits;
4155 unsigned j, number_of_places_left_in_vector;
4156 tree vector_type;
4157 tree vop;
4158 int group_size = stmts.length ();
4159 unsigned int vec_num, i;
4160 unsigned number_of_copies = 1;
4161 vec<tree> voprnds;
4162 voprnds.create (number_of_vectors);
4163 struct loop *loop;
4164 auto_vec<tree, 16> permute_results;
4166 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4168 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4170 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4171 gcc_assert (loop);
4172 edge pe = loop_preheader_edge (loop);
4174 gcc_assert (!reduc_chain || neutral_op);
4176 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4177 created vectors. It is greater than 1 if unrolling is performed.
4179 For example, we have two scalar operands, s1 and s2 (e.g., group of
4180 strided accesses of size two), while NUNITS is four (i.e., four scalars
4181 of this type can be packed in a vector). The output vector will contain
4182 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4183 will be 2).
4185 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4186 vectors containing the operands.
4188 For example, NUNITS is four as before, and the group size is 8
4189 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4190 {s5, s6, s7, s8}. */
4192 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4193 nunits = group_size;
4195 number_of_copies = nunits * number_of_vectors / group_size;
4197 number_of_places_left_in_vector = nunits;
4198 bool constant_p = true;
4199 tree_vector_builder elts (vector_type, nunits, 1);
4200 elts.quick_grow (nunits);
4201 for (j = 0; j < number_of_copies; j++)
4203 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4205 tree op;
4206 /* Get the def before the loop. In reduction chain we have only
4207 one initial value. */
4208 if ((j != (number_of_copies - 1)
4209 || (reduc_chain && i != 0))
4210 && neutral_op)
4211 op = neutral_op;
4212 else
4213 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4215 /* Create 'vect_ = {op0,op1,...,opn}'. */
4216 number_of_places_left_in_vector--;
4217 elts[number_of_places_left_in_vector] = op;
4218 if (!CONSTANT_CLASS_P (op))
4219 constant_p = false;
4221 if (number_of_places_left_in_vector == 0)
4223 gimple_seq ctor_seq = NULL;
4224 tree init;
4225 if (constant_p && !neutral_op
4226 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4227 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4228 /* Build the vector directly from ELTS. */
4229 init = gimple_build_vector (&ctor_seq, &elts);
4230 else if (neutral_op)
4232 /* Build a vector of the neutral value and shift the
4233 other elements into place. */
4234 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4235 neutral_op);
4236 int k = nunits;
4237 while (k > 0 && elts[k - 1] == neutral_op)
4238 k -= 1;
4239 while (k > 0)
4241 k -= 1;
4242 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4243 vector_type, init, elts[k]);
4246 else
4248 /* First time round, duplicate ELTS to fill the
4249 required number of vectors, then cherry pick the
4250 appropriate result for each iteration. */
4251 if (vec_oprnds->is_empty ())
4252 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4253 number_of_vectors,
4254 permute_results);
4255 init = permute_results[number_of_vectors - j - 1];
4257 if (ctor_seq != NULL)
4258 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4259 voprnds.quick_push (init);
4261 number_of_places_left_in_vector = nunits;
4262 elts.new_vector (vector_type, nunits, 1);
4263 elts.quick_grow (nunits);
4264 constant_p = true;
4269 /* Since the vectors are created in the reverse order, we should invert
4270 them. */
4271 vec_num = voprnds.length ();
4272 for (j = vec_num; j != 0; j--)
4274 vop = voprnds[j - 1];
4275 vec_oprnds->quick_push (vop);
4278 voprnds.release ();
4280 /* In case that VF is greater than the unrolling factor needed for the SLP
4281 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4282 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4283 to replicate the vectors. */
4284 tree neutral_vec = NULL;
4285 while (number_of_vectors > vec_oprnds->length ())
4287 if (neutral_op)
4289 if (!neutral_vec)
4291 gimple_seq ctor_seq = NULL;
4292 neutral_vec = gimple_build_vector_from_val
4293 (&ctor_seq, vector_type, neutral_op);
4294 if (ctor_seq != NULL)
4295 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4297 vec_oprnds->quick_push (neutral_vec);
4299 else
4301 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4302 vec_oprnds->quick_push (vop);
4308 /* Function vect_create_epilog_for_reduction
4310 Create code at the loop-epilog to finalize the result of a reduction
4311 computation.
4313 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4314 reduction statements.
4315 STMT_INFO is the scalar reduction stmt that is being vectorized.
4316 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4317 number of elements that we can fit in a vectype (nunits). In this case
4318 we have to generate more than one vector stmt - i.e - we need to "unroll"
4319 the vector stmt by a factor VF/nunits. For more details see documentation
4320 in vectorizable_operation.
4321 REDUC_FN is the internal function for the epilog reduction.
4322 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4323 computation.
4324 REDUC_INDEX is the index of the operand in the right hand side of the
4325 statement that is defined by REDUCTION_PHI.
4326 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4327 SLP_NODE is an SLP node containing a group of reduction statements. The
4328 first one in this group is STMT_INFO.
4329 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4330 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4331 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4332 any value of the IV in the loop.
4333 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4334 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4335 null if this is not an SLP reduction
4337 This function:
4338 1. Creates the reduction def-use cycles: sets the arguments for
4339 REDUCTION_PHIS:
4340 The loop-entry argument is the vectorized initial-value of the reduction.
4341 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4342 sums.
4343 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4344 by calling the function specified by REDUC_FN if available, or by
4345 other means (whole-vector shifts or a scalar loop).
4346 The function also creates a new phi node at the loop exit to preserve
4347 loop-closed form, as illustrated below.
4349 The flow at the entry to this function:
4351 loop:
4352 vec_def = phi <null, null> # REDUCTION_PHI
4353 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4354 s_loop = scalar_stmt # (scalar) STMT_INFO
4355 loop_exit:
4356 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4357 use <s_out0>
4358 use <s_out0>
4360 The above is transformed by this function into:
4362 loop:
4363 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4364 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4365 s_loop = scalar_stmt # (scalar) STMT_INFO
4366 loop_exit:
4367 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4368 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4369 v_out2 = reduce <v_out1>
4370 s_out3 = extract_field <v_out2, 0>
4371 s_out4 = adjust_result <s_out3>
4372 use <s_out4>
4373 use <s_out4>
4376 static void
4377 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4378 stmt_vec_info stmt_info,
4379 gimple *reduc_def_stmt,
4380 int ncopies, internal_fn reduc_fn,
4381 vec<stmt_vec_info> reduction_phis,
4382 bool double_reduc,
4383 slp_tree slp_node,
4384 slp_instance slp_node_instance,
4385 tree induc_val, enum tree_code induc_code,
4386 tree neutral_op)
4388 stmt_vec_info prev_phi_info;
4389 tree vectype;
4390 machine_mode mode;
4391 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4392 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4393 basic_block exit_bb;
4394 tree scalar_dest;
4395 tree scalar_type;
4396 gimple *new_phi = NULL, *phi;
4397 stmt_vec_info phi_info;
4398 gimple_stmt_iterator exit_gsi;
4399 tree vec_dest;
4400 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4401 gimple *epilog_stmt = NULL;
4402 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4403 gimple *exit_phi;
4404 tree bitsize;
4405 tree adjustment_def = NULL;
4406 tree vec_initial_def = NULL;
4407 tree expr, def, initial_def = NULL;
4408 tree orig_name, scalar_result;
4409 imm_use_iterator imm_iter, phi_imm_iter;
4410 use_operand_p use_p, phi_use_p;
4411 gimple *use_stmt;
4412 stmt_vec_info reduction_phi_info = NULL;
4413 bool nested_in_vect_loop = false;
4414 auto_vec<gimple *> new_phis;
4415 auto_vec<stmt_vec_info> inner_phis;
4416 int j, i;
4417 auto_vec<tree> scalar_results;
4418 unsigned int group_size = 1, k, ratio;
4419 auto_vec<tree> vec_initial_defs;
4420 auto_vec<gimple *> phis;
4421 bool slp_reduc = false;
4422 bool direct_slp_reduc;
4423 tree new_phi_result;
4424 stmt_vec_info inner_phi = NULL;
4425 tree induction_index = NULL_TREE;
4427 if (slp_node)
4428 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4430 if (nested_in_vect_loop_p (loop, stmt_info))
4432 outer_loop = loop;
4433 loop = loop->inner;
4434 nested_in_vect_loop = true;
4435 gcc_assert (!slp_node);
4438 vectype = STMT_VINFO_VECTYPE (stmt_info);
4439 gcc_assert (vectype);
4440 mode = TYPE_MODE (vectype);
4442 /* 1. Create the reduction def-use cycle:
4443 Set the arguments of REDUCTION_PHIS, i.e., transform
4445 loop:
4446 vec_def = phi <null, null> # REDUCTION_PHI
4447 VECT_DEF = vector_stmt # vectorized form of STMT
4450 into:
4452 loop:
4453 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4454 VECT_DEF = vector_stmt # vectorized form of STMT
4457 (in case of SLP, do it for all the phis). */
4459 /* Get the loop-entry arguments. */
4460 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4461 if (slp_node)
4463 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4464 vec_initial_defs.reserve (vec_num);
4465 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4466 &vec_initial_defs, vec_num,
4467 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4468 neutral_op);
4470 else
4472 /* Get at the scalar def before the loop, that defines the initial value
4473 of the reduction variable. */
4474 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4475 loop_preheader_edge (loop));
4476 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4477 and we can't use zero for induc_val, use initial_def. Similarly
4478 for REDUC_MIN and initial_def larger than the base. */
4479 if (TREE_CODE (initial_def) == INTEGER_CST
4480 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4481 == INTEGER_INDUC_COND_REDUCTION)
4482 && !integer_zerop (induc_val)
4483 && ((induc_code == MAX_EXPR
4484 && tree_int_cst_lt (initial_def, induc_val))
4485 || (induc_code == MIN_EXPR
4486 && tree_int_cst_lt (induc_val, initial_def))))
4487 induc_val = initial_def;
4489 if (double_reduc)
4490 /* In case of double reduction we only create a vector variable
4491 to be put in the reduction phi node. The actual statement
4492 creation is done later in this function. */
4493 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4494 else if (nested_in_vect_loop)
4496 /* Do not use an adjustment def as that case is not supported
4497 correctly if ncopies is not one. */
4498 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4499 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4500 stmt_info);
4502 else
4503 vec_initial_def
4504 = get_initial_def_for_reduction (stmt_info, initial_def,
4505 &adjustment_def);
4506 vec_initial_defs.create (1);
4507 vec_initial_defs.quick_push (vec_initial_def);
4510 /* Set phi nodes arguments. */
4511 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4513 tree vec_init_def = vec_initial_defs[i];
4514 tree def = vect_defs[i];
4515 for (j = 0; j < ncopies; j++)
4517 if (j != 0)
4519 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4520 if (nested_in_vect_loop)
4521 vec_init_def
4522 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4525 /* Set the loop-entry arg of the reduction-phi. */
4527 gphi *phi = as_a <gphi *> (phi_info->stmt);
4528 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4529 == INTEGER_INDUC_COND_REDUCTION)
4531 /* Initialise the reduction phi to zero. This prevents initial
4532 values of non-zero interferring with the reduction op. */
4533 gcc_assert (ncopies == 1);
4534 gcc_assert (i == 0);
4536 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4537 tree induc_val_vec
4538 = build_vector_from_val (vec_init_def_type, induc_val);
4540 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4541 UNKNOWN_LOCATION);
4543 else
4544 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4545 UNKNOWN_LOCATION);
4547 /* Set the loop-latch arg for the reduction-phi. */
4548 if (j > 0)
4549 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4551 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4553 if (dump_enabled_p ())
4555 dump_printf_loc (MSG_NOTE, vect_location,
4556 "transform reduction: created def-use cycle: ");
4557 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4558 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4563 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4564 which is updated with the current index of the loop for every match of
4565 the original loop's cond_expr (VEC_STMT). This results in a vector
4566 containing the last time the condition passed for that vector lane.
4567 The first match will be a 1 to allow 0 to be used for non-matching
4568 indexes. If there are no matches at all then the vector will be all
4569 zeroes. */
4570 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4572 tree indx_before_incr, indx_after_incr;
4573 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4575 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4576 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4578 int scalar_precision
4579 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4580 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4581 tree cr_index_vector_type = build_vector_type
4582 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4584 /* First we create a simple vector induction variable which starts
4585 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4586 vector size (STEP). */
4588 /* Create a {1,2,3,...} vector. */
4589 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4591 /* Create a vector of the step value. */
4592 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4593 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4595 /* Create an induction variable. */
4596 gimple_stmt_iterator incr_gsi;
4597 bool insert_after;
4598 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4599 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4600 insert_after, &indx_before_incr, &indx_after_incr);
4602 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4603 filled with zeros (VEC_ZERO). */
4605 /* Create a vector of 0s. */
4606 tree zero = build_zero_cst (cr_index_scalar_type);
4607 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4609 /* Create a vector phi node. */
4610 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4611 new_phi = create_phi_node (new_phi_tree, loop->header);
4612 loop_vinfo->add_stmt (new_phi);
4613 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4614 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4616 /* Now take the condition from the loops original cond_expr
4617 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4618 every match uses values from the induction variable
4619 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4620 (NEW_PHI_TREE).
4621 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4622 the new cond_expr (INDEX_COND_EXPR). */
4624 /* Duplicate the condition from vec_stmt. */
4625 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4627 /* Create a conditional, where the condition is taken from vec_stmt
4628 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4629 else is the phi (NEW_PHI_TREE). */
4630 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4631 ccompare, indx_before_incr,
4632 new_phi_tree);
4633 induction_index = make_ssa_name (cr_index_vector_type);
4634 gimple *index_condition = gimple_build_assign (induction_index,
4635 index_cond_expr);
4636 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4637 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4638 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4640 /* Update the phi with the vec cond. */
4641 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4642 loop_latch_edge (loop), UNKNOWN_LOCATION);
4645 /* 2. Create epilog code.
4646 The reduction epilog code operates across the elements of the vector
4647 of partial results computed by the vectorized loop.
4648 The reduction epilog code consists of:
4650 step 1: compute the scalar result in a vector (v_out2)
4651 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4652 step 3: adjust the scalar result (s_out3) if needed.
4654 Step 1 can be accomplished using one the following three schemes:
4655 (scheme 1) using reduc_fn, if available.
4656 (scheme 2) using whole-vector shifts, if available.
4657 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4658 combined.
4660 The overall epilog code looks like this:
4662 s_out0 = phi <s_loop> # original EXIT_PHI
4663 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4664 v_out2 = reduce <v_out1> # step 1
4665 s_out3 = extract_field <v_out2, 0> # step 2
4666 s_out4 = adjust_result <s_out3> # step 3
4668 (step 3 is optional, and steps 1 and 2 may be combined).
4669 Lastly, the uses of s_out0 are replaced by s_out4. */
4672 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4673 v_out1 = phi <VECT_DEF>
4674 Store them in NEW_PHIS. */
4676 exit_bb = single_exit (loop)->dest;
4677 prev_phi_info = NULL;
4678 new_phis.create (vect_defs.length ());
4679 FOR_EACH_VEC_ELT (vect_defs, i, def)
4681 for (j = 0; j < ncopies; j++)
4683 tree new_def = copy_ssa_name (def);
4684 phi = create_phi_node (new_def, exit_bb);
4685 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4686 if (j == 0)
4687 new_phis.quick_push (phi);
4688 else
4690 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4691 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4694 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4695 prev_phi_info = phi_info;
4699 /* The epilogue is created for the outer-loop, i.e., for the loop being
4700 vectorized. Create exit phis for the outer loop. */
4701 if (double_reduc)
4703 loop = outer_loop;
4704 exit_bb = single_exit (loop)->dest;
4705 inner_phis.create (vect_defs.length ());
4706 FOR_EACH_VEC_ELT (new_phis, i, phi)
4708 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4709 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4710 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4711 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4712 PHI_RESULT (phi));
4713 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4714 inner_phis.quick_push (phi_info);
4715 new_phis[i] = outer_phi;
4716 while (STMT_VINFO_RELATED_STMT (phi_info))
4718 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4719 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4720 outer_phi = create_phi_node (new_result, exit_bb);
4721 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4722 PHI_RESULT (phi_info->stmt));
4723 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4724 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4725 prev_phi_info = outer_phi_info;
4730 exit_gsi = gsi_after_labels (exit_bb);
4732 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4733 (i.e. when reduc_fn is not available) and in the final adjustment
4734 code (if needed). Also get the original scalar reduction variable as
4735 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4736 represents a reduction pattern), the tree-code and scalar-def are
4737 taken from the original stmt that the pattern-stmt (STMT) replaces.
4738 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4739 are taken from STMT. */
4741 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4742 if (!orig_stmt_info)
4744 /* Regular reduction */
4745 orig_stmt_info = stmt_info;
4747 else
4749 /* Reduction pattern */
4750 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4751 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4754 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4755 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4756 partial results are added and not subtracted. */
4757 if (code == MINUS_EXPR)
4758 code = PLUS_EXPR;
4760 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4761 scalar_type = TREE_TYPE (scalar_dest);
4762 scalar_results.create (group_size);
4763 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4764 bitsize = TYPE_SIZE (scalar_type);
4766 /* In case this is a reduction in an inner-loop while vectorizing an outer
4767 loop - we don't need to extract a single scalar result at the end of the
4768 inner-loop (unless it is double reduction, i.e., the use of reduction is
4769 outside the outer-loop). The final vector of partial results will be used
4770 in the vectorized outer-loop, or reduced to a scalar result at the end of
4771 the outer-loop. */
4772 if (nested_in_vect_loop && !double_reduc)
4773 goto vect_finalize_reduction;
4775 /* SLP reduction without reduction chain, e.g.,
4776 # a1 = phi <a2, a0>
4777 # b1 = phi <b2, b0>
4778 a2 = operation (a1)
4779 b2 = operation (b1) */
4780 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4782 /* True if we should implement SLP_REDUC using native reduction operations
4783 instead of scalar operations. */
4784 direct_slp_reduc = (reduc_fn != IFN_LAST
4785 && slp_reduc
4786 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4788 /* In case of reduction chain, e.g.,
4789 # a1 = phi <a3, a0>
4790 a2 = operation (a1)
4791 a3 = operation (a2),
4793 we may end up with more than one vector result. Here we reduce them to
4794 one vector. */
4795 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4797 tree first_vect = PHI_RESULT (new_phis[0]);
4798 gassign *new_vec_stmt = NULL;
4799 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4800 for (k = 1; k < new_phis.length (); k++)
4802 gimple *next_phi = new_phis[k];
4803 tree second_vect = PHI_RESULT (next_phi);
4804 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4805 new_vec_stmt = gimple_build_assign (tem, code,
4806 first_vect, second_vect);
4807 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4808 first_vect = tem;
4811 new_phi_result = first_vect;
4812 if (new_vec_stmt)
4814 new_phis.truncate (0);
4815 new_phis.safe_push (new_vec_stmt);
4818 /* Likewise if we couldn't use a single defuse cycle. */
4819 else if (ncopies > 1)
4821 gcc_assert (new_phis.length () == 1);
4822 tree first_vect = PHI_RESULT (new_phis[0]);
4823 gassign *new_vec_stmt = NULL;
4824 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4825 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4826 for (int k = 1; k < ncopies; ++k)
4828 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4829 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4830 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4831 new_vec_stmt = gimple_build_assign (tem, code,
4832 first_vect, second_vect);
4833 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4834 first_vect = tem;
4836 new_phi_result = first_vect;
4837 new_phis.truncate (0);
4838 new_phis.safe_push (new_vec_stmt);
4840 else
4841 new_phi_result = PHI_RESULT (new_phis[0]);
4843 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4844 && reduc_fn != IFN_LAST)
4846 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4847 various data values where the condition matched and another vector
4848 (INDUCTION_INDEX) containing all the indexes of those matches. We
4849 need to extract the last matching index (which will be the index with
4850 highest value) and use this to index into the data vector.
4851 For the case where there were no matches, the data vector will contain
4852 all default values and the index vector will be all zeros. */
4854 /* Get various versions of the type of the vector of indexes. */
4855 tree index_vec_type = TREE_TYPE (induction_index);
4856 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4857 tree index_scalar_type = TREE_TYPE (index_vec_type);
4858 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4859 (index_vec_type);
4861 /* Get an unsigned integer version of the type of the data vector. */
4862 int scalar_precision
4863 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4864 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4865 tree vectype_unsigned = build_vector_type
4866 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4868 /* First we need to create a vector (ZERO_VEC) of zeros and another
4869 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4870 can create using a MAX reduction and then expanding.
4871 In the case where the loop never made any matches, the max index will
4872 be zero. */
4874 /* Vector of {0, 0, 0,...}. */
4875 tree zero_vec = make_ssa_name (vectype);
4876 tree zero_vec_rhs = build_zero_cst (vectype);
4877 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4878 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4880 /* Find maximum value from the vector of found indexes. */
4881 tree max_index = make_ssa_name (index_scalar_type);
4882 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4883 1, induction_index);
4884 gimple_call_set_lhs (max_index_stmt, max_index);
4885 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4887 /* Vector of {max_index, max_index, max_index,...}. */
4888 tree max_index_vec = make_ssa_name (index_vec_type);
4889 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4890 max_index);
4891 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4892 max_index_vec_rhs);
4893 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4895 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4896 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4897 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4898 otherwise. Only one value should match, resulting in a vector
4899 (VEC_COND) with one data value and the rest zeros.
4900 In the case where the loop never made any matches, every index will
4901 match, resulting in a vector with all data values (which will all be
4902 the default value). */
4904 /* Compare the max index vector to the vector of found indexes to find
4905 the position of the max value. */
4906 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4907 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4908 induction_index,
4909 max_index_vec);
4910 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4912 /* Use the compare to choose either values from the data vector or
4913 zero. */
4914 tree vec_cond = make_ssa_name (vectype);
4915 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4916 vec_compare, new_phi_result,
4917 zero_vec);
4918 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4920 /* Finally we need to extract the data value from the vector (VEC_COND)
4921 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4922 reduction, but because this doesn't exist, we can use a MAX reduction
4923 instead. The data value might be signed or a float so we need to cast
4924 it first.
4925 In the case where the loop never made any matches, the data values are
4926 all identical, and so will reduce down correctly. */
4928 /* Make the matched data values unsigned. */
4929 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4930 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4931 vec_cond);
4932 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4933 VIEW_CONVERT_EXPR,
4934 vec_cond_cast_rhs);
4935 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4937 /* Reduce down to a scalar value. */
4938 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4939 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4940 1, vec_cond_cast);
4941 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4942 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4944 /* Convert the reduced value back to the result type and set as the
4945 result. */
4946 gimple_seq stmts = NULL;
4947 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4948 data_reduc);
4949 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4950 scalar_results.safe_push (new_temp);
4952 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4953 && reduc_fn == IFN_LAST)
4955 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4956 idx = 0;
4957 idx_val = induction_index[0];
4958 val = data_reduc[0];
4959 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4960 if (induction_index[i] > idx_val)
4961 val = data_reduc[i], idx_val = induction_index[i];
4962 return val; */
4964 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4965 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4966 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4967 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4968 /* Enforced by vectorizable_reduction, which ensures we have target
4969 support before allowing a conditional reduction on variable-length
4970 vectors. */
4971 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4972 tree idx_val = NULL_TREE, val = NULL_TREE;
4973 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4975 tree old_idx_val = idx_val;
4976 tree old_val = val;
4977 idx_val = make_ssa_name (idx_eltype);
4978 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4979 build3 (BIT_FIELD_REF, idx_eltype,
4980 induction_index,
4981 bitsize_int (el_size),
4982 bitsize_int (off)));
4983 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4984 val = make_ssa_name (data_eltype);
4985 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4986 build3 (BIT_FIELD_REF,
4987 data_eltype,
4988 new_phi_result,
4989 bitsize_int (el_size),
4990 bitsize_int (off)));
4991 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4992 if (off != 0)
4994 tree new_idx_val = idx_val;
4995 tree new_val = val;
4996 if (off != v_size - el_size)
4998 new_idx_val = make_ssa_name (idx_eltype);
4999 epilog_stmt = gimple_build_assign (new_idx_val,
5000 MAX_EXPR, idx_val,
5001 old_idx_val);
5002 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5004 new_val = make_ssa_name (data_eltype);
5005 epilog_stmt = gimple_build_assign (new_val,
5006 COND_EXPR,
5007 build2 (GT_EXPR,
5008 boolean_type_node,
5009 idx_val,
5010 old_idx_val),
5011 val, old_val);
5012 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5013 idx_val = new_idx_val;
5014 val = new_val;
5017 /* Convert the reduced value back to the result type and set as the
5018 result. */
5019 gimple_seq stmts = NULL;
5020 val = gimple_convert (&stmts, scalar_type, val);
5021 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5022 scalar_results.safe_push (val);
5025 /* 2.3 Create the reduction code, using one of the three schemes described
5026 above. In SLP we simply need to extract all the elements from the
5027 vector (without reducing them), so we use scalar shifts. */
5028 else if (reduc_fn != IFN_LAST && !slp_reduc)
5030 tree tmp;
5031 tree vec_elem_type;
5033 /* Case 1: Create:
5034 v_out2 = reduc_expr <v_out1> */
5036 if (dump_enabled_p ())
5037 dump_printf_loc (MSG_NOTE, vect_location,
5038 "Reduce using direct vector reduction.\n");
5040 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5041 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5043 tree tmp_dest
5044 = vect_create_destination_var (scalar_dest, vec_elem_type);
5045 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5046 new_phi_result);
5047 gimple_set_lhs (epilog_stmt, tmp_dest);
5048 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5049 gimple_set_lhs (epilog_stmt, new_temp);
5050 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5052 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5053 new_temp);
5055 else
5057 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5058 new_phi_result);
5059 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5062 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5063 gimple_set_lhs (epilog_stmt, new_temp);
5064 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5066 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5067 == INTEGER_INDUC_COND_REDUCTION)
5068 && !operand_equal_p (initial_def, induc_val, 0))
5070 /* Earlier we set the initial value to be a vector if induc_val
5071 values. Check the result and if it is induc_val then replace
5072 with the original initial value, unless induc_val is
5073 the same as initial_def already. */
5074 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5075 induc_val);
5077 tmp = make_ssa_name (new_scalar_dest);
5078 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5079 initial_def, new_temp);
5080 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5081 new_temp = tmp;
5084 scalar_results.safe_push (new_temp);
5086 else if (direct_slp_reduc)
5088 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5089 with the elements for other SLP statements replaced with the
5090 neutral value. We can then do a normal reduction on each vector. */
5092 /* Enforced by vectorizable_reduction. */
5093 gcc_assert (new_phis.length () == 1);
5094 gcc_assert (pow2p_hwi (group_size));
5096 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5097 vec<stmt_vec_info> orig_phis
5098 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5099 gimple_seq seq = NULL;
5101 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5102 and the same element size as VECTYPE. */
5103 tree index = build_index_vector (vectype, 0, 1);
5104 tree index_type = TREE_TYPE (index);
5105 tree index_elt_type = TREE_TYPE (index_type);
5106 tree mask_type = build_same_sized_truth_vector_type (index_type);
5108 /* Create a vector that, for each element, identifies which of
5109 the REDUC_GROUP_SIZE results should use it. */
5110 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5111 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5112 build_vector_from_val (index_type, index_mask));
5114 /* Get a neutral vector value. This is simply a splat of the neutral
5115 scalar value if we have one, otherwise the initial scalar value
5116 is itself a neutral value. */
5117 tree vector_identity = NULL_TREE;
5118 if (neutral_op)
5119 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5120 neutral_op);
5121 for (unsigned int i = 0; i < group_size; ++i)
5123 /* If there's no univeral neutral value, we can use the
5124 initial scalar value from the original PHI. This is used
5125 for MIN and MAX reduction, for example. */
5126 if (!neutral_op)
5128 tree scalar_value
5129 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5130 loop_preheader_edge (loop));
5131 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5132 scalar_value);
5135 /* Calculate the equivalent of:
5137 sel[j] = (index[j] == i);
5139 which selects the elements of NEW_PHI_RESULT that should
5140 be included in the result. */
5141 tree compare_val = build_int_cst (index_elt_type, i);
5142 compare_val = build_vector_from_val (index_type, compare_val);
5143 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5144 index, compare_val);
5146 /* Calculate the equivalent of:
5148 vec = seq ? new_phi_result : vector_identity;
5150 VEC is now suitable for a full vector reduction. */
5151 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5152 sel, new_phi_result, vector_identity);
5154 /* Do the reduction and convert it to the appropriate type. */
5155 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5156 TREE_TYPE (vectype), vec);
5157 scalar = gimple_convert (&seq, scalar_type, scalar);
5158 scalar_results.safe_push (scalar);
5160 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5162 else
5164 bool reduce_with_shift;
5165 tree vec_temp;
5167 /* COND reductions all do the final reduction with MAX_EXPR
5168 or MIN_EXPR. */
5169 if (code == COND_EXPR)
5171 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5172 == INTEGER_INDUC_COND_REDUCTION)
5173 code = induc_code;
5174 else
5175 code = MAX_EXPR;
5178 /* See if the target wants to do the final (shift) reduction
5179 in a vector mode of smaller size and first reduce upper/lower
5180 halves against each other. */
5181 enum machine_mode mode1 = mode;
5182 tree vectype1 = vectype;
5183 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5184 unsigned sz1 = sz;
5185 if (!slp_reduc
5186 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5187 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5189 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5190 reduce_with_shift = have_whole_vector_shift (mode1);
5191 if (!VECTOR_MODE_P (mode1))
5192 reduce_with_shift = false;
5193 else
5195 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5196 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5197 reduce_with_shift = false;
5200 /* First reduce the vector to the desired vector size we should
5201 do shift reduction on by combining upper and lower halves. */
5202 new_temp = new_phi_result;
5203 while (sz > sz1)
5205 gcc_assert (!slp_reduc);
5206 sz /= 2;
5207 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5209 /* The target has to make sure we support lowpart/highpart
5210 extraction, either via direct vector extract or through
5211 an integer mode punning. */
5212 tree dst1, dst2;
5213 if (convert_optab_handler (vec_extract_optab,
5214 TYPE_MODE (TREE_TYPE (new_temp)),
5215 TYPE_MODE (vectype1))
5216 != CODE_FOR_nothing)
5218 /* Extract sub-vectors directly once vec_extract becomes
5219 a conversion optab. */
5220 dst1 = make_ssa_name (vectype1);
5221 epilog_stmt
5222 = gimple_build_assign (dst1, BIT_FIELD_REF,
5223 build3 (BIT_FIELD_REF, vectype1,
5224 new_temp, TYPE_SIZE (vectype1),
5225 bitsize_int (0)));
5226 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5227 dst2 = make_ssa_name (vectype1);
5228 epilog_stmt
5229 = gimple_build_assign (dst2, BIT_FIELD_REF,
5230 build3 (BIT_FIELD_REF, vectype1,
5231 new_temp, TYPE_SIZE (vectype1),
5232 bitsize_int (sz * BITS_PER_UNIT)));
5233 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5235 else
5237 /* Extract via punning to appropriately sized integer mode
5238 vector. */
5239 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5241 tree etype = build_vector_type (eltype, 2);
5242 gcc_assert (convert_optab_handler (vec_extract_optab,
5243 TYPE_MODE (etype),
5244 TYPE_MODE (eltype))
5245 != CODE_FOR_nothing);
5246 tree tem = make_ssa_name (etype);
5247 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5248 build1 (VIEW_CONVERT_EXPR,
5249 etype, new_temp));
5250 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251 new_temp = tem;
5252 tem = make_ssa_name (eltype);
5253 epilog_stmt
5254 = gimple_build_assign (tem, BIT_FIELD_REF,
5255 build3 (BIT_FIELD_REF, eltype,
5256 new_temp, TYPE_SIZE (eltype),
5257 bitsize_int (0)));
5258 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5259 dst1 = make_ssa_name (vectype1);
5260 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5261 build1 (VIEW_CONVERT_EXPR,
5262 vectype1, tem));
5263 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5264 tem = make_ssa_name (eltype);
5265 epilog_stmt
5266 = gimple_build_assign (tem, BIT_FIELD_REF,
5267 build3 (BIT_FIELD_REF, eltype,
5268 new_temp, TYPE_SIZE (eltype),
5269 bitsize_int (sz * BITS_PER_UNIT)));
5270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271 dst2 = make_ssa_name (vectype1);
5272 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5273 build1 (VIEW_CONVERT_EXPR,
5274 vectype1, tem));
5275 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278 new_temp = make_ssa_name (vectype1);
5279 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5280 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5283 if (reduce_with_shift && !slp_reduc)
5285 int element_bitsize = tree_to_uhwi (bitsize);
5286 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5287 for variable-length vectors and also requires direct target support
5288 for loop reductions. */
5289 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5290 int nelements = vec_size_in_bits / element_bitsize;
5291 vec_perm_builder sel;
5292 vec_perm_indices indices;
5294 int elt_offset;
5296 tree zero_vec = build_zero_cst (vectype1);
5297 /* Case 2: Create:
5298 for (offset = nelements/2; offset >= 1; offset/=2)
5300 Create: va' = vec_shift <va, offset>
5301 Create: va = vop <va, va'>
5302 } */
5304 tree rhs;
5306 if (dump_enabled_p ())
5307 dump_printf_loc (MSG_NOTE, vect_location,
5308 "Reduce using vector shifts\n");
5310 mode1 = TYPE_MODE (vectype1);
5311 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5312 for (elt_offset = nelements / 2;
5313 elt_offset >= 1;
5314 elt_offset /= 2)
5316 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5317 indices.new_vector (sel, 2, nelements);
5318 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5319 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5320 new_temp, zero_vec, mask);
5321 new_name = make_ssa_name (vec_dest, epilog_stmt);
5322 gimple_assign_set_lhs (epilog_stmt, new_name);
5323 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5325 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5326 new_temp);
5327 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5328 gimple_assign_set_lhs (epilog_stmt, new_temp);
5329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5332 /* 2.4 Extract the final scalar result. Create:
5333 s_out3 = extract_field <v_out2, bitpos> */
5335 if (dump_enabled_p ())
5336 dump_printf_loc (MSG_NOTE, vect_location,
5337 "extract scalar result\n");
5339 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5340 bitsize, bitsize_zero_node);
5341 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5342 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5343 gimple_assign_set_lhs (epilog_stmt, new_temp);
5344 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345 scalar_results.safe_push (new_temp);
5347 else
5349 /* Case 3: Create:
5350 s = extract_field <v_out2, 0>
5351 for (offset = element_size;
5352 offset < vector_size;
5353 offset += element_size;)
5355 Create: s' = extract_field <v_out2, offset>
5356 Create: s = op <s, s'> // For non SLP cases
5357 } */
5359 if (dump_enabled_p ())
5360 dump_printf_loc (MSG_NOTE, vect_location,
5361 "Reduce using scalar code.\n");
5363 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5364 int element_bitsize = tree_to_uhwi (bitsize);
5365 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5367 int bit_offset;
5368 if (gimple_code (new_phi) == GIMPLE_PHI)
5369 vec_temp = PHI_RESULT (new_phi);
5370 else
5371 vec_temp = gimple_assign_lhs (new_phi);
5372 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5373 bitsize_zero_node);
5374 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5375 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5376 gimple_assign_set_lhs (epilog_stmt, new_temp);
5377 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5379 /* In SLP we don't need to apply reduction operation, so we just
5380 collect s' values in SCALAR_RESULTS. */
5381 if (slp_reduc)
5382 scalar_results.safe_push (new_temp);
5384 for (bit_offset = element_bitsize;
5385 bit_offset < vec_size_in_bits;
5386 bit_offset += element_bitsize)
5388 tree bitpos = bitsize_int (bit_offset);
5389 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5390 bitsize, bitpos);
5392 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5393 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5394 gimple_assign_set_lhs (epilog_stmt, new_name);
5395 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5397 if (slp_reduc)
5399 /* In SLP we don't need to apply reduction operation, so
5400 we just collect s' values in SCALAR_RESULTS. */
5401 new_temp = new_name;
5402 scalar_results.safe_push (new_name);
5404 else
5406 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5407 new_name, new_temp);
5408 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5409 gimple_assign_set_lhs (epilog_stmt, new_temp);
5410 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5415 /* The only case where we need to reduce scalar results in SLP, is
5416 unrolling. If the size of SCALAR_RESULTS is greater than
5417 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5418 REDUC_GROUP_SIZE. */
5419 if (slp_reduc)
5421 tree res, first_res, new_res;
5422 gimple *new_stmt;
5424 /* Reduce multiple scalar results in case of SLP unrolling. */
5425 for (j = group_size; scalar_results.iterate (j, &res);
5426 j++)
5428 first_res = scalar_results[j % group_size];
5429 new_stmt = gimple_build_assign (new_scalar_dest, code,
5430 first_res, res);
5431 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5432 gimple_assign_set_lhs (new_stmt, new_res);
5433 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5434 scalar_results[j % group_size] = new_res;
5437 else
5438 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5439 scalar_results.safe_push (new_temp);
5442 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5443 == INTEGER_INDUC_COND_REDUCTION)
5444 && !operand_equal_p (initial_def, induc_val, 0))
5446 /* Earlier we set the initial value to be a vector if induc_val
5447 values. Check the result and if it is induc_val then replace
5448 with the original initial value, unless induc_val is
5449 the same as initial_def already. */
5450 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5451 induc_val);
5453 tree tmp = make_ssa_name (new_scalar_dest);
5454 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5455 initial_def, new_temp);
5456 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5457 scalar_results[0] = tmp;
5461 vect_finalize_reduction:
5463 if (double_reduc)
5464 loop = loop->inner;
5466 /* 2.5 Adjust the final result by the initial value of the reduction
5467 variable. (When such adjustment is not needed, then
5468 'adjustment_def' is zero). For example, if code is PLUS we create:
5469 new_temp = loop_exit_def + adjustment_def */
5471 if (adjustment_def)
5473 gcc_assert (!slp_reduc);
5474 if (nested_in_vect_loop)
5476 new_phi = new_phis[0];
5477 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5478 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5479 new_dest = vect_create_destination_var (scalar_dest, vectype);
5481 else
5483 new_temp = scalar_results[0];
5484 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5485 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5486 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5489 epilog_stmt = gimple_build_assign (new_dest, expr);
5490 new_temp = make_ssa_name (new_dest, epilog_stmt);
5491 gimple_assign_set_lhs (epilog_stmt, new_temp);
5492 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5493 if (nested_in_vect_loop)
5495 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5496 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5497 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5499 if (!double_reduc)
5500 scalar_results.quick_push (new_temp);
5501 else
5502 scalar_results[0] = new_temp;
5504 else
5505 scalar_results[0] = new_temp;
5507 new_phis[0] = epilog_stmt;
5510 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5511 phis with new adjusted scalar results, i.e., replace use <s_out0>
5512 with use <s_out4>.
5514 Transform:
5515 loop_exit:
5516 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5517 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5518 v_out2 = reduce <v_out1>
5519 s_out3 = extract_field <v_out2, 0>
5520 s_out4 = adjust_result <s_out3>
5521 use <s_out0>
5522 use <s_out0>
5524 into:
5526 loop_exit:
5527 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5528 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5529 v_out2 = reduce <v_out1>
5530 s_out3 = extract_field <v_out2, 0>
5531 s_out4 = adjust_result <s_out3>
5532 use <s_out4>
5533 use <s_out4> */
5536 /* In SLP reduction chain we reduce vector results into one vector if
5537 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5538 LHS of the last stmt in the reduction chain, since we are looking for
5539 the loop exit phi node. */
5540 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5542 stmt_vec_info dest_stmt_info
5543 = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5544 /* Handle reduction patterns. */
5545 if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
5546 dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
5548 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5549 group_size = 1;
5552 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5553 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5554 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5555 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5556 correspond to the first vector stmt, etc.
5557 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5558 if (group_size > new_phis.length ())
5560 ratio = group_size / new_phis.length ();
5561 gcc_assert (!(group_size % new_phis.length ()));
5563 else
5564 ratio = 1;
5566 stmt_vec_info epilog_stmt_info = NULL;
5567 for (k = 0; k < group_size; k++)
5569 if (k % ratio == 0)
5571 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5572 reduction_phi_info = reduction_phis[k / ratio];
5573 if (double_reduc)
5574 inner_phi = inner_phis[k / ratio];
5577 if (slp_reduc)
5579 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5581 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5582 /* SLP statements can't participate in patterns. */
5583 gcc_assert (!orig_stmt_info);
5584 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5587 phis.create (3);
5588 /* Find the loop-closed-use at the loop exit of the original scalar
5589 result. (The reduction result is expected to have two immediate uses -
5590 one at the latch block, and one at the loop exit). */
5591 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5592 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5593 && !is_gimple_debug (USE_STMT (use_p)))
5594 phis.safe_push (USE_STMT (use_p));
5596 /* While we expect to have found an exit_phi because of loop-closed-ssa
5597 form we can end up without one if the scalar cycle is dead. */
5599 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5601 if (outer_loop)
5603 stmt_vec_info exit_phi_vinfo
5604 = loop_vinfo->lookup_stmt (exit_phi);
5605 gphi *vect_phi;
5607 /* FORNOW. Currently not supporting the case that an inner-loop
5608 reduction is not used in the outer-loop (but only outside the
5609 outer-loop), unless it is double reduction. */
5610 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5611 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5612 || double_reduc);
5614 if (double_reduc)
5615 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5616 else
5617 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5618 if (!double_reduc
5619 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5620 != vect_double_reduction_def)
5621 continue;
5623 /* Handle double reduction:
5625 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5626 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5627 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5628 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5630 At that point the regular reduction (stmt2 and stmt3) is
5631 already vectorized, as well as the exit phi node, stmt4.
5632 Here we vectorize the phi node of double reduction, stmt1, and
5633 update all relevant statements. */
5635 /* Go through all the uses of s2 to find double reduction phi
5636 node, i.e., stmt1 above. */
5637 orig_name = PHI_RESULT (exit_phi);
5638 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5640 stmt_vec_info use_stmt_vinfo;
5641 tree vect_phi_init, preheader_arg, vect_phi_res;
5642 basic_block bb = gimple_bb (use_stmt);
5644 /* Check that USE_STMT is really double reduction phi
5645 node. */
5646 if (gimple_code (use_stmt) != GIMPLE_PHI
5647 || gimple_phi_num_args (use_stmt) != 2
5648 || bb->loop_father != outer_loop)
5649 continue;
5650 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5651 if (!use_stmt_vinfo
5652 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5653 != vect_double_reduction_def)
5654 continue;
5656 /* Create vector phi node for double reduction:
5657 vs1 = phi <vs0, vs2>
5658 vs1 was created previously in this function by a call to
5659 vect_get_vec_def_for_operand and is stored in
5660 vec_initial_def;
5661 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5662 vs0 is created here. */
5664 /* Create vector phi node. */
5665 vect_phi = create_phi_node (vec_initial_def, bb);
5666 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5668 /* Create vs0 - initial def of the double reduction phi. */
5669 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5670 loop_preheader_edge (outer_loop));
5671 vect_phi_init = get_initial_def_for_reduction
5672 (stmt_info, preheader_arg, NULL);
5674 /* Update phi node arguments with vs0 and vs2. */
5675 add_phi_arg (vect_phi, vect_phi_init,
5676 loop_preheader_edge (outer_loop),
5677 UNKNOWN_LOCATION);
5678 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5679 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5680 if (dump_enabled_p ())
5682 dump_printf_loc (MSG_NOTE, vect_location,
5683 "created double reduction phi node: ");
5684 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5687 vect_phi_res = PHI_RESULT (vect_phi);
5689 /* Replace the use, i.e., set the correct vs1 in the regular
5690 reduction phi node. FORNOW, NCOPIES is always 1, so the
5691 loop is redundant. */
5692 stmt_vec_info use_info = reduction_phi_info;
5693 for (j = 0; j < ncopies; j++)
5695 edge pr_edge = loop_preheader_edge (loop);
5696 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5697 pr_edge->dest_idx, vect_phi_res);
5698 use_info = STMT_VINFO_RELATED_STMT (use_info);
5704 phis.release ();
5705 if (nested_in_vect_loop)
5707 if (double_reduc)
5708 loop = outer_loop;
5709 else
5710 continue;
5713 phis.create (3);
5714 /* Find the loop-closed-use at the loop exit of the original scalar
5715 result. (The reduction result is expected to have two immediate uses,
5716 one at the latch block, and one at the loop exit). For double
5717 reductions we are looking for exit phis of the outer loop. */
5718 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5720 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5722 if (!is_gimple_debug (USE_STMT (use_p)))
5723 phis.safe_push (USE_STMT (use_p));
5725 else
5727 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5729 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5731 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5733 if (!flow_bb_inside_loop_p (loop,
5734 gimple_bb (USE_STMT (phi_use_p)))
5735 && !is_gimple_debug (USE_STMT (phi_use_p)))
5736 phis.safe_push (USE_STMT (phi_use_p));
5742 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5744 /* Replace the uses: */
5745 orig_name = PHI_RESULT (exit_phi);
5746 scalar_result = scalar_results[k];
5747 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5748 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5749 SET_USE (use_p, scalar_result);
5752 phis.release ();
5756 /* Return a vector of type VECTYPE that is equal to the vector select
5757 operation "MASK ? VEC : IDENTITY". Insert the select statements
5758 before GSI. */
5760 static tree
5761 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5762 tree vec, tree identity)
5764 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5765 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5766 mask, vec, identity);
5767 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5768 return cond;
5771 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5772 order, starting with LHS. Insert the extraction statements before GSI and
5773 associate the new scalar SSA names with variable SCALAR_DEST.
5774 Return the SSA name for the result. */
5776 static tree
5777 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5778 tree_code code, tree lhs, tree vector_rhs)
5780 tree vectype = TREE_TYPE (vector_rhs);
5781 tree scalar_type = TREE_TYPE (vectype);
5782 tree bitsize = TYPE_SIZE (scalar_type);
5783 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5784 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5786 for (unsigned HOST_WIDE_INT bit_offset = 0;
5787 bit_offset < vec_size_in_bits;
5788 bit_offset += element_bitsize)
5790 tree bitpos = bitsize_int (bit_offset);
5791 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5792 bitsize, bitpos);
5794 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5795 rhs = make_ssa_name (scalar_dest, stmt);
5796 gimple_assign_set_lhs (stmt, rhs);
5797 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5799 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5800 tree new_name = make_ssa_name (scalar_dest, stmt);
5801 gimple_assign_set_lhs (stmt, new_name);
5802 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5803 lhs = new_name;
5805 return lhs;
5808 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5809 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5810 statement. CODE is the operation performed by STMT_INFO and OPS are
5811 its scalar operands. REDUC_INDEX is the index of the operand in
5812 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5813 implements in-order reduction, or IFN_LAST if we should open-code it.
5814 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5815 that should be used to control the operation in a fully-masked loop. */
5817 static bool
5818 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5819 gimple_stmt_iterator *gsi,
5820 stmt_vec_info *vec_stmt, slp_tree slp_node,
5821 gimple *reduc_def_stmt,
5822 tree_code code, internal_fn reduc_fn,
5823 tree ops[3], tree vectype_in,
5824 int reduc_index, vec_loop_masks *masks)
5826 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5827 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5828 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5829 stmt_vec_info new_stmt_info = NULL;
5831 int ncopies;
5832 if (slp_node)
5833 ncopies = 1;
5834 else
5835 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5837 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5838 gcc_assert (ncopies == 1);
5839 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5840 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5841 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5842 == FOLD_LEFT_REDUCTION);
5844 if (slp_node)
5845 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5846 TYPE_VECTOR_SUBPARTS (vectype_in)));
5848 tree op0 = ops[1 - reduc_index];
5850 int group_size = 1;
5851 stmt_vec_info scalar_dest_def_info;
5852 auto_vec<tree> vec_oprnds0;
5853 if (slp_node)
5855 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5856 slp_node);
5857 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5858 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5860 else
5862 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5863 vec_oprnds0.create (1);
5864 vec_oprnds0.quick_push (loop_vec_def0);
5865 scalar_dest_def_info = stmt_info;
5868 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5869 tree scalar_type = TREE_TYPE (scalar_dest);
5870 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5872 int vec_num = vec_oprnds0.length ();
5873 gcc_assert (vec_num == 1 || slp_node);
5874 tree vec_elem_type = TREE_TYPE (vectype_out);
5875 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5877 tree vector_identity = NULL_TREE;
5878 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5879 vector_identity = build_zero_cst (vectype_out);
5881 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5882 int i;
5883 tree def0;
5884 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5886 gimple *new_stmt;
5887 tree mask = NULL_TREE;
5888 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5889 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5891 /* Handle MINUS by adding the negative. */
5892 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5894 tree negated = make_ssa_name (vectype_out);
5895 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5896 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5897 def0 = negated;
5900 if (mask)
5901 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5902 vector_identity);
5904 /* On the first iteration the input is simply the scalar phi
5905 result, and for subsequent iterations it is the output of
5906 the preceding operation. */
5907 if (reduc_fn != IFN_LAST)
5909 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5910 /* For chained SLP reductions the output of the previous reduction
5911 operation serves as the input of the next. For the final statement
5912 the output cannot be a temporary - we reuse the original
5913 scalar destination of the last statement. */
5914 if (i != vec_num - 1)
5916 gimple_set_lhs (new_stmt, scalar_dest_var);
5917 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5918 gimple_set_lhs (new_stmt, reduc_var);
5921 else
5923 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5924 reduc_var, def0);
5925 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5926 /* Remove the statement, so that we can use the same code paths
5927 as for statements that we've just created. */
5928 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5929 gsi_remove (&tmp_gsi, false);
5932 if (i == vec_num - 1)
5934 gimple_set_lhs (new_stmt, scalar_dest);
5935 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5936 new_stmt);
5938 else
5939 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5940 new_stmt, gsi);
5942 if (slp_node)
5943 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5946 if (!slp_node)
5947 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5949 return true;
5952 /* Function is_nonwrapping_integer_induction.
5954 Check if STMT_VINO (which is part of loop LOOP) both increments and
5955 does not cause overflow. */
5957 static bool
5958 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5960 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5961 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5962 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5963 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5964 widest_int ni, max_loop_value, lhs_max;
5965 wi::overflow_type overflow = wi::OVF_NONE;
5967 /* Make sure the loop is integer based. */
5968 if (TREE_CODE (base) != INTEGER_CST
5969 || TREE_CODE (step) != INTEGER_CST)
5970 return false;
5972 /* Check that the max size of the loop will not wrap. */
5974 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5975 return true;
5977 if (! max_stmt_executions (loop, &ni))
5978 return false;
5980 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5981 &overflow);
5982 if (overflow)
5983 return false;
5985 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5986 TYPE_SIGN (lhs_type), &overflow);
5987 if (overflow)
5988 return false;
5990 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5991 <= TYPE_PRECISION (lhs_type));
5994 /* Function vectorizable_reduction.
5996 Check if STMT_INFO performs a reduction operation that can be vectorized.
5997 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5998 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5999 Return true if STMT_INFO is vectorizable in this way.
6001 This function also handles reduction idioms (patterns) that have been
6002 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6003 may be of this form:
6004 X = pattern_expr (arg0, arg1, ..., X)
6005 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6006 sequence that had been detected and replaced by the pattern-stmt
6007 (STMT_INFO).
6009 This function also handles reduction of condition expressions, for example:
6010 for (int i = 0; i < N; i++)
6011 if (a[i] < value)
6012 last = a[i];
6013 This is handled by vectorising the loop and creating an additional vector
6014 containing the loop indexes for which "a[i] < value" was true. In the
6015 function epilogue this is reduced to a single max value and then used to
6016 index into the vector of results.
6018 In some cases of reduction patterns, the type of the reduction variable X is
6019 different than the type of the other arguments of STMT_INFO.
6020 In such cases, the vectype that is used when transforming STMT_INFO into
6021 a vector stmt is different than the vectype that is used to determine the
6022 vectorization factor, because it consists of a different number of elements
6023 than the actual number of elements that are being operated upon in parallel.
6025 For example, consider an accumulation of shorts into an int accumulator.
6026 On some targets it's possible to vectorize this pattern operating on 8
6027 shorts at a time (hence, the vectype for purposes of determining the
6028 vectorization factor should be V8HI); on the other hand, the vectype that
6029 is used to create the vector form is actually V4SI (the type of the result).
6031 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6032 indicates what is the actual level of parallelism (V8HI in the example), so
6033 that the right vectorization factor would be derived. This vectype
6034 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6035 be used to create the vectorized stmt. The right vectype for the vectorized
6036 stmt is obtained from the type of the result X:
6037 get_vectype_for_scalar_type (TREE_TYPE (X))
6039 This means that, contrary to "regular" reductions (or "regular" stmts in
6040 general), the following equation:
6041 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6042 does *NOT* necessarily hold for reduction patterns. */
6044 bool
6045 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6046 stmt_vec_info *vec_stmt, slp_tree slp_node,
6047 slp_instance slp_node_instance,
6048 stmt_vector_for_cost *cost_vec)
6050 tree vec_dest;
6051 tree scalar_dest;
6052 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6053 tree vectype_in = NULL_TREE;
6054 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6055 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6056 enum tree_code code, orig_code;
6057 internal_fn reduc_fn;
6058 machine_mode vec_mode;
6059 int op_type;
6060 optab optab;
6061 tree new_temp = NULL_TREE;
6062 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6063 stmt_vec_info cond_stmt_vinfo = NULL;
6064 enum tree_code cond_reduc_op_code = ERROR_MARK;
6065 tree scalar_type;
6066 bool is_simple_use;
6067 int i;
6068 int ncopies;
6069 int epilog_copies;
6070 stmt_vec_info prev_stmt_info, prev_phi_info;
6071 bool single_defuse_cycle = false;
6072 stmt_vec_info new_stmt_info = NULL;
6073 int j;
6074 tree ops[3];
6075 enum vect_def_type dts[3];
6076 bool nested_cycle = false, found_nested_cycle_def = false;
6077 bool double_reduc = false;
6078 basic_block def_bb;
6079 struct loop * def_stmt_loop;
6080 tree def_arg;
6081 auto_vec<tree> vec_oprnds0;
6082 auto_vec<tree> vec_oprnds1;
6083 auto_vec<tree> vec_oprnds2;
6084 auto_vec<tree> vect_defs;
6085 auto_vec<stmt_vec_info> phis;
6086 int vec_num;
6087 tree def0, tem;
6088 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6089 tree cond_reduc_val = NULL_TREE;
6091 /* Make sure it was already recognized as a reduction computation. */
6092 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6093 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6094 return false;
6096 if (nested_in_vect_loop_p (loop, stmt_info))
6098 loop = loop->inner;
6099 nested_cycle = true;
6102 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6103 gcc_assert (slp_node
6104 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6106 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6108 tree phi_result = gimple_phi_result (phi);
6109 /* Analysis is fully done on the reduction stmt invocation. */
6110 if (! vec_stmt)
6112 if (slp_node)
6113 slp_node_instance->reduc_phis = slp_node;
6115 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6116 return true;
6119 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6120 /* Leave the scalar phi in place. Note that checking
6121 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6122 for reductions involving a single statement. */
6123 return true;
6125 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6126 if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
6127 reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
6129 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6130 == EXTRACT_LAST_REDUCTION)
6131 /* Leave the scalar phi in place. */
6132 return true;
6134 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6135 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6137 tree op = gimple_op (reduc_stmt, k);
6138 if (op == phi_result)
6139 continue;
6140 if (k == 1
6141 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6142 continue;
6143 if (!vectype_in
6144 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6145 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6146 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6147 break;
6149 gcc_assert (vectype_in);
6151 if (slp_node)
6152 ncopies = 1;
6153 else
6154 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6156 stmt_vec_info use_stmt_info;
6157 if (ncopies > 1
6158 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6159 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6160 && (use_stmt_info == reduc_stmt_info
6161 || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt_info))
6162 single_defuse_cycle = true;
6164 /* Create the destination vector */
6165 scalar_dest = gimple_assign_lhs (reduc_stmt);
6166 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6168 if (slp_node)
6169 /* The size vect_schedule_slp_instance computes is off for us. */
6170 vec_num = vect_get_num_vectors
6171 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6172 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6173 vectype_in);
6174 else
6175 vec_num = 1;
6177 /* Generate the reduction PHIs upfront. */
6178 prev_phi_info = NULL;
6179 for (j = 0; j < ncopies; j++)
6181 if (j == 0 || !single_defuse_cycle)
6183 for (i = 0; i < vec_num; i++)
6185 /* Create the reduction-phi that defines the reduction
6186 operand. */
6187 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6188 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6190 if (slp_node)
6191 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6192 else
6194 if (j == 0)
6195 STMT_VINFO_VEC_STMT (stmt_info)
6196 = *vec_stmt = new_phi_info;
6197 else
6198 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6199 prev_phi_info = new_phi_info;
6205 return true;
6208 /* 1. Is vectorizable reduction? */
6209 /* Not supportable if the reduction variable is used in the loop, unless
6210 it's a reduction chain. */
6211 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6212 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6213 return false;
6215 /* Reductions that are not used even in an enclosing outer-loop,
6216 are expected to be "live" (used out of the loop). */
6217 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6218 && !STMT_VINFO_LIVE_P (stmt_info))
6219 return false;
6221 /* 2. Has this been recognized as a reduction pattern?
6223 Check if STMT represents a pattern that has been recognized
6224 in earlier analysis stages. For stmts that represent a pattern,
6225 the STMT_VINFO_RELATED_STMT field records the last stmt in
6226 the original sequence that constitutes the pattern. */
6228 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6229 if (orig_stmt_info)
6231 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6232 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6235 /* 3. Check the operands of the operation. The first operands are defined
6236 inside the loop body. The last operand is the reduction variable,
6237 which is defined by the loop-header-phi. */
6239 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6241 /* Flatten RHS. */
6242 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6244 case GIMPLE_BINARY_RHS:
6245 code = gimple_assign_rhs_code (stmt);
6246 op_type = TREE_CODE_LENGTH (code);
6247 gcc_assert (op_type == binary_op);
6248 ops[0] = gimple_assign_rhs1 (stmt);
6249 ops[1] = gimple_assign_rhs2 (stmt);
6250 break;
6252 case GIMPLE_TERNARY_RHS:
6253 code = gimple_assign_rhs_code (stmt);
6254 op_type = TREE_CODE_LENGTH (code);
6255 gcc_assert (op_type == ternary_op);
6256 ops[0] = gimple_assign_rhs1 (stmt);
6257 ops[1] = gimple_assign_rhs2 (stmt);
6258 ops[2] = gimple_assign_rhs3 (stmt);
6259 break;
6261 case GIMPLE_UNARY_RHS:
6262 return false;
6264 default:
6265 gcc_unreachable ();
6268 if (code == COND_EXPR && slp_node)
6269 return false;
6271 scalar_dest = gimple_assign_lhs (stmt);
6272 scalar_type = TREE_TYPE (scalar_dest);
6273 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6274 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6275 return false;
6277 /* Do not try to vectorize bit-precision reductions. */
6278 if (!type_has_mode_precision_p (scalar_type))
6279 return false;
6281 /* All uses but the last are expected to be defined in the loop.
6282 The last use is the reduction variable. In case of nested cycle this
6283 assumption is not true: we use reduc_index to record the index of the
6284 reduction variable. */
6285 stmt_vec_info reduc_def_info = NULL;
6286 int reduc_index = -1;
6287 for (i = 0; i < op_type; i++)
6289 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6290 if (i == 0 && code == COND_EXPR)
6291 continue;
6293 stmt_vec_info def_stmt_info;
6294 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6295 &def_stmt_info);
6296 dt = dts[i];
6297 gcc_assert (is_simple_use);
6298 if (dt == vect_reduction_def)
6300 reduc_def_info = def_stmt_info;
6301 reduc_index = i;
6302 continue;
6304 else if (tem)
6306 /* To properly compute ncopies we are interested in the widest
6307 input type in case we're looking at a widening accumulation. */
6308 if (!vectype_in
6309 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6310 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6311 vectype_in = tem;
6314 if (dt != vect_internal_def
6315 && dt != vect_external_def
6316 && dt != vect_constant_def
6317 && dt != vect_induction_def
6318 && !(dt == vect_nested_cycle && nested_cycle))
6319 return false;
6321 if (dt == vect_nested_cycle)
6323 found_nested_cycle_def = true;
6324 reduc_def_info = def_stmt_info;
6325 reduc_index = i;
6328 if (i == 1 && code == COND_EXPR)
6330 /* Record how value of COND_EXPR is defined. */
6331 if (dt == vect_constant_def)
6333 cond_reduc_dt = dt;
6334 cond_reduc_val = ops[i];
6336 if (dt == vect_induction_def
6337 && def_stmt_info
6338 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6340 cond_reduc_dt = dt;
6341 cond_stmt_vinfo = def_stmt_info;
6346 if (!vectype_in)
6347 vectype_in = vectype_out;
6349 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6350 directy used in stmt. */
6351 if (reduc_index == -1)
6353 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6355 if (dump_enabled_p ())
6356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357 "in-order reduction chain without SLP.\n");
6358 return false;
6361 if (orig_stmt_info)
6362 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6363 else
6364 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6367 if (! reduc_def_info)
6368 return false;
6370 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6371 if (!reduc_def_phi)
6372 return false;
6374 if (!(reduc_index == -1
6375 || dts[reduc_index] == vect_reduction_def
6376 || dts[reduc_index] == vect_nested_cycle
6377 || ((dts[reduc_index] == vect_internal_def
6378 || dts[reduc_index] == vect_external_def
6379 || dts[reduc_index] == vect_constant_def
6380 || dts[reduc_index] == vect_induction_def)
6381 && nested_cycle && found_nested_cycle_def)))
6383 /* For pattern recognized stmts, orig_stmt might be a reduction,
6384 but some helper statements for the pattern might not, or
6385 might be COND_EXPRs with reduction uses in the condition. */
6386 gcc_assert (orig_stmt_info);
6387 return false;
6390 /* PHIs should not participate in patterns. */
6391 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6392 enum vect_reduction_type v_reduc_type
6393 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6394 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6396 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6397 /* If we have a condition reduction, see if we can simplify it further. */
6398 if (v_reduc_type == COND_REDUCTION)
6400 /* TODO: We can't yet handle reduction chains, since we need to treat
6401 each COND_EXPR in the chain specially, not just the last one.
6402 E.g. for:
6404 x_1 = PHI <x_3, ...>
6405 x_2 = a_2 ? ... : x_1;
6406 x_3 = a_3 ? ... : x_2;
6408 we're interested in the last element in x_3 for which a_2 || a_3
6409 is true, whereas the current reduction chain handling would
6410 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6411 as a reduction operation. */
6412 if (reduc_index == -1)
6414 if (dump_enabled_p ())
6415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6416 "conditional reduction chains not supported\n");
6417 return false;
6420 /* vect_is_simple_reduction ensured that operand 2 is the
6421 loop-carried operand. */
6422 gcc_assert (reduc_index == 2);
6424 /* Loop peeling modifies initial value of reduction PHI, which
6425 makes the reduction stmt to be transformed different to the
6426 original stmt analyzed. We need to record reduction code for
6427 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6428 it can be used directly at transform stage. */
6429 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6430 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6432 /* Also set the reduction type to CONST_COND_REDUCTION. */
6433 gcc_assert (cond_reduc_dt == vect_constant_def);
6434 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6436 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6437 vectype_in, OPTIMIZE_FOR_SPEED))
6439 if (dump_enabled_p ())
6440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6441 "optimizing condition reduction with"
6442 " FOLD_EXTRACT_LAST.\n");
6443 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6445 else if (cond_reduc_dt == vect_induction_def)
6447 tree base
6448 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6449 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6451 gcc_assert (TREE_CODE (base) == INTEGER_CST
6452 && TREE_CODE (step) == INTEGER_CST);
6453 cond_reduc_val = NULL_TREE;
6454 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6455 above base; punt if base is the minimum value of the type for
6456 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6457 if (tree_int_cst_sgn (step) == -1)
6459 cond_reduc_op_code = MIN_EXPR;
6460 if (tree_int_cst_sgn (base) == -1)
6461 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6462 else if (tree_int_cst_lt (base,
6463 TYPE_MAX_VALUE (TREE_TYPE (base))))
6464 cond_reduc_val
6465 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6467 else
6469 cond_reduc_op_code = MAX_EXPR;
6470 if (tree_int_cst_sgn (base) == 1)
6471 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6472 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6473 base))
6474 cond_reduc_val
6475 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6477 if (cond_reduc_val)
6479 if (dump_enabled_p ())
6480 dump_printf_loc (MSG_NOTE, vect_location,
6481 "condition expression based on "
6482 "integer induction.\n");
6483 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6484 = INTEGER_INDUC_COND_REDUCTION;
6487 else if (cond_reduc_dt == vect_constant_def)
6489 enum vect_def_type cond_initial_dt;
6490 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6491 tree cond_initial_val
6492 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6494 gcc_assert (cond_reduc_val != NULL_TREE);
6495 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6496 if (cond_initial_dt == vect_constant_def
6497 && types_compatible_p (TREE_TYPE (cond_initial_val),
6498 TREE_TYPE (cond_reduc_val)))
6500 tree e = fold_binary (LE_EXPR, boolean_type_node,
6501 cond_initial_val, cond_reduc_val);
6502 if (e && (integer_onep (e) || integer_zerop (e)))
6504 if (dump_enabled_p ())
6505 dump_printf_loc (MSG_NOTE, vect_location,
6506 "condition expression based on "
6507 "compile time constant.\n");
6508 /* Record reduction code at analysis stage. */
6509 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6510 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6511 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6512 = CONST_COND_REDUCTION;
6518 if (orig_stmt_info)
6519 gcc_assert (tmp == orig_stmt_info
6520 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6521 else
6522 /* We changed STMT to be the first stmt in reduction chain, hence we
6523 check that in this case the first element in the chain is STMT. */
6524 gcc_assert (tmp == stmt_info
6525 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6527 if (STMT_VINFO_LIVE_P (reduc_def_info))
6528 return false;
6530 if (slp_node)
6531 ncopies = 1;
6532 else
6533 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6535 gcc_assert (ncopies >= 1);
6537 vec_mode = TYPE_MODE (vectype_in);
6538 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6540 if (code == COND_EXPR)
6542 /* Only call during the analysis stage, otherwise we'll lose
6543 STMT_VINFO_TYPE. */
6544 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6545 ops[reduc_index], 0, NULL,
6546 cost_vec))
6548 if (dump_enabled_p ())
6549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6550 "unsupported condition in reduction\n");
6551 return false;
6554 else
6556 /* 4. Supportable by target? */
6558 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6559 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6561 /* Shifts and rotates are only supported by vectorizable_shifts,
6562 not vectorizable_reduction. */
6563 if (dump_enabled_p ())
6564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6565 "unsupported shift or rotation.\n");
6566 return false;
6569 /* 4.1. check support for the operation in the loop */
6570 optab = optab_for_tree_code (code, vectype_in, optab_default);
6571 if (!optab)
6573 if (dump_enabled_p ())
6574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6575 "no optab.\n");
6577 return false;
6580 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6582 if (dump_enabled_p ())
6583 dump_printf (MSG_NOTE, "op not supported by target.\n");
6585 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6586 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6587 return false;
6589 if (dump_enabled_p ())
6590 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6593 /* Worthwhile without SIMD support? */
6594 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6595 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6597 if (dump_enabled_p ())
6598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6599 "not worthwhile without SIMD support.\n");
6601 return false;
6605 /* 4.2. Check support for the epilog operation.
6607 If STMT represents a reduction pattern, then the type of the
6608 reduction variable may be different than the type of the rest
6609 of the arguments. For example, consider the case of accumulation
6610 of shorts into an int accumulator; The original code:
6611 S1: int_a = (int) short_a;
6612 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6614 was replaced with:
6615 STMT: int_acc = widen_sum <short_a, int_acc>
6617 This means that:
6618 1. The tree-code that is used to create the vector operation in the
6619 epilog code (that reduces the partial results) is not the
6620 tree-code of STMT, but is rather the tree-code of the original
6621 stmt from the pattern that STMT is replacing. I.e, in the example
6622 above we want to use 'widen_sum' in the loop, but 'plus' in the
6623 epilog.
6624 2. The type (mode) we use to check available target support
6625 for the vector operation to be created in the *epilog*, is
6626 determined by the type of the reduction variable (in the example
6627 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6628 However the type (mode) we use to check available target support
6629 for the vector operation to be created *inside the loop*, is
6630 determined by the type of the other arguments to STMT (in the
6631 example we'd check this: optab_handler (widen_sum_optab,
6632 vect_short_mode)).
6634 This is contrary to "regular" reductions, in which the types of all
6635 the arguments are the same as the type of the reduction variable.
6636 For "regular" reductions we can therefore use the same vector type
6637 (and also the same tree-code) when generating the epilog code and
6638 when generating the code inside the loop. */
6640 vect_reduction_type reduction_type
6641 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6642 if (orig_stmt_info
6643 && (reduction_type == TREE_CODE_REDUCTION
6644 || reduction_type == FOLD_LEFT_REDUCTION))
6646 /* This is a reduction pattern: get the vectype from the type of the
6647 reduction variable, and get the tree-code from orig_stmt. */
6648 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6649 gcc_assert (vectype_out);
6650 vec_mode = TYPE_MODE (vectype_out);
6652 else
6654 /* Regular reduction: use the same vectype and tree-code as used for
6655 the vector code inside the loop can be used for the epilog code. */
6656 orig_code = code;
6658 if (code == MINUS_EXPR)
6659 orig_code = PLUS_EXPR;
6661 /* For simple condition reductions, replace with the actual expression
6662 we want to base our reduction around. */
6663 if (reduction_type == CONST_COND_REDUCTION)
6665 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6666 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6668 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6669 orig_code = cond_reduc_op_code;
6672 if (nested_cycle)
6674 def_bb = gimple_bb (reduc_def_phi);
6675 def_stmt_loop = def_bb->loop_father;
6676 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6677 loop_preheader_edge (def_stmt_loop));
6678 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6679 if (def_arg_stmt_info
6680 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6681 == vect_double_reduction_def))
6682 double_reduc = true;
6685 reduc_fn = IFN_LAST;
6687 if (reduction_type == TREE_CODE_REDUCTION
6688 || reduction_type == FOLD_LEFT_REDUCTION
6689 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6690 || reduction_type == CONST_COND_REDUCTION)
6692 if (reduction_type == FOLD_LEFT_REDUCTION
6693 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6694 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6696 if (reduc_fn != IFN_LAST
6697 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6698 OPTIMIZE_FOR_SPEED))
6700 if (dump_enabled_p ())
6701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6702 "reduc op not supported by target.\n");
6704 reduc_fn = IFN_LAST;
6707 else
6709 if (!nested_cycle || double_reduc)
6711 if (dump_enabled_p ())
6712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6713 "no reduc code for scalar code.\n");
6715 return false;
6719 else if (reduction_type == COND_REDUCTION)
6721 int scalar_precision
6722 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6723 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6724 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6725 nunits_out);
6727 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6728 OPTIMIZE_FOR_SPEED))
6729 reduc_fn = IFN_REDUC_MAX;
6732 if (reduction_type != EXTRACT_LAST_REDUCTION
6733 && reduc_fn == IFN_LAST
6734 && !nunits_out.is_constant ())
6736 if (dump_enabled_p ())
6737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6738 "missing target support for reduction on"
6739 " variable-length vectors.\n");
6740 return false;
6743 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6744 && ncopies > 1)
6746 if (dump_enabled_p ())
6747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6748 "multiple types in double reduction or condition "
6749 "reduction.\n");
6750 return false;
6753 /* For SLP reductions, see if there is a neutral value we can use. */
6754 tree neutral_op = NULL_TREE;
6755 if (slp_node)
6756 neutral_op = neutral_op_for_slp_reduction
6757 (slp_node_instance->reduc_phis, code,
6758 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL_STMT_VEC_INFO);
6760 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6762 /* We can't support in-order reductions of code such as this:
6764 for (int i = 0; i < n1; ++i)
6765 for (int j = 0; j < n2; ++j)
6766 l += a[j];
6768 since GCC effectively transforms the loop when vectorizing:
6770 for (int i = 0; i < n1 / VF; ++i)
6771 for (int j = 0; j < n2; ++j)
6772 for (int k = 0; k < VF; ++k)
6773 l += a[j];
6775 which is a reassociation of the original operation. */
6776 if (dump_enabled_p ())
6777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6778 "in-order double reduction not supported.\n");
6780 return false;
6783 if (reduction_type == FOLD_LEFT_REDUCTION
6784 && slp_node
6785 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6787 /* We cannot use in-order reductions in this case because there is
6788 an implicit reassociation of the operations involved. */
6789 if (dump_enabled_p ())
6790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6791 "in-order unchained SLP reductions not supported.\n");
6792 return false;
6795 /* For double reductions, and for SLP reductions with a neutral value,
6796 we construct a variable-length initial vector by loading a vector
6797 full of the neutral value and then shift-and-inserting the start
6798 values into the low-numbered elements. */
6799 if ((double_reduc || neutral_op)
6800 && !nunits_out.is_constant ()
6801 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6802 vectype_out, OPTIMIZE_FOR_SPEED))
6804 if (dump_enabled_p ())
6805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6806 "reduction on variable-length vectors requires"
6807 " target support for a vector-shift-and-insert"
6808 " operation.\n");
6809 return false;
6812 /* Check extra constraints for variable-length unchained SLP reductions. */
6813 if (STMT_SLP_TYPE (stmt_info)
6814 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6815 && !nunits_out.is_constant ())
6817 /* We checked above that we could build the initial vector when
6818 there's a neutral element value. Check here for the case in
6819 which each SLP statement has its own initial value and in which
6820 that value needs to be repeated for every instance of the
6821 statement within the initial vector. */
6822 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6823 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6824 if (!neutral_op
6825 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6827 if (dump_enabled_p ())
6828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6829 "unsupported form of SLP reduction for"
6830 " variable-length vectors: cannot build"
6831 " initial vector.\n");
6832 return false;
6834 /* The epilogue code relies on the number of elements being a multiple
6835 of the group size. The duplicate-and-interleave approach to setting
6836 up the the initial vector does too. */
6837 if (!multiple_p (nunits_out, group_size))
6839 if (dump_enabled_p ())
6840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6841 "unsupported form of SLP reduction for"
6842 " variable-length vectors: the vector size"
6843 " is not a multiple of the number of results.\n");
6844 return false;
6848 /* In case of widenning multiplication by a constant, we update the type
6849 of the constant to be the type of the other operand. We check that the
6850 constant fits the type in the pattern recognition pass. */
6851 if (code == DOT_PROD_EXPR
6852 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6854 if (TREE_CODE (ops[0]) == INTEGER_CST)
6855 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6856 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6857 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6858 else
6860 if (dump_enabled_p ())
6861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6862 "invalid types in dot-prod\n");
6864 return false;
6868 if (reduction_type == COND_REDUCTION)
6870 widest_int ni;
6872 if (! max_loop_iterations (loop, &ni))
6874 if (dump_enabled_p ())
6875 dump_printf_loc (MSG_NOTE, vect_location,
6876 "loop count not known, cannot create cond "
6877 "reduction.\n");
6878 return false;
6880 /* Convert backedges to iterations. */
6881 ni += 1;
6883 /* The additional index will be the same type as the condition. Check
6884 that the loop can fit into this less one (because we'll use up the
6885 zero slot for when there are no matches). */
6886 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6887 if (wi::geu_p (ni, wi::to_widest (max_index)))
6889 if (dump_enabled_p ())
6890 dump_printf_loc (MSG_NOTE, vect_location,
6891 "loop size is greater than data size.\n");
6892 return false;
6896 /* In case the vectorization factor (VF) is bigger than the number
6897 of elements that we can fit in a vectype (nunits), we have to generate
6898 more than one vector stmt - i.e - we need to "unroll" the
6899 vector stmt by a factor VF/nunits. For more details see documentation
6900 in vectorizable_operation. */
6902 /* If the reduction is used in an outer loop we need to generate
6903 VF intermediate results, like so (e.g. for ncopies=2):
6904 r0 = phi (init, r0)
6905 r1 = phi (init, r1)
6906 r0 = x0 + r0;
6907 r1 = x1 + r1;
6908 (i.e. we generate VF results in 2 registers).
6909 In this case we have a separate def-use cycle for each copy, and therefore
6910 for each copy we get the vector def for the reduction variable from the
6911 respective phi node created for this copy.
6913 Otherwise (the reduction is unused in the loop nest), we can combine
6914 together intermediate results, like so (e.g. for ncopies=2):
6915 r = phi (init, r)
6916 r = x0 + r;
6917 r = x1 + r;
6918 (i.e. we generate VF/2 results in a single register).
6919 In this case for each copy we get the vector def for the reduction variable
6920 from the vectorized reduction operation generated in the previous iteration.
6922 This only works when we see both the reduction PHI and its only consumer
6923 in vectorizable_reduction and there are no intermediate stmts
6924 participating. */
6925 stmt_vec_info use_stmt_info;
6926 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6927 if (ncopies > 1
6928 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6929 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6930 && (use_stmt_info == stmt_info
6931 || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt_info))
6933 single_defuse_cycle = true;
6934 epilog_copies = 1;
6936 else
6937 epilog_copies = ncopies;
6939 /* If the reduction stmt is one of the patterns that have lane
6940 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6941 if ((ncopies > 1
6942 && ! single_defuse_cycle)
6943 && (code == DOT_PROD_EXPR
6944 || code == WIDEN_SUM_EXPR
6945 || code == SAD_EXPR))
6947 if (dump_enabled_p ())
6948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6949 "multi def-use cycle not possible for lane-reducing "
6950 "reduction operation\n");
6951 return false;
6954 if (slp_node)
6955 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6956 else
6957 vec_num = 1;
6959 internal_fn cond_fn = get_conditional_internal_fn (code);
6960 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6962 if (!vec_stmt) /* transformation not required. */
6964 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6965 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6967 if (reduction_type != FOLD_LEFT_REDUCTION
6968 && (cond_fn == IFN_LAST
6969 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6970 OPTIMIZE_FOR_SPEED)))
6972 if (dump_enabled_p ())
6973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6974 "can't use a fully-masked loop because no"
6975 " conditional operation is available.\n");
6976 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6978 else if (reduc_index == -1)
6980 if (dump_enabled_p ())
6981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982 "can't use a fully-masked loop for chained"
6983 " reductions.\n");
6984 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6986 else
6987 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6988 vectype_in);
6990 if (dump_enabled_p ()
6991 && reduction_type == FOLD_LEFT_REDUCTION)
6992 dump_printf_loc (MSG_NOTE, vect_location,
6993 "using an in-order (fold-left) reduction.\n");
6994 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6995 return true;
6998 /* Transform. */
7000 if (dump_enabled_p ())
7001 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7003 /* FORNOW: Multiple types are not supported for condition. */
7004 if (code == COND_EXPR)
7005 gcc_assert (ncopies == 1);
7007 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7009 if (reduction_type == FOLD_LEFT_REDUCTION)
7010 return vectorize_fold_left_reduction
7011 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7012 reduc_fn, ops, vectype_in, reduc_index, masks);
7014 if (reduction_type == EXTRACT_LAST_REDUCTION)
7016 gcc_assert (!slp_node);
7017 return vectorizable_condition (stmt_info, gsi, vec_stmt,
7018 NULL, reduc_index, NULL, NULL);
7021 /* Create the destination vector */
7022 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7024 prev_stmt_info = NULL;
7025 prev_phi_info = NULL;
7026 if (!slp_node)
7028 vec_oprnds0.create (1);
7029 vec_oprnds1.create (1);
7030 if (op_type == ternary_op)
7031 vec_oprnds2.create (1);
7034 phis.create (vec_num);
7035 vect_defs.create (vec_num);
7036 if (!slp_node)
7037 vect_defs.quick_push (NULL_TREE);
7039 if (slp_node)
7040 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7041 else
7042 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7044 for (j = 0; j < ncopies; j++)
7046 if (code == COND_EXPR)
7048 gcc_assert (!slp_node);
7049 vectorizable_condition (stmt_info, gsi, vec_stmt,
7050 PHI_RESULT (phis[0]->stmt),
7051 reduc_index, NULL, NULL);
7052 /* Multiple types are not supported for condition. */
7053 break;
7056 /* Handle uses. */
7057 if (j == 0)
7059 if (slp_node)
7061 /* Get vec defs for all the operands except the reduction index,
7062 ensuring the ordering of the ops in the vector is kept. */
7063 auto_vec<tree, 3> slp_ops;
7064 auto_vec<vec<tree>, 3> vec_defs;
7066 slp_ops.quick_push (ops[0]);
7067 slp_ops.quick_push (ops[1]);
7068 if (op_type == ternary_op)
7069 slp_ops.quick_push (ops[2]);
7071 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7073 vec_oprnds0.safe_splice (vec_defs[0]);
7074 vec_defs[0].release ();
7075 vec_oprnds1.safe_splice (vec_defs[1]);
7076 vec_defs[1].release ();
7077 if (op_type == ternary_op)
7079 vec_oprnds2.safe_splice (vec_defs[2]);
7080 vec_defs[2].release ();
7083 else
7085 vec_oprnds0.quick_push
7086 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7087 vec_oprnds1.quick_push
7088 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7089 if (op_type == ternary_op)
7090 vec_oprnds2.quick_push
7091 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7094 else
7096 if (!slp_node)
7098 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7100 if (single_defuse_cycle && reduc_index == 0)
7101 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7102 else
7103 vec_oprnds0[0]
7104 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7105 vec_oprnds0[0]);
7106 if (single_defuse_cycle && reduc_index == 1)
7107 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7108 else
7109 vec_oprnds1[0]
7110 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7111 vec_oprnds1[0]);
7112 if (op_type == ternary_op)
7114 if (single_defuse_cycle && reduc_index == 2)
7115 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7116 else
7117 vec_oprnds2[0]
7118 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7119 vec_oprnds2[0]);
7124 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7126 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7127 if (masked_loop_p)
7129 /* Make sure that the reduction accumulator is vop[0]. */
7130 if (reduc_index == 1)
7132 gcc_assert (commutative_tree_code (code));
7133 std::swap (vop[0], vop[1]);
7135 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7136 vectype_in, i * ncopies + j);
7137 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7138 vop[0], vop[1],
7139 vop[0]);
7140 new_temp = make_ssa_name (vec_dest, call);
7141 gimple_call_set_lhs (call, new_temp);
7142 gimple_call_set_nothrow (call, true);
7143 new_stmt_info
7144 = vect_finish_stmt_generation (stmt_info, call, gsi);
7146 else
7148 if (op_type == ternary_op)
7149 vop[2] = vec_oprnds2[i];
7151 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7152 vop[0], vop[1], vop[2]);
7153 new_temp = make_ssa_name (vec_dest, new_stmt);
7154 gimple_assign_set_lhs (new_stmt, new_temp);
7155 new_stmt_info
7156 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7159 if (slp_node)
7161 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7162 vect_defs.quick_push (new_temp);
7164 else
7165 vect_defs[0] = new_temp;
7168 if (slp_node)
7169 continue;
7171 if (j == 0)
7172 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7173 else
7174 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7176 prev_stmt_info = new_stmt_info;
7179 /* Finalize the reduction-phi (set its arguments) and create the
7180 epilog reduction code. */
7181 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7182 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7184 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7185 epilog_copies, reduc_fn, phis,
7186 double_reduc, slp_node, slp_node_instance,
7187 cond_reduc_val, cond_reduc_op_code,
7188 neutral_op);
7190 return true;
7193 /* Function vect_min_worthwhile_factor.
7195 For a loop where we could vectorize the operation indicated by CODE,
7196 return the minimum vectorization factor that makes it worthwhile
7197 to use generic vectors. */
7198 static unsigned int
7199 vect_min_worthwhile_factor (enum tree_code code)
7201 switch (code)
7203 case PLUS_EXPR:
7204 case MINUS_EXPR:
7205 case NEGATE_EXPR:
7206 return 4;
7208 case BIT_AND_EXPR:
7209 case BIT_IOR_EXPR:
7210 case BIT_XOR_EXPR:
7211 case BIT_NOT_EXPR:
7212 return 2;
7214 default:
7215 return INT_MAX;
7219 /* Return true if VINFO indicates we are doing loop vectorization and if
7220 it is worth decomposing CODE operations into scalar operations for
7221 that loop's vectorization factor. */
7223 bool
7224 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7226 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7227 unsigned HOST_WIDE_INT value;
7228 return (loop_vinfo
7229 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7230 && value >= vect_min_worthwhile_factor (code));
7233 /* Function vectorizable_induction
7235 Check if STMT_INFO performs an induction computation that can be vectorized.
7236 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7237 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7238 Return true if STMT_INFO is vectorizable in this way. */
7240 bool
7241 vectorizable_induction (stmt_vec_info stmt_info,
7242 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7243 stmt_vec_info *vec_stmt, slp_tree slp_node,
7244 stmt_vector_for_cost *cost_vec)
7246 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7247 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7248 unsigned ncopies;
7249 bool nested_in_vect_loop = false;
7250 struct loop *iv_loop;
7251 tree vec_def;
7252 edge pe = loop_preheader_edge (loop);
7253 basic_block new_bb;
7254 tree new_vec, vec_init, vec_step, t;
7255 tree new_name;
7256 gimple *new_stmt;
7257 gphi *induction_phi;
7258 tree induc_def, vec_dest;
7259 tree init_expr, step_expr;
7260 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7261 unsigned i;
7262 tree expr;
7263 gimple_seq stmts;
7264 imm_use_iterator imm_iter;
7265 use_operand_p use_p;
7266 gimple *exit_phi;
7267 edge latch_e;
7268 tree loop_arg;
7269 gimple_stmt_iterator si;
7271 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7272 if (!phi)
7273 return false;
7275 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7276 return false;
7278 /* Make sure it was recognized as induction computation. */
7279 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7280 return false;
7282 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7283 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7285 if (slp_node)
7286 ncopies = 1;
7287 else
7288 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7289 gcc_assert (ncopies >= 1);
7291 /* FORNOW. These restrictions should be relaxed. */
7292 if (nested_in_vect_loop_p (loop, stmt_info))
7294 imm_use_iterator imm_iter;
7295 use_operand_p use_p;
7296 gimple *exit_phi;
7297 edge latch_e;
7298 tree loop_arg;
7300 if (ncopies > 1)
7302 if (dump_enabled_p ())
7303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7304 "multiple types in nested loop.\n");
7305 return false;
7308 /* FORNOW: outer loop induction with SLP not supported. */
7309 if (STMT_SLP_TYPE (stmt_info))
7310 return false;
7312 exit_phi = NULL;
7313 latch_e = loop_latch_edge (loop->inner);
7314 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7315 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7317 gimple *use_stmt = USE_STMT (use_p);
7318 if (is_gimple_debug (use_stmt))
7319 continue;
7321 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7323 exit_phi = use_stmt;
7324 break;
7327 if (exit_phi)
7329 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7330 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7331 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7333 if (dump_enabled_p ())
7334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7335 "inner-loop induction only used outside "
7336 "of the outer vectorized loop.\n");
7337 return false;
7341 nested_in_vect_loop = true;
7342 iv_loop = loop->inner;
7344 else
7345 iv_loop = loop;
7346 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7348 if (slp_node && !nunits.is_constant ())
7350 /* The current SLP code creates the initial value element-by-element. */
7351 if (dump_enabled_p ())
7352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7353 "SLP induction not supported for variable-length"
7354 " vectors.\n");
7355 return false;
7358 if (!vec_stmt) /* transformation not required. */
7360 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7361 DUMP_VECT_SCOPE ("vectorizable_induction");
7362 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7363 return true;
7366 /* Transform. */
7368 /* Compute a vector variable, initialized with the first VF values of
7369 the induction variable. E.g., for an iv with IV_PHI='X' and
7370 evolution S, for a vector of 4 units, we want to compute:
7371 [X, X + S, X + 2*S, X + 3*S]. */
7373 if (dump_enabled_p ())
7374 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7376 latch_e = loop_latch_edge (iv_loop);
7377 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7379 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7380 gcc_assert (step_expr != NULL_TREE);
7382 pe = loop_preheader_edge (iv_loop);
7383 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7384 loop_preheader_edge (iv_loop));
7386 stmts = NULL;
7387 if (!nested_in_vect_loop)
7389 /* Convert the initial value to the desired type. */
7390 tree new_type = TREE_TYPE (vectype);
7391 init_expr = gimple_convert (&stmts, new_type, init_expr);
7393 /* If we are using the loop mask to "peel" for alignment then we need
7394 to adjust the start value here. */
7395 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7396 if (skip_niters != NULL_TREE)
7398 if (FLOAT_TYPE_P (vectype))
7399 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7400 skip_niters);
7401 else
7402 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7403 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7404 skip_niters, step_expr);
7405 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7406 init_expr, skip_step);
7410 /* Convert the step to the desired type. */
7411 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7413 if (stmts)
7415 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7416 gcc_assert (!new_bb);
7419 /* Find the first insertion point in the BB. */
7420 basic_block bb = gimple_bb (phi);
7421 si = gsi_after_labels (bb);
7423 /* For SLP induction we have to generate several IVs as for example
7424 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7425 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7426 [VF*S, VF*S, VF*S, VF*S] for all. */
7427 if (slp_node)
7429 /* Enforced above. */
7430 unsigned int const_nunits = nunits.to_constant ();
7432 /* Generate [VF*S, VF*S, ... ]. */
7433 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7435 expr = build_int_cst (integer_type_node, vf);
7436 expr = fold_convert (TREE_TYPE (step_expr), expr);
7438 else
7439 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7440 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7441 expr, step_expr);
7442 if (! CONSTANT_CLASS_P (new_name))
7443 new_name = vect_init_vector (stmt_info, new_name,
7444 TREE_TYPE (step_expr), NULL);
7445 new_vec = build_vector_from_val (vectype, new_name);
7446 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7448 /* Now generate the IVs. */
7449 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7450 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7451 unsigned elts = const_nunits * nvects;
7452 unsigned nivs = least_common_multiple (group_size,
7453 const_nunits) / const_nunits;
7454 gcc_assert (elts % group_size == 0);
7455 tree elt = init_expr;
7456 unsigned ivn;
7457 for (ivn = 0; ivn < nivs; ++ivn)
7459 tree_vector_builder elts (vectype, const_nunits, 1);
7460 stmts = NULL;
7461 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7463 if (ivn*const_nunits + eltn >= group_size
7464 && (ivn * const_nunits + eltn) % group_size == 0)
7465 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7466 elt, step_expr);
7467 elts.quick_push (elt);
7469 vec_init = gimple_build_vector (&stmts, &elts);
7470 if (stmts)
7472 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7473 gcc_assert (!new_bb);
7476 /* Create the induction-phi that defines the induction-operand. */
7477 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7478 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7479 stmt_vec_info induction_phi_info
7480 = loop_vinfo->add_stmt (induction_phi);
7481 induc_def = PHI_RESULT (induction_phi);
7483 /* Create the iv update inside the loop */
7484 vec_def = make_ssa_name (vec_dest);
7485 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7486 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7487 loop_vinfo->add_stmt (new_stmt);
7489 /* Set the arguments of the phi node: */
7490 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7491 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7492 UNKNOWN_LOCATION);
7494 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7497 /* Re-use IVs when we can. */
7498 if (ivn < nvects)
7500 unsigned vfp
7501 = least_common_multiple (group_size, const_nunits) / group_size;
7502 /* Generate [VF'*S, VF'*S, ... ]. */
7503 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7505 expr = build_int_cst (integer_type_node, vfp);
7506 expr = fold_convert (TREE_TYPE (step_expr), expr);
7508 else
7509 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7510 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7511 expr, step_expr);
7512 if (! CONSTANT_CLASS_P (new_name))
7513 new_name = vect_init_vector (stmt_info, new_name,
7514 TREE_TYPE (step_expr), NULL);
7515 new_vec = build_vector_from_val (vectype, new_name);
7516 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7517 for (; ivn < nvects; ++ivn)
7519 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7520 tree def;
7521 if (gimple_code (iv) == GIMPLE_PHI)
7522 def = gimple_phi_result (iv);
7523 else
7524 def = gimple_assign_lhs (iv);
7525 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7526 PLUS_EXPR,
7527 def, vec_step);
7528 if (gimple_code (iv) == GIMPLE_PHI)
7529 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7530 else
7532 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7533 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7535 SLP_TREE_VEC_STMTS (slp_node).quick_push
7536 (loop_vinfo->add_stmt (new_stmt));
7540 return true;
7543 /* Create the vector that holds the initial_value of the induction. */
7544 if (nested_in_vect_loop)
7546 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7547 been created during vectorization of previous stmts. We obtain it
7548 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7549 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7550 /* If the initial value is not of proper type, convert it. */
7551 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7553 new_stmt
7554 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7555 vect_simple_var,
7556 "vec_iv_"),
7557 VIEW_CONVERT_EXPR,
7558 build1 (VIEW_CONVERT_EXPR, vectype,
7559 vec_init));
7560 vec_init = gimple_assign_lhs (new_stmt);
7561 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7562 new_stmt);
7563 gcc_assert (!new_bb);
7564 loop_vinfo->add_stmt (new_stmt);
7567 else
7569 /* iv_loop is the loop to be vectorized. Create:
7570 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7571 stmts = NULL;
7572 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7574 unsigned HOST_WIDE_INT const_nunits;
7575 if (nunits.is_constant (&const_nunits))
7577 tree_vector_builder elts (vectype, const_nunits, 1);
7578 elts.quick_push (new_name);
7579 for (i = 1; i < const_nunits; i++)
7581 /* Create: new_name_i = new_name + step_expr */
7582 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7583 new_name, step_expr);
7584 elts.quick_push (new_name);
7586 /* Create a vector from [new_name_0, new_name_1, ...,
7587 new_name_nunits-1] */
7588 vec_init = gimple_build_vector (&stmts, &elts);
7590 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7591 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7592 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7593 new_name, step_expr);
7594 else
7596 /* Build:
7597 [base, base, base, ...]
7598 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7599 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7600 gcc_assert (flag_associative_math);
7601 tree index = build_index_vector (vectype, 0, 1);
7602 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7603 new_name);
7604 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7605 step_expr);
7606 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7607 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7608 vec_init, step_vec);
7609 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7610 vec_init, base_vec);
7613 if (stmts)
7615 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7616 gcc_assert (!new_bb);
7621 /* Create the vector that holds the step of the induction. */
7622 if (nested_in_vect_loop)
7623 /* iv_loop is nested in the loop to be vectorized. Generate:
7624 vec_step = [S, S, S, S] */
7625 new_name = step_expr;
7626 else
7628 /* iv_loop is the loop to be vectorized. Generate:
7629 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7630 gimple_seq seq = NULL;
7631 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7633 expr = build_int_cst (integer_type_node, vf);
7634 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7636 else
7637 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7638 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7639 expr, step_expr);
7640 if (seq)
7642 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7643 gcc_assert (!new_bb);
7647 t = unshare_expr (new_name);
7648 gcc_assert (CONSTANT_CLASS_P (new_name)
7649 || TREE_CODE (new_name) == SSA_NAME);
7650 new_vec = build_vector_from_val (vectype, t);
7651 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7654 /* Create the following def-use cycle:
7655 loop prolog:
7656 vec_init = ...
7657 vec_step = ...
7658 loop:
7659 vec_iv = PHI <vec_init, vec_loop>
7661 STMT
7663 vec_loop = vec_iv + vec_step; */
7665 /* Create the induction-phi that defines the induction-operand. */
7666 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7667 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7668 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7669 induc_def = PHI_RESULT (induction_phi);
7671 /* Create the iv update inside the loop */
7672 vec_def = make_ssa_name (vec_dest);
7673 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7674 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7675 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7677 /* Set the arguments of the phi node: */
7678 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7679 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7680 UNKNOWN_LOCATION);
7682 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7684 /* In case that vectorization factor (VF) is bigger than the number
7685 of elements that we can fit in a vectype (nunits), we have to generate
7686 more than one vector stmt - i.e - we need to "unroll" the
7687 vector stmt by a factor VF/nunits. For more details see documentation
7688 in vectorizable_operation. */
7690 if (ncopies > 1)
7692 gimple_seq seq = NULL;
7693 stmt_vec_info prev_stmt_vinfo;
7694 /* FORNOW. This restriction should be relaxed. */
7695 gcc_assert (!nested_in_vect_loop);
7697 /* Create the vector that holds the step of the induction. */
7698 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7700 expr = build_int_cst (integer_type_node, nunits);
7701 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7703 else
7704 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7705 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7706 expr, step_expr);
7707 if (seq)
7709 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7710 gcc_assert (!new_bb);
7713 t = unshare_expr (new_name);
7714 gcc_assert (CONSTANT_CLASS_P (new_name)
7715 || TREE_CODE (new_name) == SSA_NAME);
7716 new_vec = build_vector_from_val (vectype, t);
7717 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7719 vec_def = induc_def;
7720 prev_stmt_vinfo = induction_phi_info;
7721 for (i = 1; i < ncopies; i++)
7723 /* vec_i = vec_prev + vec_step */
7724 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7725 vec_def, vec_step);
7726 vec_def = make_ssa_name (vec_dest, new_stmt);
7727 gimple_assign_set_lhs (new_stmt, vec_def);
7729 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7730 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7731 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7732 prev_stmt_vinfo = new_stmt_info;
7736 if (nested_in_vect_loop)
7738 /* Find the loop-closed exit-phi of the induction, and record
7739 the final vector of induction results: */
7740 exit_phi = NULL;
7741 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7743 gimple *use_stmt = USE_STMT (use_p);
7744 if (is_gimple_debug (use_stmt))
7745 continue;
7747 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7749 exit_phi = use_stmt;
7750 break;
7753 if (exit_phi)
7755 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7756 /* FORNOW. Currently not supporting the case that an inner-loop induction
7757 is not used in the outer-loop (i.e. only outside the outer-loop). */
7758 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7759 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7761 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7762 if (dump_enabled_p ())
7764 dump_printf_loc (MSG_NOTE, vect_location,
7765 "vector of inductions after inner-loop:");
7766 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7772 if (dump_enabled_p ())
7774 dump_printf_loc (MSG_NOTE, vect_location,
7775 "transform induction: created def-use cycle: ");
7776 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7777 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7778 SSA_NAME_DEF_STMT (vec_def), 0);
7781 return true;
7784 /* Function vectorizable_live_operation.
7786 STMT_INFO computes a value that is used outside the loop. Check if
7787 it can be supported. */
7789 bool
7790 vectorizable_live_operation (stmt_vec_info stmt_info,
7791 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7792 slp_tree slp_node, int slp_index,
7793 stmt_vec_info *vec_stmt,
7794 stmt_vector_for_cost *)
7796 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7797 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7798 imm_use_iterator imm_iter;
7799 tree lhs, lhs_type, bitsize, vec_bitsize;
7800 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7801 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7802 int ncopies;
7803 gimple *use_stmt;
7804 auto_vec<tree> vec_oprnds;
7805 int vec_entry = 0;
7806 poly_uint64 vec_index = 0;
7808 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7810 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7811 return false;
7813 /* FORNOW. CHECKME. */
7814 if (nested_in_vect_loop_p (loop, stmt_info))
7815 return false;
7817 /* If STMT is not relevant and it is a simple assignment and its inputs are
7818 invariant then it can remain in place, unvectorized. The original last
7819 scalar value that it computes will be used. */
7820 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7822 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7823 if (dump_enabled_p ())
7824 dump_printf_loc (MSG_NOTE, vect_location,
7825 "statement is simple and uses invariant. Leaving in "
7826 "place.\n");
7827 return true;
7830 if (slp_node)
7831 ncopies = 1;
7832 else
7833 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7835 if (slp_node)
7837 gcc_assert (slp_index >= 0);
7839 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7840 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7842 /* Get the last occurrence of the scalar index from the concatenation of
7843 all the slp vectors. Calculate which slp vector it is and the index
7844 within. */
7845 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7847 /* Calculate which vector contains the result, and which lane of
7848 that vector we need. */
7849 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7851 if (dump_enabled_p ())
7852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7853 "Cannot determine which vector holds the"
7854 " final result.\n");
7855 return false;
7859 if (!vec_stmt)
7861 /* No transformation required. */
7862 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7864 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7865 OPTIMIZE_FOR_SPEED))
7867 if (dump_enabled_p ())
7868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7869 "can't use a fully-masked loop because "
7870 "the target doesn't support extract last "
7871 "reduction.\n");
7872 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7874 else if (slp_node)
7876 if (dump_enabled_p ())
7877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7878 "can't use a fully-masked loop because an "
7879 "SLP statement is live after the loop.\n");
7880 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7882 else if (ncopies > 1)
7884 if (dump_enabled_p ())
7885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886 "can't use a fully-masked loop because"
7887 " ncopies is greater than 1.\n");
7888 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7890 else
7892 gcc_assert (ncopies == 1 && !slp_node);
7893 vect_record_loop_mask (loop_vinfo,
7894 &LOOP_VINFO_MASKS (loop_vinfo),
7895 1, vectype);
7898 return true;
7901 /* If stmt has a related stmt, then use that for getting the lhs. */
7902 gimple *stmt = (is_pattern_stmt_p (stmt_info)
7903 ? STMT_VINFO_RELATED_STMT (stmt_info)->stmt
7904 : stmt_info->stmt);
7906 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7907 : gimple_get_lhs (stmt);
7908 lhs_type = TREE_TYPE (lhs);
7910 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7911 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7912 : TYPE_SIZE (TREE_TYPE (vectype)));
7913 vec_bitsize = TYPE_SIZE (vectype);
7915 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7916 tree vec_lhs, bitstart;
7917 if (slp_node)
7919 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7921 /* Get the correct slp vectorized stmt. */
7922 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7923 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7924 vec_lhs = gimple_phi_result (phi);
7925 else
7926 vec_lhs = gimple_get_lhs (vec_stmt);
7928 /* Get entry to use. */
7929 bitstart = bitsize_int (vec_index);
7930 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7932 else
7934 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7935 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7936 gcc_checking_assert (ncopies == 1
7937 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7939 /* For multiple copies, get the last copy. */
7940 for (int i = 1; i < ncopies; ++i)
7941 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7943 /* Get the last lane in the vector. */
7944 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7947 gimple_seq stmts = NULL;
7948 tree new_tree;
7949 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7951 /* Emit:
7953 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7955 where VEC_LHS is the vectorized live-out result and MASK is
7956 the loop mask for the final iteration. */
7957 gcc_assert (ncopies == 1 && !slp_node);
7958 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7959 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7960 1, vectype, 0);
7961 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7962 scalar_type, mask, vec_lhs);
7964 /* Convert the extracted vector element to the required scalar type. */
7965 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7967 else
7969 tree bftype = TREE_TYPE (vectype);
7970 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7971 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7972 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7973 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7974 &stmts, true, NULL_TREE);
7977 if (stmts)
7978 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7980 /* Replace use of lhs with newly computed result. If the use stmt is a
7981 single arg PHI, just replace all uses of PHI result. It's necessary
7982 because lcssa PHI defining lhs may be before newly inserted stmt. */
7983 use_operand_p use_p;
7984 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7985 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7986 && !is_gimple_debug (use_stmt))
7988 if (gimple_code (use_stmt) == GIMPLE_PHI
7989 && gimple_phi_num_args (use_stmt) == 1)
7991 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7993 else
7995 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7996 SET_USE (use_p, new_tree);
7998 update_stmt (use_stmt);
8001 return true;
8004 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8006 static void
8007 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8009 ssa_op_iter op_iter;
8010 imm_use_iterator imm_iter;
8011 def_operand_p def_p;
8012 gimple *ustmt;
8014 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8016 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8018 basic_block bb;
8020 if (!is_gimple_debug (ustmt))
8021 continue;
8023 bb = gimple_bb (ustmt);
8025 if (!flow_bb_inside_loop_p (loop, bb))
8027 if (gimple_debug_bind_p (ustmt))
8029 if (dump_enabled_p ())
8030 dump_printf_loc (MSG_NOTE, vect_location,
8031 "killing debug use\n");
8033 gimple_debug_bind_reset_value (ustmt);
8034 update_stmt (ustmt);
8036 else
8037 gcc_unreachable ();
8043 /* Given loop represented by LOOP_VINFO, return true if computation of
8044 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8045 otherwise. */
8047 static bool
8048 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8050 /* Constant case. */
8051 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8053 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8054 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8056 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8057 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8058 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8059 return true;
8062 widest_int max;
8063 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8064 /* Check the upper bound of loop niters. */
8065 if (get_max_loop_iterations (loop, &max))
8067 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8068 signop sgn = TYPE_SIGN (type);
8069 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8070 if (max < type_max)
8071 return true;
8073 return false;
8076 /* Return a mask type with half the number of elements as TYPE. */
8078 tree
8079 vect_halve_mask_nunits (tree type)
8081 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8082 return build_truth_vector_type (nunits, current_vector_size);
8085 /* Return a mask type with twice as many elements as TYPE. */
8087 tree
8088 vect_double_mask_nunits (tree type)
8090 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8091 return build_truth_vector_type (nunits, current_vector_size);
8094 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8095 contain a sequence of NVECTORS masks that each control a vector of type
8096 VECTYPE. */
8098 void
8099 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8100 unsigned int nvectors, tree vectype)
8102 gcc_assert (nvectors != 0);
8103 if (masks->length () < nvectors)
8104 masks->safe_grow_cleared (nvectors);
8105 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8106 /* The number of scalars per iteration and the number of vectors are
8107 both compile-time constants. */
8108 unsigned int nscalars_per_iter
8109 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8110 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8111 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8113 rgm->max_nscalars_per_iter = nscalars_per_iter;
8114 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8118 /* Given a complete set of masks MASKS, extract mask number INDEX
8119 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8120 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8122 See the comment above vec_loop_masks for more details about the mask
8123 arrangement. */
8125 tree
8126 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8127 unsigned int nvectors, tree vectype, unsigned int index)
8129 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8130 tree mask_type = rgm->mask_type;
8132 /* Populate the rgroup's mask array, if this is the first time we've
8133 used it. */
8134 if (rgm->masks.is_empty ())
8136 rgm->masks.safe_grow_cleared (nvectors);
8137 for (unsigned int i = 0; i < nvectors; ++i)
8139 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8140 /* Provide a dummy definition until the real one is available. */
8141 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8142 rgm->masks[i] = mask;
8146 tree mask = rgm->masks[index];
8147 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8148 TYPE_VECTOR_SUBPARTS (vectype)))
8150 /* A loop mask for data type X can be reused for data type Y
8151 if X has N times more elements than Y and if Y's elements
8152 are N times bigger than X's. In this case each sequence
8153 of N elements in the loop mask will be all-zero or all-one.
8154 We can then view-convert the mask so that each sequence of
8155 N elements is replaced by a single element. */
8156 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8157 TYPE_VECTOR_SUBPARTS (vectype)));
8158 gimple_seq seq = NULL;
8159 mask_type = build_same_sized_truth_vector_type (vectype);
8160 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8161 if (seq)
8162 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8164 return mask;
8167 /* Scale profiling counters by estimation for LOOP which is vectorized
8168 by factor VF. */
8170 static void
8171 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8173 edge preheader = loop_preheader_edge (loop);
8174 /* Reduce loop iterations by the vectorization factor. */
8175 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8176 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8178 if (freq_h.nonzero_p ())
8180 profile_probability p;
8182 /* Avoid dropping loop body profile counter to 0 because of zero count
8183 in loop's preheader. */
8184 if (!(freq_e == profile_count::zero ()))
8185 freq_e = freq_e.force_nonzero ();
8186 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8187 scale_loop_frequencies (loop, p);
8190 edge exit_e = single_exit (loop);
8191 exit_e->probability = profile_probability::always ()
8192 .apply_scale (1, new_est_niter + 1);
8194 edge exit_l = single_pred_edge (loop->latch);
8195 profile_probability prob = exit_l->probability;
8196 exit_l->probability = exit_e->probability.invert ();
8197 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8198 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8201 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8202 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its stmt_vec_info.
8203 *SLP_SCHEDULE is a running record of whether we have called
8204 vect_schedule_slp. */
8206 static void
8207 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8208 gimple_stmt_iterator *gsi,
8209 stmt_vec_info *seen_store, bool *slp_scheduled)
8211 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8212 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8214 if (dump_enabled_p ())
8216 dump_printf_loc (MSG_NOTE, vect_location,
8217 "------>vectorizing statement: ");
8218 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
8221 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8222 vect_loop_kill_debug_uses (loop, stmt_info);
8224 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8225 && !STMT_VINFO_LIVE_P (stmt_info))
8226 return;
8228 if (STMT_VINFO_VECTYPE (stmt_info))
8230 poly_uint64 nunits
8231 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8232 if (!STMT_SLP_TYPE (stmt_info)
8233 && maybe_ne (nunits, vf)
8234 && dump_enabled_p ())
8235 /* For SLP VF is set according to unrolling factor, and not
8236 to vector size, hence for SLP this print is not valid. */
8237 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8240 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8241 reached. */
8242 if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8245 if (!*slp_scheduled)
8247 *slp_scheduled = true;
8249 DUMP_VECT_SCOPE ("scheduling SLP instances");
8251 vect_schedule_slp (loop_vinfo);
8254 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8255 if (slptype == pure_slp)
8256 return;
8259 if (dump_enabled_p ())
8260 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8262 bool grouped_store = false;
8263 if (vect_transform_stmt (stmt_info, gsi, &grouped_store, NULL, NULL))
8264 *seen_store = stmt_info;
8267 /* Function vect_transform_loop.
8269 The analysis phase has determined that the loop is vectorizable.
8270 Vectorize the loop - created vectorized stmts to replace the scalar
8271 stmts in the loop, and update the loop exit condition.
8272 Returns scalar epilogue loop if any. */
8274 struct loop *
8275 vect_transform_loop (loop_vec_info loop_vinfo)
8277 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8278 struct loop *epilogue = NULL;
8279 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8280 int nbbs = loop->num_nodes;
8281 int i;
8282 tree niters_vector = NULL_TREE;
8283 tree step_vector = NULL_TREE;
8284 tree niters_vector_mult_vf = NULL_TREE;
8285 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8286 unsigned int lowest_vf = constant_lower_bound (vf);
8287 bool slp_scheduled = false;
8288 gimple *stmt;
8289 bool check_profitability = false;
8290 unsigned int th;
8292 DUMP_VECT_SCOPE ("vec_transform_loop");
8294 loop_vinfo->shared->check_datarefs ();
8296 /* Use the more conservative vectorization threshold. If the number
8297 of iterations is constant assume the cost check has been performed
8298 by our caller. If the threshold makes all loops profitable that
8299 run at least the (estimated) vectorization factor number of times
8300 checking is pointless, too. */
8301 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8302 if (th >= vect_vf_for_cost (loop_vinfo)
8303 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8305 if (dump_enabled_p ())
8306 dump_printf_loc (MSG_NOTE, vect_location,
8307 "Profitability threshold is %d loop iterations.\n",
8308 th);
8309 check_profitability = true;
8312 /* Make sure there exists a single-predecessor exit bb. Do this before
8313 versioning. */
8314 edge e = single_exit (loop);
8315 if (! single_pred_p (e->dest))
8317 split_loop_exit_edge (e);
8318 if (dump_enabled_p ())
8319 dump_printf (MSG_NOTE, "split exit edge\n");
8322 /* Version the loop first, if required, so the profitability check
8323 comes first. */
8325 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8327 poly_uint64 versioning_threshold
8328 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8329 if (check_profitability
8330 && ordered_p (poly_uint64 (th), versioning_threshold))
8332 versioning_threshold = ordered_max (poly_uint64 (th),
8333 versioning_threshold);
8334 check_profitability = false;
8336 vect_loop_versioning (loop_vinfo, th, check_profitability,
8337 versioning_threshold);
8338 check_profitability = false;
8341 /* Make sure there exists a single-predecessor exit bb also on the
8342 scalar loop copy. Do this after versioning but before peeling
8343 so CFG structure is fine for both scalar and if-converted loop
8344 to make slpeel_duplicate_current_defs_from_edges face matched
8345 loop closed PHI nodes on the exit. */
8346 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8348 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8349 if (! single_pred_p (e->dest))
8351 split_loop_exit_edge (e);
8352 if (dump_enabled_p ())
8353 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8357 tree niters = vect_build_loop_niters (loop_vinfo);
8358 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8359 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8360 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8361 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8362 &step_vector, &niters_vector_mult_vf, th,
8363 check_profitability, niters_no_overflow);
8365 if (niters_vector == NULL_TREE)
8367 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8368 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8369 && known_eq (lowest_vf, vf))
8371 niters_vector
8372 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8373 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8374 step_vector = build_one_cst (TREE_TYPE (niters));
8376 else
8377 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8378 &step_vector, niters_no_overflow);
8381 /* 1) Make sure the loop header has exactly two entries
8382 2) Make sure we have a preheader basic block. */
8384 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8386 split_edge (loop_preheader_edge (loop));
8388 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8389 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8390 /* This will deal with any possible peeling. */
8391 vect_prepare_for_masked_peels (loop_vinfo);
8393 /* FORNOW: the vectorizer supports only loops which body consist
8394 of one basic block (header + empty latch). When the vectorizer will
8395 support more involved loop forms, the order by which the BBs are
8396 traversed need to be reconsidered. */
8398 for (i = 0; i < nbbs; i++)
8400 basic_block bb = bbs[i];
8401 stmt_vec_info stmt_info;
8403 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8404 gsi_next (&si))
8406 gphi *phi = si.phi ();
8407 if (dump_enabled_p ())
8409 dump_printf_loc (MSG_NOTE, vect_location,
8410 "------>vectorizing phi: ");
8411 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8413 stmt_info = loop_vinfo->lookup_stmt (phi);
8414 if (!stmt_info)
8415 continue;
8417 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8418 vect_loop_kill_debug_uses (loop, stmt_info);
8420 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8421 && !STMT_VINFO_LIVE_P (stmt_info))
8422 continue;
8424 if (STMT_VINFO_VECTYPE (stmt_info)
8425 && (maybe_ne
8426 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8427 && dump_enabled_p ())
8428 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8430 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8431 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8432 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8433 && ! PURE_SLP_STMT (stmt_info))
8435 if (dump_enabled_p ())
8436 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8437 vect_transform_stmt (stmt_info, NULL, NULL, NULL, NULL);
8441 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8442 !gsi_end_p (si);)
8444 stmt = gsi_stmt (si);
8445 /* During vectorization remove existing clobber stmts. */
8446 if (gimple_clobber_p (stmt))
8448 unlink_stmt_vdef (stmt);
8449 gsi_remove (&si, true);
8450 release_defs (stmt);
8452 else
8454 stmt_info = loop_vinfo->lookup_stmt (stmt);
8456 /* vector stmts created in the outer-loop during vectorization of
8457 stmts in an inner-loop may not have a stmt_info, and do not
8458 need to be vectorized. */
8459 stmt_vec_info seen_store = NULL;
8460 if (stmt_info)
8462 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8464 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8465 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8466 !gsi_end_p (subsi); gsi_next (&subsi))
8468 stmt_vec_info pat_stmt_info
8469 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8470 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8471 &si, &seen_store,
8472 &slp_scheduled);
8474 stmt_vec_info pat_stmt_info
8475 = STMT_VINFO_RELATED_STMT (stmt_info);
8476 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8477 &seen_store, &slp_scheduled);
8479 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8480 &seen_store, &slp_scheduled);
8482 gsi_next (&si);
8483 if (seen_store)
8485 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8486 /* Interleaving. If IS_STORE is TRUE, the
8487 vectorization of the interleaving chain was
8488 completed - free all the stores in the chain. */
8489 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8490 else
8491 /* Free the attached stmt_vec_info and remove the stmt. */
8492 loop_vinfo->remove_stmt (stmt_info);
8497 /* Stub out scalar statements that must not survive vectorization.
8498 Doing this here helps with grouped statements, or statements that
8499 are involved in patterns. */
8500 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8501 !gsi_end_p (gsi); gsi_next (&gsi))
8503 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8504 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8506 tree lhs = gimple_get_lhs (call);
8507 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8509 tree zero = build_zero_cst (TREE_TYPE (lhs));
8510 gimple *new_stmt = gimple_build_assign (lhs, zero);
8511 gsi_replace (&gsi, new_stmt, true);
8515 } /* BBs in loop */
8517 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8518 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8519 if (integer_onep (step_vector))
8520 niters_no_overflow = true;
8521 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8522 niters_vector_mult_vf, !niters_no_overflow);
8524 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8525 scale_profile_for_vect_loop (loop, assumed_vf);
8527 /* True if the final iteration might not handle a full vector's
8528 worth of scalar iterations. */
8529 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8530 /* The minimum number of iterations performed by the epilogue. This
8531 is 1 when peeling for gaps because we always need a final scalar
8532 iteration. */
8533 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8534 /* +1 to convert latch counts to loop iteration counts,
8535 -min_epilogue_iters to remove iterations that cannot be performed
8536 by the vector code. */
8537 int bias_for_lowest = 1 - min_epilogue_iters;
8538 int bias_for_assumed = bias_for_lowest;
8539 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8540 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8542 /* When the amount of peeling is known at compile time, the first
8543 iteration will have exactly alignment_npeels active elements.
8544 In the worst case it will have at least one. */
8545 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8546 bias_for_lowest += lowest_vf - min_first_active;
8547 bias_for_assumed += assumed_vf - min_first_active;
8549 /* In these calculations the "- 1" converts loop iteration counts
8550 back to latch counts. */
8551 if (loop->any_upper_bound)
8552 loop->nb_iterations_upper_bound
8553 = (final_iter_may_be_partial
8554 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8555 lowest_vf) - 1
8556 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8557 lowest_vf) - 1);
8558 if (loop->any_likely_upper_bound)
8559 loop->nb_iterations_likely_upper_bound
8560 = (final_iter_may_be_partial
8561 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8562 + bias_for_lowest, lowest_vf) - 1
8563 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8564 + bias_for_lowest, lowest_vf) - 1);
8565 if (loop->any_estimate)
8566 loop->nb_iterations_estimate
8567 = (final_iter_may_be_partial
8568 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8569 assumed_vf) - 1
8570 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8571 assumed_vf) - 1);
8573 if (dump_enabled_p ())
8575 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8577 dump_printf_loc (MSG_NOTE, vect_location,
8578 "LOOP VECTORIZED\n");
8579 if (loop->inner)
8580 dump_printf_loc (MSG_NOTE, vect_location,
8581 "OUTER LOOP VECTORIZED\n");
8582 dump_printf (MSG_NOTE, "\n");
8584 else
8586 dump_printf_loc (MSG_NOTE, vect_location,
8587 "LOOP EPILOGUE VECTORIZED (VS=");
8588 dump_dec (MSG_NOTE, current_vector_size);
8589 dump_printf (MSG_NOTE, ")\n");
8593 /* Free SLP instances here because otherwise stmt reference counting
8594 won't work. */
8595 slp_instance instance;
8596 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8597 vect_free_slp_instance (instance, true);
8598 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8599 /* Clear-up safelen field since its value is invalid after vectorization
8600 since vectorized loop can have loop-carried dependencies. */
8601 loop->safelen = 0;
8603 /* Don't vectorize epilogue for epilogue. */
8604 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8605 epilogue = NULL;
8607 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8608 epilogue = NULL;
8610 if (epilogue)
8612 auto_vector_sizes vector_sizes;
8613 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8614 unsigned int next_size = 0;
8616 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8617 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8618 && known_eq (vf, lowest_vf))
8620 unsigned int eiters
8621 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8622 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8623 eiters = eiters % lowest_vf;
8624 epilogue->nb_iterations_upper_bound = eiters - 1;
8626 unsigned int ratio;
8627 while (next_size < vector_sizes.length ()
8628 && !(constant_multiple_p (current_vector_size,
8629 vector_sizes[next_size], &ratio)
8630 && eiters >= lowest_vf / ratio))
8631 next_size += 1;
8633 else
8634 while (next_size < vector_sizes.length ()
8635 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8636 next_size += 1;
8638 if (next_size == vector_sizes.length ())
8639 epilogue = NULL;
8642 if (epilogue)
8644 epilogue->force_vectorize = loop->force_vectorize;
8645 epilogue->safelen = loop->safelen;
8646 epilogue->dont_vectorize = false;
8648 /* We may need to if-convert epilogue to vectorize it. */
8649 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8650 tree_if_conversion (epilogue);
8653 return epilogue;
8656 /* The code below is trying to perform simple optimization - revert
8657 if-conversion for masked stores, i.e. if the mask of a store is zero
8658 do not perform it and all stored value producers also if possible.
8659 For example,
8660 for (i=0; i<n; i++)
8661 if (c[i])
8663 p1[i] += 1;
8664 p2[i] = p3[i] +2;
8666 this transformation will produce the following semi-hammock:
8668 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8670 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8671 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8672 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8673 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8674 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8675 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8679 void
8680 optimize_mask_stores (struct loop *loop)
8682 basic_block *bbs = get_loop_body (loop);
8683 unsigned nbbs = loop->num_nodes;
8684 unsigned i;
8685 basic_block bb;
8686 struct loop *bb_loop;
8687 gimple_stmt_iterator gsi;
8688 gimple *stmt;
8689 auto_vec<gimple *> worklist;
8691 vect_location = find_loop_location (loop);
8692 /* Pick up all masked stores in loop if any. */
8693 for (i = 0; i < nbbs; i++)
8695 bb = bbs[i];
8696 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8697 gsi_next (&gsi))
8699 stmt = gsi_stmt (gsi);
8700 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8701 worklist.safe_push (stmt);
8705 free (bbs);
8706 if (worklist.is_empty ())
8707 return;
8709 /* Loop has masked stores. */
8710 while (!worklist.is_empty ())
8712 gimple *last, *last_store;
8713 edge e, efalse;
8714 tree mask;
8715 basic_block store_bb, join_bb;
8716 gimple_stmt_iterator gsi_to;
8717 tree vdef, new_vdef;
8718 gphi *phi;
8719 tree vectype;
8720 tree zero;
8722 last = worklist.pop ();
8723 mask = gimple_call_arg (last, 2);
8724 bb = gimple_bb (last);
8725 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8726 the same loop as if_bb. It could be different to LOOP when two
8727 level loop-nest is vectorized and mask_store belongs to the inner
8728 one. */
8729 e = split_block (bb, last);
8730 bb_loop = bb->loop_father;
8731 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8732 join_bb = e->dest;
8733 store_bb = create_empty_bb (bb);
8734 add_bb_to_loop (store_bb, bb_loop);
8735 e->flags = EDGE_TRUE_VALUE;
8736 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8737 /* Put STORE_BB to likely part. */
8738 efalse->probability = profile_probability::unlikely ();
8739 store_bb->count = efalse->count ();
8740 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8741 if (dom_info_available_p (CDI_DOMINATORS))
8742 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8743 if (dump_enabled_p ())
8744 dump_printf_loc (MSG_NOTE, vect_location,
8745 "Create new block %d to sink mask stores.",
8746 store_bb->index);
8747 /* Create vector comparison with boolean result. */
8748 vectype = TREE_TYPE (mask);
8749 zero = build_zero_cst (vectype);
8750 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8751 gsi = gsi_last_bb (bb);
8752 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8753 /* Create new PHI node for vdef of the last masked store:
8754 .MEM_2 = VDEF <.MEM_1>
8755 will be converted to
8756 .MEM.3 = VDEF <.MEM_1>
8757 and new PHI node will be created in join bb
8758 .MEM_2 = PHI <.MEM_1, .MEM_3>
8760 vdef = gimple_vdef (last);
8761 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8762 gimple_set_vdef (last, new_vdef);
8763 phi = create_phi_node (vdef, join_bb);
8764 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8766 /* Put all masked stores with the same mask to STORE_BB if possible. */
8767 while (true)
8769 gimple_stmt_iterator gsi_from;
8770 gimple *stmt1 = NULL;
8772 /* Move masked store to STORE_BB. */
8773 last_store = last;
8774 gsi = gsi_for_stmt (last);
8775 gsi_from = gsi;
8776 /* Shift GSI to the previous stmt for further traversal. */
8777 gsi_prev (&gsi);
8778 gsi_to = gsi_start_bb (store_bb);
8779 gsi_move_before (&gsi_from, &gsi_to);
8780 /* Setup GSI_TO to the non-empty block start. */
8781 gsi_to = gsi_start_bb (store_bb);
8782 if (dump_enabled_p ())
8784 dump_printf_loc (MSG_NOTE, vect_location,
8785 "Move stmt to created bb\n");
8786 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8788 /* Move all stored value producers if possible. */
8789 while (!gsi_end_p (gsi))
8791 tree lhs;
8792 imm_use_iterator imm_iter;
8793 use_operand_p use_p;
8794 bool res;
8796 /* Skip debug statements. */
8797 if (is_gimple_debug (gsi_stmt (gsi)))
8799 gsi_prev (&gsi);
8800 continue;
8802 stmt1 = gsi_stmt (gsi);
8803 /* Do not consider statements writing to memory or having
8804 volatile operand. */
8805 if (gimple_vdef (stmt1)
8806 || gimple_has_volatile_ops (stmt1))
8807 break;
8808 gsi_from = gsi;
8809 gsi_prev (&gsi);
8810 lhs = gimple_get_lhs (stmt1);
8811 if (!lhs)
8812 break;
8814 /* LHS of vectorized stmt must be SSA_NAME. */
8815 if (TREE_CODE (lhs) != SSA_NAME)
8816 break;
8818 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8820 /* Remove dead scalar statement. */
8821 if (has_zero_uses (lhs))
8823 gsi_remove (&gsi_from, true);
8824 continue;
8828 /* Check that LHS does not have uses outside of STORE_BB. */
8829 res = true;
8830 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8832 gimple *use_stmt;
8833 use_stmt = USE_STMT (use_p);
8834 if (is_gimple_debug (use_stmt))
8835 continue;
8836 if (gimple_bb (use_stmt) != store_bb)
8838 res = false;
8839 break;
8842 if (!res)
8843 break;
8845 if (gimple_vuse (stmt1)
8846 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8847 break;
8849 /* Can move STMT1 to STORE_BB. */
8850 if (dump_enabled_p ())
8852 dump_printf_loc (MSG_NOTE, vect_location,
8853 "Move stmt to created bb\n");
8854 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8856 gsi_move_before (&gsi_from, &gsi_to);
8857 /* Shift GSI_TO for further insertion. */
8858 gsi_prev (&gsi_to);
8860 /* Put other masked stores with the same mask to STORE_BB. */
8861 if (worklist.is_empty ()
8862 || gimple_call_arg (worklist.last (), 2) != mask
8863 || worklist.last () != stmt1)
8864 break;
8865 last = worklist.pop ();
8867 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);