[03/46] Remove unnecessary update of NUM_SLP_USES
[official-gcc.git] / gcc / tree-vect-loop.c
blob9be3d31dd355b55775a400de0f58c3c88af6118f
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
184 if (stmt_vectype)
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return true;
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
221 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
222 return false;
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
227 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
234 stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location,
238 "==> examining pattern def stmt: ");
239 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
240 def_stmt_info->stmt, 0);
242 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
243 vf, mask_producers))
244 return false;
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE, vect_location,
250 "==> examining pattern statement: ");
251 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
253 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
254 return false;
257 return true;
260 /* Function vect_determine_vectorization_factor
262 Determine the vectorization factor (VF). VF is the number of data elements
263 that are operated upon in parallel in a single iteration of the vectorized
264 loop. For example, when vectorizing a loop that operates on 4byte elements,
265 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
266 elements can fit in a single vector register.
268 We currently support vectorization of loops in which all types operated upon
269 are of the same size. Therefore this function currently sets VF according to
270 the size of the types operated upon, and fails if there are multiple sizes
271 in the loop.
273 VF is also the factor by which the loop iterations are strip-mined, e.g.:
274 original loop:
275 for (i=0; i<N; i++){
276 a[i] = b[i] + c[i];
279 vectorized loop:
280 for (i=0; i<N; i+=VF){
281 a[i:VF] = b[i:VF] + c[i:VF];
285 static bool
286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
289 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
290 unsigned nbbs = loop->num_nodes;
291 poly_uint64 vectorization_factor = 1;
292 tree scalar_type = NULL_TREE;
293 gphi *phi;
294 tree vectype;
295 stmt_vec_info stmt_info;
296 unsigned i;
297 auto_vec<stmt_vec_info> mask_producers;
299 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
301 for (i = 0; i < nbbs; i++)
303 basic_block bb = bbs[i];
305 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
306 gsi_next (&si))
308 phi = si.phi ();
309 stmt_info = vinfo_for_stmt (phi);
310 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
313 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
316 gcc_assert (stmt_info);
318 if (STMT_VINFO_RELEVANT_P (stmt_info)
319 || STMT_VINFO_LIVE_P (stmt_info))
321 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
322 scalar_type = TREE_TYPE (PHI_RESULT (phi));
324 if (dump_enabled_p ())
326 dump_printf_loc (MSG_NOTE, vect_location,
327 "get vectype for scalar type: ");
328 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
329 dump_printf (MSG_NOTE, "\n");
332 vectype = get_vectype_for_scalar_type (scalar_type);
333 if (!vectype)
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
338 "not vectorized: unsupported "
339 "data-type ");
340 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
341 scalar_type);
342 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344 return false;
346 STMT_VINFO_VECTYPE (stmt_info) = vectype;
348 if (dump_enabled_p ())
350 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
351 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
352 dump_printf (MSG_NOTE, "\n");
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
358 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
359 dump_printf (MSG_NOTE, "\n");
362 vect_update_max_nunits (&vectorization_factor, vectype);
366 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
367 gsi_next (&si))
369 stmt_info = vinfo_for_stmt (gsi_stmt (si));
370 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
371 &mask_producers))
372 return false;
376 /* TODO: Analyze cost. Decide if worth while to vectorize. */
377 if (dump_enabled_p ())
379 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
380 dump_dec (MSG_NOTE, vectorization_factor);
381 dump_printf (MSG_NOTE, "\n");
384 if (known_le (vectorization_factor, 1U))
386 if (dump_enabled_p ())
387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
388 "not vectorized: unsupported data-type\n");
389 return false;
391 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
393 for (i = 0; i < mask_producers.length (); i++)
395 stmt_info = mask_producers[i];
396 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
397 if (!mask_type)
398 return false;
399 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
402 return true;
406 /* Function vect_is_simple_iv_evolution.
408 FORNOW: A simple evolution of an induction variables in the loop is
409 considered a polynomial evolution. */
411 static bool
412 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
413 tree * step)
415 tree init_expr;
416 tree step_expr;
417 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
418 basic_block bb;
420 /* When there is no evolution in this loop, the evolution function
421 is not "simple". */
422 if (evolution_part == NULL_TREE)
423 return false;
425 /* When the evolution is a polynomial of degree >= 2
426 the evolution function is not "simple". */
427 if (tree_is_chrec (evolution_part))
428 return false;
430 step_expr = evolution_part;
431 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
436 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
437 dump_printf (MSG_NOTE, ", init: ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
439 dump_printf (MSG_NOTE, "\n");
442 *init = init_expr;
443 *step = step_expr;
445 if (TREE_CODE (step_expr) != INTEGER_CST
446 && (TREE_CODE (step_expr) != SSA_NAME
447 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
448 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
449 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
450 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
451 || !flag_associative_math)))
452 && (TREE_CODE (step_expr) != REAL_CST
453 || !flag_associative_math))
455 if (dump_enabled_p ())
456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
457 "step unknown.\n");
458 return false;
461 return true;
464 /* Function vect_analyze_scalar_cycles_1.
466 Examine the cross iteration def-use cycles of scalar variables
467 in LOOP. LOOP_VINFO represents the loop that is now being
468 considered for vectorization (can be LOOP, or an outer-loop
469 enclosing LOOP). */
471 static void
472 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
474 basic_block bb = loop->header;
475 tree init, step;
476 auto_vec<gimple *, 64> worklist;
477 gphi_iterator gsi;
478 bool double_reduc;
480 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
482 /* First - identify all inductions. Reduction detection assumes that all the
483 inductions have been identified, therefore, this order must not be
484 changed. */
485 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
487 gphi *phi = gsi.phi ();
488 tree access_fn = NULL;
489 tree def = PHI_RESULT (phi);
490 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
492 if (dump_enabled_p ())
494 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
495 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
498 /* Skip virtual phi's. The data dependences that are associated with
499 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
500 if (virtual_operand_p (def))
501 continue;
503 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
505 /* Analyze the evolution function. */
506 access_fn = analyze_scalar_evolution (loop, def);
507 if (access_fn)
509 STRIP_NOPS (access_fn);
510 if (dump_enabled_p ())
512 dump_printf_loc (MSG_NOTE, vect_location,
513 "Access function of PHI: ");
514 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
515 dump_printf (MSG_NOTE, "\n");
517 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
518 = initial_condition_in_loop_num (access_fn, loop->num);
519 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
520 = evolution_part_in_loop_num (access_fn, loop->num);
523 if (!access_fn
524 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
525 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
526 && TREE_CODE (step) != INTEGER_CST))
528 worklist.safe_push (phi);
529 continue;
532 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
533 != NULL_TREE);
534 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
538 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
542 /* Second - identify all reductions and nested cycles. */
543 while (worklist.length () > 0)
545 gimple *phi = worklist.pop ();
546 tree def = PHI_RESULT (phi);
547 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
548 gimple *reduc_stmt;
550 if (dump_enabled_p ())
552 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
553 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
556 gcc_assert (!virtual_operand_p (def)
557 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
559 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
560 &double_reduc, false);
561 if (reduc_stmt)
563 if (double_reduc)
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "Detected double reduction.\n");
569 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
570 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
571 vect_double_reduction_def;
573 else
575 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
577 if (dump_enabled_p ())
578 dump_printf_loc (MSG_NOTE, vect_location,
579 "Detected vectorizable nested cycle.\n");
581 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
582 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
583 vect_nested_cycle;
585 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_NOTE, vect_location,
589 "Detected reduction.\n");
591 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
592 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
593 vect_reduction_def;
594 /* Store the reduction cycles for possible vectorization in
595 loop-aware SLP if it was not detected as reduction
596 chain. */
597 if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
598 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
602 else
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Unknown def-use cycle pattern.\n");
610 /* Function vect_analyze_scalar_cycles.
612 Examine the cross iteration def-use cycles of scalar variables, by
613 analyzing the loop-header PHIs of scalar variables. Classify each
614 cycle as one of the following: invariant, induction, reduction, unknown.
615 We do that for the loop represented by LOOP_VINFO, and also to its
616 inner-loop, if exists.
617 Examples for scalar cycles:
619 Example1: reduction:
621 loop1:
622 for (i=0; i<N; i++)
623 sum += a[i];
625 Example2: induction:
627 loop2:
628 for (i=0; i<N; i++)
629 a[i] = i; */
631 static void
632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
634 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
636 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
638 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
639 Reductions in such inner-loop therefore have different properties than
640 the reductions in the nest that gets vectorized:
641 1. When vectorized, they are executed in the same order as in the original
642 scalar loop, so we can't change the order of computation when
643 vectorizing them.
644 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
645 current checks are too strict. */
647 if (loop->inner)
648 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 /* Transfer group and reduction information from STMT to its pattern stmt. */
653 static void
654 vect_fixup_reduc_chain (gimple *stmt)
656 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
657 gimple *stmtp;
658 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
659 && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
660 REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
661 = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
664 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
665 REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
666 stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
667 if (stmt)
668 REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
669 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
671 while (stmt);
672 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
675 /* Fixup scalar cycles that now have their stmts detected as patterns. */
677 static void
678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
680 gimple *first;
681 unsigned i;
683 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
684 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
686 gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
687 while (next)
689 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
690 break;
691 next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
693 /* If not all stmt in the chain are patterns try to handle
694 the chain without patterns. */
695 if (! next)
697 vect_fixup_reduc_chain (first);
698 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
699 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
704 /* Function vect_get_loop_niters.
706 Determine how many iterations the loop is executed and place it
707 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
708 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
709 niter information holds in ASSUMPTIONS.
711 Return the loop exit condition. */
714 static gcond *
715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
716 tree *number_of_iterations, tree *number_of_iterationsm1)
718 edge exit = single_exit (loop);
719 struct tree_niter_desc niter_desc;
720 tree niter_assumptions, niter, may_be_zero;
721 gcond *cond = get_loop_exit_condition (loop);
723 *assumptions = boolean_true_node;
724 *number_of_iterationsm1 = chrec_dont_know;
725 *number_of_iterations = chrec_dont_know;
726 DUMP_VECT_SCOPE ("get_loop_niters");
728 if (!exit)
729 return cond;
731 niter = chrec_dont_know;
732 may_be_zero = NULL_TREE;
733 niter_assumptions = boolean_true_node;
734 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
735 || chrec_contains_undetermined (niter_desc.niter))
736 return cond;
738 niter_assumptions = niter_desc.assumptions;
739 may_be_zero = niter_desc.may_be_zero;
740 niter = niter_desc.niter;
742 if (may_be_zero && integer_zerop (may_be_zero))
743 may_be_zero = NULL_TREE;
745 if (may_be_zero)
747 if (COMPARISON_CLASS_P (may_be_zero))
749 /* Try to combine may_be_zero with assumptions, this can simplify
750 computation of niter expression. */
751 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
752 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
753 niter_assumptions,
754 fold_build1 (TRUTH_NOT_EXPR,
755 boolean_type_node,
756 may_be_zero));
757 else
758 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
759 build_int_cst (TREE_TYPE (niter), 0),
760 rewrite_to_non_trapping_overflow (niter));
762 may_be_zero = NULL_TREE;
764 else if (integer_nonzerop (may_be_zero))
766 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
767 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
768 return cond;
770 else
771 return cond;
774 *assumptions = niter_assumptions;
775 *number_of_iterationsm1 = niter;
777 /* We want the number of loop header executions which is the number
778 of latch executions plus one.
779 ??? For UINT_MAX latch executions this number overflows to zero
780 for loops like do { n++; } while (n != 0); */
781 if (niter && !chrec_contains_undetermined (niter))
782 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
783 build_int_cst (TREE_TYPE (niter), 1));
784 *number_of_iterations = niter;
786 return cond;
789 /* Function bb_in_loop_p
791 Used as predicate for dfs order traversal of the loop bbs. */
793 static bool
794 bb_in_loop_p (const_basic_block bb, const void *data)
796 const struct loop *const loop = (const struct loop *)data;
797 if (flow_bb_inside_loop_p (loop, bb))
798 return true;
799 return false;
803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
804 stmt_vec_info structs for all the stmts in LOOP_IN. */
806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
807 : vec_info (vec_info::loop, init_cost (loop_in), shared),
808 loop (loop_in),
809 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
810 num_itersm1 (NULL_TREE),
811 num_iters (NULL_TREE),
812 num_iters_unchanged (NULL_TREE),
813 num_iters_assumptions (NULL_TREE),
814 th (0),
815 versioning_threshold (0),
816 vectorization_factor (0),
817 max_vectorization_factor (0),
818 mask_skip_niters (NULL_TREE),
819 mask_compare_type (NULL_TREE),
820 unaligned_dr (NULL),
821 peeling_for_alignment (0),
822 ptr_mask (0),
823 ivexpr_map (NULL),
824 slp_unrolling_factor (1),
825 single_scalar_iteration_cost (0),
826 vectorizable (false),
827 can_fully_mask_p (true),
828 fully_masked_p (false),
829 peeling_for_gaps (false),
830 peeling_for_niter (false),
831 operands_swapped (false),
832 no_data_dependencies (false),
833 has_mask_store (false),
834 scalar_loop (NULL),
835 orig_loop_info (NULL)
837 /* Create/Update stmt_info for all stmts in the loop. */
838 basic_block *body = get_loop_body (loop);
839 for (unsigned int i = 0; i < loop->num_nodes; i++)
841 basic_block bb = body[i];
842 gimple_stmt_iterator si;
844 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
846 gimple *phi = gsi_stmt (si);
847 gimple_set_uid (phi, 0);
848 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
851 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
853 gimple *stmt = gsi_stmt (si);
854 gimple_set_uid (stmt, 0);
855 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
858 free (body);
860 /* CHECKME: We want to visit all BBs before their successors (except for
861 latch blocks, for which this assertion wouldn't hold). In the simple
862 case of the loop forms we allow, a dfs order of the BBs would the same
863 as reversed postorder traversal, so we are safe. */
865 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866 bbs, loop->num_nodes, loop);
867 gcc_assert (nbbs == loop->num_nodes);
870 /* Free all levels of MASKS. */
872 void
873 release_vec_loop_masks (vec_loop_masks *masks)
875 rgroup_masks *rgm;
876 unsigned int i;
877 FOR_EACH_VEC_ELT (*masks, i, rgm)
878 rgm->masks.release ();
879 masks->release ();
882 /* Free all memory used by the _loop_vec_info, as well as all the
883 stmt_vec_info structs of all the stmts in the loop. */
885 _loop_vec_info::~_loop_vec_info ()
887 int nbbs;
888 gimple_stmt_iterator si;
889 int j;
891 /* ??? We're releasing loop_vinfos en-block. */
892 set_stmt_vec_info_vec (&stmt_vec_infos);
893 nbbs = loop->num_nodes;
894 for (j = 0; j < nbbs; j++)
896 basic_block bb = bbs[j];
897 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
898 free_stmt_vec_info (gsi_stmt (si));
900 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
902 gimple *stmt = gsi_stmt (si);
904 /* We may have broken canonical form by moving a constant
905 into RHS1 of a commutative op. Fix such occurrences. */
906 if (operands_swapped && is_gimple_assign (stmt))
908 enum tree_code code = gimple_assign_rhs_code (stmt);
910 if ((code == PLUS_EXPR
911 || code == POINTER_PLUS_EXPR
912 || code == MULT_EXPR)
913 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
914 swap_ssa_operands (stmt,
915 gimple_assign_rhs1_ptr (stmt),
916 gimple_assign_rhs2_ptr (stmt));
917 else if (code == COND_EXPR
918 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
920 tree cond_expr = gimple_assign_rhs1 (stmt);
921 enum tree_code cond_code = TREE_CODE (cond_expr);
923 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
925 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
926 0));
927 cond_code = invert_tree_comparison (cond_code,
928 honor_nans);
929 if (cond_code != ERROR_MARK)
931 TREE_SET_CODE (cond_expr, cond_code);
932 swap_ssa_operands (stmt,
933 gimple_assign_rhs2_ptr (stmt),
934 gimple_assign_rhs3_ptr (stmt));
940 /* Free stmt_vec_info. */
941 free_stmt_vec_info (stmt);
942 gsi_next (&si);
946 free (bbs);
948 release_vec_loop_masks (&masks);
949 delete ivexpr_map;
951 loop->aux = NULL;
954 /* Return an invariant or register for EXPR and emit necessary
955 computations in the LOOP_VINFO loop preheader. */
957 tree
958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
960 if (is_gimple_reg (expr)
961 || is_gimple_min_invariant (expr))
962 return expr;
964 if (! loop_vinfo->ivexpr_map)
965 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
966 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
967 if (! cached)
969 gimple_seq stmts = NULL;
970 cached = force_gimple_operand (unshare_expr (expr),
971 &stmts, true, NULL_TREE);
972 if (stmts)
974 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
975 gsi_insert_seq_on_edge_immediate (e, stmts);
978 return cached;
981 /* Return true if we can use CMP_TYPE as the comparison type to produce
982 all masks required to mask LOOP_VINFO. */
984 static bool
985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
987 rgroup_masks *rgm;
988 unsigned int i;
989 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
990 if (rgm->mask_type != NULL_TREE
991 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
992 cmp_type, rgm->mask_type,
993 OPTIMIZE_FOR_SPEED))
994 return false;
995 return true;
998 /* Calculate the maximum number of scalars per iteration for every
999 rgroup in LOOP_VINFO. */
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1004 unsigned int res = 1;
1005 unsigned int i;
1006 rgroup_masks *rgm;
1007 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008 res = MAX (res, rgm->max_nscalars_per_iter);
1009 return res;
1012 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1013 whether we can actually generate the masks required. Return true if so,
1014 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1019 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020 unsigned int min_ni_width;
1022 /* Use a normal loop if there are no statements that need masking.
1023 This only happens in rare degenerate cases: it means that the loop
1024 has no loads, no stores, and no live-out values. */
1025 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026 return false;
1028 /* Get the maximum number of iterations that is representable
1029 in the counter type. */
1030 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1033 /* Get a more refined estimate for the number of iterations. */
1034 widest_int max_back_edges;
1035 if (max_loop_iterations (loop, &max_back_edges))
1036 max_ni = wi::smin (max_ni, max_back_edges + 1);
1038 /* Account for rgroup masks, in which each bit is replicated N times. */
1039 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1041 /* Work out how many bits we need to represent the limit. */
1042 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1044 /* Find a scalar mode for which WHILE_ULT is supported. */
1045 opt_scalar_int_mode cmp_mode_iter;
1046 tree cmp_type = NULL_TREE;
1047 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1049 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050 if (cmp_bits >= min_ni_width
1051 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1053 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054 if (this_type
1055 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1057 /* Although we could stop as soon as we find a valid mode,
1058 it's often better to continue until we hit Pmode, since the
1059 operands to the WHILE are more likely to be reusable in
1060 address calculations. */
1061 cmp_type = this_type;
1062 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063 break;
1068 if (!cmp_type)
1069 return false;
1071 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072 return true;
1075 /* Calculate the cost of one scalar iteration of the loop. */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1079 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081 int nbbs = loop->num_nodes, factor;
1082 int innerloop_iters, i;
1084 /* Gather costs for statements in the scalar loop. */
1086 /* FORNOW. */
1087 innerloop_iters = 1;
1088 if (loop->inner)
1089 innerloop_iters = 50; /* FIXME */
1091 for (i = 0; i < nbbs; i++)
1093 gimple_stmt_iterator si;
1094 basic_block bb = bbs[i];
1096 if (bb->loop_father == loop->inner)
1097 factor = innerloop_iters;
1098 else
1099 factor = 1;
1101 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1103 gimple *stmt = gsi_stmt (si);
1104 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1106 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107 continue;
1109 /* Skip stmts that are not vectorized inside the loop. */
1110 if (stmt_info
1111 && !STMT_VINFO_RELEVANT_P (stmt_info)
1112 && (!STMT_VINFO_LIVE_P (stmt_info)
1113 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115 continue;
1117 vect_cost_for_stmt kind;
1118 if (STMT_VINFO_DATA_REF (stmt_info))
1120 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121 kind = scalar_load;
1122 else
1123 kind = scalar_store;
1125 else
1126 kind = scalar_stmt;
1128 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129 factor, kind, stmt_info, 0, vect_prologue);
1133 /* Now accumulate cost. */
1134 void *target_cost_data = init_cost (loop);
1135 stmt_info_for_cost *si;
1136 int j;
1137 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138 j, si)
1140 struct _stmt_vec_info *stmt_info
1141 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1142 (void) add_stmt_cost (target_cost_data, si->count,
1143 si->kind, stmt_info, si->misalign,
1144 vect_body);
1146 unsigned dummy, body_cost = 0;
1147 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1148 destroy_cost_data (target_cost_data);
1149 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1153 /* Function vect_analyze_loop_form_1.
1155 Verify that certain CFG restrictions hold, including:
1156 - the loop has a pre-header
1157 - the loop has a single entry and exit
1158 - the loop exit condition is simple enough
1159 - the number of iterations can be analyzed, i.e, a countable loop. The
1160 niter could be analyzed under some assumptions. */
1162 bool
1163 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1164 tree *assumptions, tree *number_of_iterationsm1,
1165 tree *number_of_iterations, gcond **inner_loop_cond)
1167 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1169 /* Different restrictions apply when we are considering an inner-most loop,
1170 vs. an outer (nested) loop.
1171 (FORNOW. May want to relax some of these restrictions in the future). */
1173 if (!loop->inner)
1175 /* Inner-most loop. We currently require that the number of BBs is
1176 exactly 2 (the header and latch). Vectorizable inner-most loops
1177 look like this:
1179 (pre-header)
1181 header <--------+
1182 | | |
1183 | +--> latch --+
1185 (exit-bb) */
1187 if (loop->num_nodes != 2)
1189 if (dump_enabled_p ())
1190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 "not vectorized: control flow in loop.\n");
1192 return false;
1195 if (empty_block_p (loop->header))
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "not vectorized: empty loop.\n");
1200 return false;
1203 else
1205 struct loop *innerloop = loop->inner;
1206 edge entryedge;
1208 /* Nested loop. We currently require that the loop is doubly-nested,
1209 contains a single inner loop, and the number of BBs is exactly 5.
1210 Vectorizable outer-loops look like this:
1212 (pre-header)
1214 header <---+
1216 inner-loop |
1218 tail ------+
1220 (exit-bb)
1222 The inner-loop has the properties expected of inner-most loops
1223 as described above. */
1225 if ((loop->inner)->inner || (loop->inner)->next)
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229 "not vectorized: multiple nested loops.\n");
1230 return false;
1233 if (loop->num_nodes != 5)
1235 if (dump_enabled_p ())
1236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237 "not vectorized: control flow in loop.\n");
1238 return false;
1241 entryedge = loop_preheader_edge (innerloop);
1242 if (entryedge->src != loop->header
1243 || !single_exit (innerloop)
1244 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1246 if (dump_enabled_p ())
1247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248 "not vectorized: unsupported outerloop form.\n");
1249 return false;
1252 /* Analyze the inner-loop. */
1253 tree inner_niterm1, inner_niter, inner_assumptions;
1254 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1255 &inner_assumptions, &inner_niterm1,
1256 &inner_niter, NULL)
1257 /* Don't support analyzing niter under assumptions for inner
1258 loop. */
1259 || !integer_onep (inner_assumptions))
1261 if (dump_enabled_p ())
1262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263 "not vectorized: Bad inner loop.\n");
1264 return false;
1267 if (!expr_invariant_in_loop_p (loop, inner_niter))
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271 "not vectorized: inner-loop count not"
1272 " invariant.\n");
1273 return false;
1276 if (dump_enabled_p ())
1277 dump_printf_loc (MSG_NOTE, vect_location,
1278 "Considering outer-loop vectorization.\n");
1281 if (!single_exit (loop)
1282 || EDGE_COUNT (loop->header->preds) != 2)
1284 if (dump_enabled_p ())
1286 if (!single_exit (loop))
1287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288 "not vectorized: multiple exits.\n");
1289 else if (EDGE_COUNT (loop->header->preds) != 2)
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "not vectorized: too many incoming edges.\n");
1293 return false;
1296 /* We assume that the loop exit condition is at the end of the loop. i.e,
1297 that the loop is represented as a do-while (with a proper if-guard
1298 before the loop if needed), where the loop header contains all the
1299 executable statements, and the latch is empty. */
1300 if (!empty_block_p (loop->latch)
1301 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1303 if (dump_enabled_p ())
1304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305 "not vectorized: latch block not empty.\n");
1306 return false;
1309 /* Make sure the exit is not abnormal. */
1310 edge e = single_exit (loop);
1311 if (e->flags & EDGE_ABNORMAL)
1313 if (dump_enabled_p ())
1314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315 "not vectorized: abnormal loop exit edge.\n");
1316 return false;
1319 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1320 number_of_iterationsm1);
1321 if (!*loop_cond)
1323 if (dump_enabled_p ())
1324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325 "not vectorized: complicated exit condition.\n");
1326 return false;
1329 if (integer_zerop (*assumptions)
1330 || !*number_of_iterations
1331 || chrec_contains_undetermined (*number_of_iterations))
1333 if (dump_enabled_p ())
1334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1335 "not vectorized: number of iterations cannot be "
1336 "computed.\n");
1337 return false;
1340 if (integer_zerop (*number_of_iterations))
1342 if (dump_enabled_p ())
1343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344 "not vectorized: number of iterations = 0.\n");
1345 return false;
1348 return true;
1351 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1353 loop_vec_info
1354 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1356 tree assumptions, number_of_iterations, number_of_iterationsm1;
1357 gcond *loop_cond, *inner_loop_cond = NULL;
1359 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1360 &assumptions, &number_of_iterationsm1,
1361 &number_of_iterations, &inner_loop_cond))
1362 return NULL;
1364 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1365 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1366 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1367 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1368 if (!integer_onep (assumptions))
1370 /* We consider to vectorize this loop by versioning it under
1371 some assumptions. In order to do this, we need to clear
1372 existing information computed by scev and niter analyzer. */
1373 scev_reset_htab ();
1374 free_numbers_of_iterations_estimates (loop);
1375 /* Also set flag for this loop so that following scev and niter
1376 analysis are done under the assumptions. */
1377 loop_constraint_set (loop, LOOP_C_FINITE);
1378 /* Also record the assumptions for versioning. */
1379 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1382 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1384 if (dump_enabled_p ())
1386 dump_printf_loc (MSG_NOTE, vect_location,
1387 "Symbolic number of iterations is ");
1388 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1389 dump_printf (MSG_NOTE, "\n");
1393 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1394 if (inner_loop_cond)
1395 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1396 = loop_exit_ctrl_vec_info_type;
1398 gcc_assert (!loop->aux);
1399 loop->aux = loop_vinfo;
1400 return loop_vinfo;
1405 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1406 statements update the vectorization factor. */
1408 static void
1409 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1411 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1412 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1413 int nbbs = loop->num_nodes;
1414 poly_uint64 vectorization_factor;
1415 int i;
1417 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1419 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420 gcc_assert (known_ne (vectorization_factor, 0U));
1422 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1423 vectorization factor of the loop is the unrolling factor required by
1424 the SLP instances. If that unrolling factor is 1, we say, that we
1425 perform pure SLP on loop - cross iteration parallelism is not
1426 exploited. */
1427 bool only_slp_in_loop = true;
1428 for (i = 0; i < nbbs; i++)
1430 basic_block bb = bbs[i];
1431 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1432 gsi_next (&si))
1434 gimple *stmt = gsi_stmt (si);
1435 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1436 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1437 && STMT_VINFO_RELATED_STMT (stmt_info))
1439 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1440 stmt_info = vinfo_for_stmt (stmt);
1442 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1443 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1444 && !PURE_SLP_STMT (stmt_info))
1445 /* STMT needs both SLP and loop-based vectorization. */
1446 only_slp_in_loop = false;
1450 if (only_slp_in_loop)
1452 dump_printf_loc (MSG_NOTE, vect_location,
1453 "Loop contains only SLP stmts\n");
1454 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1456 else
1458 dump_printf_loc (MSG_NOTE, vect_location,
1459 "Loop contains SLP and non-SLP stmts\n");
1460 /* Both the vectorization factor and unroll factor have the form
1461 current_vector_size * X for some rational X, so they must have
1462 a common multiple. */
1463 vectorization_factor
1464 = force_common_multiple (vectorization_factor,
1465 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1468 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1469 if (dump_enabled_p ())
1471 dump_printf_loc (MSG_NOTE, vect_location,
1472 "Updating vectorization factor to ");
1473 dump_dec (MSG_NOTE, vectorization_factor);
1474 dump_printf (MSG_NOTE, ".\n");
1478 /* Return true if STMT_INFO describes a double reduction phi and if
1479 the other phi in the reduction is also relevant for vectorization.
1480 This rejects cases such as:
1482 outer1:
1483 x_1 = PHI <x_3(outer2), ...>;
1486 inner:
1487 x_2 = ...;
1490 outer2:
1491 x_3 = PHI <x_2(inner)>;
1493 if nothing in x_2 or elsewhere makes x_1 relevant. */
1495 static bool
1496 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1498 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1499 return false;
1501 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1502 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1505 /* Function vect_analyze_loop_operations.
1507 Scan the loop stmts and make sure they are all vectorizable. */
1509 static bool
1510 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1512 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1513 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1514 int nbbs = loop->num_nodes;
1515 int i;
1516 stmt_vec_info stmt_info;
1517 bool need_to_vectorize = false;
1518 bool ok;
1520 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1522 stmt_vector_for_cost cost_vec;
1523 cost_vec.create (2);
1525 for (i = 0; i < nbbs; i++)
1527 basic_block bb = bbs[i];
1529 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1530 gsi_next (&si))
1532 gphi *phi = si.phi ();
1533 ok = true;
1535 stmt_info = vinfo_for_stmt (phi);
1536 if (dump_enabled_p ())
1538 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1539 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1541 if (virtual_operand_p (gimple_phi_result (phi)))
1542 continue;
1544 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1545 (i.e., a phi in the tail of the outer-loop). */
1546 if (! is_loop_header_bb_p (bb))
1548 /* FORNOW: we currently don't support the case that these phis
1549 are not used in the outerloop (unless it is double reduction,
1550 i.e., this phi is vect_reduction_def), cause this case
1551 requires to actually do something here. */
1552 if (STMT_VINFO_LIVE_P (stmt_info)
1553 && !vect_active_double_reduction_p (stmt_info))
1555 if (dump_enabled_p ())
1556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1557 "Unsupported loop-closed phi in "
1558 "outer-loop.\n");
1559 return false;
1562 /* If PHI is used in the outer loop, we check that its operand
1563 is defined in the inner loop. */
1564 if (STMT_VINFO_RELEVANT_P (stmt_info))
1566 tree phi_op;
1567 gimple *op_def_stmt;
1569 if (gimple_phi_num_args (phi) != 1)
1570 return false;
1572 phi_op = PHI_ARG_DEF (phi, 0);
1573 if (TREE_CODE (phi_op) != SSA_NAME)
1574 return false;
1576 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1577 if (gimple_nop_p (op_def_stmt)
1578 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1579 || !vinfo_for_stmt (op_def_stmt))
1580 return false;
1582 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1583 != vect_used_in_outer
1584 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1585 != vect_used_in_outer_by_reduction)
1586 return false;
1589 continue;
1592 gcc_assert (stmt_info);
1594 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1595 || STMT_VINFO_LIVE_P (stmt_info))
1596 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1598 /* A scalar-dependence cycle that we don't support. */
1599 if (dump_enabled_p ())
1600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601 "not vectorized: scalar dependence cycle.\n");
1602 return false;
1605 if (STMT_VINFO_RELEVANT_P (stmt_info))
1607 need_to_vectorize = true;
1608 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1609 && ! PURE_SLP_STMT (stmt_info))
1610 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1611 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1612 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1613 && ! PURE_SLP_STMT (stmt_info))
1614 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1615 &cost_vec);
1618 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1619 if (ok
1620 && STMT_VINFO_LIVE_P (stmt_info)
1621 && !PURE_SLP_STMT (stmt_info))
1622 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1623 &cost_vec);
1625 if (!ok)
1627 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "not vectorized: relevant phi not "
1631 "supported: ");
1632 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1634 return false;
1638 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1639 gsi_next (&si))
1641 gimple *stmt = gsi_stmt (si);
1642 if (!gimple_clobber_p (stmt)
1643 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1644 &cost_vec))
1645 return false;
1647 } /* bbs */
1649 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1650 cost_vec.release ();
1652 /* All operations in the loop are either irrelevant (deal with loop
1653 control, or dead), or only used outside the loop and can be moved
1654 out of the loop (e.g. invariants, inductions). The loop can be
1655 optimized away by scalar optimizations. We're better off not
1656 touching this loop. */
1657 if (!need_to_vectorize)
1659 if (dump_enabled_p ())
1660 dump_printf_loc (MSG_NOTE, vect_location,
1661 "All the computation can be taken out of the loop.\n");
1662 if (dump_enabled_p ())
1663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1664 "not vectorized: redundant loop. no profit to "
1665 "vectorize.\n");
1666 return false;
1669 return true;
1672 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1673 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1674 definitely no, or -1 if it's worth retrying. */
1676 static int
1677 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1679 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1680 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1682 /* Only fully-masked loops can have iteration counts less than the
1683 vectorization factor. */
1684 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1686 HOST_WIDE_INT max_niter;
1688 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1689 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1690 else
1691 max_niter = max_stmt_executions_int (loop);
1693 if (max_niter != -1
1694 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1696 if (dump_enabled_p ())
1697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698 "not vectorized: iteration count smaller than "
1699 "vectorization factor.\n");
1700 return 0;
1704 int min_profitable_iters, min_profitable_estimate;
1705 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1706 &min_profitable_estimate);
1708 if (min_profitable_iters < 0)
1710 if (dump_enabled_p ())
1711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1712 "not vectorized: vectorization not profitable.\n");
1713 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715 "not vectorized: vector version will never be "
1716 "profitable.\n");
1717 return -1;
1720 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1721 * assumed_vf);
1723 /* Use the cost model only if it is more conservative than user specified
1724 threshold. */
1725 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1726 min_profitable_iters);
1728 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1730 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1731 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1733 if (dump_enabled_p ())
1734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735 "not vectorized: vectorization not profitable.\n");
1736 if (dump_enabled_p ())
1737 dump_printf_loc (MSG_NOTE, vect_location,
1738 "not vectorized: iteration count smaller than user "
1739 "specified loop bound parameter or minimum profitable "
1740 "iterations (whichever is more conservative).\n");
1741 return 0;
1744 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1745 if (estimated_niter == -1)
1746 estimated_niter = likely_max_stmt_executions_int (loop);
1747 if (estimated_niter != -1
1748 && ((unsigned HOST_WIDE_INT) estimated_niter
1749 < MAX (th, (unsigned) min_profitable_estimate)))
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753 "not vectorized: estimated iteration count too "
1754 "small.\n");
1755 if (dump_enabled_p ())
1756 dump_printf_loc (MSG_NOTE, vect_location,
1757 "not vectorized: estimated iteration count smaller "
1758 "than specified loop bound parameter or minimum "
1759 "profitable iterations (whichever is more "
1760 "conservative).\n");
1761 return -1;
1764 return 1;
1767 static bool
1768 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1769 vec<data_reference_p> *datarefs,
1770 unsigned int *n_stmts)
1772 *n_stmts = 0;
1773 for (unsigned i = 0; i < loop->num_nodes; i++)
1774 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1775 !gsi_end_p (gsi); gsi_next (&gsi))
1777 gimple *stmt = gsi_stmt (gsi);
1778 if (is_gimple_debug (stmt))
1779 continue;
1780 ++(*n_stmts);
1781 if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1783 if (is_gimple_call (stmt) && loop->safelen)
1785 tree fndecl = gimple_call_fndecl (stmt), op;
1786 if (fndecl != NULL_TREE)
1788 cgraph_node *node = cgraph_node::get (fndecl);
1789 if (node != NULL && node->simd_clones != NULL)
1791 unsigned int j, n = gimple_call_num_args (stmt);
1792 for (j = 0; j < n; j++)
1794 op = gimple_call_arg (stmt, j);
1795 if (DECL_P (op)
1796 || (REFERENCE_CLASS_P (op)
1797 && get_base_address (op)))
1798 break;
1800 op = gimple_call_lhs (stmt);
1801 /* Ignore #pragma omp declare simd functions
1802 if they don't have data references in the
1803 call stmt itself. */
1804 if (j == n
1805 && !(op
1806 && (DECL_P (op)
1807 || (REFERENCE_CLASS_P (op)
1808 && get_base_address (op)))))
1809 continue;
1813 return false;
1815 /* If dependence analysis will give up due to the limit on the
1816 number of datarefs stop here and fail fatally. */
1817 if (datarefs->length ()
1818 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1819 return false;
1821 return true;
1824 /* Function vect_analyze_loop_2.
1826 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1827 for it. The different analyses will record information in the
1828 loop_vec_info struct. */
1829 static bool
1830 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1832 bool ok;
1833 int res;
1834 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1835 poly_uint64 min_vf = 2;
1837 /* The first group of checks is independent of the vector size. */
1838 fatal = true;
1840 /* Find all data references in the loop (which correspond to vdefs/vuses)
1841 and analyze their evolution in the loop. */
1843 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1845 /* Gather the data references and count stmts in the loop. */
1846 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1848 if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1849 &LOOP_VINFO_DATAREFS (loop_vinfo),
1850 n_stmts))
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "not vectorized: loop contains function "
1855 "calls or data references that cannot "
1856 "be analyzed\n");
1857 return false;
1859 loop_vinfo->shared->save_datarefs ();
1861 else
1862 loop_vinfo->shared->check_datarefs ();
1864 /* Analyze the data references and also adjust the minimal
1865 vectorization factor according to the loads and stores. */
1867 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1868 if (!ok)
1870 if (dump_enabled_p ())
1871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872 "bad data references.\n");
1873 return false;
1876 /* Classify all cross-iteration scalar data-flow cycles.
1877 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1878 vect_analyze_scalar_cycles (loop_vinfo);
1880 vect_pattern_recog (loop_vinfo);
1882 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1884 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1885 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1887 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1888 if (!ok)
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 "bad data access.\n");
1893 return false;
1896 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1898 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1899 if (!ok)
1901 if (dump_enabled_p ())
1902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903 "unexpected pattern.\n");
1904 return false;
1907 /* While the rest of the analysis below depends on it in some way. */
1908 fatal = false;
1910 /* Analyze data dependences between the data-refs in the loop
1911 and adjust the maximum vectorization factor according to
1912 the dependences.
1913 FORNOW: fail at the first data dependence that we encounter. */
1915 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1916 if (!ok
1917 || (max_vf != MAX_VECTORIZATION_FACTOR
1918 && maybe_lt (max_vf, min_vf)))
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922 "bad data dependence.\n");
1923 return false;
1925 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1927 ok = vect_determine_vectorization_factor (loop_vinfo);
1928 if (!ok)
1930 if (dump_enabled_p ())
1931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932 "can't determine vectorization factor.\n");
1933 return false;
1935 if (max_vf != MAX_VECTORIZATION_FACTOR
1936 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1938 if (dump_enabled_p ())
1939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940 "bad data dependence.\n");
1941 return false;
1944 /* Compute the scalar iteration cost. */
1945 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1947 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948 unsigned th;
1950 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1951 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1952 if (!ok)
1953 return false;
1955 /* If there are any SLP instances mark them as pure_slp. */
1956 bool slp = vect_make_slp_decision (loop_vinfo);
1957 if (slp)
1959 /* Find stmts that need to be both vectorized and SLPed. */
1960 vect_detect_hybrid_slp (loop_vinfo);
1962 /* Update the vectorization factor based on the SLP decision. */
1963 vect_update_vf_for_slp (loop_vinfo);
1966 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1968 /* We don't expect to have to roll back to anything other than an empty
1969 set of rgroups. */
1970 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1972 /* This is the point where we can re-start analysis with SLP forced off. */
1973 start_over:
1975 /* Now the vectorization factor is final. */
1976 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1977 gcc_assert (known_ne (vectorization_factor, 0U));
1979 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1981 dump_printf_loc (MSG_NOTE, vect_location,
1982 "vectorization_factor = ");
1983 dump_dec (MSG_NOTE, vectorization_factor);
1984 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1985 LOOP_VINFO_INT_NITERS (loop_vinfo));
1988 HOST_WIDE_INT max_niter
1989 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1991 /* Analyze the alignment of the data-refs in the loop.
1992 Fail if a data reference is found that cannot be vectorized. */
1994 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1995 if (!ok)
1997 if (dump_enabled_p ())
1998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1999 "bad data alignment.\n");
2000 return false;
2003 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2004 It is important to call pruning after vect_analyze_data_ref_accesses,
2005 since we use grouping information gathered by interleaving analysis. */
2006 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2007 if (!ok)
2008 return false;
2010 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2011 vectorization. */
2012 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2014 /* This pass will decide on using loop versioning and/or loop peeling in
2015 order to enhance the alignment of data references in the loop. */
2016 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2017 if (!ok)
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2021 "bad data alignment.\n");
2022 return false;
2026 if (slp)
2028 /* Analyze operations in the SLP instances. Note this may
2029 remove unsupported SLP instances which makes the above
2030 SLP kind detection invalid. */
2031 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2032 vect_slp_analyze_operations (loop_vinfo);
2033 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2034 goto again;
2037 /* Scan all the remaining operations in the loop that are not subject
2038 to SLP and make sure they are vectorizable. */
2039 ok = vect_analyze_loop_operations (loop_vinfo);
2040 if (!ok)
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2044 "bad operation or unsupported loop bound.\n");
2045 return false;
2048 /* Decide whether to use a fully-masked loop for this vectorization
2049 factor. */
2050 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2051 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2052 && vect_verify_full_masking (loop_vinfo));
2053 if (dump_enabled_p ())
2055 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "using a fully-masked loop.\n");
2058 else
2059 dump_printf_loc (MSG_NOTE, vect_location,
2060 "not using a fully-masked loop.\n");
2063 /* If epilog loop is required because of data accesses with gaps,
2064 one additional iteration needs to be peeled. Check if there is
2065 enough iterations for vectorization. */
2066 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2067 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2068 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2070 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2071 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2073 if (known_lt (wi::to_widest (scalar_niters), vf))
2075 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_NOTE, vect_location,
2077 "loop has no enough iterations to support"
2078 " peeling for gaps.\n");
2079 return false;
2083 /* Check the costings of the loop make vectorizing worthwhile. */
2084 res = vect_analyze_loop_costing (loop_vinfo);
2085 if (res < 0)
2086 goto again;
2087 if (!res)
2089 if (dump_enabled_p ())
2090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2091 "Loop costings not worthwhile.\n");
2092 return false;
2095 /* Decide whether we need to create an epilogue loop to handle
2096 remaining scalar iterations. */
2097 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2099 unsigned HOST_WIDE_INT const_vf;
2100 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2101 /* The main loop handles all iterations. */
2102 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2103 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2104 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2106 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2107 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2108 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2109 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2111 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2112 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2113 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2114 < (unsigned) exact_log2 (const_vf))
2115 /* In case of versioning, check if the maximum number of
2116 iterations is greater than th. If they are identical,
2117 the epilogue is unnecessary. */
2118 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2119 || ((unsigned HOST_WIDE_INT) max_niter
2120 > (th / const_vf) * const_vf))))
2121 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2123 /* If an epilogue loop is required make sure we can create one. */
2124 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2125 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2127 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2129 if (!vect_can_advance_ivs_p (loop_vinfo)
2130 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2131 single_exit (LOOP_VINFO_LOOP
2132 (loop_vinfo))))
2134 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136 "not vectorized: can't create required "
2137 "epilog loop\n");
2138 goto again;
2142 /* During peeling, we need to check if number of loop iterations is
2143 enough for both peeled prolog loop and vector loop. This check
2144 can be merged along with threshold check of loop versioning, so
2145 increase threshold for this case if necessary. */
2146 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2148 poly_uint64 niters_th = 0;
2150 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2152 /* Niters for peeled prolog loop. */
2153 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2155 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2156 tree vectype
2157 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2158 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2160 else
2161 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2164 /* Niters for at least one iteration of vectorized loop. */
2165 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2166 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2167 /* One additional iteration because of peeling for gap. */
2168 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2169 niters_th += 1;
2170 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2173 gcc_assert (known_eq (vectorization_factor,
2174 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2176 /* Ok to vectorize! */
2177 return true;
2179 again:
2180 /* Try again with SLP forced off but if we didn't do any SLP there is
2181 no point in re-trying. */
2182 if (!slp)
2183 return false;
2185 /* If there are reduction chains re-trying will fail anyway. */
2186 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2187 return false;
2189 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2190 via interleaving or lane instructions. */
2191 slp_instance instance;
2192 slp_tree node;
2193 unsigned i, j;
2194 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2196 stmt_vec_info vinfo;
2197 vinfo = vinfo_for_stmt
2198 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2199 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2200 continue;
2201 vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2202 unsigned int size = DR_GROUP_SIZE (vinfo);
2203 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2204 if (! vect_store_lanes_supported (vectype, size, false)
2205 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2206 && ! vect_grouped_store_supported (vectype, size))
2207 return false;
2208 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2210 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2211 vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2212 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2213 size = DR_GROUP_SIZE (vinfo);
2214 vectype = STMT_VINFO_VECTYPE (vinfo);
2215 if (! vect_load_lanes_supported (vectype, size, false)
2216 && ! vect_grouped_load_supported (vectype, single_element_p,
2217 size))
2218 return false;
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2224 "re-trying with SLP disabled\n");
2226 /* Roll back state appropriately. No SLP this time. */
2227 slp = false;
2228 /* Restore vectorization factor as it were without SLP. */
2229 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2230 /* Free the SLP instances. */
2231 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2232 vect_free_slp_instance (instance, false);
2233 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2234 /* Reset SLP type to loop_vect on all stmts. */
2235 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2237 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2238 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2239 !gsi_end_p (si); gsi_next (&si))
2241 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2242 STMT_SLP_TYPE (stmt_info) = loop_vect;
2244 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2245 !gsi_end_p (si); gsi_next (&si))
2247 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2248 STMT_SLP_TYPE (stmt_info) = loop_vect;
2249 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2251 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2252 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2253 STMT_SLP_TYPE (stmt_info) = loop_vect;
2254 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2255 !gsi_end_p (pi); gsi_next (&pi))
2257 gimple *pstmt = gsi_stmt (pi);
2258 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2263 /* Free optimized alias test DDRS. */
2264 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2265 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2266 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2267 /* Reset target cost data. */
2268 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2269 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2270 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2271 /* Reset accumulated rgroup information. */
2272 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2273 /* Reset assorted flags. */
2274 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2275 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2276 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2277 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2278 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2280 goto start_over;
2283 /* Function vect_analyze_loop.
2285 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2286 for it. The different analyses will record information in the
2287 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2288 be vectorized. */
2289 loop_vec_info
2290 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2291 vec_info_shared *shared)
2293 loop_vec_info loop_vinfo;
2294 auto_vector_sizes vector_sizes;
2296 /* Autodetect first vector size we try. */
2297 current_vector_size = 0;
2298 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2299 unsigned int next_size = 0;
2301 DUMP_VECT_SCOPE ("analyze_loop_nest");
2303 if (loop_outer (loop)
2304 && loop_vec_info_for_loop (loop_outer (loop))
2305 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_NOTE, vect_location,
2309 "outer-loop already vectorized.\n");
2310 return NULL;
2313 if (!find_loop_nest (loop, &shared->loop_nest))
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317 "not vectorized: loop nest containing two "
2318 "or more consecutive inner loops cannot be "
2319 "vectorized\n");
2320 return NULL;
2323 unsigned n_stmts = 0;
2324 poly_uint64 autodetected_vector_size = 0;
2325 while (1)
2327 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2328 loop_vinfo = vect_analyze_loop_form (loop, shared);
2329 if (!loop_vinfo)
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "bad loop form.\n");
2334 return NULL;
2337 bool fatal = false;
2339 if (orig_loop_vinfo)
2340 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2342 if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2344 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2346 return loop_vinfo;
2349 delete loop_vinfo;
2351 if (next_size == 0)
2352 autodetected_vector_size = current_vector_size;
2354 if (next_size < vector_sizes.length ()
2355 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2356 next_size += 1;
2358 if (fatal
2359 || next_size == vector_sizes.length ()
2360 || known_eq (current_vector_size, 0U))
2361 return NULL;
2363 /* Try the next biggest vector size. */
2364 current_vector_size = vector_sizes[next_size++];
2365 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_NOTE, vect_location,
2368 "***** Re-trying analysis with "
2369 "vector size ");
2370 dump_dec (MSG_NOTE, current_vector_size);
2371 dump_printf (MSG_NOTE, "\n");
2376 /* Return true if there is an in-order reduction function for CODE, storing
2377 it in *REDUC_FN if so. */
2379 static bool
2380 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2382 switch (code)
2384 case PLUS_EXPR:
2385 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2386 return true;
2388 default:
2389 return false;
2393 /* Function reduction_fn_for_scalar_code
2395 Input:
2396 CODE - tree_code of a reduction operations.
2398 Output:
2399 REDUC_FN - the corresponding internal function to be used to reduce the
2400 vector of partial results into a single scalar result, or IFN_LAST
2401 if the operation is a supported reduction operation, but does not have
2402 such an internal function.
2404 Return FALSE if CODE currently cannot be vectorized as reduction. */
2406 static bool
2407 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2409 switch (code)
2411 case MAX_EXPR:
2412 *reduc_fn = IFN_REDUC_MAX;
2413 return true;
2415 case MIN_EXPR:
2416 *reduc_fn = IFN_REDUC_MIN;
2417 return true;
2419 case PLUS_EXPR:
2420 *reduc_fn = IFN_REDUC_PLUS;
2421 return true;
2423 case BIT_AND_EXPR:
2424 *reduc_fn = IFN_REDUC_AND;
2425 return true;
2427 case BIT_IOR_EXPR:
2428 *reduc_fn = IFN_REDUC_IOR;
2429 return true;
2431 case BIT_XOR_EXPR:
2432 *reduc_fn = IFN_REDUC_XOR;
2433 return true;
2435 case MULT_EXPR:
2436 case MINUS_EXPR:
2437 *reduc_fn = IFN_LAST;
2438 return true;
2440 default:
2441 return false;
2445 /* If there is a neutral value X such that SLP reduction NODE would not
2446 be affected by the introduction of additional X elements, return that X,
2447 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2448 is true if the SLP statements perform a single reduction, false if each
2449 statement performs an independent reduction. */
2451 static tree
2452 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2453 bool reduc_chain)
2455 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2456 gimple *stmt = stmts[0];
2457 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2458 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2459 tree scalar_type = TREE_TYPE (vector_type);
2460 struct loop *loop = gimple_bb (stmt)->loop_father;
2461 gcc_assert (loop);
2463 switch (code)
2465 case WIDEN_SUM_EXPR:
2466 case DOT_PROD_EXPR:
2467 case SAD_EXPR:
2468 case PLUS_EXPR:
2469 case MINUS_EXPR:
2470 case BIT_IOR_EXPR:
2471 case BIT_XOR_EXPR:
2472 return build_zero_cst (scalar_type);
2474 case MULT_EXPR:
2475 return build_one_cst (scalar_type);
2477 case BIT_AND_EXPR:
2478 return build_all_ones_cst (scalar_type);
2480 case MAX_EXPR:
2481 case MIN_EXPR:
2482 /* For MIN/MAX the initial values are neutral. A reduction chain
2483 has only a single initial value, so that value is neutral for
2484 all statements. */
2485 if (reduc_chain)
2486 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2487 return NULL_TREE;
2489 default:
2490 return NULL_TREE;
2494 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2495 STMT is printed with a message MSG. */
2497 static void
2498 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2500 dump_printf_loc (msg_type, vect_location, "%s", msg);
2501 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2505 /* Detect SLP reduction of the form:
2507 #a1 = phi <a5, a0>
2508 a2 = operation (a1)
2509 a3 = operation (a2)
2510 a4 = operation (a3)
2511 a5 = operation (a4)
2513 #a = phi <a5>
2515 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2516 FIRST_STMT is the first reduction stmt in the chain
2517 (a2 = operation (a1)).
2519 Return TRUE if a reduction chain was detected. */
2521 static bool
2522 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2523 gimple *first_stmt)
2525 struct loop *loop = (gimple_bb (phi))->loop_father;
2526 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2527 enum tree_code code;
2528 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2529 stmt_vec_info use_stmt_info, current_stmt_info;
2530 tree lhs;
2531 imm_use_iterator imm_iter;
2532 use_operand_p use_p;
2533 int nloop_uses, size = 0, n_out_of_loop_uses;
2534 bool found = false;
2536 if (loop != vect_loop)
2537 return false;
2539 lhs = PHI_RESULT (phi);
2540 code = gimple_assign_rhs_code (first_stmt);
2541 while (1)
2543 nloop_uses = 0;
2544 n_out_of_loop_uses = 0;
2545 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2547 gimple *use_stmt = USE_STMT (use_p);
2548 if (is_gimple_debug (use_stmt))
2549 continue;
2551 /* Check if we got back to the reduction phi. */
2552 if (use_stmt == phi)
2554 loop_use_stmt = use_stmt;
2555 found = true;
2556 break;
2559 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2561 loop_use_stmt = use_stmt;
2562 nloop_uses++;
2564 else
2565 n_out_of_loop_uses++;
2567 /* There are can be either a single use in the loop or two uses in
2568 phi nodes. */
2569 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2570 return false;
2573 if (found)
2574 break;
2576 /* We reached a statement with no loop uses. */
2577 if (nloop_uses == 0)
2578 return false;
2580 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2581 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2582 return false;
2584 if (!is_gimple_assign (loop_use_stmt)
2585 || code != gimple_assign_rhs_code (loop_use_stmt)
2586 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2587 return false;
2589 /* Insert USE_STMT into reduction chain. */
2590 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2591 if (current_stmt)
2593 current_stmt_info = vinfo_for_stmt (current_stmt);
2594 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2595 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2596 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2598 else
2599 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2601 lhs = gimple_assign_lhs (loop_use_stmt);
2602 current_stmt = loop_use_stmt;
2603 size++;
2606 if (!found || loop_use_stmt != phi || size < 2)
2607 return false;
2609 /* Swap the operands, if needed, to make the reduction operand be the second
2610 operand. */
2611 lhs = PHI_RESULT (phi);
2612 next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2613 while (next_stmt)
2615 if (gimple_assign_rhs2 (next_stmt) == lhs)
2617 tree op = gimple_assign_rhs1 (next_stmt);
2618 gimple *def_stmt = NULL;
2620 if (TREE_CODE (op) == SSA_NAME)
2621 def_stmt = SSA_NAME_DEF_STMT (op);
2623 /* Check that the other def is either defined in the loop
2624 ("vect_internal_def"), or it's an induction (defined by a
2625 loop-header phi-node). */
2626 if (def_stmt
2627 && gimple_bb (def_stmt)
2628 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2629 && (is_gimple_assign (def_stmt)
2630 || is_gimple_call (def_stmt)
2631 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2632 == vect_induction_def
2633 || (gimple_code (def_stmt) == GIMPLE_PHI
2634 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2635 == vect_internal_def
2636 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2638 lhs = gimple_assign_lhs (next_stmt);
2639 next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2640 continue;
2643 return false;
2645 else
2647 tree op = gimple_assign_rhs2 (next_stmt);
2648 gimple *def_stmt = NULL;
2650 if (TREE_CODE (op) == SSA_NAME)
2651 def_stmt = SSA_NAME_DEF_STMT (op);
2653 /* Check that the other def is either defined in the loop
2654 ("vect_internal_def"), or it's an induction (defined by a
2655 loop-header phi-node). */
2656 if (def_stmt
2657 && gimple_bb (def_stmt)
2658 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2659 && (is_gimple_assign (def_stmt)
2660 || is_gimple_call (def_stmt)
2661 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2662 == vect_induction_def
2663 || (gimple_code (def_stmt) == GIMPLE_PHI
2664 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2665 == vect_internal_def
2666 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2668 if (dump_enabled_p ())
2670 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2671 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2674 swap_ssa_operands (next_stmt,
2675 gimple_assign_rhs1_ptr (next_stmt),
2676 gimple_assign_rhs2_ptr (next_stmt));
2677 update_stmt (next_stmt);
2679 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2680 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2682 else
2683 return false;
2686 lhs = gimple_assign_lhs (next_stmt);
2687 next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2690 /* Save the chain for further analysis in SLP detection. */
2691 first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2692 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2693 REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2695 return true;
2698 /* Return true if we need an in-order reduction for operation CODE
2699 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2700 overflow must wrap. */
2702 static bool
2703 needs_fold_left_reduction_p (tree type, tree_code code,
2704 bool need_wrapping_integral_overflow)
2706 /* CHECKME: check for !flag_finite_math_only too? */
2707 if (SCALAR_FLOAT_TYPE_P (type))
2708 switch (code)
2710 case MIN_EXPR:
2711 case MAX_EXPR:
2712 return false;
2714 default:
2715 return !flag_associative_math;
2718 if (INTEGRAL_TYPE_P (type))
2720 if (!operation_no_trapping_overflow (type, code))
2721 return true;
2722 if (need_wrapping_integral_overflow
2723 && !TYPE_OVERFLOW_WRAPS (type)
2724 && operation_can_overflow (code))
2725 return true;
2726 return false;
2729 if (SAT_FIXED_POINT_TYPE_P (type))
2730 return true;
2732 return false;
2735 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2736 reduction operation CODE has a handled computation expression. */
2738 bool
2739 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2740 tree loop_arg, enum tree_code code)
2742 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2743 auto_bitmap visited;
2744 tree lookfor = PHI_RESULT (phi);
2745 ssa_op_iter curri;
2746 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2747 while (USE_FROM_PTR (curr) != loop_arg)
2748 curr = op_iter_next_use (&curri);
2749 curri.i = curri.numops;
2752 path.safe_push (std::make_pair (curri, curr));
2753 tree use = USE_FROM_PTR (curr);
2754 if (use == lookfor)
2755 break;
2756 gimple *def = SSA_NAME_DEF_STMT (use);
2757 if (gimple_nop_p (def)
2758 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2760 pop:
2763 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2764 curri = x.first;
2765 curr = x.second;
2767 curr = op_iter_next_use (&curri);
2768 /* Skip already visited or non-SSA operands (from iterating
2769 over PHI args). */
2770 while (curr != NULL_USE_OPERAND_P
2771 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2772 || ! bitmap_set_bit (visited,
2773 SSA_NAME_VERSION
2774 (USE_FROM_PTR (curr)))));
2776 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2777 if (curr == NULL_USE_OPERAND_P)
2778 break;
2780 else
2782 if (gimple_code (def) == GIMPLE_PHI)
2783 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2784 else
2785 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2786 while (curr != NULL_USE_OPERAND_P
2787 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2788 || ! bitmap_set_bit (visited,
2789 SSA_NAME_VERSION
2790 (USE_FROM_PTR (curr)))))
2791 curr = op_iter_next_use (&curri);
2792 if (curr == NULL_USE_OPERAND_P)
2793 goto pop;
2796 while (1);
2797 if (dump_file && (dump_flags & TDF_DETAILS))
2799 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2800 unsigned i;
2801 std::pair<ssa_op_iter, use_operand_p> *x;
2802 FOR_EACH_VEC_ELT (path, i, x)
2804 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2805 dump_printf (MSG_NOTE, " ");
2807 dump_printf (MSG_NOTE, "\n");
2810 /* Check whether the reduction path detected is valid. */
2811 bool fail = path.length () == 0;
2812 bool neg = false;
2813 for (unsigned i = 1; i < path.length (); ++i)
2815 gimple *use_stmt = USE_STMT (path[i].second);
2816 tree op = USE_FROM_PTR (path[i].second);
2817 if (! has_single_use (op)
2818 || ! is_gimple_assign (use_stmt))
2820 fail = true;
2821 break;
2823 if (gimple_assign_rhs_code (use_stmt) != code)
2825 if (code == PLUS_EXPR
2826 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2828 /* Track whether we negate the reduction value each iteration. */
2829 if (gimple_assign_rhs2 (use_stmt) == op)
2830 neg = ! neg;
2832 else
2834 fail = true;
2835 break;
2839 return ! fail && ! neg;
2843 /* Function vect_is_simple_reduction
2845 (1) Detect a cross-iteration def-use cycle that represents a simple
2846 reduction computation. We look for the following pattern:
2848 loop_header:
2849 a1 = phi < a0, a2 >
2850 a3 = ...
2851 a2 = operation (a3, a1)
2855 a3 = ...
2856 loop_header:
2857 a1 = phi < a0, a2 >
2858 a2 = operation (a3, a1)
2860 such that:
2861 1. operation is commutative and associative and it is safe to
2862 change the order of the computation
2863 2. no uses for a2 in the loop (a2 is used out of the loop)
2864 3. no uses of a1 in the loop besides the reduction operation
2865 4. no uses of a1 outside the loop.
2867 Conditions 1,4 are tested here.
2868 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2870 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2871 nested cycles.
2873 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2874 reductions:
2876 a1 = phi < a0, a2 >
2877 inner loop (def of a3)
2878 a2 = phi < a3 >
2880 (4) Detect condition expressions, ie:
2881 for (int i = 0; i < N; i++)
2882 if (a[i] < val)
2883 ret_val = a[i];
2887 static gimple *
2888 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2889 bool *double_reduc,
2890 bool need_wrapping_integral_overflow,
2891 enum vect_reduction_type *v_reduc_type)
2893 struct loop *loop = (gimple_bb (phi))->loop_father;
2894 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2895 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2896 enum tree_code orig_code, code;
2897 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2898 tree type;
2899 int nloop_uses;
2900 tree name;
2901 imm_use_iterator imm_iter;
2902 use_operand_p use_p;
2903 bool phi_def;
2905 *double_reduc = false;
2906 *v_reduc_type = TREE_CODE_REDUCTION;
2908 tree phi_name = PHI_RESULT (phi);
2909 /* ??? If there are no uses of the PHI result the inner loop reduction
2910 won't be detected as possibly double-reduction by vectorizable_reduction
2911 because that tries to walk the PHI arg from the preheader edge which
2912 can be constant. See PR60382. */
2913 if (has_zero_uses (phi_name))
2914 return NULL;
2915 nloop_uses = 0;
2916 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2918 gimple *use_stmt = USE_STMT (use_p);
2919 if (is_gimple_debug (use_stmt))
2920 continue;
2922 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2924 if (dump_enabled_p ())
2925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2926 "intermediate value used outside loop.\n");
2928 return NULL;
2931 nloop_uses++;
2932 if (nloop_uses > 1)
2934 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936 "reduction value used in loop.\n");
2937 return NULL;
2940 phi_use_stmt = use_stmt;
2943 edge latch_e = loop_latch_edge (loop);
2944 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2945 if (TREE_CODE (loop_arg) != SSA_NAME)
2947 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2950 "reduction: not ssa_name: ");
2951 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2952 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2954 return NULL;
2957 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2958 if (is_gimple_assign (def_stmt))
2960 name = gimple_assign_lhs (def_stmt);
2961 phi_def = false;
2963 else if (gimple_code (def_stmt) == GIMPLE_PHI)
2965 name = PHI_RESULT (def_stmt);
2966 phi_def = true;
2968 else
2970 if (dump_enabled_p ())
2972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2973 "reduction: unhandled reduction operation: ");
2974 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2976 return NULL;
2979 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2980 return NULL;
2982 nloop_uses = 0;
2983 auto_vec<gphi *, 3> lcphis;
2984 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2986 gimple *use_stmt = USE_STMT (use_p);
2987 if (is_gimple_debug (use_stmt))
2988 continue;
2989 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2990 nloop_uses++;
2991 else
2992 /* We can have more than one loop-closed PHI. */
2993 lcphis.safe_push (as_a <gphi *> (use_stmt));
2994 if (nloop_uses > 1)
2996 if (dump_enabled_p ())
2997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998 "reduction used in loop.\n");
2999 return NULL;
3003 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3004 defined in the inner loop. */
3005 if (phi_def)
3007 op1 = PHI_ARG_DEF (def_stmt, 0);
3009 if (gimple_phi_num_args (def_stmt) != 1
3010 || TREE_CODE (op1) != SSA_NAME)
3012 if (dump_enabled_p ())
3013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014 "unsupported phi node definition.\n");
3016 return NULL;
3019 def1 = SSA_NAME_DEF_STMT (op1);
3020 if (gimple_bb (def1)
3021 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3022 && loop->inner
3023 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3024 && is_gimple_assign (def1)
3025 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3027 if (dump_enabled_p ())
3028 report_vect_op (MSG_NOTE, def_stmt,
3029 "detected double reduction: ");
3031 *double_reduc = true;
3032 return def_stmt;
3035 return NULL;
3038 /* If we are vectorizing an inner reduction we are executing that
3039 in the original order only in case we are not dealing with a
3040 double reduction. */
3041 bool check_reduction = true;
3042 if (flow_loop_nested_p (vect_loop, loop))
3044 gphi *lcphi;
3045 unsigned i;
3046 check_reduction = false;
3047 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3048 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3050 gimple *use_stmt = USE_STMT (use_p);
3051 if (is_gimple_debug (use_stmt))
3052 continue;
3053 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3054 check_reduction = true;
3058 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3059 code = orig_code = gimple_assign_rhs_code (def_stmt);
3061 /* We can handle "res -= x[i]", which is non-associative by
3062 simply rewriting this into "res += -x[i]". Avoid changing
3063 gimple instruction for the first simple tests and only do this
3064 if we're allowed to change code at all. */
3065 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3066 code = PLUS_EXPR;
3068 if (code == COND_EXPR)
3070 if (! nested_in_vect_loop)
3071 *v_reduc_type = COND_REDUCTION;
3073 op3 = gimple_assign_rhs1 (def_stmt);
3074 if (COMPARISON_CLASS_P (op3))
3076 op4 = TREE_OPERAND (op3, 1);
3077 op3 = TREE_OPERAND (op3, 0);
3079 if (op3 == phi_name || op4 == phi_name)
3081 if (dump_enabled_p ())
3082 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3083 "reduction: condition depends on previous"
3084 " iteration: ");
3085 return NULL;
3088 op1 = gimple_assign_rhs2 (def_stmt);
3089 op2 = gimple_assign_rhs3 (def_stmt);
3091 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3093 if (dump_enabled_p ())
3094 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3095 "reduction: not commutative/associative: ");
3096 return NULL;
3098 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3100 op1 = gimple_assign_rhs1 (def_stmt);
3101 op2 = gimple_assign_rhs2 (def_stmt);
3103 else
3105 if (dump_enabled_p ())
3106 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3107 "reduction: not handled operation: ");
3108 return NULL;
3111 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3113 if (dump_enabled_p ())
3114 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3115 "reduction: both uses not ssa_names: ");
3117 return NULL;
3120 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3121 if ((TREE_CODE (op1) == SSA_NAME
3122 && !types_compatible_p (type,TREE_TYPE (op1)))
3123 || (TREE_CODE (op2) == SSA_NAME
3124 && !types_compatible_p (type, TREE_TYPE (op2)))
3125 || (op3 && TREE_CODE (op3) == SSA_NAME
3126 && !types_compatible_p (type, TREE_TYPE (op3)))
3127 || (op4 && TREE_CODE (op4) == SSA_NAME
3128 && !types_compatible_p (type, TREE_TYPE (op4))))
3130 if (dump_enabled_p ())
3132 dump_printf_loc (MSG_NOTE, vect_location,
3133 "reduction: multiple types: operation type: ");
3134 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3135 dump_printf (MSG_NOTE, ", operands types: ");
3136 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3137 TREE_TYPE (op1));
3138 dump_printf (MSG_NOTE, ",");
3139 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3140 TREE_TYPE (op2));
3141 if (op3)
3143 dump_printf (MSG_NOTE, ",");
3144 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3145 TREE_TYPE (op3));
3148 if (op4)
3150 dump_printf (MSG_NOTE, ",");
3151 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3152 TREE_TYPE (op4));
3154 dump_printf (MSG_NOTE, "\n");
3157 return NULL;
3160 /* Check whether it's ok to change the order of the computation.
3161 Generally, when vectorizing a reduction we change the order of the
3162 computation. This may change the behavior of the program in some
3163 cases, so we need to check that this is ok. One exception is when
3164 vectorizing an outer-loop: the inner-loop is executed sequentially,
3165 and therefore vectorizing reductions in the inner-loop during
3166 outer-loop vectorization is safe. */
3167 if (check_reduction
3168 && *v_reduc_type == TREE_CODE_REDUCTION
3169 && needs_fold_left_reduction_p (type, code,
3170 need_wrapping_integral_overflow))
3171 *v_reduc_type = FOLD_LEFT_REDUCTION;
3173 /* Reduction is safe. We're dealing with one of the following:
3174 1) integer arithmetic and no trapv
3175 2) floating point arithmetic, and special flags permit this optimization
3176 3) nested cycle (i.e., outer loop vectorization). */
3177 if (TREE_CODE (op1) == SSA_NAME)
3178 def1 = SSA_NAME_DEF_STMT (op1);
3180 if (TREE_CODE (op2) == SSA_NAME)
3181 def2 = SSA_NAME_DEF_STMT (op2);
3183 if (code != COND_EXPR
3184 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3186 if (dump_enabled_p ())
3187 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3188 return NULL;
3191 /* Check that one def is the reduction def, defined by PHI,
3192 the other def is either defined in the loop ("vect_internal_def"),
3193 or it's an induction (defined by a loop-header phi-node). */
3195 if (def2 && def2 == phi
3196 && (code == COND_EXPR
3197 || !def1 || gimple_nop_p (def1)
3198 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3199 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3200 && (is_gimple_assign (def1)
3201 || is_gimple_call (def1)
3202 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3203 == vect_induction_def
3204 || (gimple_code (def1) == GIMPLE_PHI
3205 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3206 == vect_internal_def
3207 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3209 if (dump_enabled_p ())
3210 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3211 return def_stmt;
3214 if (def1 && def1 == phi
3215 && (code == COND_EXPR
3216 || !def2 || gimple_nop_p (def2)
3217 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3218 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3219 && (is_gimple_assign (def2)
3220 || is_gimple_call (def2)
3221 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3222 == vect_induction_def
3223 || (gimple_code (def2) == GIMPLE_PHI
3224 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3225 == vect_internal_def
3226 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3228 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3230 /* Check if we can swap operands (just for simplicity - so that
3231 the rest of the code can assume that the reduction variable
3232 is always the last (second) argument). */
3233 if (code == COND_EXPR)
3235 /* Swap cond_expr by inverting the condition. */
3236 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3237 enum tree_code invert_code = ERROR_MARK;
3238 enum tree_code cond_code = TREE_CODE (cond_expr);
3240 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3242 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3243 invert_code = invert_tree_comparison (cond_code, honor_nans);
3245 if (invert_code != ERROR_MARK)
3247 TREE_SET_CODE (cond_expr, invert_code);
3248 swap_ssa_operands (def_stmt,
3249 gimple_assign_rhs2_ptr (def_stmt),
3250 gimple_assign_rhs3_ptr (def_stmt));
3252 else
3254 if (dump_enabled_p ())
3255 report_vect_op (MSG_NOTE, def_stmt,
3256 "detected reduction: cannot swap operands "
3257 "for cond_expr");
3258 return NULL;
3261 else
3262 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3263 gimple_assign_rhs2_ptr (def_stmt));
3265 if (dump_enabled_p ())
3266 report_vect_op (MSG_NOTE, def_stmt,
3267 "detected reduction: need to swap operands: ");
3269 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3270 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3272 else
3274 if (dump_enabled_p ())
3275 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3278 return def_stmt;
3281 /* Try to find SLP reduction chain. */
3282 if (! nested_in_vect_loop
3283 && code != COND_EXPR
3284 && orig_code != MINUS_EXPR
3285 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3287 if (dump_enabled_p ())
3288 report_vect_op (MSG_NOTE, def_stmt,
3289 "reduction: detected reduction chain: ");
3291 return def_stmt;
3294 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3295 gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3296 while (first)
3298 gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3299 REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3300 REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3301 first = next;
3304 /* Look for the expression computing loop_arg from loop PHI result. */
3305 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3306 code))
3307 return def_stmt;
3309 if (dump_enabled_p ())
3311 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3312 "reduction: unknown pattern: ");
3315 return NULL;
3318 /* Wrapper around vect_is_simple_reduction, which will modify code
3319 in-place if it enables detection of more reductions. Arguments
3320 as there. */
3322 gimple *
3323 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3324 bool *double_reduc,
3325 bool need_wrapping_integral_overflow)
3327 enum vect_reduction_type v_reduc_type;
3328 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3329 need_wrapping_integral_overflow,
3330 &v_reduc_type);
3331 if (def)
3333 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3334 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3335 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3336 reduc_def_info = vinfo_for_stmt (def);
3337 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3338 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3340 return def;
3343 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3345 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3346 int *peel_iters_epilogue,
3347 stmt_vector_for_cost *scalar_cost_vec,
3348 stmt_vector_for_cost *prologue_cost_vec,
3349 stmt_vector_for_cost *epilogue_cost_vec)
3351 int retval = 0;
3352 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3354 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3356 *peel_iters_epilogue = assumed_vf / 2;
3357 if (dump_enabled_p ())
3358 dump_printf_loc (MSG_NOTE, vect_location,
3359 "cost model: epilogue peel iters set to vf/2 "
3360 "because loop iterations are unknown .\n");
3362 /* If peeled iterations are known but number of scalar loop
3363 iterations are unknown, count a taken branch per peeled loop. */
3364 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3365 NULL, 0, vect_prologue);
3366 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3367 NULL, 0, vect_epilogue);
3369 else
3371 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3372 peel_iters_prologue = niters < peel_iters_prologue ?
3373 niters : peel_iters_prologue;
3374 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3375 /* If we need to peel for gaps, but no peeling is required, we have to
3376 peel VF iterations. */
3377 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3378 *peel_iters_epilogue = assumed_vf;
3381 stmt_info_for_cost *si;
3382 int j;
3383 if (peel_iters_prologue)
3384 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3386 stmt_vec_info stmt_info
3387 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3388 retval += record_stmt_cost (prologue_cost_vec,
3389 si->count * peel_iters_prologue,
3390 si->kind, stmt_info, si->misalign,
3391 vect_prologue);
3393 if (*peel_iters_epilogue)
3394 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3396 stmt_vec_info stmt_info
3397 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3398 retval += record_stmt_cost (epilogue_cost_vec,
3399 si->count * *peel_iters_epilogue,
3400 si->kind, stmt_info, si->misalign,
3401 vect_epilogue);
3404 return retval;
3407 /* Function vect_estimate_min_profitable_iters
3409 Return the number of iterations required for the vector version of the
3410 loop to be profitable relative to the cost of the scalar version of the
3411 loop.
3413 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3414 of iterations for vectorization. -1 value means loop vectorization
3415 is not profitable. This returned value may be used for dynamic
3416 profitability check.
3418 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3419 for static check against estimated number of iterations. */
3421 static void
3422 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3423 int *ret_min_profitable_niters,
3424 int *ret_min_profitable_estimate)
3426 int min_profitable_iters;
3427 int min_profitable_estimate;
3428 int peel_iters_prologue;
3429 int peel_iters_epilogue;
3430 unsigned vec_inside_cost = 0;
3431 int vec_outside_cost = 0;
3432 unsigned vec_prologue_cost = 0;
3433 unsigned vec_epilogue_cost = 0;
3434 int scalar_single_iter_cost = 0;
3435 int scalar_outside_cost = 0;
3436 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3437 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3438 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3440 /* Cost model disabled. */
3441 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3443 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3444 *ret_min_profitable_niters = 0;
3445 *ret_min_profitable_estimate = 0;
3446 return;
3449 /* Requires loop versioning tests to handle misalignment. */
3450 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3452 /* FIXME: Make cost depend on complexity of individual check. */
3453 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3454 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3455 vect_prologue);
3456 dump_printf (MSG_NOTE,
3457 "cost model: Adding cost of checks for loop "
3458 "versioning to treat misalignment.\n");
3461 /* Requires loop versioning with alias checks. */
3462 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3464 /* FIXME: Make cost depend on complexity of individual check. */
3465 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3466 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3467 vect_prologue);
3468 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3469 if (len)
3470 /* Count LEN - 1 ANDs and LEN comparisons. */
3471 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3472 NULL, 0, vect_prologue);
3473 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3474 if (len)
3476 /* Count LEN - 1 ANDs and LEN comparisons. */
3477 unsigned int nstmts = len * 2 - 1;
3478 /* +1 for each bias that needs adding. */
3479 for (unsigned int i = 0; i < len; ++i)
3480 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3481 nstmts += 1;
3482 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3483 NULL, 0, vect_prologue);
3485 dump_printf (MSG_NOTE,
3486 "cost model: Adding cost of checks for loop "
3487 "versioning aliasing.\n");
3490 /* Requires loop versioning with niter checks. */
3491 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3493 /* FIXME: Make cost depend on complexity of individual check. */
3494 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3495 vect_prologue);
3496 dump_printf (MSG_NOTE,
3497 "cost model: Adding cost of checks for loop "
3498 "versioning niters.\n");
3501 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3502 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3503 vect_prologue);
3505 /* Count statements in scalar loop. Using this as scalar cost for a single
3506 iteration for now.
3508 TODO: Add outer loop support.
3510 TODO: Consider assigning different costs to different scalar
3511 statements. */
3513 scalar_single_iter_cost
3514 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3516 /* Add additional cost for the peeled instructions in prologue and epilogue
3517 loop. (For fully-masked loops there will be no peeling.)
3519 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3520 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3522 TODO: Build an expression that represents peel_iters for prologue and
3523 epilogue to be used in a run-time test. */
3525 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3527 peel_iters_prologue = 0;
3528 peel_iters_epilogue = 0;
3530 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3532 /* We need to peel exactly one iteration. */
3533 peel_iters_epilogue += 1;
3534 stmt_info_for_cost *si;
3535 int j;
3536 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3537 j, si)
3539 struct _stmt_vec_info *stmt_info
3540 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3541 (void) add_stmt_cost (target_cost_data, si->count,
3542 si->kind, stmt_info, si->misalign,
3543 vect_epilogue);
3547 else if (npeel < 0)
3549 peel_iters_prologue = assumed_vf / 2;
3550 dump_printf (MSG_NOTE, "cost model: "
3551 "prologue peel iters set to vf/2.\n");
3553 /* If peeling for alignment is unknown, loop bound of main loop becomes
3554 unknown. */
3555 peel_iters_epilogue = assumed_vf / 2;
3556 dump_printf (MSG_NOTE, "cost model: "
3557 "epilogue peel iters set to vf/2 because "
3558 "peeling for alignment is unknown.\n");
3560 /* If peeled iterations are unknown, count a taken branch and a not taken
3561 branch per peeled loop. Even if scalar loop iterations are known,
3562 vector iterations are not known since peeled prologue iterations are
3563 not known. Hence guards remain the same. */
3564 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3565 NULL, 0, vect_prologue);
3566 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3567 NULL, 0, vect_prologue);
3568 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3569 NULL, 0, vect_epilogue);
3570 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3571 NULL, 0, vect_epilogue);
3572 stmt_info_for_cost *si;
3573 int j;
3574 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3576 struct _stmt_vec_info *stmt_info
3577 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3578 (void) add_stmt_cost (target_cost_data,
3579 si->count * peel_iters_prologue,
3580 si->kind, stmt_info, si->misalign,
3581 vect_prologue);
3582 (void) add_stmt_cost (target_cost_data,
3583 si->count * peel_iters_epilogue,
3584 si->kind, stmt_info, si->misalign,
3585 vect_epilogue);
3588 else
3590 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3591 stmt_info_for_cost *si;
3592 int j;
3593 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3595 prologue_cost_vec.create (2);
3596 epilogue_cost_vec.create (2);
3597 peel_iters_prologue = npeel;
3599 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3600 &peel_iters_epilogue,
3601 &LOOP_VINFO_SCALAR_ITERATION_COST
3602 (loop_vinfo),
3603 &prologue_cost_vec,
3604 &epilogue_cost_vec);
3606 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3608 struct _stmt_vec_info *stmt_info
3609 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3610 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3611 si->misalign, vect_prologue);
3614 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3616 struct _stmt_vec_info *stmt_info
3617 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3618 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3619 si->misalign, vect_epilogue);
3622 prologue_cost_vec.release ();
3623 epilogue_cost_vec.release ();
3626 /* FORNOW: The scalar outside cost is incremented in one of the
3627 following ways:
3629 1. The vectorizer checks for alignment and aliasing and generates
3630 a condition that allows dynamic vectorization. A cost model
3631 check is ANDED with the versioning condition. Hence scalar code
3632 path now has the added cost of the versioning check.
3634 if (cost > th & versioning_check)
3635 jmp to vector code
3637 Hence run-time scalar is incremented by not-taken branch cost.
3639 2. The vectorizer then checks if a prologue is required. If the
3640 cost model check was not done before during versioning, it has to
3641 be done before the prologue check.
3643 if (cost <= th)
3644 prologue = scalar_iters
3645 if (prologue == 0)
3646 jmp to vector code
3647 else
3648 execute prologue
3649 if (prologue == num_iters)
3650 go to exit
3652 Hence the run-time scalar cost is incremented by a taken branch,
3653 plus a not-taken branch, plus a taken branch cost.
3655 3. The vectorizer then checks if an epilogue is required. If the
3656 cost model check was not done before during prologue check, it
3657 has to be done with the epilogue check.
3659 if (prologue == 0)
3660 jmp to vector code
3661 else
3662 execute prologue
3663 if (prologue == num_iters)
3664 go to exit
3665 vector code:
3666 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3667 jmp to epilogue
3669 Hence the run-time scalar cost should be incremented by 2 taken
3670 branches.
3672 TODO: The back end may reorder the BBS's differently and reverse
3673 conditions/branch directions. Change the estimates below to
3674 something more reasonable. */
3676 /* If the number of iterations is known and we do not do versioning, we can
3677 decide whether to vectorize at compile time. Hence the scalar version
3678 do not carry cost model guard costs. */
3679 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3680 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3682 /* Cost model check occurs at versioning. */
3683 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3684 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3685 else
3687 /* Cost model check occurs at prologue generation. */
3688 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3689 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3690 + vect_get_stmt_cost (cond_branch_not_taken);
3691 /* Cost model check occurs at epilogue generation. */
3692 else
3693 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3697 /* Complete the target-specific cost calculations. */
3698 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3699 &vec_inside_cost, &vec_epilogue_cost);
3701 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3703 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3706 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3707 vec_inside_cost);
3708 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3709 vec_prologue_cost);
3710 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3711 vec_epilogue_cost);
3712 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3713 scalar_single_iter_cost);
3714 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3715 scalar_outside_cost);
3716 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3717 vec_outside_cost);
3718 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3719 peel_iters_prologue);
3720 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3721 peel_iters_epilogue);
3724 /* Calculate number of iterations required to make the vector version
3725 profitable, relative to the loop bodies only. The following condition
3726 must hold true:
3727 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3728 where
3729 SIC = scalar iteration cost, VIC = vector iteration cost,
3730 VOC = vector outside cost, VF = vectorization factor,
3731 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3732 SOC = scalar outside cost for run time cost model check. */
3734 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3736 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3737 * assumed_vf
3738 - vec_inside_cost * peel_iters_prologue
3739 - vec_inside_cost * peel_iters_epilogue);
3740 if (min_profitable_iters <= 0)
3741 min_profitable_iters = 0;
3742 else
3744 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3745 - vec_inside_cost);
3747 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3748 <= (((int) vec_inside_cost * min_profitable_iters)
3749 + (((int) vec_outside_cost - scalar_outside_cost)
3750 * assumed_vf)))
3751 min_profitable_iters++;
3754 /* vector version will never be profitable. */
3755 else
3757 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3758 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3759 "vectorization did not happen for a simd loop");
3761 if (dump_enabled_p ())
3762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3763 "cost model: the vector iteration cost = %d "
3764 "divided by the scalar iteration cost = %d "
3765 "is greater or equal to the vectorization factor = %d"
3766 ".\n",
3767 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3768 *ret_min_profitable_niters = -1;
3769 *ret_min_profitable_estimate = -1;
3770 return;
3773 dump_printf (MSG_NOTE,
3774 " Calculated minimum iters for profitability: %d\n",
3775 min_profitable_iters);
3777 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3778 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3779 /* We want the vectorized loop to execute at least once. */
3780 min_profitable_iters = assumed_vf + peel_iters_prologue;
3782 if (dump_enabled_p ())
3783 dump_printf_loc (MSG_NOTE, vect_location,
3784 " Runtime profitability threshold = %d\n",
3785 min_profitable_iters);
3787 *ret_min_profitable_niters = min_profitable_iters;
3789 /* Calculate number of iterations required to make the vector version
3790 profitable, relative to the loop bodies only.
3792 Non-vectorized variant is SIC * niters and it must win over vector
3793 variant on the expected loop trip count. The following condition must hold true:
3794 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3796 if (vec_outside_cost <= 0)
3797 min_profitable_estimate = 0;
3798 else
3800 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3801 * assumed_vf
3802 - vec_inside_cost * peel_iters_prologue
3803 - vec_inside_cost * peel_iters_epilogue)
3804 / ((scalar_single_iter_cost * assumed_vf)
3805 - vec_inside_cost);
3807 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3808 if (dump_enabled_p ())
3809 dump_printf_loc (MSG_NOTE, vect_location,
3810 " Static estimate profitability threshold = %d\n",
3811 min_profitable_estimate);
3813 *ret_min_profitable_estimate = min_profitable_estimate;
3816 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3817 vector elements (not bits) for a vector with NELT elements. */
3818 static void
3819 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3820 vec_perm_builder *sel)
3822 /* The encoding is a single stepped pattern. Any wrap-around is handled
3823 by vec_perm_indices. */
3824 sel->new_vector (nelt, 1, 3);
3825 for (unsigned int i = 0; i < 3; i++)
3826 sel->quick_push (i + offset);
3829 /* Checks whether the target supports whole-vector shifts for vectors of mode
3830 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3831 it supports vec_perm_const with masks for all necessary shift amounts. */
3832 static bool
3833 have_whole_vector_shift (machine_mode mode)
3835 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3836 return true;
3838 /* Variable-length vectors should be handled via the optab. */
3839 unsigned int nelt;
3840 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3841 return false;
3843 vec_perm_builder sel;
3844 vec_perm_indices indices;
3845 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3847 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3848 indices.new_vector (sel, 2, nelt);
3849 if (!can_vec_perm_const_p (mode, indices, false))
3850 return false;
3852 return true;
3855 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3856 functions. Design better to avoid maintenance issues. */
3858 /* Function vect_model_reduction_cost.
3860 Models cost for a reduction operation, including the vector ops
3861 generated within the strip-mine loop, the initial definition before
3862 the loop, and the epilogue code that must be generated. */
3864 static void
3865 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3866 int ncopies, stmt_vector_for_cost *cost_vec)
3868 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3869 enum tree_code code;
3870 optab optab;
3871 tree vectype;
3872 gimple *orig_stmt;
3873 machine_mode mode;
3874 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3875 struct loop *loop = NULL;
3877 if (loop_vinfo)
3878 loop = LOOP_VINFO_LOOP (loop_vinfo);
3880 /* Condition reductions generate two reductions in the loop. */
3881 vect_reduction_type reduction_type
3882 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3883 if (reduction_type == COND_REDUCTION)
3884 ncopies *= 2;
3886 vectype = STMT_VINFO_VECTYPE (stmt_info);
3887 mode = TYPE_MODE (vectype);
3888 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3890 if (!orig_stmt)
3891 orig_stmt = STMT_VINFO_STMT (stmt_info);
3893 code = gimple_assign_rhs_code (orig_stmt);
3895 if (reduction_type == EXTRACT_LAST_REDUCTION
3896 || reduction_type == FOLD_LEFT_REDUCTION)
3898 /* No extra instructions needed in the prologue. */
3899 prologue_cost = 0;
3901 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3902 /* Count one reduction-like operation per vector. */
3903 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3904 stmt_info, 0, vect_body);
3905 else
3907 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3908 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3909 inside_cost = record_stmt_cost (cost_vec, nelements,
3910 vec_to_scalar, stmt_info, 0,
3911 vect_body);
3912 inside_cost += record_stmt_cost (cost_vec, nelements,
3913 scalar_stmt, stmt_info, 0,
3914 vect_body);
3917 else
3919 /* Add in cost for initial definition.
3920 For cond reduction we have four vectors: initial index, step,
3921 initial result of the data reduction, initial value of the index
3922 reduction. */
3923 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3924 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3925 scalar_to_vec, stmt_info, 0,
3926 vect_prologue);
3928 /* Cost of reduction op inside loop. */
3929 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3930 stmt_info, 0, vect_body);
3933 /* Determine cost of epilogue code.
3935 We have a reduction operator that will reduce the vector in one statement.
3936 Also requires scalar extract. */
3938 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3940 if (reduc_fn != IFN_LAST)
3942 if (reduction_type == COND_REDUCTION)
3944 /* An EQ stmt and an COND_EXPR stmt. */
3945 epilogue_cost += record_stmt_cost (cost_vec, 2,
3946 vector_stmt, stmt_info, 0,
3947 vect_epilogue);
3948 /* Reduction of the max index and a reduction of the found
3949 values. */
3950 epilogue_cost += record_stmt_cost (cost_vec, 2,
3951 vec_to_scalar, stmt_info, 0,
3952 vect_epilogue);
3953 /* A broadcast of the max value. */
3954 epilogue_cost += record_stmt_cost (cost_vec, 1,
3955 scalar_to_vec, stmt_info, 0,
3956 vect_epilogue);
3958 else
3960 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3961 stmt_info, 0, vect_epilogue);
3962 epilogue_cost += record_stmt_cost (cost_vec, 1,
3963 vec_to_scalar, stmt_info, 0,
3964 vect_epilogue);
3967 else if (reduction_type == COND_REDUCTION)
3969 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3970 /* Extraction of scalar elements. */
3971 epilogue_cost += record_stmt_cost (cost_vec,
3972 2 * estimated_nunits,
3973 vec_to_scalar, stmt_info, 0,
3974 vect_epilogue);
3975 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3976 epilogue_cost += record_stmt_cost (cost_vec,
3977 2 * estimated_nunits - 3,
3978 scalar_stmt, stmt_info, 0,
3979 vect_epilogue);
3981 else if (reduction_type == EXTRACT_LAST_REDUCTION
3982 || reduction_type == FOLD_LEFT_REDUCTION)
3983 /* No extra instructions need in the epilogue. */
3985 else
3987 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3988 tree bitsize =
3989 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3990 int element_bitsize = tree_to_uhwi (bitsize);
3991 int nelements = vec_size_in_bits / element_bitsize;
3993 if (code == COND_EXPR)
3994 code = MAX_EXPR;
3996 optab = optab_for_tree_code (code, vectype, optab_default);
3998 /* We have a whole vector shift available. */
3999 if (optab != unknown_optab
4000 && VECTOR_MODE_P (mode)
4001 && optab_handler (optab, mode) != CODE_FOR_nothing
4002 && have_whole_vector_shift (mode))
4004 /* Final reduction via vector shifts and the reduction operator.
4005 Also requires scalar extract. */
4006 epilogue_cost += record_stmt_cost (cost_vec,
4007 exact_log2 (nelements) * 2,
4008 vector_stmt, stmt_info, 0,
4009 vect_epilogue);
4010 epilogue_cost += record_stmt_cost (cost_vec, 1,
4011 vec_to_scalar, stmt_info, 0,
4012 vect_epilogue);
4014 else
4015 /* Use extracts and reduction op for final reduction. For N
4016 elements, we have N extracts and N-1 reduction ops. */
4017 epilogue_cost += record_stmt_cost (cost_vec,
4018 nelements + nelements - 1,
4019 vector_stmt, stmt_info, 0,
4020 vect_epilogue);
4024 if (dump_enabled_p ())
4025 dump_printf (MSG_NOTE,
4026 "vect_model_reduction_cost: inside_cost = %d, "
4027 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4028 prologue_cost, epilogue_cost);
4032 /* Function vect_model_induction_cost.
4034 Models cost for induction operations. */
4036 static void
4037 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4038 stmt_vector_for_cost *cost_vec)
4040 unsigned inside_cost, prologue_cost;
4042 if (PURE_SLP_STMT (stmt_info))
4043 return;
4045 /* loop cost for vec_loop. */
4046 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4047 stmt_info, 0, vect_body);
4049 /* prologue cost for vec_init and vec_step. */
4050 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4051 stmt_info, 0, vect_prologue);
4053 if (dump_enabled_p ())
4054 dump_printf_loc (MSG_NOTE, vect_location,
4055 "vect_model_induction_cost: inside_cost = %d, "
4056 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4061 /* Function get_initial_def_for_reduction
4063 Input:
4064 STMT - a stmt that performs a reduction operation in the loop.
4065 INIT_VAL - the initial value of the reduction variable
4067 Output:
4068 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4069 of the reduction (used for adjusting the epilog - see below).
4070 Return a vector variable, initialized according to the operation that STMT
4071 performs. This vector will be used as the initial value of the
4072 vector of partial results.
4074 Option1 (adjust in epilog): Initialize the vector as follows:
4075 add/bit or/xor: [0,0,...,0,0]
4076 mult/bit and: [1,1,...,1,1]
4077 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4078 and when necessary (e.g. add/mult case) let the caller know
4079 that it needs to adjust the result by init_val.
4081 Option2: Initialize the vector as follows:
4082 add/bit or/xor: [init_val,0,0,...,0]
4083 mult/bit and: [init_val,1,1,...,1]
4084 min/max/cond_expr: [init_val,init_val,...,init_val]
4085 and no adjustments are needed.
4087 For example, for the following code:
4089 s = init_val;
4090 for (i=0;i<n;i++)
4091 s = s + a[i];
4093 STMT is 's = s + a[i]', and the reduction variable is 's'.
4094 For a vector of 4 units, we want to return either [0,0,0,init_val],
4095 or [0,0,0,0] and let the caller know that it needs to adjust
4096 the result at the end by 'init_val'.
4098 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4099 initialization vector is simpler (same element in all entries), if
4100 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4102 A cost model should help decide between these two schemes. */
4104 tree
4105 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4106 tree *adjustment_def)
4108 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4109 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4110 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4111 tree scalar_type = TREE_TYPE (init_val);
4112 tree vectype = get_vectype_for_scalar_type (scalar_type);
4113 enum tree_code code = gimple_assign_rhs_code (stmt);
4114 tree def_for_init;
4115 tree init_def;
4116 REAL_VALUE_TYPE real_init_val = dconst0;
4117 int int_init_val = 0;
4118 gimple_seq stmts = NULL;
4120 gcc_assert (vectype);
4122 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4123 || SCALAR_FLOAT_TYPE_P (scalar_type));
4125 gcc_assert (nested_in_vect_loop_p (loop, stmt)
4126 || loop == (gimple_bb (stmt))->loop_father);
4128 vect_reduction_type reduction_type
4129 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4131 switch (code)
4133 case WIDEN_SUM_EXPR:
4134 case DOT_PROD_EXPR:
4135 case SAD_EXPR:
4136 case PLUS_EXPR:
4137 case MINUS_EXPR:
4138 case BIT_IOR_EXPR:
4139 case BIT_XOR_EXPR:
4140 case MULT_EXPR:
4141 case BIT_AND_EXPR:
4143 /* ADJUSTMENT_DEF is NULL when called from
4144 vect_create_epilog_for_reduction to vectorize double reduction. */
4145 if (adjustment_def)
4146 *adjustment_def = init_val;
4148 if (code == MULT_EXPR)
4150 real_init_val = dconst1;
4151 int_init_val = 1;
4154 if (code == BIT_AND_EXPR)
4155 int_init_val = -1;
4157 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4158 def_for_init = build_real (scalar_type, real_init_val);
4159 else
4160 def_for_init = build_int_cst (scalar_type, int_init_val);
4162 if (adjustment_def)
4163 /* Option1: the first element is '0' or '1' as well. */
4164 init_def = gimple_build_vector_from_val (&stmts, vectype,
4165 def_for_init);
4166 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4168 /* Option2 (variable length): the first element is INIT_VAL. */
4169 init_def = gimple_build_vector_from_val (&stmts, vectype,
4170 def_for_init);
4171 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4172 vectype, init_def, init_val);
4174 else
4176 /* Option2: the first element is INIT_VAL. */
4177 tree_vector_builder elts (vectype, 1, 2);
4178 elts.quick_push (init_val);
4179 elts.quick_push (def_for_init);
4180 init_def = gimple_build_vector (&stmts, &elts);
4183 break;
4185 case MIN_EXPR:
4186 case MAX_EXPR:
4187 case COND_EXPR:
4189 if (adjustment_def)
4191 *adjustment_def = NULL_TREE;
4192 if (reduction_type != COND_REDUCTION
4193 && reduction_type != EXTRACT_LAST_REDUCTION)
4195 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4196 break;
4199 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4200 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4202 break;
4204 default:
4205 gcc_unreachable ();
4208 if (stmts)
4209 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4210 return init_def;
4213 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4214 NUMBER_OF_VECTORS is the number of vector defs to create.
4215 If NEUTRAL_OP is nonnull, introducing extra elements of that
4216 value will not change the result. */
4218 static void
4219 get_initial_defs_for_reduction (slp_tree slp_node,
4220 vec<tree> *vec_oprnds,
4221 unsigned int number_of_vectors,
4222 bool reduc_chain, tree neutral_op)
4224 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4225 gimple *stmt = stmts[0];
4226 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4227 unsigned HOST_WIDE_INT nunits;
4228 unsigned j, number_of_places_left_in_vector;
4229 tree vector_type;
4230 tree vop;
4231 int group_size = stmts.length ();
4232 unsigned int vec_num, i;
4233 unsigned number_of_copies = 1;
4234 vec<tree> voprnds;
4235 voprnds.create (number_of_vectors);
4236 struct loop *loop;
4237 auto_vec<tree, 16> permute_results;
4239 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4241 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4243 loop = (gimple_bb (stmt))->loop_father;
4244 gcc_assert (loop);
4245 edge pe = loop_preheader_edge (loop);
4247 gcc_assert (!reduc_chain || neutral_op);
4249 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4250 created vectors. It is greater than 1 if unrolling is performed.
4252 For example, we have two scalar operands, s1 and s2 (e.g., group of
4253 strided accesses of size two), while NUNITS is four (i.e., four scalars
4254 of this type can be packed in a vector). The output vector will contain
4255 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4256 will be 2).
4258 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4259 vectors containing the operands.
4261 For example, NUNITS is four as before, and the group size is 8
4262 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4263 {s5, s6, s7, s8}. */
4265 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4266 nunits = group_size;
4268 number_of_copies = nunits * number_of_vectors / group_size;
4270 number_of_places_left_in_vector = nunits;
4271 bool constant_p = true;
4272 tree_vector_builder elts (vector_type, nunits, 1);
4273 elts.quick_grow (nunits);
4274 for (j = 0; j < number_of_copies; j++)
4276 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4278 tree op;
4279 /* Get the def before the loop. In reduction chain we have only
4280 one initial value. */
4281 if ((j != (number_of_copies - 1)
4282 || (reduc_chain && i != 0))
4283 && neutral_op)
4284 op = neutral_op;
4285 else
4286 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4288 /* Create 'vect_ = {op0,op1,...,opn}'. */
4289 number_of_places_left_in_vector--;
4290 elts[number_of_places_left_in_vector] = op;
4291 if (!CONSTANT_CLASS_P (op))
4292 constant_p = false;
4294 if (number_of_places_left_in_vector == 0)
4296 gimple_seq ctor_seq = NULL;
4297 tree init;
4298 if (constant_p && !neutral_op
4299 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4300 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4301 /* Build the vector directly from ELTS. */
4302 init = gimple_build_vector (&ctor_seq, &elts);
4303 else if (neutral_op)
4305 /* Build a vector of the neutral value and shift the
4306 other elements into place. */
4307 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4308 neutral_op);
4309 int k = nunits;
4310 while (k > 0 && elts[k - 1] == neutral_op)
4311 k -= 1;
4312 while (k > 0)
4314 k -= 1;
4315 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4316 vector_type, init, elts[k]);
4319 else
4321 /* First time round, duplicate ELTS to fill the
4322 required number of vectors, then cherry pick the
4323 appropriate result for each iteration. */
4324 if (vec_oprnds->is_empty ())
4325 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4326 number_of_vectors,
4327 permute_results);
4328 init = permute_results[number_of_vectors - j - 1];
4330 if (ctor_seq != NULL)
4331 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4332 voprnds.quick_push (init);
4334 number_of_places_left_in_vector = nunits;
4335 elts.new_vector (vector_type, nunits, 1);
4336 elts.quick_grow (nunits);
4337 constant_p = true;
4342 /* Since the vectors are created in the reverse order, we should invert
4343 them. */
4344 vec_num = voprnds.length ();
4345 for (j = vec_num; j != 0; j--)
4347 vop = voprnds[j - 1];
4348 vec_oprnds->quick_push (vop);
4351 voprnds.release ();
4353 /* In case that VF is greater than the unrolling factor needed for the SLP
4354 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4355 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4356 to replicate the vectors. */
4357 tree neutral_vec = NULL;
4358 while (number_of_vectors > vec_oprnds->length ())
4360 if (neutral_op)
4362 if (!neutral_vec)
4364 gimple_seq ctor_seq = NULL;
4365 neutral_vec = gimple_build_vector_from_val
4366 (&ctor_seq, vector_type, neutral_op);
4367 if (ctor_seq != NULL)
4368 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4370 vec_oprnds->quick_push (neutral_vec);
4372 else
4374 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4375 vec_oprnds->quick_push (vop);
4381 /* Function vect_create_epilog_for_reduction
4383 Create code at the loop-epilog to finalize the result of a reduction
4384 computation.
4386 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4387 reduction statements.
4388 STMT is the scalar reduction stmt that is being vectorized.
4389 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4390 number of elements that we can fit in a vectype (nunits). In this case
4391 we have to generate more than one vector stmt - i.e - we need to "unroll"
4392 the vector stmt by a factor VF/nunits. For more details see documentation
4393 in vectorizable_operation.
4394 REDUC_FN is the internal function for the epilog reduction.
4395 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4396 computation.
4397 REDUC_INDEX is the index of the operand in the right hand side of the
4398 statement that is defined by REDUCTION_PHI.
4399 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4400 SLP_NODE is an SLP node containing a group of reduction statements. The
4401 first one in this group is STMT.
4402 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4403 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4404 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4405 any value of the IV in the loop.
4406 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4407 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4408 null if this is not an SLP reduction
4410 This function:
4411 1. Creates the reduction def-use cycles: sets the arguments for
4412 REDUCTION_PHIS:
4413 The loop-entry argument is the vectorized initial-value of the reduction.
4414 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4415 sums.
4416 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4417 by calling the function specified by REDUC_FN if available, or by
4418 other means (whole-vector shifts or a scalar loop).
4419 The function also creates a new phi node at the loop exit to preserve
4420 loop-closed form, as illustrated below.
4422 The flow at the entry to this function:
4424 loop:
4425 vec_def = phi <null, null> # REDUCTION_PHI
4426 VECT_DEF = vector_stmt # vectorized form of STMT
4427 s_loop = scalar_stmt # (scalar) STMT
4428 loop_exit:
4429 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4430 use <s_out0>
4431 use <s_out0>
4433 The above is transformed by this function into:
4435 loop:
4436 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4437 VECT_DEF = vector_stmt # vectorized form of STMT
4438 s_loop = scalar_stmt # (scalar) STMT
4439 loop_exit:
4440 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4441 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4442 v_out2 = reduce <v_out1>
4443 s_out3 = extract_field <v_out2, 0>
4444 s_out4 = adjust_result <s_out3>
4445 use <s_out4>
4446 use <s_out4>
4449 static void
4450 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4451 gimple *reduc_def_stmt,
4452 int ncopies, internal_fn reduc_fn,
4453 vec<gimple *> reduction_phis,
4454 bool double_reduc,
4455 slp_tree slp_node,
4456 slp_instance slp_node_instance,
4457 tree induc_val, enum tree_code induc_code,
4458 tree neutral_op)
4460 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4461 stmt_vec_info prev_phi_info;
4462 tree vectype;
4463 machine_mode mode;
4464 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4465 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4466 basic_block exit_bb;
4467 tree scalar_dest;
4468 tree scalar_type;
4469 gimple *new_phi = NULL, *phi;
4470 gimple_stmt_iterator exit_gsi;
4471 tree vec_dest;
4472 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4473 gimple *epilog_stmt = NULL;
4474 enum tree_code code = gimple_assign_rhs_code (stmt);
4475 gimple *exit_phi;
4476 tree bitsize;
4477 tree adjustment_def = NULL;
4478 tree vec_initial_def = NULL;
4479 tree expr, def, initial_def = NULL;
4480 tree orig_name, scalar_result;
4481 imm_use_iterator imm_iter, phi_imm_iter;
4482 use_operand_p use_p, phi_use_p;
4483 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4484 bool nested_in_vect_loop = false;
4485 auto_vec<gimple *> new_phis;
4486 auto_vec<gimple *> inner_phis;
4487 enum vect_def_type dt = vect_unknown_def_type;
4488 int j, i;
4489 auto_vec<tree> scalar_results;
4490 unsigned int group_size = 1, k, ratio;
4491 auto_vec<tree> vec_initial_defs;
4492 auto_vec<gimple *> phis;
4493 bool slp_reduc = false;
4494 bool direct_slp_reduc;
4495 tree new_phi_result;
4496 gimple *inner_phi = NULL;
4497 tree induction_index = NULL_TREE;
4499 if (slp_node)
4500 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4502 if (nested_in_vect_loop_p (loop, stmt))
4504 outer_loop = loop;
4505 loop = loop->inner;
4506 nested_in_vect_loop = true;
4507 gcc_assert (!slp_node);
4510 vectype = STMT_VINFO_VECTYPE (stmt_info);
4511 gcc_assert (vectype);
4512 mode = TYPE_MODE (vectype);
4514 /* 1. Create the reduction def-use cycle:
4515 Set the arguments of REDUCTION_PHIS, i.e., transform
4517 loop:
4518 vec_def = phi <null, null> # REDUCTION_PHI
4519 VECT_DEF = vector_stmt # vectorized form of STMT
4522 into:
4524 loop:
4525 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4526 VECT_DEF = vector_stmt # vectorized form of STMT
4529 (in case of SLP, do it for all the phis). */
4531 /* Get the loop-entry arguments. */
4532 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4533 if (slp_node)
4535 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4536 vec_initial_defs.reserve (vec_num);
4537 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4538 &vec_initial_defs, vec_num,
4539 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4540 neutral_op);
4542 else
4544 /* Get at the scalar def before the loop, that defines the initial value
4545 of the reduction variable. */
4546 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4547 loop_preheader_edge (loop));
4548 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4549 and we can't use zero for induc_val, use initial_def. Similarly
4550 for REDUC_MIN and initial_def larger than the base. */
4551 if (TREE_CODE (initial_def) == INTEGER_CST
4552 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4553 == INTEGER_INDUC_COND_REDUCTION)
4554 && !integer_zerop (induc_val)
4555 && ((induc_code == MAX_EXPR
4556 && tree_int_cst_lt (initial_def, induc_val))
4557 || (induc_code == MIN_EXPR
4558 && tree_int_cst_lt (induc_val, initial_def))))
4559 induc_val = initial_def;
4561 if (double_reduc)
4562 /* In case of double reduction we only create a vector variable
4563 to be put in the reduction phi node. The actual statement
4564 creation is done later in this function. */
4565 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4566 else if (nested_in_vect_loop)
4568 /* Do not use an adjustment def as that case is not supported
4569 correctly if ncopies is not one. */
4570 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4571 vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4573 else
4574 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4575 &adjustment_def);
4576 vec_initial_defs.create (1);
4577 vec_initial_defs.quick_push (vec_initial_def);
4580 /* Set phi nodes arguments. */
4581 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4583 tree vec_init_def = vec_initial_defs[i];
4584 tree def = vect_defs[i];
4585 for (j = 0; j < ncopies; j++)
4587 if (j != 0)
4589 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4590 if (nested_in_vect_loop)
4591 vec_init_def
4592 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4593 vec_init_def);
4596 /* Set the loop-entry arg of the reduction-phi. */
4598 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4599 == INTEGER_INDUC_COND_REDUCTION)
4601 /* Initialise the reduction phi to zero. This prevents initial
4602 values of non-zero interferring with the reduction op. */
4603 gcc_assert (ncopies == 1);
4604 gcc_assert (i == 0);
4606 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4607 tree induc_val_vec
4608 = build_vector_from_val (vec_init_def_type, induc_val);
4610 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4611 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4613 else
4614 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4615 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4617 /* Set the loop-latch arg for the reduction-phi. */
4618 if (j > 0)
4619 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4621 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4622 UNKNOWN_LOCATION);
4624 if (dump_enabled_p ())
4626 dump_printf_loc (MSG_NOTE, vect_location,
4627 "transform reduction: created def-use cycle: ");
4628 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4629 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4634 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4635 which is updated with the current index of the loop for every match of
4636 the original loop's cond_expr (VEC_STMT). This results in a vector
4637 containing the last time the condition passed for that vector lane.
4638 The first match will be a 1 to allow 0 to be used for non-matching
4639 indexes. If there are no matches at all then the vector will be all
4640 zeroes. */
4641 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4643 tree indx_before_incr, indx_after_incr;
4644 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4646 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4647 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4649 int scalar_precision
4650 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4651 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4652 tree cr_index_vector_type = build_vector_type
4653 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4655 /* First we create a simple vector induction variable which starts
4656 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4657 vector size (STEP). */
4659 /* Create a {1,2,3,...} vector. */
4660 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4662 /* Create a vector of the step value. */
4663 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4664 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4666 /* Create an induction variable. */
4667 gimple_stmt_iterator incr_gsi;
4668 bool insert_after;
4669 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4670 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4671 insert_after, &indx_before_incr, &indx_after_incr);
4673 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4674 filled with zeros (VEC_ZERO). */
4676 /* Create a vector of 0s. */
4677 tree zero = build_zero_cst (cr_index_scalar_type);
4678 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4680 /* Create a vector phi node. */
4681 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4682 new_phi = create_phi_node (new_phi_tree, loop->header);
4683 set_vinfo_for_stmt (new_phi,
4684 new_stmt_vec_info (new_phi, loop_vinfo));
4685 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4686 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4688 /* Now take the condition from the loops original cond_expr
4689 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4690 every match uses values from the induction variable
4691 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4692 (NEW_PHI_TREE).
4693 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4694 the new cond_expr (INDEX_COND_EXPR). */
4696 /* Duplicate the condition from vec_stmt. */
4697 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4699 /* Create a conditional, where the condition is taken from vec_stmt
4700 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4701 else is the phi (NEW_PHI_TREE). */
4702 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4703 ccompare, indx_before_incr,
4704 new_phi_tree);
4705 induction_index = make_ssa_name (cr_index_vector_type);
4706 gimple *index_condition = gimple_build_assign (induction_index,
4707 index_cond_expr);
4708 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4709 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4710 loop_vinfo);
4711 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4712 set_vinfo_for_stmt (index_condition, index_vec_info);
4714 /* Update the phi with the vec cond. */
4715 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4716 loop_latch_edge (loop), UNKNOWN_LOCATION);
4719 /* 2. Create epilog code.
4720 The reduction epilog code operates across the elements of the vector
4721 of partial results computed by the vectorized loop.
4722 The reduction epilog code consists of:
4724 step 1: compute the scalar result in a vector (v_out2)
4725 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4726 step 3: adjust the scalar result (s_out3) if needed.
4728 Step 1 can be accomplished using one the following three schemes:
4729 (scheme 1) using reduc_fn, if available.
4730 (scheme 2) using whole-vector shifts, if available.
4731 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4732 combined.
4734 The overall epilog code looks like this:
4736 s_out0 = phi <s_loop> # original EXIT_PHI
4737 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4738 v_out2 = reduce <v_out1> # step 1
4739 s_out3 = extract_field <v_out2, 0> # step 2
4740 s_out4 = adjust_result <s_out3> # step 3
4742 (step 3 is optional, and steps 1 and 2 may be combined).
4743 Lastly, the uses of s_out0 are replaced by s_out4. */
4746 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4747 v_out1 = phi <VECT_DEF>
4748 Store them in NEW_PHIS. */
4750 exit_bb = single_exit (loop)->dest;
4751 prev_phi_info = NULL;
4752 new_phis.create (vect_defs.length ());
4753 FOR_EACH_VEC_ELT (vect_defs, i, def)
4755 for (j = 0; j < ncopies; j++)
4757 tree new_def = copy_ssa_name (def);
4758 phi = create_phi_node (new_def, exit_bb);
4759 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4760 if (j == 0)
4761 new_phis.quick_push (phi);
4762 else
4764 def = vect_get_vec_def_for_stmt_copy (dt, def);
4765 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4768 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4769 prev_phi_info = vinfo_for_stmt (phi);
4773 /* The epilogue is created for the outer-loop, i.e., for the loop being
4774 vectorized. Create exit phis for the outer loop. */
4775 if (double_reduc)
4777 loop = outer_loop;
4778 exit_bb = single_exit (loop)->dest;
4779 inner_phis.create (vect_defs.length ());
4780 FOR_EACH_VEC_ELT (new_phis, i, phi)
4782 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4783 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4784 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4785 PHI_RESULT (phi));
4786 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4787 loop_vinfo));
4788 inner_phis.quick_push (phi);
4789 new_phis[i] = outer_phi;
4790 prev_phi_info = vinfo_for_stmt (outer_phi);
4791 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4793 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4794 new_result = copy_ssa_name (PHI_RESULT (phi));
4795 outer_phi = create_phi_node (new_result, exit_bb);
4796 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4797 PHI_RESULT (phi));
4798 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4799 loop_vinfo));
4800 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4801 prev_phi_info = vinfo_for_stmt (outer_phi);
4806 exit_gsi = gsi_after_labels (exit_bb);
4808 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4809 (i.e. when reduc_fn is not available) and in the final adjustment
4810 code (if needed). Also get the original scalar reduction variable as
4811 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4812 represents a reduction pattern), the tree-code and scalar-def are
4813 taken from the original stmt that the pattern-stmt (STMT) replaces.
4814 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4815 are taken from STMT. */
4817 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4818 if (!orig_stmt)
4820 /* Regular reduction */
4821 orig_stmt = stmt;
4823 else
4825 /* Reduction pattern */
4826 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4827 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4828 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4831 code = gimple_assign_rhs_code (orig_stmt);
4832 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4833 partial results are added and not subtracted. */
4834 if (code == MINUS_EXPR)
4835 code = PLUS_EXPR;
4837 scalar_dest = gimple_assign_lhs (orig_stmt);
4838 scalar_type = TREE_TYPE (scalar_dest);
4839 scalar_results.create (group_size);
4840 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4841 bitsize = TYPE_SIZE (scalar_type);
4843 /* In case this is a reduction in an inner-loop while vectorizing an outer
4844 loop - we don't need to extract a single scalar result at the end of the
4845 inner-loop (unless it is double reduction, i.e., the use of reduction is
4846 outside the outer-loop). The final vector of partial results will be used
4847 in the vectorized outer-loop, or reduced to a scalar result at the end of
4848 the outer-loop. */
4849 if (nested_in_vect_loop && !double_reduc)
4850 goto vect_finalize_reduction;
4852 /* SLP reduction without reduction chain, e.g.,
4853 # a1 = phi <a2, a0>
4854 # b1 = phi <b2, b0>
4855 a2 = operation (a1)
4856 b2 = operation (b1) */
4857 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4859 /* True if we should implement SLP_REDUC using native reduction operations
4860 instead of scalar operations. */
4861 direct_slp_reduc = (reduc_fn != IFN_LAST
4862 && slp_reduc
4863 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4865 /* In case of reduction chain, e.g.,
4866 # a1 = phi <a3, a0>
4867 a2 = operation (a1)
4868 a3 = operation (a2),
4870 we may end up with more than one vector result. Here we reduce them to
4871 one vector. */
4872 if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4874 tree first_vect = PHI_RESULT (new_phis[0]);
4875 gassign *new_vec_stmt = NULL;
4876 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4877 for (k = 1; k < new_phis.length (); k++)
4879 gimple *next_phi = new_phis[k];
4880 tree second_vect = PHI_RESULT (next_phi);
4881 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4882 new_vec_stmt = gimple_build_assign (tem, code,
4883 first_vect, second_vect);
4884 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4885 first_vect = tem;
4888 new_phi_result = first_vect;
4889 if (new_vec_stmt)
4891 new_phis.truncate (0);
4892 new_phis.safe_push (new_vec_stmt);
4895 /* Likewise if we couldn't use a single defuse cycle. */
4896 else if (ncopies > 1)
4898 gcc_assert (new_phis.length () == 1);
4899 tree first_vect = PHI_RESULT (new_phis[0]);
4900 gassign *new_vec_stmt = NULL;
4901 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4902 gimple *next_phi = new_phis[0];
4903 for (int k = 1; k < ncopies; ++k)
4905 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4906 tree second_vect = PHI_RESULT (next_phi);
4907 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4908 new_vec_stmt = gimple_build_assign (tem, code,
4909 first_vect, second_vect);
4910 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4911 first_vect = tem;
4913 new_phi_result = first_vect;
4914 new_phis.truncate (0);
4915 new_phis.safe_push (new_vec_stmt);
4917 else
4918 new_phi_result = PHI_RESULT (new_phis[0]);
4920 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4921 && reduc_fn != IFN_LAST)
4923 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4924 various data values where the condition matched and another vector
4925 (INDUCTION_INDEX) containing all the indexes of those matches. We
4926 need to extract the last matching index (which will be the index with
4927 highest value) and use this to index into the data vector.
4928 For the case where there were no matches, the data vector will contain
4929 all default values and the index vector will be all zeros. */
4931 /* Get various versions of the type of the vector of indexes. */
4932 tree index_vec_type = TREE_TYPE (induction_index);
4933 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4934 tree index_scalar_type = TREE_TYPE (index_vec_type);
4935 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4936 (index_vec_type);
4938 /* Get an unsigned integer version of the type of the data vector. */
4939 int scalar_precision
4940 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4941 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4942 tree vectype_unsigned = build_vector_type
4943 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4945 /* First we need to create a vector (ZERO_VEC) of zeros and another
4946 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4947 can create using a MAX reduction and then expanding.
4948 In the case where the loop never made any matches, the max index will
4949 be zero. */
4951 /* Vector of {0, 0, 0,...}. */
4952 tree zero_vec = make_ssa_name (vectype);
4953 tree zero_vec_rhs = build_zero_cst (vectype);
4954 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4955 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4957 /* Find maximum value from the vector of found indexes. */
4958 tree max_index = make_ssa_name (index_scalar_type);
4959 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4960 1, induction_index);
4961 gimple_call_set_lhs (max_index_stmt, max_index);
4962 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4964 /* Vector of {max_index, max_index, max_index,...}. */
4965 tree max_index_vec = make_ssa_name (index_vec_type);
4966 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4967 max_index);
4968 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4969 max_index_vec_rhs);
4970 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4972 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4973 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4974 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4975 otherwise. Only one value should match, resulting in a vector
4976 (VEC_COND) with one data value and the rest zeros.
4977 In the case where the loop never made any matches, every index will
4978 match, resulting in a vector with all data values (which will all be
4979 the default value). */
4981 /* Compare the max index vector to the vector of found indexes to find
4982 the position of the max value. */
4983 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4984 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4985 induction_index,
4986 max_index_vec);
4987 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4989 /* Use the compare to choose either values from the data vector or
4990 zero. */
4991 tree vec_cond = make_ssa_name (vectype);
4992 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4993 vec_compare, new_phi_result,
4994 zero_vec);
4995 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4997 /* Finally we need to extract the data value from the vector (VEC_COND)
4998 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4999 reduction, but because this doesn't exist, we can use a MAX reduction
5000 instead. The data value might be signed or a float so we need to cast
5001 it first.
5002 In the case where the loop never made any matches, the data values are
5003 all identical, and so will reduce down correctly. */
5005 /* Make the matched data values unsigned. */
5006 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5007 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5008 vec_cond);
5009 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5010 VIEW_CONVERT_EXPR,
5011 vec_cond_cast_rhs);
5012 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5014 /* Reduce down to a scalar value. */
5015 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5016 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5017 1, vec_cond_cast);
5018 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5019 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5021 /* Convert the reduced value back to the result type and set as the
5022 result. */
5023 gimple_seq stmts = NULL;
5024 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5025 data_reduc);
5026 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5027 scalar_results.safe_push (new_temp);
5029 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5030 && reduc_fn == IFN_LAST)
5032 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5033 idx = 0;
5034 idx_val = induction_index[0];
5035 val = data_reduc[0];
5036 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5037 if (induction_index[i] > idx_val)
5038 val = data_reduc[i], idx_val = induction_index[i];
5039 return val; */
5041 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5042 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5043 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5044 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5045 /* Enforced by vectorizable_reduction, which ensures we have target
5046 support before allowing a conditional reduction on variable-length
5047 vectors. */
5048 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5049 tree idx_val = NULL_TREE, val = NULL_TREE;
5050 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5052 tree old_idx_val = idx_val;
5053 tree old_val = val;
5054 idx_val = make_ssa_name (idx_eltype);
5055 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5056 build3 (BIT_FIELD_REF, idx_eltype,
5057 induction_index,
5058 bitsize_int (el_size),
5059 bitsize_int (off)));
5060 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5061 val = make_ssa_name (data_eltype);
5062 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5063 build3 (BIT_FIELD_REF,
5064 data_eltype,
5065 new_phi_result,
5066 bitsize_int (el_size),
5067 bitsize_int (off)));
5068 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5069 if (off != 0)
5071 tree new_idx_val = idx_val;
5072 tree new_val = val;
5073 if (off != v_size - el_size)
5075 new_idx_val = make_ssa_name (idx_eltype);
5076 epilog_stmt = gimple_build_assign (new_idx_val,
5077 MAX_EXPR, idx_val,
5078 old_idx_val);
5079 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5081 new_val = make_ssa_name (data_eltype);
5082 epilog_stmt = gimple_build_assign (new_val,
5083 COND_EXPR,
5084 build2 (GT_EXPR,
5085 boolean_type_node,
5086 idx_val,
5087 old_idx_val),
5088 val, old_val);
5089 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5090 idx_val = new_idx_val;
5091 val = new_val;
5094 /* Convert the reduced value back to the result type and set as the
5095 result. */
5096 gimple_seq stmts = NULL;
5097 val = gimple_convert (&stmts, scalar_type, val);
5098 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5099 scalar_results.safe_push (val);
5102 /* 2.3 Create the reduction code, using one of the three schemes described
5103 above. In SLP we simply need to extract all the elements from the
5104 vector (without reducing them), so we use scalar shifts. */
5105 else if (reduc_fn != IFN_LAST && !slp_reduc)
5107 tree tmp;
5108 tree vec_elem_type;
5110 /* Case 1: Create:
5111 v_out2 = reduc_expr <v_out1> */
5113 if (dump_enabled_p ())
5114 dump_printf_loc (MSG_NOTE, vect_location,
5115 "Reduce using direct vector reduction.\n");
5117 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5118 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5120 tree tmp_dest
5121 = vect_create_destination_var (scalar_dest, vec_elem_type);
5122 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5123 new_phi_result);
5124 gimple_set_lhs (epilog_stmt, tmp_dest);
5125 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5126 gimple_set_lhs (epilog_stmt, new_temp);
5127 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5129 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5130 new_temp);
5132 else
5134 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5135 new_phi_result);
5136 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5139 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5140 gimple_set_lhs (epilog_stmt, new_temp);
5141 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5143 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5144 == INTEGER_INDUC_COND_REDUCTION)
5145 && !operand_equal_p (initial_def, induc_val, 0))
5147 /* Earlier we set the initial value to be a vector if induc_val
5148 values. Check the result and if it is induc_val then replace
5149 with the original initial value, unless induc_val is
5150 the same as initial_def already. */
5151 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5152 induc_val);
5154 tmp = make_ssa_name (new_scalar_dest);
5155 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5156 initial_def, new_temp);
5157 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5158 new_temp = tmp;
5161 scalar_results.safe_push (new_temp);
5163 else if (direct_slp_reduc)
5165 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5166 with the elements for other SLP statements replaced with the
5167 neutral value. We can then do a normal reduction on each vector. */
5169 /* Enforced by vectorizable_reduction. */
5170 gcc_assert (new_phis.length () == 1);
5171 gcc_assert (pow2p_hwi (group_size));
5173 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5174 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5175 gimple_seq seq = NULL;
5177 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5178 and the same element size as VECTYPE. */
5179 tree index = build_index_vector (vectype, 0, 1);
5180 tree index_type = TREE_TYPE (index);
5181 tree index_elt_type = TREE_TYPE (index_type);
5182 tree mask_type = build_same_sized_truth_vector_type (index_type);
5184 /* Create a vector that, for each element, identifies which of
5185 the REDUC_GROUP_SIZE results should use it. */
5186 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5187 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5188 build_vector_from_val (index_type, index_mask));
5190 /* Get a neutral vector value. This is simply a splat of the neutral
5191 scalar value if we have one, otherwise the initial scalar value
5192 is itself a neutral value. */
5193 tree vector_identity = NULL_TREE;
5194 if (neutral_op)
5195 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5196 neutral_op);
5197 for (unsigned int i = 0; i < group_size; ++i)
5199 /* If there's no univeral neutral value, we can use the
5200 initial scalar value from the original PHI. This is used
5201 for MIN and MAX reduction, for example. */
5202 if (!neutral_op)
5204 tree scalar_value
5205 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5206 loop_preheader_edge (loop));
5207 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5208 scalar_value);
5211 /* Calculate the equivalent of:
5213 sel[j] = (index[j] == i);
5215 which selects the elements of NEW_PHI_RESULT that should
5216 be included in the result. */
5217 tree compare_val = build_int_cst (index_elt_type, i);
5218 compare_val = build_vector_from_val (index_type, compare_val);
5219 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5220 index, compare_val);
5222 /* Calculate the equivalent of:
5224 vec = seq ? new_phi_result : vector_identity;
5226 VEC is now suitable for a full vector reduction. */
5227 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5228 sel, new_phi_result, vector_identity);
5230 /* Do the reduction and convert it to the appropriate type. */
5231 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5232 TREE_TYPE (vectype), vec);
5233 scalar = gimple_convert (&seq, scalar_type, scalar);
5234 scalar_results.safe_push (scalar);
5236 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5238 else
5240 bool reduce_with_shift;
5241 tree vec_temp;
5243 /* COND reductions all do the final reduction with MAX_EXPR
5244 or MIN_EXPR. */
5245 if (code == COND_EXPR)
5247 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5248 == INTEGER_INDUC_COND_REDUCTION)
5249 code = induc_code;
5250 else
5251 code = MAX_EXPR;
5254 /* See if the target wants to do the final (shift) reduction
5255 in a vector mode of smaller size and first reduce upper/lower
5256 halves against each other. */
5257 enum machine_mode mode1 = mode;
5258 tree vectype1 = vectype;
5259 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5260 unsigned sz1 = sz;
5261 if (!slp_reduc
5262 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5263 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5265 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5266 reduce_with_shift = have_whole_vector_shift (mode1);
5267 if (!VECTOR_MODE_P (mode1))
5268 reduce_with_shift = false;
5269 else
5271 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5272 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5273 reduce_with_shift = false;
5276 /* First reduce the vector to the desired vector size we should
5277 do shift reduction on by combining upper and lower halves. */
5278 new_temp = new_phi_result;
5279 while (sz > sz1)
5281 gcc_assert (!slp_reduc);
5282 sz /= 2;
5283 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5285 /* The target has to make sure we support lowpart/highpart
5286 extraction, either via direct vector extract or through
5287 an integer mode punning. */
5288 tree dst1, dst2;
5289 if (convert_optab_handler (vec_extract_optab,
5290 TYPE_MODE (TREE_TYPE (new_temp)),
5291 TYPE_MODE (vectype1))
5292 != CODE_FOR_nothing)
5294 /* Extract sub-vectors directly once vec_extract becomes
5295 a conversion optab. */
5296 dst1 = make_ssa_name (vectype1);
5297 epilog_stmt
5298 = gimple_build_assign (dst1, BIT_FIELD_REF,
5299 build3 (BIT_FIELD_REF, vectype1,
5300 new_temp, TYPE_SIZE (vectype1),
5301 bitsize_int (0)));
5302 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5303 dst2 = make_ssa_name (vectype1);
5304 epilog_stmt
5305 = gimple_build_assign (dst2, BIT_FIELD_REF,
5306 build3 (BIT_FIELD_REF, vectype1,
5307 new_temp, TYPE_SIZE (vectype1),
5308 bitsize_int (sz * BITS_PER_UNIT)));
5309 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5311 else
5313 /* Extract via punning to appropriately sized integer mode
5314 vector. */
5315 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5317 tree etype = build_vector_type (eltype, 2);
5318 gcc_assert (convert_optab_handler (vec_extract_optab,
5319 TYPE_MODE (etype),
5320 TYPE_MODE (eltype))
5321 != CODE_FOR_nothing);
5322 tree tem = make_ssa_name (etype);
5323 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5324 build1 (VIEW_CONVERT_EXPR,
5325 etype, new_temp));
5326 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5327 new_temp = tem;
5328 tem = make_ssa_name (eltype);
5329 epilog_stmt
5330 = gimple_build_assign (tem, BIT_FIELD_REF,
5331 build3 (BIT_FIELD_REF, eltype,
5332 new_temp, TYPE_SIZE (eltype),
5333 bitsize_int (0)));
5334 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5335 dst1 = make_ssa_name (vectype1);
5336 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5337 build1 (VIEW_CONVERT_EXPR,
5338 vectype1, tem));
5339 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5340 tem = make_ssa_name (eltype);
5341 epilog_stmt
5342 = gimple_build_assign (tem, BIT_FIELD_REF,
5343 build3 (BIT_FIELD_REF, eltype,
5344 new_temp, TYPE_SIZE (eltype),
5345 bitsize_int (sz * BITS_PER_UNIT)));
5346 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347 dst2 = make_ssa_name (vectype1);
5348 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5349 build1 (VIEW_CONVERT_EXPR,
5350 vectype1, tem));
5351 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5354 new_temp = make_ssa_name (vectype1);
5355 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5356 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5359 if (reduce_with_shift && !slp_reduc)
5361 int element_bitsize = tree_to_uhwi (bitsize);
5362 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5363 for variable-length vectors and also requires direct target support
5364 for loop reductions. */
5365 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5366 int nelements = vec_size_in_bits / element_bitsize;
5367 vec_perm_builder sel;
5368 vec_perm_indices indices;
5370 int elt_offset;
5372 tree zero_vec = build_zero_cst (vectype1);
5373 /* Case 2: Create:
5374 for (offset = nelements/2; offset >= 1; offset/=2)
5376 Create: va' = vec_shift <va, offset>
5377 Create: va = vop <va, va'>
5378 } */
5380 tree rhs;
5382 if (dump_enabled_p ())
5383 dump_printf_loc (MSG_NOTE, vect_location,
5384 "Reduce using vector shifts\n");
5386 mode1 = TYPE_MODE (vectype1);
5387 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5388 for (elt_offset = nelements / 2;
5389 elt_offset >= 1;
5390 elt_offset /= 2)
5392 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5393 indices.new_vector (sel, 2, nelements);
5394 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5395 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5396 new_temp, zero_vec, mask);
5397 new_name = make_ssa_name (vec_dest, epilog_stmt);
5398 gimple_assign_set_lhs (epilog_stmt, new_name);
5399 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5401 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5402 new_temp);
5403 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5404 gimple_assign_set_lhs (epilog_stmt, new_temp);
5405 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5408 /* 2.4 Extract the final scalar result. Create:
5409 s_out3 = extract_field <v_out2, bitpos> */
5411 if (dump_enabled_p ())
5412 dump_printf_loc (MSG_NOTE, vect_location,
5413 "extract scalar result\n");
5415 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5416 bitsize, bitsize_zero_node);
5417 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5418 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5419 gimple_assign_set_lhs (epilog_stmt, new_temp);
5420 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5421 scalar_results.safe_push (new_temp);
5423 else
5425 /* Case 3: Create:
5426 s = extract_field <v_out2, 0>
5427 for (offset = element_size;
5428 offset < vector_size;
5429 offset += element_size;)
5431 Create: s' = extract_field <v_out2, offset>
5432 Create: s = op <s, s'> // For non SLP cases
5433 } */
5435 if (dump_enabled_p ())
5436 dump_printf_loc (MSG_NOTE, vect_location,
5437 "Reduce using scalar code.\n");
5439 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5440 int element_bitsize = tree_to_uhwi (bitsize);
5441 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5443 int bit_offset;
5444 if (gimple_code (new_phi) == GIMPLE_PHI)
5445 vec_temp = PHI_RESULT (new_phi);
5446 else
5447 vec_temp = gimple_assign_lhs (new_phi);
5448 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5449 bitsize_zero_node);
5450 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5451 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5452 gimple_assign_set_lhs (epilog_stmt, new_temp);
5453 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5455 /* In SLP we don't need to apply reduction operation, so we just
5456 collect s' values in SCALAR_RESULTS. */
5457 if (slp_reduc)
5458 scalar_results.safe_push (new_temp);
5460 for (bit_offset = element_bitsize;
5461 bit_offset < vec_size_in_bits;
5462 bit_offset += element_bitsize)
5464 tree bitpos = bitsize_int (bit_offset);
5465 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5466 bitsize, bitpos);
5468 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5469 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5470 gimple_assign_set_lhs (epilog_stmt, new_name);
5471 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5473 if (slp_reduc)
5475 /* In SLP we don't need to apply reduction operation, so
5476 we just collect s' values in SCALAR_RESULTS. */
5477 new_temp = new_name;
5478 scalar_results.safe_push (new_name);
5480 else
5482 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5483 new_name, new_temp);
5484 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5485 gimple_assign_set_lhs (epilog_stmt, new_temp);
5486 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5491 /* The only case where we need to reduce scalar results in SLP, is
5492 unrolling. If the size of SCALAR_RESULTS is greater than
5493 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5494 REDUC_GROUP_SIZE. */
5495 if (slp_reduc)
5497 tree res, first_res, new_res;
5498 gimple *new_stmt;
5500 /* Reduce multiple scalar results in case of SLP unrolling. */
5501 for (j = group_size; scalar_results.iterate (j, &res);
5502 j++)
5504 first_res = scalar_results[j % group_size];
5505 new_stmt = gimple_build_assign (new_scalar_dest, code,
5506 first_res, res);
5507 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5508 gimple_assign_set_lhs (new_stmt, new_res);
5509 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5510 scalar_results[j % group_size] = new_res;
5513 else
5514 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5515 scalar_results.safe_push (new_temp);
5518 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5519 == INTEGER_INDUC_COND_REDUCTION)
5520 && !operand_equal_p (initial_def, induc_val, 0))
5522 /* Earlier we set the initial value to be a vector if induc_val
5523 values. Check the result and if it is induc_val then replace
5524 with the original initial value, unless induc_val is
5525 the same as initial_def already. */
5526 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5527 induc_val);
5529 tree tmp = make_ssa_name (new_scalar_dest);
5530 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5531 initial_def, new_temp);
5532 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5533 scalar_results[0] = tmp;
5537 vect_finalize_reduction:
5539 if (double_reduc)
5540 loop = loop->inner;
5542 /* 2.5 Adjust the final result by the initial value of the reduction
5543 variable. (When such adjustment is not needed, then
5544 'adjustment_def' is zero). For example, if code is PLUS we create:
5545 new_temp = loop_exit_def + adjustment_def */
5547 if (adjustment_def)
5549 gcc_assert (!slp_reduc);
5550 if (nested_in_vect_loop)
5552 new_phi = new_phis[0];
5553 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5554 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5555 new_dest = vect_create_destination_var (scalar_dest, vectype);
5557 else
5559 new_temp = scalar_results[0];
5560 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5561 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5562 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5565 epilog_stmt = gimple_build_assign (new_dest, expr);
5566 new_temp = make_ssa_name (new_dest, epilog_stmt);
5567 gimple_assign_set_lhs (epilog_stmt, new_temp);
5568 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5569 if (nested_in_vect_loop)
5571 set_vinfo_for_stmt (epilog_stmt,
5572 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5573 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5574 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5576 if (!double_reduc)
5577 scalar_results.quick_push (new_temp);
5578 else
5579 scalar_results[0] = new_temp;
5581 else
5582 scalar_results[0] = new_temp;
5584 new_phis[0] = epilog_stmt;
5587 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5588 phis with new adjusted scalar results, i.e., replace use <s_out0>
5589 with use <s_out4>.
5591 Transform:
5592 loop_exit:
5593 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5594 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5595 v_out2 = reduce <v_out1>
5596 s_out3 = extract_field <v_out2, 0>
5597 s_out4 = adjust_result <s_out3>
5598 use <s_out0>
5599 use <s_out0>
5601 into:
5603 loop_exit:
5604 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5605 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5606 v_out2 = reduce <v_out1>
5607 s_out3 = extract_field <v_out2, 0>
5608 s_out4 = adjust_result <s_out3>
5609 use <s_out4>
5610 use <s_out4> */
5613 /* In SLP reduction chain we reduce vector results into one vector if
5614 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5615 LHS of the last stmt in the reduction chain, since we are looking for
5616 the loop exit phi node. */
5617 if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5619 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5620 /* Handle reduction patterns. */
5621 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5622 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5624 scalar_dest = gimple_assign_lhs (dest_stmt);
5625 group_size = 1;
5628 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5629 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5630 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5631 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5632 correspond to the first vector stmt, etc.
5633 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5634 if (group_size > new_phis.length ())
5636 ratio = group_size / new_phis.length ();
5637 gcc_assert (!(group_size % new_phis.length ()));
5639 else
5640 ratio = 1;
5642 for (k = 0; k < group_size; k++)
5644 if (k % ratio == 0)
5646 epilog_stmt = new_phis[k / ratio];
5647 reduction_phi = reduction_phis[k / ratio];
5648 if (double_reduc)
5649 inner_phi = inner_phis[k / ratio];
5652 if (slp_reduc)
5654 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5656 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5657 /* SLP statements can't participate in patterns. */
5658 gcc_assert (!orig_stmt);
5659 scalar_dest = gimple_assign_lhs (current_stmt);
5662 phis.create (3);
5663 /* Find the loop-closed-use at the loop exit of the original scalar
5664 result. (The reduction result is expected to have two immediate uses -
5665 one at the latch block, and one at the loop exit). */
5666 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5667 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5668 && !is_gimple_debug (USE_STMT (use_p)))
5669 phis.safe_push (USE_STMT (use_p));
5671 /* While we expect to have found an exit_phi because of loop-closed-ssa
5672 form we can end up without one if the scalar cycle is dead. */
5674 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5676 if (outer_loop)
5678 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5679 gphi *vect_phi;
5681 /* FORNOW. Currently not supporting the case that an inner-loop
5682 reduction is not used in the outer-loop (but only outside the
5683 outer-loop), unless it is double reduction. */
5684 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5685 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5686 || double_reduc);
5688 if (double_reduc)
5689 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5690 else
5691 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5692 if (!double_reduc
5693 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5694 != vect_double_reduction_def)
5695 continue;
5697 /* Handle double reduction:
5699 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5700 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5701 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5702 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5704 At that point the regular reduction (stmt2 and stmt3) is
5705 already vectorized, as well as the exit phi node, stmt4.
5706 Here we vectorize the phi node of double reduction, stmt1, and
5707 update all relevant statements. */
5709 /* Go through all the uses of s2 to find double reduction phi
5710 node, i.e., stmt1 above. */
5711 orig_name = PHI_RESULT (exit_phi);
5712 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5714 stmt_vec_info use_stmt_vinfo;
5715 stmt_vec_info new_phi_vinfo;
5716 tree vect_phi_init, preheader_arg, vect_phi_res;
5717 basic_block bb = gimple_bb (use_stmt);
5718 gimple *use;
5720 /* Check that USE_STMT is really double reduction phi
5721 node. */
5722 if (gimple_code (use_stmt) != GIMPLE_PHI
5723 || gimple_phi_num_args (use_stmt) != 2
5724 || bb->loop_father != outer_loop)
5725 continue;
5726 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5727 if (!use_stmt_vinfo
5728 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5729 != vect_double_reduction_def)
5730 continue;
5732 /* Create vector phi node for double reduction:
5733 vs1 = phi <vs0, vs2>
5734 vs1 was created previously in this function by a call to
5735 vect_get_vec_def_for_operand and is stored in
5736 vec_initial_def;
5737 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5738 vs0 is created here. */
5740 /* Create vector phi node. */
5741 vect_phi = create_phi_node (vec_initial_def, bb);
5742 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5743 loop_vec_info_for_loop (outer_loop));
5744 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5746 /* Create vs0 - initial def of the double reduction phi. */
5747 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5748 loop_preheader_edge (outer_loop));
5749 vect_phi_init = get_initial_def_for_reduction
5750 (stmt, preheader_arg, NULL);
5752 /* Update phi node arguments with vs0 and vs2. */
5753 add_phi_arg (vect_phi, vect_phi_init,
5754 loop_preheader_edge (outer_loop),
5755 UNKNOWN_LOCATION);
5756 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5757 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5758 if (dump_enabled_p ())
5760 dump_printf_loc (MSG_NOTE, vect_location,
5761 "created double reduction phi node: ");
5762 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5765 vect_phi_res = PHI_RESULT (vect_phi);
5767 /* Replace the use, i.e., set the correct vs1 in the regular
5768 reduction phi node. FORNOW, NCOPIES is always 1, so the
5769 loop is redundant. */
5770 use = reduction_phi;
5771 for (j = 0; j < ncopies; j++)
5773 edge pr_edge = loop_preheader_edge (loop);
5774 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5775 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5781 phis.release ();
5782 if (nested_in_vect_loop)
5784 if (double_reduc)
5785 loop = outer_loop;
5786 else
5787 continue;
5790 phis.create (3);
5791 /* Find the loop-closed-use at the loop exit of the original scalar
5792 result. (The reduction result is expected to have two immediate uses,
5793 one at the latch block, and one at the loop exit). For double
5794 reductions we are looking for exit phis of the outer loop. */
5795 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5797 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5799 if (!is_gimple_debug (USE_STMT (use_p)))
5800 phis.safe_push (USE_STMT (use_p));
5802 else
5804 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5806 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5808 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5810 if (!flow_bb_inside_loop_p (loop,
5811 gimple_bb (USE_STMT (phi_use_p)))
5812 && !is_gimple_debug (USE_STMT (phi_use_p)))
5813 phis.safe_push (USE_STMT (phi_use_p));
5819 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5821 /* Replace the uses: */
5822 orig_name = PHI_RESULT (exit_phi);
5823 scalar_result = scalar_results[k];
5824 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5825 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5826 SET_USE (use_p, scalar_result);
5829 phis.release ();
5833 /* Return a vector of type VECTYPE that is equal to the vector select
5834 operation "MASK ? VEC : IDENTITY". Insert the select statements
5835 before GSI. */
5837 static tree
5838 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5839 tree vec, tree identity)
5841 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5842 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5843 mask, vec, identity);
5844 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5845 return cond;
5848 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5849 order, starting with LHS. Insert the extraction statements before GSI and
5850 associate the new scalar SSA names with variable SCALAR_DEST.
5851 Return the SSA name for the result. */
5853 static tree
5854 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5855 tree_code code, tree lhs, tree vector_rhs)
5857 tree vectype = TREE_TYPE (vector_rhs);
5858 tree scalar_type = TREE_TYPE (vectype);
5859 tree bitsize = TYPE_SIZE (scalar_type);
5860 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5861 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5863 for (unsigned HOST_WIDE_INT bit_offset = 0;
5864 bit_offset < vec_size_in_bits;
5865 bit_offset += element_bitsize)
5867 tree bitpos = bitsize_int (bit_offset);
5868 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5869 bitsize, bitpos);
5871 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5872 rhs = make_ssa_name (scalar_dest, stmt);
5873 gimple_assign_set_lhs (stmt, rhs);
5874 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5876 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5877 tree new_name = make_ssa_name (scalar_dest, stmt);
5878 gimple_assign_set_lhs (stmt, new_name);
5879 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5880 lhs = new_name;
5882 return lhs;
5885 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
5886 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5887 statement. CODE is the operation performed by STMT and OPS are
5888 its scalar operands. REDUC_INDEX is the index of the operand in
5889 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5890 implements in-order reduction, or IFN_LAST if we should open-code it.
5891 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5892 that should be used to control the operation in a fully-masked loop. */
5894 static bool
5895 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5896 gimple **vec_stmt, slp_tree slp_node,
5897 gimple *reduc_def_stmt,
5898 tree_code code, internal_fn reduc_fn,
5899 tree ops[3], tree vectype_in,
5900 int reduc_index, vec_loop_masks *masks)
5902 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5903 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5904 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5905 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5906 gimple *new_stmt = NULL;
5908 int ncopies;
5909 if (slp_node)
5910 ncopies = 1;
5911 else
5912 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5914 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5915 gcc_assert (ncopies == 1);
5916 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5917 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5918 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5919 == FOLD_LEFT_REDUCTION);
5921 if (slp_node)
5922 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5923 TYPE_VECTOR_SUBPARTS (vectype_in)));
5925 tree op0 = ops[1 - reduc_index];
5927 int group_size = 1;
5928 gimple *scalar_dest_def;
5929 auto_vec<tree> vec_oprnds0;
5930 if (slp_node)
5932 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5933 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5934 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5936 else
5938 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5939 vec_oprnds0.create (1);
5940 vec_oprnds0.quick_push (loop_vec_def0);
5941 scalar_dest_def = stmt;
5944 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5945 tree scalar_type = TREE_TYPE (scalar_dest);
5946 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5948 int vec_num = vec_oprnds0.length ();
5949 gcc_assert (vec_num == 1 || slp_node);
5950 tree vec_elem_type = TREE_TYPE (vectype_out);
5951 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5953 tree vector_identity = NULL_TREE;
5954 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5955 vector_identity = build_zero_cst (vectype_out);
5957 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5958 int i;
5959 tree def0;
5960 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5962 tree mask = NULL_TREE;
5963 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5964 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5966 /* Handle MINUS by adding the negative. */
5967 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5969 tree negated = make_ssa_name (vectype_out);
5970 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5971 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5972 def0 = negated;
5975 if (mask)
5976 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5977 vector_identity);
5979 /* On the first iteration the input is simply the scalar phi
5980 result, and for subsequent iterations it is the output of
5981 the preceding operation. */
5982 if (reduc_fn != IFN_LAST)
5984 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5985 /* For chained SLP reductions the output of the previous reduction
5986 operation serves as the input of the next. For the final statement
5987 the output cannot be a temporary - we reuse the original
5988 scalar destination of the last statement. */
5989 if (i != vec_num - 1)
5991 gimple_set_lhs (new_stmt, scalar_dest_var);
5992 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5993 gimple_set_lhs (new_stmt, reduc_var);
5996 else
5998 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5999 reduc_var, def0);
6000 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6001 /* Remove the statement, so that we can use the same code paths
6002 as for statements that we've just created. */
6003 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6004 gsi_remove (&tmp_gsi, false);
6007 if (i == vec_num - 1)
6009 gimple_set_lhs (new_stmt, scalar_dest);
6010 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6012 else
6013 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6015 if (slp_node)
6016 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6019 if (!slp_node)
6020 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6022 return true;
6025 /* Function is_nonwrapping_integer_induction.
6027 Check if STMT (which is part of loop LOOP) both increments and
6028 does not cause overflow. */
6030 static bool
6031 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6033 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6034 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6035 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6036 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6037 widest_int ni, max_loop_value, lhs_max;
6038 wi::overflow_type overflow = wi::OVF_NONE;
6040 /* Make sure the loop is integer based. */
6041 if (TREE_CODE (base) != INTEGER_CST
6042 || TREE_CODE (step) != INTEGER_CST)
6043 return false;
6045 /* Check that the max size of the loop will not wrap. */
6047 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6048 return true;
6050 if (! max_stmt_executions (loop, &ni))
6051 return false;
6053 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6054 &overflow);
6055 if (overflow)
6056 return false;
6058 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6059 TYPE_SIGN (lhs_type), &overflow);
6060 if (overflow)
6061 return false;
6063 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6064 <= TYPE_PRECISION (lhs_type));
6067 /* Function vectorizable_reduction.
6069 Check if STMT performs a reduction operation that can be vectorized.
6070 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6071 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6072 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6074 This function also handles reduction idioms (patterns) that have been
6075 recognized in advance during vect_pattern_recog. In this case, STMT may be
6076 of this form:
6077 X = pattern_expr (arg0, arg1, ..., X)
6078 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6079 sequence that had been detected and replaced by the pattern-stmt (STMT).
6081 This function also handles reduction of condition expressions, for example:
6082 for (int i = 0; i < N; i++)
6083 if (a[i] < value)
6084 last = a[i];
6085 This is handled by vectorising the loop and creating an additional vector
6086 containing the loop indexes for which "a[i] < value" was true. In the
6087 function epilogue this is reduced to a single max value and then used to
6088 index into the vector of results.
6090 In some cases of reduction patterns, the type of the reduction variable X is
6091 different than the type of the other arguments of STMT.
6092 In such cases, the vectype that is used when transforming STMT into a vector
6093 stmt is different than the vectype that is used to determine the
6094 vectorization factor, because it consists of a different number of elements
6095 than the actual number of elements that are being operated upon in parallel.
6097 For example, consider an accumulation of shorts into an int accumulator.
6098 On some targets it's possible to vectorize this pattern operating on 8
6099 shorts at a time (hence, the vectype for purposes of determining the
6100 vectorization factor should be V8HI); on the other hand, the vectype that
6101 is used to create the vector form is actually V4SI (the type of the result).
6103 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6104 indicates what is the actual level of parallelism (V8HI in the example), so
6105 that the right vectorization factor would be derived. This vectype
6106 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6107 be used to create the vectorized stmt. The right vectype for the vectorized
6108 stmt is obtained from the type of the result X:
6109 get_vectype_for_scalar_type (TREE_TYPE (X))
6111 This means that, contrary to "regular" reductions (or "regular" stmts in
6112 general), the following equation:
6113 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6114 does *NOT* necessarily hold for reduction patterns. */
6116 bool
6117 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6118 gimple **vec_stmt, slp_tree slp_node,
6119 slp_instance slp_node_instance,
6120 stmt_vector_for_cost *cost_vec)
6122 tree vec_dest;
6123 tree scalar_dest;
6124 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6125 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6126 tree vectype_in = NULL_TREE;
6127 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6128 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6129 enum tree_code code, orig_code;
6130 internal_fn reduc_fn;
6131 machine_mode vec_mode;
6132 int op_type;
6133 optab optab;
6134 tree new_temp = NULL_TREE;
6135 gimple *def_stmt;
6136 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6137 gimple *cond_reduc_def_stmt = NULL;
6138 enum tree_code cond_reduc_op_code = ERROR_MARK;
6139 tree scalar_type;
6140 bool is_simple_use;
6141 gimple *orig_stmt;
6142 stmt_vec_info orig_stmt_info = NULL;
6143 int i;
6144 int ncopies;
6145 int epilog_copies;
6146 stmt_vec_info prev_stmt_info, prev_phi_info;
6147 bool single_defuse_cycle = false;
6148 gimple *new_stmt = NULL;
6149 int j;
6150 tree ops[3];
6151 enum vect_def_type dts[3];
6152 bool nested_cycle = false, found_nested_cycle_def = false;
6153 bool double_reduc = false;
6154 basic_block def_bb;
6155 struct loop * def_stmt_loop, *outer_loop = NULL;
6156 tree def_arg;
6157 gimple *def_arg_stmt;
6158 auto_vec<tree> vec_oprnds0;
6159 auto_vec<tree> vec_oprnds1;
6160 auto_vec<tree> vec_oprnds2;
6161 auto_vec<tree> vect_defs;
6162 auto_vec<gimple *> phis;
6163 int vec_num;
6164 tree def0, tem;
6165 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6166 tree cond_reduc_val = NULL_TREE;
6168 /* Make sure it was already recognized as a reduction computation. */
6169 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6170 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6171 return false;
6173 if (nested_in_vect_loop_p (loop, stmt))
6175 outer_loop = loop;
6176 loop = loop->inner;
6177 nested_cycle = true;
6180 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6181 gcc_assert (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt);
6183 if (gimple_code (stmt) == GIMPLE_PHI)
6185 /* Analysis is fully done on the reduction stmt invocation. */
6186 if (! vec_stmt)
6188 if (slp_node)
6189 slp_node_instance->reduc_phis = slp_node;
6191 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6192 return true;
6195 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6196 /* Leave the scalar phi in place. Note that checking
6197 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6198 for reductions involving a single statement. */
6199 return true;
6201 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6202 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6203 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6205 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6206 == EXTRACT_LAST_REDUCTION)
6207 /* Leave the scalar phi in place. */
6208 return true;
6210 gcc_assert (is_gimple_assign (reduc_stmt));
6211 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6213 tree op = gimple_op (reduc_stmt, k);
6214 if (op == gimple_phi_result (stmt))
6215 continue;
6216 if (k == 1
6217 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6218 continue;
6219 if (!vectype_in
6220 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6221 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6222 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6223 break;
6225 gcc_assert (vectype_in);
6227 if (slp_node)
6228 ncopies = 1;
6229 else
6230 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6232 use_operand_p use_p;
6233 gimple *use_stmt;
6234 if (ncopies > 1
6235 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6236 <= vect_used_only_live)
6237 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6238 && (use_stmt == reduc_stmt
6239 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6240 == reduc_stmt)))
6241 single_defuse_cycle = true;
6243 /* Create the destination vector */
6244 scalar_dest = gimple_assign_lhs (reduc_stmt);
6245 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6247 if (slp_node)
6248 /* The size vect_schedule_slp_instance computes is off for us. */
6249 vec_num = vect_get_num_vectors
6250 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6251 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6252 vectype_in);
6253 else
6254 vec_num = 1;
6256 /* Generate the reduction PHIs upfront. */
6257 prev_phi_info = NULL;
6258 for (j = 0; j < ncopies; j++)
6260 if (j == 0 || !single_defuse_cycle)
6262 for (i = 0; i < vec_num; i++)
6264 /* Create the reduction-phi that defines the reduction
6265 operand. */
6266 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6267 set_vinfo_for_stmt (new_phi,
6268 new_stmt_vec_info (new_phi, loop_vinfo));
6270 if (slp_node)
6271 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6272 else
6274 if (j == 0)
6275 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6276 else
6277 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6278 prev_phi_info = vinfo_for_stmt (new_phi);
6284 return true;
6287 /* 1. Is vectorizable reduction? */
6288 /* Not supportable if the reduction variable is used in the loop, unless
6289 it's a reduction chain. */
6290 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6291 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6292 return false;
6294 /* Reductions that are not used even in an enclosing outer-loop,
6295 are expected to be "live" (used out of the loop). */
6296 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6297 && !STMT_VINFO_LIVE_P (stmt_info))
6298 return false;
6300 /* 2. Has this been recognized as a reduction pattern?
6302 Check if STMT represents a pattern that has been recognized
6303 in earlier analysis stages. For stmts that represent a pattern,
6304 the STMT_VINFO_RELATED_STMT field records the last stmt in
6305 the original sequence that constitutes the pattern. */
6307 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6308 if (orig_stmt)
6310 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6311 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6312 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6315 /* 3. Check the operands of the operation. The first operands are defined
6316 inside the loop body. The last operand is the reduction variable,
6317 which is defined by the loop-header-phi. */
6319 gcc_assert (is_gimple_assign (stmt));
6321 /* Flatten RHS. */
6322 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6324 case GIMPLE_BINARY_RHS:
6325 code = gimple_assign_rhs_code (stmt);
6326 op_type = TREE_CODE_LENGTH (code);
6327 gcc_assert (op_type == binary_op);
6328 ops[0] = gimple_assign_rhs1 (stmt);
6329 ops[1] = gimple_assign_rhs2 (stmt);
6330 break;
6332 case GIMPLE_TERNARY_RHS:
6333 code = gimple_assign_rhs_code (stmt);
6334 op_type = TREE_CODE_LENGTH (code);
6335 gcc_assert (op_type == ternary_op);
6336 ops[0] = gimple_assign_rhs1 (stmt);
6337 ops[1] = gimple_assign_rhs2 (stmt);
6338 ops[2] = gimple_assign_rhs3 (stmt);
6339 break;
6341 case GIMPLE_UNARY_RHS:
6342 return false;
6344 default:
6345 gcc_unreachable ();
6348 if (code == COND_EXPR && slp_node)
6349 return false;
6351 scalar_dest = gimple_assign_lhs (stmt);
6352 scalar_type = TREE_TYPE (scalar_dest);
6353 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6354 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6355 return false;
6357 /* Do not try to vectorize bit-precision reductions. */
6358 if (!type_has_mode_precision_p (scalar_type))
6359 return false;
6361 /* All uses but the last are expected to be defined in the loop.
6362 The last use is the reduction variable. In case of nested cycle this
6363 assumption is not true: we use reduc_index to record the index of the
6364 reduction variable. */
6365 gimple *reduc_def_stmt = NULL;
6366 int reduc_index = -1;
6367 for (i = 0; i < op_type; i++)
6369 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6370 if (i == 0 && code == COND_EXPR)
6371 continue;
6373 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6374 &dts[i], &tem, &def_stmt);
6375 dt = dts[i];
6376 gcc_assert (is_simple_use);
6377 if (dt == vect_reduction_def)
6379 reduc_def_stmt = def_stmt;
6380 reduc_index = i;
6381 continue;
6383 else if (tem)
6385 /* To properly compute ncopies we are interested in the widest
6386 input type in case we're looking at a widening accumulation. */
6387 if (!vectype_in
6388 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6389 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6390 vectype_in = tem;
6393 if (dt != vect_internal_def
6394 && dt != vect_external_def
6395 && dt != vect_constant_def
6396 && dt != vect_induction_def
6397 && !(dt == vect_nested_cycle && nested_cycle))
6398 return false;
6400 if (dt == vect_nested_cycle)
6402 found_nested_cycle_def = true;
6403 reduc_def_stmt = def_stmt;
6404 reduc_index = i;
6407 if (i == 1 && code == COND_EXPR)
6409 /* Record how value of COND_EXPR is defined. */
6410 if (dt == vect_constant_def)
6412 cond_reduc_dt = dt;
6413 cond_reduc_val = ops[i];
6415 if (dt == vect_induction_def
6416 && def_stmt != NULL
6417 && is_nonwrapping_integer_induction (def_stmt, loop))
6419 cond_reduc_dt = dt;
6420 cond_reduc_def_stmt = def_stmt;
6425 if (!vectype_in)
6426 vectype_in = vectype_out;
6428 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6429 directy used in stmt. */
6430 if (reduc_index == -1)
6432 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6434 if (dump_enabled_p ())
6435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6436 "in-order reduction chain without SLP.\n");
6437 return false;
6440 if (orig_stmt)
6441 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6442 else
6443 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6446 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6447 return false;
6449 if (!(reduc_index == -1
6450 || dts[reduc_index] == vect_reduction_def
6451 || dts[reduc_index] == vect_nested_cycle
6452 || ((dts[reduc_index] == vect_internal_def
6453 || dts[reduc_index] == vect_external_def
6454 || dts[reduc_index] == vect_constant_def
6455 || dts[reduc_index] == vect_induction_def)
6456 && nested_cycle && found_nested_cycle_def)))
6458 /* For pattern recognized stmts, orig_stmt might be a reduction,
6459 but some helper statements for the pattern might not, or
6460 might be COND_EXPRs with reduction uses in the condition. */
6461 gcc_assert (orig_stmt);
6462 return false;
6465 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6466 /* PHIs should not participate in patterns. */
6467 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6468 enum vect_reduction_type v_reduc_type
6469 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6470 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6472 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6473 /* If we have a condition reduction, see if we can simplify it further. */
6474 if (v_reduc_type == COND_REDUCTION)
6476 /* TODO: We can't yet handle reduction chains, since we need to treat
6477 each COND_EXPR in the chain specially, not just the last one.
6478 E.g. for:
6480 x_1 = PHI <x_3, ...>
6481 x_2 = a_2 ? ... : x_1;
6482 x_3 = a_3 ? ... : x_2;
6484 we're interested in the last element in x_3 for which a_2 || a_3
6485 is true, whereas the current reduction chain handling would
6486 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6487 as a reduction operation. */
6488 if (reduc_index == -1)
6490 if (dump_enabled_p ())
6491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492 "conditional reduction chains not supported\n");
6493 return false;
6496 /* vect_is_simple_reduction ensured that operand 2 is the
6497 loop-carried operand. */
6498 gcc_assert (reduc_index == 2);
6500 /* Loop peeling modifies initial value of reduction PHI, which
6501 makes the reduction stmt to be transformed different to the
6502 original stmt analyzed. We need to record reduction code for
6503 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6504 it can be used directly at transform stage. */
6505 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6506 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6508 /* Also set the reduction type to CONST_COND_REDUCTION. */
6509 gcc_assert (cond_reduc_dt == vect_constant_def);
6510 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6512 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6513 vectype_in, OPTIMIZE_FOR_SPEED))
6515 if (dump_enabled_p ())
6516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517 "optimizing condition reduction with"
6518 " FOLD_EXTRACT_LAST.\n");
6519 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6521 else if (cond_reduc_dt == vect_induction_def)
6523 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6524 tree base
6525 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6526 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6528 gcc_assert (TREE_CODE (base) == INTEGER_CST
6529 && TREE_CODE (step) == INTEGER_CST);
6530 cond_reduc_val = NULL_TREE;
6531 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6532 above base; punt if base is the minimum value of the type for
6533 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6534 if (tree_int_cst_sgn (step) == -1)
6536 cond_reduc_op_code = MIN_EXPR;
6537 if (tree_int_cst_sgn (base) == -1)
6538 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6539 else if (tree_int_cst_lt (base,
6540 TYPE_MAX_VALUE (TREE_TYPE (base))))
6541 cond_reduc_val
6542 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6544 else
6546 cond_reduc_op_code = MAX_EXPR;
6547 if (tree_int_cst_sgn (base) == 1)
6548 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6549 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6550 base))
6551 cond_reduc_val
6552 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6554 if (cond_reduc_val)
6556 if (dump_enabled_p ())
6557 dump_printf_loc (MSG_NOTE, vect_location,
6558 "condition expression based on "
6559 "integer induction.\n");
6560 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6561 = INTEGER_INDUC_COND_REDUCTION;
6564 else if (cond_reduc_dt == vect_constant_def)
6566 enum vect_def_type cond_initial_dt;
6567 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6568 tree cond_initial_val
6569 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6571 gcc_assert (cond_reduc_val != NULL_TREE);
6572 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6573 if (cond_initial_dt == vect_constant_def
6574 && types_compatible_p (TREE_TYPE (cond_initial_val),
6575 TREE_TYPE (cond_reduc_val)))
6577 tree e = fold_binary (LE_EXPR, boolean_type_node,
6578 cond_initial_val, cond_reduc_val);
6579 if (e && (integer_onep (e) || integer_zerop (e)))
6581 if (dump_enabled_p ())
6582 dump_printf_loc (MSG_NOTE, vect_location,
6583 "condition expression based on "
6584 "compile time constant.\n");
6585 /* Record reduction code at analysis stage. */
6586 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6587 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6588 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6589 = CONST_COND_REDUCTION;
6595 if (orig_stmt)
6596 gcc_assert (tmp == orig_stmt
6597 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6598 == orig_stmt));
6599 else
6600 /* We changed STMT to be the first stmt in reduction chain, hence we
6601 check that in this case the first element in the chain is STMT. */
6602 gcc_assert (stmt == tmp
6603 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6605 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6606 return false;
6608 if (slp_node)
6609 ncopies = 1;
6610 else
6611 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6613 gcc_assert (ncopies >= 1);
6615 vec_mode = TYPE_MODE (vectype_in);
6616 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6618 if (code == COND_EXPR)
6620 /* Only call during the analysis stage, otherwise we'll lose
6621 STMT_VINFO_TYPE. */
6622 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6623 ops[reduc_index], 0, NULL,
6624 cost_vec))
6626 if (dump_enabled_p ())
6627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6628 "unsupported condition in reduction\n");
6629 return false;
6632 else
6634 /* 4. Supportable by target? */
6636 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6637 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6639 /* Shifts and rotates are only supported by vectorizable_shifts,
6640 not vectorizable_reduction. */
6641 if (dump_enabled_p ())
6642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6643 "unsupported shift or rotation.\n");
6644 return false;
6647 /* 4.1. check support for the operation in the loop */
6648 optab = optab_for_tree_code (code, vectype_in, optab_default);
6649 if (!optab)
6651 if (dump_enabled_p ())
6652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6653 "no optab.\n");
6655 return false;
6658 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6660 if (dump_enabled_p ())
6661 dump_printf (MSG_NOTE, "op not supported by target.\n");
6663 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6664 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6665 return false;
6667 if (dump_enabled_p ())
6668 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6671 /* Worthwhile without SIMD support? */
6672 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6673 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6675 if (dump_enabled_p ())
6676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6677 "not worthwhile without SIMD support.\n");
6679 return false;
6683 /* 4.2. Check support for the epilog operation.
6685 If STMT represents a reduction pattern, then the type of the
6686 reduction variable may be different than the type of the rest
6687 of the arguments. For example, consider the case of accumulation
6688 of shorts into an int accumulator; The original code:
6689 S1: int_a = (int) short_a;
6690 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6692 was replaced with:
6693 STMT: int_acc = widen_sum <short_a, int_acc>
6695 This means that:
6696 1. The tree-code that is used to create the vector operation in the
6697 epilog code (that reduces the partial results) is not the
6698 tree-code of STMT, but is rather the tree-code of the original
6699 stmt from the pattern that STMT is replacing. I.e, in the example
6700 above we want to use 'widen_sum' in the loop, but 'plus' in the
6701 epilog.
6702 2. The type (mode) we use to check available target support
6703 for the vector operation to be created in the *epilog*, is
6704 determined by the type of the reduction variable (in the example
6705 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6706 However the type (mode) we use to check available target support
6707 for the vector operation to be created *inside the loop*, is
6708 determined by the type of the other arguments to STMT (in the
6709 example we'd check this: optab_handler (widen_sum_optab,
6710 vect_short_mode)).
6712 This is contrary to "regular" reductions, in which the types of all
6713 the arguments are the same as the type of the reduction variable.
6714 For "regular" reductions we can therefore use the same vector type
6715 (and also the same tree-code) when generating the epilog code and
6716 when generating the code inside the loop. */
6718 vect_reduction_type reduction_type
6719 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6720 if (orig_stmt
6721 && (reduction_type == TREE_CODE_REDUCTION
6722 || reduction_type == FOLD_LEFT_REDUCTION))
6724 /* This is a reduction pattern: get the vectype from the type of the
6725 reduction variable, and get the tree-code from orig_stmt. */
6726 orig_code = gimple_assign_rhs_code (orig_stmt);
6727 gcc_assert (vectype_out);
6728 vec_mode = TYPE_MODE (vectype_out);
6730 else
6732 /* Regular reduction: use the same vectype and tree-code as used for
6733 the vector code inside the loop can be used for the epilog code. */
6734 orig_code = code;
6736 if (code == MINUS_EXPR)
6737 orig_code = PLUS_EXPR;
6739 /* For simple condition reductions, replace with the actual expression
6740 we want to base our reduction around. */
6741 if (reduction_type == CONST_COND_REDUCTION)
6743 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6744 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6746 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6747 orig_code = cond_reduc_op_code;
6750 if (nested_cycle)
6752 def_bb = gimple_bb (reduc_def_stmt);
6753 def_stmt_loop = def_bb->loop_father;
6754 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6755 loop_preheader_edge (def_stmt_loop));
6756 if (TREE_CODE (def_arg) == SSA_NAME
6757 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6758 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6759 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6760 && vinfo_for_stmt (def_arg_stmt)
6761 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6762 == vect_double_reduction_def)
6763 double_reduc = true;
6766 reduc_fn = IFN_LAST;
6768 if (reduction_type == TREE_CODE_REDUCTION
6769 || reduction_type == FOLD_LEFT_REDUCTION
6770 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6771 || reduction_type == CONST_COND_REDUCTION)
6773 if (reduction_type == FOLD_LEFT_REDUCTION
6774 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6775 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6777 if (reduc_fn != IFN_LAST
6778 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6779 OPTIMIZE_FOR_SPEED))
6781 if (dump_enabled_p ())
6782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783 "reduc op not supported by target.\n");
6785 reduc_fn = IFN_LAST;
6788 else
6790 if (!nested_cycle || double_reduc)
6792 if (dump_enabled_p ())
6793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794 "no reduc code for scalar code.\n");
6796 return false;
6800 else if (reduction_type == COND_REDUCTION)
6802 int scalar_precision
6803 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6804 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6805 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6806 nunits_out);
6808 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6809 OPTIMIZE_FOR_SPEED))
6810 reduc_fn = IFN_REDUC_MAX;
6813 if (reduction_type != EXTRACT_LAST_REDUCTION
6814 && reduc_fn == IFN_LAST
6815 && !nunits_out.is_constant ())
6817 if (dump_enabled_p ())
6818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6819 "missing target support for reduction on"
6820 " variable-length vectors.\n");
6821 return false;
6824 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6825 && ncopies > 1)
6827 if (dump_enabled_p ())
6828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6829 "multiple types in double reduction or condition "
6830 "reduction.\n");
6831 return false;
6834 /* For SLP reductions, see if there is a neutral value we can use. */
6835 tree neutral_op = NULL_TREE;
6836 if (slp_node)
6837 neutral_op = neutral_op_for_slp_reduction
6838 (slp_node_instance->reduc_phis, code,
6839 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6841 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6843 /* We can't support in-order reductions of code such as this:
6845 for (int i = 0; i < n1; ++i)
6846 for (int j = 0; j < n2; ++j)
6847 l += a[j];
6849 since GCC effectively transforms the loop when vectorizing:
6851 for (int i = 0; i < n1 / VF; ++i)
6852 for (int j = 0; j < n2; ++j)
6853 for (int k = 0; k < VF; ++k)
6854 l += a[j];
6856 which is a reassociation of the original operation. */
6857 if (dump_enabled_p ())
6858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6859 "in-order double reduction not supported.\n");
6861 return false;
6864 if (reduction_type == FOLD_LEFT_REDUCTION
6865 && slp_node
6866 && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6868 /* We cannot use in-order reductions in this case because there is
6869 an implicit reassociation of the operations involved. */
6870 if (dump_enabled_p ())
6871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6872 "in-order unchained SLP reductions not supported.\n");
6873 return false;
6876 /* For double reductions, and for SLP reductions with a neutral value,
6877 we construct a variable-length initial vector by loading a vector
6878 full of the neutral value and then shift-and-inserting the start
6879 values into the low-numbered elements. */
6880 if ((double_reduc || neutral_op)
6881 && !nunits_out.is_constant ()
6882 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6883 vectype_out, OPTIMIZE_FOR_SPEED))
6885 if (dump_enabled_p ())
6886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6887 "reduction on variable-length vectors requires"
6888 " target support for a vector-shift-and-insert"
6889 " operation.\n");
6890 return false;
6893 /* Check extra constraints for variable-length unchained SLP reductions. */
6894 if (STMT_SLP_TYPE (stmt_info)
6895 && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6896 && !nunits_out.is_constant ())
6898 /* We checked above that we could build the initial vector when
6899 there's a neutral element value. Check here for the case in
6900 which each SLP statement has its own initial value and in which
6901 that value needs to be repeated for every instance of the
6902 statement within the initial vector. */
6903 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6904 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6905 if (!neutral_op
6906 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6908 if (dump_enabled_p ())
6909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6910 "unsupported form of SLP reduction for"
6911 " variable-length vectors: cannot build"
6912 " initial vector.\n");
6913 return false;
6915 /* The epilogue code relies on the number of elements being a multiple
6916 of the group size. The duplicate-and-interleave approach to setting
6917 up the the initial vector does too. */
6918 if (!multiple_p (nunits_out, group_size))
6920 if (dump_enabled_p ())
6921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922 "unsupported form of SLP reduction for"
6923 " variable-length vectors: the vector size"
6924 " is not a multiple of the number of results.\n");
6925 return false;
6929 /* In case of widenning multiplication by a constant, we update the type
6930 of the constant to be the type of the other operand. We check that the
6931 constant fits the type in the pattern recognition pass. */
6932 if (code == DOT_PROD_EXPR
6933 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6935 if (TREE_CODE (ops[0]) == INTEGER_CST)
6936 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6937 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6938 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6939 else
6941 if (dump_enabled_p ())
6942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6943 "invalid types in dot-prod\n");
6945 return false;
6949 if (reduction_type == COND_REDUCTION)
6951 widest_int ni;
6953 if (! max_loop_iterations (loop, &ni))
6955 if (dump_enabled_p ())
6956 dump_printf_loc (MSG_NOTE, vect_location,
6957 "loop count not known, cannot create cond "
6958 "reduction.\n");
6959 return false;
6961 /* Convert backedges to iterations. */
6962 ni += 1;
6964 /* The additional index will be the same type as the condition. Check
6965 that the loop can fit into this less one (because we'll use up the
6966 zero slot for when there are no matches). */
6967 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6968 if (wi::geu_p (ni, wi::to_widest (max_index)))
6970 if (dump_enabled_p ())
6971 dump_printf_loc (MSG_NOTE, vect_location,
6972 "loop size is greater than data size.\n");
6973 return false;
6977 /* In case the vectorization factor (VF) is bigger than the number
6978 of elements that we can fit in a vectype (nunits), we have to generate
6979 more than one vector stmt - i.e - we need to "unroll" the
6980 vector stmt by a factor VF/nunits. For more details see documentation
6981 in vectorizable_operation. */
6983 /* If the reduction is used in an outer loop we need to generate
6984 VF intermediate results, like so (e.g. for ncopies=2):
6985 r0 = phi (init, r0)
6986 r1 = phi (init, r1)
6987 r0 = x0 + r0;
6988 r1 = x1 + r1;
6989 (i.e. we generate VF results in 2 registers).
6990 In this case we have a separate def-use cycle for each copy, and therefore
6991 for each copy we get the vector def for the reduction variable from the
6992 respective phi node created for this copy.
6994 Otherwise (the reduction is unused in the loop nest), we can combine
6995 together intermediate results, like so (e.g. for ncopies=2):
6996 r = phi (init, r)
6997 r = x0 + r;
6998 r = x1 + r;
6999 (i.e. we generate VF/2 results in a single register).
7000 In this case for each copy we get the vector def for the reduction variable
7001 from the vectorized reduction operation generated in the previous iteration.
7003 This only works when we see both the reduction PHI and its only consumer
7004 in vectorizable_reduction and there are no intermediate stmts
7005 participating. */
7006 use_operand_p use_p;
7007 gimple *use_stmt;
7008 if (ncopies > 1
7009 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7010 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7011 && (use_stmt == stmt
7012 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7014 single_defuse_cycle = true;
7015 epilog_copies = 1;
7017 else
7018 epilog_copies = ncopies;
7020 /* If the reduction stmt is one of the patterns that have lane
7021 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7022 if ((ncopies > 1
7023 && ! single_defuse_cycle)
7024 && (code == DOT_PROD_EXPR
7025 || code == WIDEN_SUM_EXPR
7026 || code == SAD_EXPR))
7028 if (dump_enabled_p ())
7029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7030 "multi def-use cycle not possible for lane-reducing "
7031 "reduction operation\n");
7032 return false;
7035 if (slp_node)
7036 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7037 else
7038 vec_num = 1;
7040 internal_fn cond_fn = get_conditional_internal_fn (code);
7041 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7043 if (!vec_stmt) /* transformation not required. */
7045 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7046 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7048 if (reduction_type != FOLD_LEFT_REDUCTION
7049 && (cond_fn == IFN_LAST
7050 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7051 OPTIMIZE_FOR_SPEED)))
7053 if (dump_enabled_p ())
7054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7055 "can't use a fully-masked loop because no"
7056 " conditional operation is available.\n");
7057 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7059 else if (reduc_index == -1)
7061 if (dump_enabled_p ())
7062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7063 "can't use a fully-masked loop for chained"
7064 " reductions.\n");
7065 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7067 else
7068 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7069 vectype_in);
7071 if (dump_enabled_p ()
7072 && reduction_type == FOLD_LEFT_REDUCTION)
7073 dump_printf_loc (MSG_NOTE, vect_location,
7074 "using an in-order (fold-left) reduction.\n");
7075 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7076 return true;
7079 /* Transform. */
7081 if (dump_enabled_p ())
7082 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7084 /* FORNOW: Multiple types are not supported for condition. */
7085 if (code == COND_EXPR)
7086 gcc_assert (ncopies == 1);
7088 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7090 if (reduction_type == FOLD_LEFT_REDUCTION)
7091 return vectorize_fold_left_reduction
7092 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7093 reduc_fn, ops, vectype_in, reduc_index, masks);
7095 if (reduction_type == EXTRACT_LAST_REDUCTION)
7097 gcc_assert (!slp_node);
7098 return vectorizable_condition (stmt, gsi, vec_stmt,
7099 NULL, reduc_index, NULL, NULL);
7102 /* Create the destination vector */
7103 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7105 prev_stmt_info = NULL;
7106 prev_phi_info = NULL;
7107 if (!slp_node)
7109 vec_oprnds0.create (1);
7110 vec_oprnds1.create (1);
7111 if (op_type == ternary_op)
7112 vec_oprnds2.create (1);
7115 phis.create (vec_num);
7116 vect_defs.create (vec_num);
7117 if (!slp_node)
7118 vect_defs.quick_push (NULL_TREE);
7120 if (slp_node)
7121 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7122 else
7123 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7125 for (j = 0; j < ncopies; j++)
7127 if (code == COND_EXPR)
7129 gcc_assert (!slp_node);
7130 vectorizable_condition (stmt, gsi, vec_stmt,
7131 PHI_RESULT (phis[0]),
7132 reduc_index, NULL, NULL);
7133 /* Multiple types are not supported for condition. */
7134 break;
7137 /* Handle uses. */
7138 if (j == 0)
7140 if (slp_node)
7142 /* Get vec defs for all the operands except the reduction index,
7143 ensuring the ordering of the ops in the vector is kept. */
7144 auto_vec<tree, 3> slp_ops;
7145 auto_vec<vec<tree>, 3> vec_defs;
7147 slp_ops.quick_push (ops[0]);
7148 slp_ops.quick_push (ops[1]);
7149 if (op_type == ternary_op)
7150 slp_ops.quick_push (ops[2]);
7152 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7154 vec_oprnds0.safe_splice (vec_defs[0]);
7155 vec_defs[0].release ();
7156 vec_oprnds1.safe_splice (vec_defs[1]);
7157 vec_defs[1].release ();
7158 if (op_type == ternary_op)
7160 vec_oprnds2.safe_splice (vec_defs[2]);
7161 vec_defs[2].release ();
7164 else
7166 vec_oprnds0.quick_push
7167 (vect_get_vec_def_for_operand (ops[0], stmt));
7168 vec_oprnds1.quick_push
7169 (vect_get_vec_def_for_operand (ops[1], stmt));
7170 if (op_type == ternary_op)
7171 vec_oprnds2.quick_push
7172 (vect_get_vec_def_for_operand (ops[2], stmt));
7175 else
7177 if (!slp_node)
7179 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7181 if (single_defuse_cycle && reduc_index == 0)
7182 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7183 else
7184 vec_oprnds0[0]
7185 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7186 if (single_defuse_cycle && reduc_index == 1)
7187 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7188 else
7189 vec_oprnds1[0]
7190 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7191 if (op_type == ternary_op)
7193 if (single_defuse_cycle && reduc_index == 2)
7194 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7195 else
7196 vec_oprnds2[0]
7197 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7202 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7204 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7205 if (masked_loop_p)
7207 /* Make sure that the reduction accumulator is vop[0]. */
7208 if (reduc_index == 1)
7210 gcc_assert (commutative_tree_code (code));
7211 std::swap (vop[0], vop[1]);
7213 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7214 vectype_in, i * ncopies + j);
7215 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7216 vop[0], vop[1],
7217 vop[0]);
7218 new_temp = make_ssa_name (vec_dest, call);
7219 gimple_call_set_lhs (call, new_temp);
7220 gimple_call_set_nothrow (call, true);
7221 new_stmt = call;
7223 else
7225 if (op_type == ternary_op)
7226 vop[2] = vec_oprnds2[i];
7228 new_temp = make_ssa_name (vec_dest, new_stmt);
7229 new_stmt = gimple_build_assign (new_temp, code,
7230 vop[0], vop[1], vop[2]);
7232 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7234 if (slp_node)
7236 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7237 vect_defs.quick_push (new_temp);
7239 else
7240 vect_defs[0] = new_temp;
7243 if (slp_node)
7244 continue;
7246 if (j == 0)
7247 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7248 else
7249 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7251 prev_stmt_info = vinfo_for_stmt (new_stmt);
7254 /* Finalize the reduction-phi (set its arguments) and create the
7255 epilog reduction code. */
7256 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7257 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7259 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7260 epilog_copies, reduc_fn, phis,
7261 double_reduc, slp_node, slp_node_instance,
7262 cond_reduc_val, cond_reduc_op_code,
7263 neutral_op);
7265 return true;
7268 /* Function vect_min_worthwhile_factor.
7270 For a loop where we could vectorize the operation indicated by CODE,
7271 return the minimum vectorization factor that makes it worthwhile
7272 to use generic vectors. */
7273 static unsigned int
7274 vect_min_worthwhile_factor (enum tree_code code)
7276 switch (code)
7278 case PLUS_EXPR:
7279 case MINUS_EXPR:
7280 case NEGATE_EXPR:
7281 return 4;
7283 case BIT_AND_EXPR:
7284 case BIT_IOR_EXPR:
7285 case BIT_XOR_EXPR:
7286 case BIT_NOT_EXPR:
7287 return 2;
7289 default:
7290 return INT_MAX;
7294 /* Return true if VINFO indicates we are doing loop vectorization and if
7295 it is worth decomposing CODE operations into scalar operations for
7296 that loop's vectorization factor. */
7298 bool
7299 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7301 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7302 unsigned HOST_WIDE_INT value;
7303 return (loop_vinfo
7304 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7305 && value >= vect_min_worthwhile_factor (code));
7308 /* Function vectorizable_induction
7310 Check if PHI performs an induction computation that can be vectorized.
7311 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7312 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7313 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7315 bool
7316 vectorizable_induction (gimple *phi,
7317 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7318 gimple **vec_stmt, slp_tree slp_node,
7319 stmt_vector_for_cost *cost_vec)
7321 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7322 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7323 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7324 unsigned ncopies;
7325 bool nested_in_vect_loop = false;
7326 struct loop *iv_loop;
7327 tree vec_def;
7328 edge pe = loop_preheader_edge (loop);
7329 basic_block new_bb;
7330 tree new_vec, vec_init, vec_step, t;
7331 tree new_name;
7332 gimple *new_stmt;
7333 gphi *induction_phi;
7334 tree induc_def, vec_dest;
7335 tree init_expr, step_expr;
7336 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7337 unsigned i;
7338 tree expr;
7339 gimple_seq stmts;
7340 imm_use_iterator imm_iter;
7341 use_operand_p use_p;
7342 gimple *exit_phi;
7343 edge latch_e;
7344 tree loop_arg;
7345 gimple_stmt_iterator si;
7346 basic_block bb = gimple_bb (phi);
7348 if (gimple_code (phi) != GIMPLE_PHI)
7349 return false;
7351 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7352 return false;
7354 /* Make sure it was recognized as induction computation. */
7355 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7356 return false;
7358 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7359 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7361 if (slp_node)
7362 ncopies = 1;
7363 else
7364 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7365 gcc_assert (ncopies >= 1);
7367 /* FORNOW. These restrictions should be relaxed. */
7368 if (nested_in_vect_loop_p (loop, phi))
7370 imm_use_iterator imm_iter;
7371 use_operand_p use_p;
7372 gimple *exit_phi;
7373 edge latch_e;
7374 tree loop_arg;
7376 if (ncopies > 1)
7378 if (dump_enabled_p ())
7379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7380 "multiple types in nested loop.\n");
7381 return false;
7384 /* FORNOW: outer loop induction with SLP not supported. */
7385 if (STMT_SLP_TYPE (stmt_info))
7386 return false;
7388 exit_phi = NULL;
7389 latch_e = loop_latch_edge (loop->inner);
7390 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7391 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7393 gimple *use_stmt = USE_STMT (use_p);
7394 if (is_gimple_debug (use_stmt))
7395 continue;
7397 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7399 exit_phi = use_stmt;
7400 break;
7403 if (exit_phi)
7405 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7406 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7407 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7409 if (dump_enabled_p ())
7410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7411 "inner-loop induction only used outside "
7412 "of the outer vectorized loop.\n");
7413 return false;
7417 nested_in_vect_loop = true;
7418 iv_loop = loop->inner;
7420 else
7421 iv_loop = loop;
7422 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7424 if (slp_node && !nunits.is_constant ())
7426 /* The current SLP code creates the initial value element-by-element. */
7427 if (dump_enabled_p ())
7428 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7429 "SLP induction not supported for variable-length"
7430 " vectors.\n");
7431 return false;
7434 if (!vec_stmt) /* transformation not required. */
7436 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7437 DUMP_VECT_SCOPE ("vectorizable_induction");
7438 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7439 return true;
7442 /* Transform. */
7444 /* Compute a vector variable, initialized with the first VF values of
7445 the induction variable. E.g., for an iv with IV_PHI='X' and
7446 evolution S, for a vector of 4 units, we want to compute:
7447 [X, X + S, X + 2*S, X + 3*S]. */
7449 if (dump_enabled_p ())
7450 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7452 latch_e = loop_latch_edge (iv_loop);
7453 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7455 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7456 gcc_assert (step_expr != NULL_TREE);
7458 pe = loop_preheader_edge (iv_loop);
7459 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7460 loop_preheader_edge (iv_loop));
7462 stmts = NULL;
7463 if (!nested_in_vect_loop)
7465 /* Convert the initial value to the desired type. */
7466 tree new_type = TREE_TYPE (vectype);
7467 init_expr = gimple_convert (&stmts, new_type, init_expr);
7469 /* If we are using the loop mask to "peel" for alignment then we need
7470 to adjust the start value here. */
7471 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7472 if (skip_niters != NULL_TREE)
7474 if (FLOAT_TYPE_P (vectype))
7475 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7476 skip_niters);
7477 else
7478 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7479 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7480 skip_niters, step_expr);
7481 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7482 init_expr, skip_step);
7486 /* Convert the step to the desired type. */
7487 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7489 if (stmts)
7491 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7492 gcc_assert (!new_bb);
7495 /* Find the first insertion point in the BB. */
7496 si = gsi_after_labels (bb);
7498 /* For SLP induction we have to generate several IVs as for example
7499 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7500 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7501 [VF*S, VF*S, VF*S, VF*S] for all. */
7502 if (slp_node)
7504 /* Enforced above. */
7505 unsigned int const_nunits = nunits.to_constant ();
7507 /* Generate [VF*S, VF*S, ... ]. */
7508 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7510 expr = build_int_cst (integer_type_node, vf);
7511 expr = fold_convert (TREE_TYPE (step_expr), expr);
7513 else
7514 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7515 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7516 expr, step_expr);
7517 if (! CONSTANT_CLASS_P (new_name))
7518 new_name = vect_init_vector (phi, new_name,
7519 TREE_TYPE (step_expr), NULL);
7520 new_vec = build_vector_from_val (vectype, new_name);
7521 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7523 /* Now generate the IVs. */
7524 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7525 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7526 unsigned elts = const_nunits * nvects;
7527 unsigned nivs = least_common_multiple (group_size,
7528 const_nunits) / const_nunits;
7529 gcc_assert (elts % group_size == 0);
7530 tree elt = init_expr;
7531 unsigned ivn;
7532 for (ivn = 0; ivn < nivs; ++ivn)
7534 tree_vector_builder elts (vectype, const_nunits, 1);
7535 stmts = NULL;
7536 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7538 if (ivn*const_nunits + eltn >= group_size
7539 && (ivn * const_nunits + eltn) % group_size == 0)
7540 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7541 elt, step_expr);
7542 elts.quick_push (elt);
7544 vec_init = gimple_build_vector (&stmts, &elts);
7545 if (stmts)
7547 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7548 gcc_assert (!new_bb);
7551 /* Create the induction-phi that defines the induction-operand. */
7552 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7553 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7554 set_vinfo_for_stmt (induction_phi,
7555 new_stmt_vec_info (induction_phi, loop_vinfo));
7556 induc_def = PHI_RESULT (induction_phi);
7558 /* Create the iv update inside the loop */
7559 vec_def = make_ssa_name (vec_dest);
7560 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7561 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7562 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7564 /* Set the arguments of the phi node: */
7565 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7566 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7567 UNKNOWN_LOCATION);
7569 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7572 /* Re-use IVs when we can. */
7573 if (ivn < nvects)
7575 unsigned vfp
7576 = least_common_multiple (group_size, const_nunits) / group_size;
7577 /* Generate [VF'*S, VF'*S, ... ]. */
7578 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7580 expr = build_int_cst (integer_type_node, vfp);
7581 expr = fold_convert (TREE_TYPE (step_expr), expr);
7583 else
7584 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7585 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7586 expr, step_expr);
7587 if (! CONSTANT_CLASS_P (new_name))
7588 new_name = vect_init_vector (phi, new_name,
7589 TREE_TYPE (step_expr), NULL);
7590 new_vec = build_vector_from_val (vectype, new_name);
7591 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7592 for (; ivn < nvects; ++ivn)
7594 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7595 tree def;
7596 if (gimple_code (iv) == GIMPLE_PHI)
7597 def = gimple_phi_result (iv);
7598 else
7599 def = gimple_assign_lhs (iv);
7600 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7601 PLUS_EXPR,
7602 def, vec_step);
7603 if (gimple_code (iv) == GIMPLE_PHI)
7604 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7605 else
7607 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7608 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7610 set_vinfo_for_stmt (new_stmt,
7611 new_stmt_vec_info (new_stmt, loop_vinfo));
7612 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7616 return true;
7619 /* Create the vector that holds the initial_value of the induction. */
7620 if (nested_in_vect_loop)
7622 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7623 been created during vectorization of previous stmts. We obtain it
7624 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7625 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7626 /* If the initial value is not of proper type, convert it. */
7627 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7629 new_stmt
7630 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7631 vect_simple_var,
7632 "vec_iv_"),
7633 VIEW_CONVERT_EXPR,
7634 build1 (VIEW_CONVERT_EXPR, vectype,
7635 vec_init));
7636 vec_init = gimple_assign_lhs (new_stmt);
7637 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7638 new_stmt);
7639 gcc_assert (!new_bb);
7640 set_vinfo_for_stmt (new_stmt,
7641 new_stmt_vec_info (new_stmt, loop_vinfo));
7644 else
7646 /* iv_loop is the loop to be vectorized. Create:
7647 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7648 stmts = NULL;
7649 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7651 unsigned HOST_WIDE_INT const_nunits;
7652 if (nunits.is_constant (&const_nunits))
7654 tree_vector_builder elts (vectype, const_nunits, 1);
7655 elts.quick_push (new_name);
7656 for (i = 1; i < const_nunits; i++)
7658 /* Create: new_name_i = new_name + step_expr */
7659 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7660 new_name, step_expr);
7661 elts.quick_push (new_name);
7663 /* Create a vector from [new_name_0, new_name_1, ...,
7664 new_name_nunits-1] */
7665 vec_init = gimple_build_vector (&stmts, &elts);
7667 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7668 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7669 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7670 new_name, step_expr);
7671 else
7673 /* Build:
7674 [base, base, base, ...]
7675 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7676 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7677 gcc_assert (flag_associative_math);
7678 tree index = build_index_vector (vectype, 0, 1);
7679 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7680 new_name);
7681 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7682 step_expr);
7683 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7684 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7685 vec_init, step_vec);
7686 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7687 vec_init, base_vec);
7690 if (stmts)
7692 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7693 gcc_assert (!new_bb);
7698 /* Create the vector that holds the step of the induction. */
7699 if (nested_in_vect_loop)
7700 /* iv_loop is nested in the loop to be vectorized. Generate:
7701 vec_step = [S, S, S, S] */
7702 new_name = step_expr;
7703 else
7705 /* iv_loop is the loop to be vectorized. Generate:
7706 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7707 gimple_seq seq = NULL;
7708 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7710 expr = build_int_cst (integer_type_node, vf);
7711 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7713 else
7714 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7715 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7716 expr, step_expr);
7717 if (seq)
7719 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7720 gcc_assert (!new_bb);
7724 t = unshare_expr (new_name);
7725 gcc_assert (CONSTANT_CLASS_P (new_name)
7726 || TREE_CODE (new_name) == SSA_NAME);
7727 new_vec = build_vector_from_val (vectype, t);
7728 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7731 /* Create the following def-use cycle:
7732 loop prolog:
7733 vec_init = ...
7734 vec_step = ...
7735 loop:
7736 vec_iv = PHI <vec_init, vec_loop>
7738 STMT
7740 vec_loop = vec_iv + vec_step; */
7742 /* Create the induction-phi that defines the induction-operand. */
7743 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7744 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7745 set_vinfo_for_stmt (induction_phi,
7746 new_stmt_vec_info (induction_phi, loop_vinfo));
7747 induc_def = PHI_RESULT (induction_phi);
7749 /* Create the iv update inside the loop */
7750 vec_def = make_ssa_name (vec_dest);
7751 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7752 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7753 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7755 /* Set the arguments of the phi node: */
7756 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7757 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7758 UNKNOWN_LOCATION);
7760 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7762 /* In case that vectorization factor (VF) is bigger than the number
7763 of elements that we can fit in a vectype (nunits), we have to generate
7764 more than one vector stmt - i.e - we need to "unroll" the
7765 vector stmt by a factor VF/nunits. For more details see documentation
7766 in vectorizable_operation. */
7768 if (ncopies > 1)
7770 gimple_seq seq = NULL;
7771 stmt_vec_info prev_stmt_vinfo;
7772 /* FORNOW. This restriction should be relaxed. */
7773 gcc_assert (!nested_in_vect_loop);
7775 /* Create the vector that holds the step of the induction. */
7776 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7778 expr = build_int_cst (integer_type_node, nunits);
7779 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7781 else
7782 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7783 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7784 expr, step_expr);
7785 if (seq)
7787 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7788 gcc_assert (!new_bb);
7791 t = unshare_expr (new_name);
7792 gcc_assert (CONSTANT_CLASS_P (new_name)
7793 || TREE_CODE (new_name) == SSA_NAME);
7794 new_vec = build_vector_from_val (vectype, t);
7795 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7797 vec_def = induc_def;
7798 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7799 for (i = 1; i < ncopies; i++)
7801 /* vec_i = vec_prev + vec_step */
7802 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7803 vec_def, vec_step);
7804 vec_def = make_ssa_name (vec_dest, new_stmt);
7805 gimple_assign_set_lhs (new_stmt, vec_def);
7807 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7808 set_vinfo_for_stmt (new_stmt,
7809 new_stmt_vec_info (new_stmt, loop_vinfo));
7810 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7811 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7815 if (nested_in_vect_loop)
7817 /* Find the loop-closed exit-phi of the induction, and record
7818 the final vector of induction results: */
7819 exit_phi = NULL;
7820 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7822 gimple *use_stmt = USE_STMT (use_p);
7823 if (is_gimple_debug (use_stmt))
7824 continue;
7826 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7828 exit_phi = use_stmt;
7829 break;
7832 if (exit_phi)
7834 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7835 /* FORNOW. Currently not supporting the case that an inner-loop induction
7836 is not used in the outer-loop (i.e. only outside the outer-loop). */
7837 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7838 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7840 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7841 if (dump_enabled_p ())
7843 dump_printf_loc (MSG_NOTE, vect_location,
7844 "vector of inductions after inner-loop:");
7845 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7851 if (dump_enabled_p ())
7853 dump_printf_loc (MSG_NOTE, vect_location,
7854 "transform induction: created def-use cycle: ");
7855 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7856 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7857 SSA_NAME_DEF_STMT (vec_def), 0);
7860 return true;
7863 /* Function vectorizable_live_operation.
7865 STMT computes a value that is used outside the loop. Check if
7866 it can be supported. */
7868 bool
7869 vectorizable_live_operation (gimple *stmt,
7870 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7871 slp_tree slp_node, int slp_index,
7872 gimple **vec_stmt,
7873 stmt_vector_for_cost *)
7875 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7876 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7877 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7878 imm_use_iterator imm_iter;
7879 tree lhs, lhs_type, bitsize, vec_bitsize;
7880 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7881 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7882 int ncopies;
7883 gimple *use_stmt;
7884 auto_vec<tree> vec_oprnds;
7885 int vec_entry = 0;
7886 poly_uint64 vec_index = 0;
7888 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7890 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7891 return false;
7893 /* FORNOW. CHECKME. */
7894 if (nested_in_vect_loop_p (loop, stmt))
7895 return false;
7897 /* If STMT is not relevant and it is a simple assignment and its inputs are
7898 invariant then it can remain in place, unvectorized. The original last
7899 scalar value that it computes will be used. */
7900 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7902 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7903 if (dump_enabled_p ())
7904 dump_printf_loc (MSG_NOTE, vect_location,
7905 "statement is simple and uses invariant. Leaving in "
7906 "place.\n");
7907 return true;
7910 if (slp_node)
7911 ncopies = 1;
7912 else
7913 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7915 if (slp_node)
7917 gcc_assert (slp_index >= 0);
7919 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7920 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7922 /* Get the last occurrence of the scalar index from the concatenation of
7923 all the slp vectors. Calculate which slp vector it is and the index
7924 within. */
7925 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7927 /* Calculate which vector contains the result, and which lane of
7928 that vector we need. */
7929 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7931 if (dump_enabled_p ())
7932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7933 "Cannot determine which vector holds the"
7934 " final result.\n");
7935 return false;
7939 if (!vec_stmt)
7941 /* No transformation required. */
7942 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7944 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7945 OPTIMIZE_FOR_SPEED))
7947 if (dump_enabled_p ())
7948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7949 "can't use a fully-masked loop because "
7950 "the target doesn't support extract last "
7951 "reduction.\n");
7952 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7954 else if (slp_node)
7956 if (dump_enabled_p ())
7957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7958 "can't use a fully-masked loop because an "
7959 "SLP statement is live after the loop.\n");
7960 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7962 else if (ncopies > 1)
7964 if (dump_enabled_p ())
7965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966 "can't use a fully-masked loop because"
7967 " ncopies is greater than 1.\n");
7968 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7970 else
7972 gcc_assert (ncopies == 1 && !slp_node);
7973 vect_record_loop_mask (loop_vinfo,
7974 &LOOP_VINFO_MASKS (loop_vinfo),
7975 1, vectype);
7978 return true;
7981 /* If stmt has a related stmt, then use that for getting the lhs. */
7982 if (is_pattern_stmt_p (stmt_info))
7983 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7985 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7986 : gimple_get_lhs (stmt);
7987 lhs_type = TREE_TYPE (lhs);
7989 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7990 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7991 : TYPE_SIZE (TREE_TYPE (vectype)));
7992 vec_bitsize = TYPE_SIZE (vectype);
7994 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7995 tree vec_lhs, bitstart;
7996 if (slp_node)
7998 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8000 /* Get the correct slp vectorized stmt. */
8001 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8002 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8003 vec_lhs = gimple_phi_result (phi);
8004 else
8005 vec_lhs = gimple_get_lhs (vec_stmt);
8007 /* Get entry to use. */
8008 bitstart = bitsize_int (vec_index);
8009 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8011 else
8013 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8014 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8015 gcc_checking_assert (ncopies == 1
8016 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8018 /* For multiple copies, get the last copy. */
8019 for (int i = 1; i < ncopies; ++i)
8020 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8021 vec_lhs);
8023 /* Get the last lane in the vector. */
8024 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8027 gimple_seq stmts = NULL;
8028 tree new_tree;
8029 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8031 /* Emit:
8033 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8035 where VEC_LHS is the vectorized live-out result and MASK is
8036 the loop mask for the final iteration. */
8037 gcc_assert (ncopies == 1 && !slp_node);
8038 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8039 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8040 1, vectype, 0);
8041 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8042 scalar_type, mask, vec_lhs);
8044 /* Convert the extracted vector element to the required scalar type. */
8045 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8047 else
8049 tree bftype = TREE_TYPE (vectype);
8050 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8051 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8052 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8053 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8054 &stmts, true, NULL_TREE);
8057 if (stmts)
8058 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8060 /* Replace use of lhs with newly computed result. If the use stmt is a
8061 single arg PHI, just replace all uses of PHI result. It's necessary
8062 because lcssa PHI defining lhs may be before newly inserted stmt. */
8063 use_operand_p use_p;
8064 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8065 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8066 && !is_gimple_debug (use_stmt))
8068 if (gimple_code (use_stmt) == GIMPLE_PHI
8069 && gimple_phi_num_args (use_stmt) == 1)
8071 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8073 else
8075 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8076 SET_USE (use_p, new_tree);
8078 update_stmt (use_stmt);
8081 return true;
8084 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8086 static void
8087 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8089 ssa_op_iter op_iter;
8090 imm_use_iterator imm_iter;
8091 def_operand_p def_p;
8092 gimple *ustmt;
8094 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8096 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8098 basic_block bb;
8100 if (!is_gimple_debug (ustmt))
8101 continue;
8103 bb = gimple_bb (ustmt);
8105 if (!flow_bb_inside_loop_p (loop, bb))
8107 if (gimple_debug_bind_p (ustmt))
8109 if (dump_enabled_p ())
8110 dump_printf_loc (MSG_NOTE, vect_location,
8111 "killing debug use\n");
8113 gimple_debug_bind_reset_value (ustmt);
8114 update_stmt (ustmt);
8116 else
8117 gcc_unreachable ();
8123 /* Given loop represented by LOOP_VINFO, return true if computation of
8124 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8125 otherwise. */
8127 static bool
8128 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8130 /* Constant case. */
8131 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8133 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8134 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8136 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8137 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8138 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8139 return true;
8142 widest_int max;
8143 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8144 /* Check the upper bound of loop niters. */
8145 if (get_max_loop_iterations (loop, &max))
8147 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8148 signop sgn = TYPE_SIGN (type);
8149 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8150 if (max < type_max)
8151 return true;
8153 return false;
8156 /* Return a mask type with half the number of elements as TYPE. */
8158 tree
8159 vect_halve_mask_nunits (tree type)
8161 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8162 return build_truth_vector_type (nunits, current_vector_size);
8165 /* Return a mask type with twice as many elements as TYPE. */
8167 tree
8168 vect_double_mask_nunits (tree type)
8170 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8171 return build_truth_vector_type (nunits, current_vector_size);
8174 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8175 contain a sequence of NVECTORS masks that each control a vector of type
8176 VECTYPE. */
8178 void
8179 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8180 unsigned int nvectors, tree vectype)
8182 gcc_assert (nvectors != 0);
8183 if (masks->length () < nvectors)
8184 masks->safe_grow_cleared (nvectors);
8185 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8186 /* The number of scalars per iteration and the number of vectors are
8187 both compile-time constants. */
8188 unsigned int nscalars_per_iter
8189 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8190 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8191 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8193 rgm->max_nscalars_per_iter = nscalars_per_iter;
8194 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8198 /* Given a complete set of masks MASKS, extract mask number INDEX
8199 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8200 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8202 See the comment above vec_loop_masks for more details about the mask
8203 arrangement. */
8205 tree
8206 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8207 unsigned int nvectors, tree vectype, unsigned int index)
8209 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8210 tree mask_type = rgm->mask_type;
8212 /* Populate the rgroup's mask array, if this is the first time we've
8213 used it. */
8214 if (rgm->masks.is_empty ())
8216 rgm->masks.safe_grow_cleared (nvectors);
8217 for (unsigned int i = 0; i < nvectors; ++i)
8219 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8220 /* Provide a dummy definition until the real one is available. */
8221 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8222 rgm->masks[i] = mask;
8226 tree mask = rgm->masks[index];
8227 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8228 TYPE_VECTOR_SUBPARTS (vectype)))
8230 /* A loop mask for data type X can be reused for data type Y
8231 if X has N times more elements than Y and if Y's elements
8232 are N times bigger than X's. In this case each sequence
8233 of N elements in the loop mask will be all-zero or all-one.
8234 We can then view-convert the mask so that each sequence of
8235 N elements is replaced by a single element. */
8236 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8237 TYPE_VECTOR_SUBPARTS (vectype)));
8238 gimple_seq seq = NULL;
8239 mask_type = build_same_sized_truth_vector_type (vectype);
8240 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8241 if (seq)
8242 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8244 return mask;
8247 /* Scale profiling counters by estimation for LOOP which is vectorized
8248 by factor VF. */
8250 static void
8251 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8253 edge preheader = loop_preheader_edge (loop);
8254 /* Reduce loop iterations by the vectorization factor. */
8255 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8256 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8258 if (freq_h.nonzero_p ())
8260 profile_probability p;
8262 /* Avoid dropping loop body profile counter to 0 because of zero count
8263 in loop's preheader. */
8264 if (!(freq_e == profile_count::zero ()))
8265 freq_e = freq_e.force_nonzero ();
8266 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8267 scale_loop_frequencies (loop, p);
8270 edge exit_e = single_exit (loop);
8271 exit_e->probability = profile_probability::always ()
8272 .apply_scale (1, new_est_niter + 1);
8274 edge exit_l = single_pred_edge (loop->latch);
8275 profile_probability prob = exit_l->probability;
8276 exit_l->probability = exit_e->probability.invert ();
8277 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8278 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8281 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8282 When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8283 *SLP_SCHEDULE is a running record of whether we have called
8284 vect_schedule_slp. */
8286 static void
8287 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8288 gimple_stmt_iterator *gsi,
8289 stmt_vec_info *seen_store, bool *slp_scheduled)
8291 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8292 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8293 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8294 if (!stmt_info)
8295 return;
8297 if (dump_enabled_p ())
8299 dump_printf_loc (MSG_NOTE, vect_location,
8300 "------>vectorizing statement: ");
8301 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8304 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8305 vect_loop_kill_debug_uses (loop, stmt);
8307 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8308 && !STMT_VINFO_LIVE_P (stmt_info))
8309 return;
8311 if (STMT_VINFO_VECTYPE (stmt_info))
8313 poly_uint64 nunits
8314 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8315 if (!STMT_SLP_TYPE (stmt_info)
8316 && maybe_ne (nunits, vf)
8317 && dump_enabled_p ())
8318 /* For SLP VF is set according to unrolling factor, and not
8319 to vector size, hence for SLP this print is not valid. */
8320 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8323 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8324 reached. */
8325 if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8328 if (!*slp_scheduled)
8330 *slp_scheduled = true;
8332 DUMP_VECT_SCOPE ("scheduling SLP instances");
8334 vect_schedule_slp (loop_vinfo);
8337 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8338 if (slptype == pure_slp)
8339 return;
8342 if (dump_enabled_p ())
8343 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8345 bool grouped_store = false;
8346 if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8347 *seen_store = stmt_info;
8350 /* Function vect_transform_loop.
8352 The analysis phase has determined that the loop is vectorizable.
8353 Vectorize the loop - created vectorized stmts to replace the scalar
8354 stmts in the loop, and update the loop exit condition.
8355 Returns scalar epilogue loop if any. */
8357 struct loop *
8358 vect_transform_loop (loop_vec_info loop_vinfo)
8360 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8361 struct loop *epilogue = NULL;
8362 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8363 int nbbs = loop->num_nodes;
8364 int i;
8365 tree niters_vector = NULL_TREE;
8366 tree step_vector = NULL_TREE;
8367 tree niters_vector_mult_vf = NULL_TREE;
8368 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8369 unsigned int lowest_vf = constant_lower_bound (vf);
8370 bool slp_scheduled = false;
8371 gimple *stmt;
8372 bool check_profitability = false;
8373 unsigned int th;
8375 DUMP_VECT_SCOPE ("vec_transform_loop");
8377 loop_vinfo->shared->check_datarefs ();
8379 /* Use the more conservative vectorization threshold. If the number
8380 of iterations is constant assume the cost check has been performed
8381 by our caller. If the threshold makes all loops profitable that
8382 run at least the (estimated) vectorization factor number of times
8383 checking is pointless, too. */
8384 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8385 if (th >= vect_vf_for_cost (loop_vinfo)
8386 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8388 if (dump_enabled_p ())
8389 dump_printf_loc (MSG_NOTE, vect_location,
8390 "Profitability threshold is %d loop iterations.\n",
8391 th);
8392 check_profitability = true;
8395 /* Make sure there exists a single-predecessor exit bb. Do this before
8396 versioning. */
8397 edge e = single_exit (loop);
8398 if (! single_pred_p (e->dest))
8400 split_loop_exit_edge (e);
8401 if (dump_enabled_p ())
8402 dump_printf (MSG_NOTE, "split exit edge\n");
8405 /* Version the loop first, if required, so the profitability check
8406 comes first. */
8408 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8410 poly_uint64 versioning_threshold
8411 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8412 if (check_profitability
8413 && ordered_p (poly_uint64 (th), versioning_threshold))
8415 versioning_threshold = ordered_max (poly_uint64 (th),
8416 versioning_threshold);
8417 check_profitability = false;
8419 vect_loop_versioning (loop_vinfo, th, check_profitability,
8420 versioning_threshold);
8421 check_profitability = false;
8424 /* Make sure there exists a single-predecessor exit bb also on the
8425 scalar loop copy. Do this after versioning but before peeling
8426 so CFG structure is fine for both scalar and if-converted loop
8427 to make slpeel_duplicate_current_defs_from_edges face matched
8428 loop closed PHI nodes on the exit. */
8429 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8431 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8432 if (! single_pred_p (e->dest))
8434 split_loop_exit_edge (e);
8435 if (dump_enabled_p ())
8436 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8440 tree niters = vect_build_loop_niters (loop_vinfo);
8441 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8442 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8443 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8444 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8445 &step_vector, &niters_vector_mult_vf, th,
8446 check_profitability, niters_no_overflow);
8448 if (niters_vector == NULL_TREE)
8450 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8451 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8452 && known_eq (lowest_vf, vf))
8454 niters_vector
8455 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8456 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8457 step_vector = build_one_cst (TREE_TYPE (niters));
8459 else
8460 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8461 &step_vector, niters_no_overflow);
8464 /* 1) Make sure the loop header has exactly two entries
8465 2) Make sure we have a preheader basic block. */
8467 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8469 split_edge (loop_preheader_edge (loop));
8471 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8472 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8473 /* This will deal with any possible peeling. */
8474 vect_prepare_for_masked_peels (loop_vinfo);
8476 /* FORNOW: the vectorizer supports only loops which body consist
8477 of one basic block (header + empty latch). When the vectorizer will
8478 support more involved loop forms, the order by which the BBs are
8479 traversed need to be reconsidered. */
8481 for (i = 0; i < nbbs; i++)
8483 basic_block bb = bbs[i];
8484 stmt_vec_info stmt_info;
8486 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8487 gsi_next (&si))
8489 gphi *phi = si.phi ();
8490 if (dump_enabled_p ())
8492 dump_printf_loc (MSG_NOTE, vect_location,
8493 "------>vectorizing phi: ");
8494 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8496 stmt_info = vinfo_for_stmt (phi);
8497 if (!stmt_info)
8498 continue;
8500 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8501 vect_loop_kill_debug_uses (loop, phi);
8503 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8504 && !STMT_VINFO_LIVE_P (stmt_info))
8505 continue;
8507 if (STMT_VINFO_VECTYPE (stmt_info)
8508 && (maybe_ne
8509 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8510 && dump_enabled_p ())
8511 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8513 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8514 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8515 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8516 && ! PURE_SLP_STMT (stmt_info))
8518 if (dump_enabled_p ())
8519 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8520 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8524 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8525 !gsi_end_p (si);)
8527 stmt = gsi_stmt (si);
8528 /* During vectorization remove existing clobber stmts. */
8529 if (gimple_clobber_p (stmt))
8531 unlink_stmt_vdef (stmt);
8532 gsi_remove (&si, true);
8533 release_defs (stmt);
8535 else
8537 stmt_info = vinfo_for_stmt (stmt);
8539 /* vector stmts created in the outer-loop during vectorization of
8540 stmts in an inner-loop may not have a stmt_info, and do not
8541 need to be vectorized. */
8542 stmt_vec_info seen_store = NULL;
8543 if (stmt_info)
8545 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8547 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8548 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8549 !gsi_end_p (subsi); gsi_next (&subsi))
8550 vect_transform_loop_stmt (loop_vinfo,
8551 gsi_stmt (subsi), &si,
8552 &seen_store,
8553 &slp_scheduled);
8554 gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8555 vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8556 &seen_store, &slp_scheduled);
8558 vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8559 &seen_store, &slp_scheduled);
8561 if (seen_store)
8563 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8565 /* Interleaving. If IS_STORE is TRUE, the
8566 vectorization of the interleaving chain was
8567 completed - free all the stores in the chain. */
8568 gsi_next (&si);
8569 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8571 else
8573 /* Free the attached stmt_vec_info and remove the
8574 stmt. */
8575 free_stmt_vec_info (stmt);
8576 unlink_stmt_vdef (stmt);
8577 gsi_remove (&si, true);
8578 release_defs (stmt);
8581 else
8582 gsi_next (&si);
8586 /* Stub out scalar statements that must not survive vectorization.
8587 Doing this here helps with grouped statements, or statements that
8588 are involved in patterns. */
8589 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8590 !gsi_end_p (gsi); gsi_next (&gsi))
8592 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8593 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8595 tree lhs = gimple_get_lhs (call);
8596 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8598 tree zero = build_zero_cst (TREE_TYPE (lhs));
8599 gimple *new_stmt = gimple_build_assign (lhs, zero);
8600 gsi_replace (&gsi, new_stmt, true);
8604 } /* BBs in loop */
8606 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8607 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8608 if (integer_onep (step_vector))
8609 niters_no_overflow = true;
8610 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8611 niters_vector_mult_vf, !niters_no_overflow);
8613 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8614 scale_profile_for_vect_loop (loop, assumed_vf);
8616 /* True if the final iteration might not handle a full vector's
8617 worth of scalar iterations. */
8618 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8619 /* The minimum number of iterations performed by the epilogue. This
8620 is 1 when peeling for gaps because we always need a final scalar
8621 iteration. */
8622 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8623 /* +1 to convert latch counts to loop iteration counts,
8624 -min_epilogue_iters to remove iterations that cannot be performed
8625 by the vector code. */
8626 int bias_for_lowest = 1 - min_epilogue_iters;
8627 int bias_for_assumed = bias_for_lowest;
8628 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8629 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8631 /* When the amount of peeling is known at compile time, the first
8632 iteration will have exactly alignment_npeels active elements.
8633 In the worst case it will have at least one. */
8634 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8635 bias_for_lowest += lowest_vf - min_first_active;
8636 bias_for_assumed += assumed_vf - min_first_active;
8638 /* In these calculations the "- 1" converts loop iteration counts
8639 back to latch counts. */
8640 if (loop->any_upper_bound)
8641 loop->nb_iterations_upper_bound
8642 = (final_iter_may_be_partial
8643 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8644 lowest_vf) - 1
8645 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8646 lowest_vf) - 1);
8647 if (loop->any_likely_upper_bound)
8648 loop->nb_iterations_likely_upper_bound
8649 = (final_iter_may_be_partial
8650 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8651 + bias_for_lowest, lowest_vf) - 1
8652 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8653 + bias_for_lowest, lowest_vf) - 1);
8654 if (loop->any_estimate)
8655 loop->nb_iterations_estimate
8656 = (final_iter_may_be_partial
8657 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8658 assumed_vf) - 1
8659 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8660 assumed_vf) - 1);
8662 if (dump_enabled_p ())
8664 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8666 dump_printf_loc (MSG_NOTE, vect_location,
8667 "LOOP VECTORIZED\n");
8668 if (loop->inner)
8669 dump_printf_loc (MSG_NOTE, vect_location,
8670 "OUTER LOOP VECTORIZED\n");
8671 dump_printf (MSG_NOTE, "\n");
8673 else
8675 dump_printf_loc (MSG_NOTE, vect_location,
8676 "LOOP EPILOGUE VECTORIZED (VS=");
8677 dump_dec (MSG_NOTE, current_vector_size);
8678 dump_printf (MSG_NOTE, ")\n");
8682 /* Free SLP instances here because otherwise stmt reference counting
8683 won't work. */
8684 slp_instance instance;
8685 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8686 vect_free_slp_instance (instance, true);
8687 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8688 /* Clear-up safelen field since its value is invalid after vectorization
8689 since vectorized loop can have loop-carried dependencies. */
8690 loop->safelen = 0;
8692 /* Don't vectorize epilogue for epilogue. */
8693 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8694 epilogue = NULL;
8696 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8697 epilogue = NULL;
8699 if (epilogue)
8701 auto_vector_sizes vector_sizes;
8702 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8703 unsigned int next_size = 0;
8705 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8706 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8707 && known_eq (vf, lowest_vf))
8709 unsigned int eiters
8710 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8711 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8712 eiters = eiters % lowest_vf;
8713 epilogue->nb_iterations_upper_bound = eiters - 1;
8715 unsigned int ratio;
8716 while (next_size < vector_sizes.length ()
8717 && !(constant_multiple_p (current_vector_size,
8718 vector_sizes[next_size], &ratio)
8719 && eiters >= lowest_vf / ratio))
8720 next_size += 1;
8722 else
8723 while (next_size < vector_sizes.length ()
8724 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8725 next_size += 1;
8727 if (next_size == vector_sizes.length ())
8728 epilogue = NULL;
8731 if (epilogue)
8733 epilogue->force_vectorize = loop->force_vectorize;
8734 epilogue->safelen = loop->safelen;
8735 epilogue->dont_vectorize = false;
8737 /* We may need to if-convert epilogue to vectorize it. */
8738 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8739 tree_if_conversion (epilogue);
8742 return epilogue;
8745 /* The code below is trying to perform simple optimization - revert
8746 if-conversion for masked stores, i.e. if the mask of a store is zero
8747 do not perform it and all stored value producers also if possible.
8748 For example,
8749 for (i=0; i<n; i++)
8750 if (c[i])
8752 p1[i] += 1;
8753 p2[i] = p3[i] +2;
8755 this transformation will produce the following semi-hammock:
8757 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8759 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8760 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8761 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8762 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8763 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8764 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8768 void
8769 optimize_mask_stores (struct loop *loop)
8771 basic_block *bbs = get_loop_body (loop);
8772 unsigned nbbs = loop->num_nodes;
8773 unsigned i;
8774 basic_block bb;
8775 struct loop *bb_loop;
8776 gimple_stmt_iterator gsi;
8777 gimple *stmt;
8778 auto_vec<gimple *> worklist;
8780 vect_location = find_loop_location (loop);
8781 /* Pick up all masked stores in loop if any. */
8782 for (i = 0; i < nbbs; i++)
8784 bb = bbs[i];
8785 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8786 gsi_next (&gsi))
8788 stmt = gsi_stmt (gsi);
8789 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8790 worklist.safe_push (stmt);
8794 free (bbs);
8795 if (worklist.is_empty ())
8796 return;
8798 /* Loop has masked stores. */
8799 while (!worklist.is_empty ())
8801 gimple *last, *last_store;
8802 edge e, efalse;
8803 tree mask;
8804 basic_block store_bb, join_bb;
8805 gimple_stmt_iterator gsi_to;
8806 tree vdef, new_vdef;
8807 gphi *phi;
8808 tree vectype;
8809 tree zero;
8811 last = worklist.pop ();
8812 mask = gimple_call_arg (last, 2);
8813 bb = gimple_bb (last);
8814 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8815 the same loop as if_bb. It could be different to LOOP when two
8816 level loop-nest is vectorized and mask_store belongs to the inner
8817 one. */
8818 e = split_block (bb, last);
8819 bb_loop = bb->loop_father;
8820 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8821 join_bb = e->dest;
8822 store_bb = create_empty_bb (bb);
8823 add_bb_to_loop (store_bb, bb_loop);
8824 e->flags = EDGE_TRUE_VALUE;
8825 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8826 /* Put STORE_BB to likely part. */
8827 efalse->probability = profile_probability::unlikely ();
8828 store_bb->count = efalse->count ();
8829 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8830 if (dom_info_available_p (CDI_DOMINATORS))
8831 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8832 if (dump_enabled_p ())
8833 dump_printf_loc (MSG_NOTE, vect_location,
8834 "Create new block %d to sink mask stores.",
8835 store_bb->index);
8836 /* Create vector comparison with boolean result. */
8837 vectype = TREE_TYPE (mask);
8838 zero = build_zero_cst (vectype);
8839 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8840 gsi = gsi_last_bb (bb);
8841 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8842 /* Create new PHI node for vdef of the last masked store:
8843 .MEM_2 = VDEF <.MEM_1>
8844 will be converted to
8845 .MEM.3 = VDEF <.MEM_1>
8846 and new PHI node will be created in join bb
8847 .MEM_2 = PHI <.MEM_1, .MEM_3>
8849 vdef = gimple_vdef (last);
8850 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8851 gimple_set_vdef (last, new_vdef);
8852 phi = create_phi_node (vdef, join_bb);
8853 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8855 /* Put all masked stores with the same mask to STORE_BB if possible. */
8856 while (true)
8858 gimple_stmt_iterator gsi_from;
8859 gimple *stmt1 = NULL;
8861 /* Move masked store to STORE_BB. */
8862 last_store = last;
8863 gsi = gsi_for_stmt (last);
8864 gsi_from = gsi;
8865 /* Shift GSI to the previous stmt for further traversal. */
8866 gsi_prev (&gsi);
8867 gsi_to = gsi_start_bb (store_bb);
8868 gsi_move_before (&gsi_from, &gsi_to);
8869 /* Setup GSI_TO to the non-empty block start. */
8870 gsi_to = gsi_start_bb (store_bb);
8871 if (dump_enabled_p ())
8873 dump_printf_loc (MSG_NOTE, vect_location,
8874 "Move stmt to created bb\n");
8875 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8877 /* Move all stored value producers if possible. */
8878 while (!gsi_end_p (gsi))
8880 tree lhs;
8881 imm_use_iterator imm_iter;
8882 use_operand_p use_p;
8883 bool res;
8885 /* Skip debug statements. */
8886 if (is_gimple_debug (gsi_stmt (gsi)))
8888 gsi_prev (&gsi);
8889 continue;
8891 stmt1 = gsi_stmt (gsi);
8892 /* Do not consider statements writing to memory or having
8893 volatile operand. */
8894 if (gimple_vdef (stmt1)
8895 || gimple_has_volatile_ops (stmt1))
8896 break;
8897 gsi_from = gsi;
8898 gsi_prev (&gsi);
8899 lhs = gimple_get_lhs (stmt1);
8900 if (!lhs)
8901 break;
8903 /* LHS of vectorized stmt must be SSA_NAME. */
8904 if (TREE_CODE (lhs) != SSA_NAME)
8905 break;
8907 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8909 /* Remove dead scalar statement. */
8910 if (has_zero_uses (lhs))
8912 gsi_remove (&gsi_from, true);
8913 continue;
8917 /* Check that LHS does not have uses outside of STORE_BB. */
8918 res = true;
8919 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8921 gimple *use_stmt;
8922 use_stmt = USE_STMT (use_p);
8923 if (is_gimple_debug (use_stmt))
8924 continue;
8925 if (gimple_bb (use_stmt) != store_bb)
8927 res = false;
8928 break;
8931 if (!res)
8932 break;
8934 if (gimple_vuse (stmt1)
8935 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8936 break;
8938 /* Can move STMT1 to STORE_BB. */
8939 if (dump_enabled_p ())
8941 dump_printf_loc (MSG_NOTE, vect_location,
8942 "Move stmt to created bb\n");
8943 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8945 gsi_move_before (&gsi_from, &gsi_to);
8946 /* Shift GSI_TO for further insertion. */
8947 gsi_prev (&gsi_to);
8949 /* Put other masked stores with the same mask to STORE_BB. */
8950 if (worklist.is_empty ()
8951 || gimple_call_arg (worklist.last (), 2) != mask
8952 || worklist.last () != stmt1)
8953 break;
8954 last = worklist.pop ();
8956 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);