Commit ChangeLog entries from previous commit.
[official-gcc.git] / gcc / tree-vect-loop.c
blobbf49e26e2a67ca168cece33f42052789b4705bcc
1 /* Loop Vectorization
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
54 /* Loop Vectorization Pass.
56 This pass tries to vectorize loops.
58 For example, the vectorizer transforms the following simple loop:
60 short a[N]; short b[N]; short c[N]; int i;
62 for (i=0; i<N; i++){
63 a[i] = b[i] + c[i];
66 as if it was manually vectorized by rewriting the source code into:
68 typedef int __attribute__((mode(V8HI))) v8hi;
69 short a[N]; short b[N]; short c[N]; int i;
70 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
71 v8hi va, vb, vc;
73 for (i=0; i<N/8; i++){
74 vb = pb[i];
75 vc = pc[i];
76 va = vb + vc;
77 pa[i] = va;
80 The main entry to this pass is vectorize_loops(), in which
81 the vectorizer applies a set of analyses on a given set of loops,
82 followed by the actual vectorization transformation for the loops that
83 had successfully passed the analysis phase.
84 Throughout this pass we make a distinction between two types of
85 data: scalars (which are represented by SSA_NAMES), and memory references
86 ("data-refs"). These two types of data require different handling both
87 during analysis and transformation. The types of data-refs that the
88 vectorizer currently supports are ARRAY_REFS which base is an array DECL
89 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
90 accesses are required to have a simple (consecutive) access pattern.
92 Analysis phase:
93 ===============
94 The driver for the analysis phase is vect_analyze_loop().
95 It applies a set of analyses, some of which rely on the scalar evolution
96 analyzer (scev) developed by Sebastian Pop.
98 During the analysis phase the vectorizer records some information
99 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
100 loop, as well as general information about the loop as a whole, which is
101 recorded in a "loop_vec_info" struct attached to each loop.
103 Transformation phase:
104 =====================
105 The loop transformation phase scans all the stmts in the loop, and
106 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
107 the loop that needs to be vectorized. It inserts the vector code sequence
108 just before the scalar stmt S, and records a pointer to the vector code
109 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
110 attached to S). This pointer will be used for the vectorization of following
111 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
112 otherwise, we rely on dead code elimination for removing it.
114 For example, say stmt S1 was vectorized into stmt VS1:
116 VS1: vb = px[i];
117 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
118 S2: a = b;
120 To vectorize stmt S2, the vectorizer first finds the stmt that defines
121 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
122 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
123 resulting sequence would be:
125 VS1: vb = px[i];
126 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
127 VS2: va = vb;
128 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
130 Operands that are not SSA_NAMEs, are data-refs that appear in
131 load/store operations (like 'x[i]' in S1), and are handled differently.
133 Target modeling:
134 =================
135 Currently the only target specific information that is used is the
136 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
137 Targets that can support different sizes of vectors, for now will need
138 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
139 flexibility will be added in the future.
141 Since we only vectorize operations which vector form can be
142 expressed using existing tree codes, to verify that an operation is
143 supported, the vectorizer checks the relevant optab at the relevant
144 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
145 the value found is CODE_FOR_nothing, then there's no target support, and
146 we can't vectorize the stmt.
148 For additional information on this project see:
149 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
152 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
154 /* Function vect_determine_vectorization_factor
156 Determine the vectorization factor (VF). VF is the number of data elements
157 that are operated upon in parallel in a single iteration of the vectorized
158 loop. For example, when vectorizing a loop that operates on 4byte elements,
159 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
160 elements can fit in a single vector register.
162 We currently support vectorization of loops in which all types operated upon
163 are of the same size. Therefore this function currently sets VF according to
164 the size of the types operated upon, and fails if there are multiple sizes
165 in the loop.
167 VF is also the factor by which the loop iterations are strip-mined, e.g.:
168 original loop:
169 for (i=0; i<N; i++){
170 a[i] = b[i] + c[i];
173 vectorized loop:
174 for (i=0; i<N; i+=VF){
175 a[i:VF] = b[i:VF] + c[i:VF];
179 static bool
180 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
182 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
183 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
184 unsigned nbbs = loop->num_nodes;
185 unsigned int vectorization_factor = 0;
186 tree scalar_type = NULL_TREE;
187 gphi *phi;
188 tree vectype;
189 unsigned int nunits;
190 stmt_vec_info stmt_info;
191 unsigned i;
192 HOST_WIDE_INT dummy;
193 gimple *stmt, *pattern_stmt = NULL;
194 gimple_seq pattern_def_seq = NULL;
195 gimple_stmt_iterator pattern_def_si = gsi_none ();
196 bool analyze_pattern_stmt = false;
197 bool bool_result;
198 auto_vec<stmt_vec_info> mask_producers;
200 if (dump_enabled_p ())
201 dump_printf_loc (MSG_NOTE, vect_location,
202 "=== vect_determine_vectorization_factor ===\n");
204 for (i = 0; i < nbbs; i++)
206 basic_block bb = bbs[i];
208 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
209 gsi_next (&si))
211 phi = si.phi ();
212 stmt_info = vinfo_for_stmt (phi);
213 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
216 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
219 gcc_assert (stmt_info);
221 if (STMT_VINFO_RELEVANT_P (stmt_info)
222 || STMT_VINFO_LIVE_P (stmt_info))
224 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
225 scalar_type = TREE_TYPE (PHI_RESULT (phi));
227 if (dump_enabled_p ())
229 dump_printf_loc (MSG_NOTE, vect_location,
230 "get vectype for scalar type: ");
231 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
232 dump_printf (MSG_NOTE, "\n");
235 vectype = get_vectype_for_scalar_type (scalar_type);
236 if (!vectype)
238 if (dump_enabled_p ())
240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
241 "not vectorized: unsupported "
242 "data-type ");
243 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
244 scalar_type);
245 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
247 return false;
249 STMT_VINFO_VECTYPE (stmt_info) = vectype;
251 if (dump_enabled_p ())
253 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
254 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
255 dump_printf (MSG_NOTE, "\n");
258 nunits = TYPE_VECTOR_SUBPARTS (vectype);
259 if (dump_enabled_p ())
260 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
261 nunits);
263 if (!vectorization_factor
264 || (nunits > vectorization_factor))
265 vectorization_factor = nunits;
269 for (gimple_stmt_iterator si = gsi_start_bb (bb);
270 !gsi_end_p (si) || analyze_pattern_stmt;)
272 tree vf_vectype;
274 if (analyze_pattern_stmt)
275 stmt = pattern_stmt;
276 else
277 stmt = gsi_stmt (si);
279 stmt_info = vinfo_for_stmt (stmt);
281 if (dump_enabled_p ())
283 dump_printf_loc (MSG_NOTE, vect_location,
284 "==> examining statement: ");
285 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
288 gcc_assert (stmt_info);
290 /* Skip stmts which do not need to be vectorized. */
291 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
292 && !STMT_VINFO_LIVE_P (stmt_info))
293 || gimple_clobber_p (stmt))
295 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
296 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
297 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
298 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
300 stmt = pattern_stmt;
301 stmt_info = vinfo_for_stmt (pattern_stmt);
302 if (dump_enabled_p ())
304 dump_printf_loc (MSG_NOTE, vect_location,
305 "==> examining pattern statement: ");
306 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
309 else
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
313 gsi_next (&si);
314 continue;
317 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
318 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
319 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
320 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
321 analyze_pattern_stmt = true;
323 /* If a pattern statement has def stmts, analyze them too. */
324 if (is_pattern_stmt_p (stmt_info))
326 if (pattern_def_seq == NULL)
328 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
329 pattern_def_si = gsi_start (pattern_def_seq);
331 else if (!gsi_end_p (pattern_def_si))
332 gsi_next (&pattern_def_si);
333 if (pattern_def_seq != NULL)
335 gimple *pattern_def_stmt = NULL;
336 stmt_vec_info pattern_def_stmt_info = NULL;
338 while (!gsi_end_p (pattern_def_si))
340 pattern_def_stmt = gsi_stmt (pattern_def_si);
341 pattern_def_stmt_info
342 = vinfo_for_stmt (pattern_def_stmt);
343 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
344 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
345 break;
346 gsi_next (&pattern_def_si);
349 if (!gsi_end_p (pattern_def_si))
351 if (dump_enabled_p ())
353 dump_printf_loc (MSG_NOTE, vect_location,
354 "==> examining pattern def stmt: ");
355 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
356 pattern_def_stmt, 0);
359 stmt = pattern_def_stmt;
360 stmt_info = pattern_def_stmt_info;
362 else
364 pattern_def_si = gsi_none ();
365 analyze_pattern_stmt = false;
368 else
369 analyze_pattern_stmt = false;
372 if (gimple_get_lhs (stmt) == NULL_TREE
373 /* MASK_STORE has no lhs, but is ok. */
374 && (!is_gimple_call (stmt)
375 || !gimple_call_internal_p (stmt)
376 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
378 if (is_gimple_call (stmt))
380 /* Ignore calls with no lhs. These must be calls to
381 #pragma omp simd functions, and what vectorization factor
382 it really needs can't be determined until
383 vectorizable_simd_clone_call. */
384 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
386 pattern_def_seq = NULL;
387 gsi_next (&si);
389 continue;
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "not vectorized: irregular stmt.");
395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
398 return false;
401 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
403 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 "not vectorized: vector stmt in loop:");
407 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
409 return false;
412 bool_result = false;
414 if (STMT_VINFO_VECTYPE (stmt_info))
416 /* The only case when a vectype had been already set is for stmts
417 that contain a dataref, or for "pattern-stmts" (stmts
418 generated by the vectorizer to represent/replace a certain
419 idiom). */
420 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
421 || is_pattern_stmt_p (stmt_info)
422 || !gsi_end_p (pattern_def_si));
423 vectype = STMT_VINFO_VECTYPE (stmt_info);
425 else
427 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
428 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
429 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
430 else
431 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
433 /* Bool ops don't participate in vectorization factor
434 computation. For comparison use compared types to
435 compute a factor. */
436 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
437 && is_gimple_assign (stmt)
438 && gimple_assign_rhs_code (stmt) != COND_EXPR)
440 if (STMT_VINFO_RELEVANT_P (stmt_info)
441 || STMT_VINFO_LIVE_P (stmt_info))
442 mask_producers.safe_push (stmt_info);
443 bool_result = true;
445 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
446 == tcc_comparison
447 && !VECT_SCALAR_BOOLEAN_TYPE_P
448 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
449 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
450 else
452 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
454 pattern_def_seq = NULL;
455 gsi_next (&si);
457 continue;
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "get vectype for scalar type: ");
465 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
466 dump_printf (MSG_NOTE, "\n");
468 vectype = get_vectype_for_scalar_type (scalar_type);
469 if (!vectype)
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
474 "not vectorized: unsupported "
475 "data-type ");
476 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
477 scalar_type);
478 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 return false;
483 if (!bool_result)
484 STMT_VINFO_VECTYPE (stmt_info) = vectype;
486 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
489 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
490 dump_printf (MSG_NOTE, "\n");
494 /* Don't try to compute VF out scalar types if we stmt
495 produces boolean vector. Use result vectype instead. */
496 if (VECTOR_BOOLEAN_TYPE_P (vectype))
497 vf_vectype = vectype;
498 else
500 /* The vectorization factor is according to the smallest
501 scalar type (or the largest vector size, but we only
502 support one vector size per loop). */
503 if (!bool_result)
504 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
505 &dummy);
506 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE, vect_location,
509 "get vectype for scalar type: ");
510 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
511 dump_printf (MSG_NOTE, "\n");
513 vf_vectype = get_vectype_for_scalar_type (scalar_type);
515 if (!vf_vectype)
517 if (dump_enabled_p ())
519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
520 "not vectorized: unsupported data-type ");
521 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
522 scalar_type);
523 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 return false;
528 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
529 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
531 if (dump_enabled_p ())
533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534 "not vectorized: different sized vector "
535 "types in statement, ");
536 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
537 vectype);
538 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540 vf_vectype);
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
543 return false;
546 if (dump_enabled_p ())
548 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
549 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
550 dump_printf (MSG_NOTE, "\n");
553 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
556 if (!vectorization_factor
557 || (nunits > vectorization_factor))
558 vectorization_factor = nunits;
560 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
562 pattern_def_seq = NULL;
563 gsi_next (&si);
568 /* TODO: Analyze cost. Decide if worth while to vectorize. */
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
571 vectorization_factor);
572 if (vectorization_factor <= 1)
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 "not vectorized: unsupported data-type\n");
577 return false;
579 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
581 for (i = 0; i < mask_producers.length (); i++)
583 tree mask_type = NULL;
585 stmt = STMT_VINFO_STMT (mask_producers[i]);
587 if (is_gimple_assign (stmt)
588 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
589 && !VECT_SCALAR_BOOLEAN_TYPE_P
590 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
592 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
593 mask_type = get_mask_type_for_scalar_type (scalar_type);
595 if (!mask_type)
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
599 "not vectorized: unsupported mask\n");
600 return false;
603 else
605 tree rhs;
606 ssa_op_iter iter;
607 gimple *def_stmt;
608 enum vect_def_type dt;
610 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
612 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
613 &def_stmt, &dt, &vectype))
615 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "not vectorized: can't compute mask type "
619 "for statement, ");
620 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
623 return false;
626 /* No vectype probably means external definition.
627 Allow it in case there is another operand which
628 allows to determine mask type. */
629 if (!vectype)
630 continue;
632 if (!mask_type)
633 mask_type = vectype;
634 else if (TYPE_VECTOR_SUBPARTS (mask_type)
635 != TYPE_VECTOR_SUBPARTS (vectype))
637 if (dump_enabled_p ())
639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
640 "not vectorized: different sized masks "
641 "types in statement, ");
642 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
643 mask_type);
644 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
645 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
646 vectype);
647 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
649 return false;
651 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
652 != VECTOR_BOOLEAN_TYPE_P (vectype))
654 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
657 "not vectorized: mixed mask and "
658 "nonmask vector types in statement, ");
659 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
660 mask_type);
661 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
662 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
663 vectype);
664 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
666 return false;
670 /* We may compare boolean value loaded as vector of integers.
671 Fix mask_type in such case. */
672 if (mask_type
673 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
674 && gimple_code (stmt) == GIMPLE_ASSIGN
675 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
676 mask_type = build_same_sized_truth_vector_type (mask_type);
679 /* No mask_type should mean loop invariant predicate.
680 This is probably a subject for optimization in
681 if-conversion. */
682 if (!mask_type)
684 if (dump_enabled_p ())
686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
687 "not vectorized: can't compute mask type "
688 "for statement, ");
689 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
692 return false;
695 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
698 return true;
702 /* Function vect_is_simple_iv_evolution.
704 FORNOW: A simple evolution of an induction variables in the loop is
705 considered a polynomial evolution. */
707 static bool
708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
709 tree * step)
711 tree init_expr;
712 tree step_expr;
713 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
714 basic_block bb;
716 /* When there is no evolution in this loop, the evolution function
717 is not "simple". */
718 if (evolution_part == NULL_TREE)
719 return false;
721 /* When the evolution is a polynomial of degree >= 2
722 the evolution function is not "simple". */
723 if (tree_is_chrec (evolution_part))
724 return false;
726 step_expr = evolution_part;
727 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
732 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
733 dump_printf (MSG_NOTE, ", init: ");
734 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
735 dump_printf (MSG_NOTE, "\n");
738 *init = init_expr;
739 *step = step_expr;
741 if (TREE_CODE (step_expr) != INTEGER_CST
742 && (TREE_CODE (step_expr) != SSA_NAME
743 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
744 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
745 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
746 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
747 || !flag_associative_math)))
748 && (TREE_CODE (step_expr) != REAL_CST
749 || !flag_associative_math))
751 if (dump_enabled_p ())
752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
753 "step unknown.\n");
754 return false;
757 return true;
760 /* Function vect_analyze_scalar_cycles_1.
762 Examine the cross iteration def-use cycles of scalar variables
763 in LOOP. LOOP_VINFO represents the loop that is now being
764 considered for vectorization (can be LOOP, or an outer-loop
765 enclosing LOOP). */
767 static void
768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
770 basic_block bb = loop->header;
771 tree init, step;
772 auto_vec<gimple *, 64> worklist;
773 gphi_iterator gsi;
774 bool double_reduc;
776 if (dump_enabled_p ())
777 dump_printf_loc (MSG_NOTE, vect_location,
778 "=== vect_analyze_scalar_cycles ===\n");
780 /* First - identify all inductions. Reduction detection assumes that all the
781 inductions have been identified, therefore, this order must not be
782 changed. */
783 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
785 gphi *phi = gsi.phi ();
786 tree access_fn = NULL;
787 tree def = PHI_RESULT (phi);
788 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
790 if (dump_enabled_p ())
792 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
793 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
796 /* Skip virtual phi's. The data dependences that are associated with
797 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
798 if (virtual_operand_p (def))
799 continue;
801 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
803 /* Analyze the evolution function. */
804 access_fn = analyze_scalar_evolution (loop, def);
805 if (access_fn)
807 STRIP_NOPS (access_fn);
808 if (dump_enabled_p ())
810 dump_printf_loc (MSG_NOTE, vect_location,
811 "Access function of PHI: ");
812 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
813 dump_printf (MSG_NOTE, "\n");
815 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
816 = initial_condition_in_loop_num (access_fn, loop->num);
817 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
818 = evolution_part_in_loop_num (access_fn, loop->num);
821 if (!access_fn
822 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
823 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
824 && TREE_CODE (step) != INTEGER_CST))
826 worklist.safe_push (phi);
827 continue;
830 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
831 != NULL_TREE);
832 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
834 if (dump_enabled_p ())
835 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
836 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
840 /* Second - identify all reductions and nested cycles. */
841 while (worklist.length () > 0)
843 gimple *phi = worklist.pop ();
844 tree def = PHI_RESULT (phi);
845 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
846 gimple *reduc_stmt;
848 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
851 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
854 gcc_assert (!virtual_operand_p (def)
855 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
857 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
858 &double_reduc, false);
859 if (reduc_stmt)
861 if (double_reduc)
863 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE, vect_location,
865 "Detected double reduction.\n");
867 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
868 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
869 vect_double_reduction_def;
871 else
873 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
875 if (dump_enabled_p ())
876 dump_printf_loc (MSG_NOTE, vect_location,
877 "Detected vectorizable nested cycle.\n");
879 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
880 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
881 vect_nested_cycle;
883 else
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location,
887 "Detected reduction.\n");
889 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
890 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
891 vect_reduction_def;
892 /* Store the reduction cycles for possible vectorization in
893 loop-aware SLP if it was not detected as reduction
894 chain. */
895 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
896 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
900 else
901 if (dump_enabled_p ())
902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
903 "Unknown def-use cycle pattern.\n");
908 /* Function vect_analyze_scalar_cycles.
910 Examine the cross iteration def-use cycles of scalar variables, by
911 analyzing the loop-header PHIs of scalar variables. Classify each
912 cycle as one of the following: invariant, induction, reduction, unknown.
913 We do that for the loop represented by LOOP_VINFO, and also to its
914 inner-loop, if exists.
915 Examples for scalar cycles:
917 Example1: reduction:
919 loop1:
920 for (i=0; i<N; i++)
921 sum += a[i];
923 Example2: induction:
925 loop2:
926 for (i=0; i<N; i++)
927 a[i] = i; */
929 static void
930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
932 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
934 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
936 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
937 Reductions in such inner-loop therefore have different properties than
938 the reductions in the nest that gets vectorized:
939 1. When vectorized, they are executed in the same order as in the original
940 scalar loop, so we can't change the order of computation when
941 vectorizing them.
942 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
943 current checks are too strict. */
945 if (loop->inner)
946 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
949 /* Transfer group and reduction information from STMT to its pattern stmt. */
951 static void
952 vect_fixup_reduc_chain (gimple *stmt)
954 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
955 gimple *stmtp;
956 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
957 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
958 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
961 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
962 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
963 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
964 if (stmt)
965 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
966 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
968 while (stmt);
969 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
972 /* Fixup scalar cycles that now have their stmts detected as patterns. */
974 static void
975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
977 gimple *first;
978 unsigned i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
981 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
983 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
984 while (next)
986 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
987 break;
988 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
990 /* If not all stmt in the chain are patterns try to handle
991 the chain without patterns. */
992 if (! next)
994 vect_fixup_reduc_chain (first);
995 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
996 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1001 /* Function vect_get_loop_niters.
1003 Determine how many iterations the loop is executed and place it
1004 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1005 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1006 niter information holds in ASSUMPTIONS.
1008 Return the loop exit condition. */
1011 static gcond *
1012 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1013 tree *number_of_iterations, tree *number_of_iterationsm1)
1015 edge exit = single_exit (loop);
1016 struct tree_niter_desc niter_desc;
1017 tree niter_assumptions, niter, may_be_zero;
1018 gcond *cond = get_loop_exit_condition (loop);
1020 *assumptions = boolean_true_node;
1021 *number_of_iterationsm1 = chrec_dont_know;
1022 *number_of_iterations = chrec_dont_know;
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_NOTE, vect_location,
1025 "=== get_loop_niters ===\n");
1027 if (!exit)
1028 return cond;
1030 niter = chrec_dont_know;
1031 may_be_zero = NULL_TREE;
1032 niter_assumptions = boolean_true_node;
1033 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1034 || chrec_contains_undetermined (niter_desc.niter))
1035 return cond;
1037 niter_assumptions = niter_desc.assumptions;
1038 may_be_zero = niter_desc.may_be_zero;
1039 niter = niter_desc.niter;
1041 if (may_be_zero && integer_zerop (may_be_zero))
1042 may_be_zero = NULL_TREE;
1044 if (may_be_zero)
1046 if (COMPARISON_CLASS_P (may_be_zero))
1048 /* Try to combine may_be_zero with assumptions, this can simplify
1049 computation of niter expression. */
1050 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1051 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1052 niter_assumptions,
1053 fold_build1 (TRUTH_NOT_EXPR,
1054 boolean_type_node,
1055 may_be_zero));
1056 else
1057 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1058 build_int_cst (TREE_TYPE (niter), 0), niter);
1060 may_be_zero = NULL_TREE;
1062 else if (integer_nonzerop (may_be_zero))
1064 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1065 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1066 return cond;
1068 else
1069 return cond;
1072 *assumptions = niter_assumptions;
1073 *number_of_iterationsm1 = niter;
1075 /* We want the number of loop header executions which is the number
1076 of latch executions plus one.
1077 ??? For UINT_MAX latch executions this number overflows to zero
1078 for loops like do { n++; } while (n != 0); */
1079 if (niter && !chrec_contains_undetermined (niter))
1080 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1081 build_int_cst (TREE_TYPE (niter), 1));
1082 *number_of_iterations = niter;
1084 return cond;
1087 /* Function bb_in_loop_p
1089 Used as predicate for dfs order traversal of the loop bbs. */
1091 static bool
1092 bb_in_loop_p (const_basic_block bb, const void *data)
1094 const struct loop *const loop = (const struct loop *)data;
1095 if (flow_bb_inside_loop_p (loop, bb))
1096 return true;
1097 return false;
1101 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1102 stmt_vec_info structs for all the stmts in LOOP_IN. */
1104 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1105 : vec_info (vec_info::loop, init_cost (loop_in)),
1106 loop (loop_in),
1107 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1108 num_itersm1 (NULL_TREE),
1109 num_iters (NULL_TREE),
1110 num_iters_unchanged (NULL_TREE),
1111 num_iters_assumptions (NULL_TREE),
1112 th (0),
1113 vectorization_factor (0),
1114 max_vectorization_factor (0),
1115 unaligned_dr (NULL),
1116 peeling_for_alignment (0),
1117 ptr_mask (0),
1118 slp_unrolling_factor (1),
1119 single_scalar_iteration_cost (0),
1120 vectorizable (false),
1121 peeling_for_gaps (false),
1122 peeling_for_niter (false),
1123 operands_swapped (false),
1124 no_data_dependencies (false),
1125 has_mask_store (false),
1126 scalar_loop (NULL),
1127 orig_loop_info (NULL)
1129 /* Create/Update stmt_info for all stmts in the loop. */
1130 basic_block *body = get_loop_body (loop);
1131 for (unsigned int i = 0; i < loop->num_nodes; i++)
1133 basic_block bb = body[i];
1134 gimple_stmt_iterator si;
1136 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1138 gimple *phi = gsi_stmt (si);
1139 gimple_set_uid (phi, 0);
1140 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1143 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1145 gimple *stmt = gsi_stmt (si);
1146 gimple_set_uid (stmt, 0);
1147 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1150 free (body);
1152 /* CHECKME: We want to visit all BBs before their successors (except for
1153 latch blocks, for which this assertion wouldn't hold). In the simple
1154 case of the loop forms we allow, a dfs order of the BBs would the same
1155 as reversed postorder traversal, so we are safe. */
1157 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1158 bbs, loop->num_nodes, loop);
1159 gcc_assert (nbbs == loop->num_nodes);
1163 /* Free all memory used by the _loop_vec_info, as well as all the
1164 stmt_vec_info structs of all the stmts in the loop. */
1166 _loop_vec_info::~_loop_vec_info ()
1168 int nbbs;
1169 gimple_stmt_iterator si;
1170 int j;
1172 nbbs = loop->num_nodes;
1173 for (j = 0; j < nbbs; j++)
1175 basic_block bb = bbs[j];
1176 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1177 free_stmt_vec_info (gsi_stmt (si));
1179 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1181 gimple *stmt = gsi_stmt (si);
1183 /* We may have broken canonical form by moving a constant
1184 into RHS1 of a commutative op. Fix such occurrences. */
1185 if (operands_swapped && is_gimple_assign (stmt))
1187 enum tree_code code = gimple_assign_rhs_code (stmt);
1189 if ((code == PLUS_EXPR
1190 || code == POINTER_PLUS_EXPR
1191 || code == MULT_EXPR)
1192 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1193 swap_ssa_operands (stmt,
1194 gimple_assign_rhs1_ptr (stmt),
1195 gimple_assign_rhs2_ptr (stmt));
1196 else if (code == COND_EXPR
1197 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1199 tree cond_expr = gimple_assign_rhs1 (stmt);
1200 enum tree_code cond_code = TREE_CODE (cond_expr);
1202 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1204 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1205 0));
1206 cond_code = invert_tree_comparison (cond_code,
1207 honor_nans);
1208 if (cond_code != ERROR_MARK)
1210 TREE_SET_CODE (cond_expr, cond_code);
1211 swap_ssa_operands (stmt,
1212 gimple_assign_rhs2_ptr (stmt),
1213 gimple_assign_rhs3_ptr (stmt));
1219 /* Free stmt_vec_info. */
1220 free_stmt_vec_info (stmt);
1221 gsi_next (&si);
1225 free (bbs);
1227 loop->aux = NULL;
1231 /* Calculate the cost of one scalar iteration of the loop. */
1232 static void
1233 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1236 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1237 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1238 int innerloop_iters, i;
1240 /* Count statements in scalar loop. Using this as scalar cost for a single
1241 iteration for now.
1243 TODO: Add outer loop support.
1245 TODO: Consider assigning different costs to different scalar
1246 statements. */
1248 /* FORNOW. */
1249 innerloop_iters = 1;
1250 if (loop->inner)
1251 innerloop_iters = 50; /* FIXME */
1253 for (i = 0; i < nbbs; i++)
1255 gimple_stmt_iterator si;
1256 basic_block bb = bbs[i];
1258 if (bb->loop_father == loop->inner)
1259 factor = innerloop_iters;
1260 else
1261 factor = 1;
1263 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1265 gimple *stmt = gsi_stmt (si);
1266 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1268 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1269 continue;
1271 /* Skip stmts that are not vectorized inside the loop. */
1272 if (stmt_info
1273 && !STMT_VINFO_RELEVANT_P (stmt_info)
1274 && (!STMT_VINFO_LIVE_P (stmt_info)
1275 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1276 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1277 continue;
1279 vect_cost_for_stmt kind;
1280 if (STMT_VINFO_DATA_REF (stmt_info))
1282 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1283 kind = scalar_load;
1284 else
1285 kind = scalar_store;
1287 else
1288 kind = scalar_stmt;
1290 scalar_single_iter_cost
1291 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1292 factor, kind, stmt_info, 0, vect_prologue);
1295 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1296 = scalar_single_iter_cost;
1300 /* Function vect_analyze_loop_form_1.
1302 Verify that certain CFG restrictions hold, including:
1303 - the loop has a pre-header
1304 - the loop has a single entry and exit
1305 - the loop exit condition is simple enough
1306 - the number of iterations can be analyzed, i.e, a countable loop. The
1307 niter could be analyzed under some assumptions. */
1309 bool
1310 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1311 tree *assumptions, tree *number_of_iterationsm1,
1312 tree *number_of_iterations, gcond **inner_loop_cond)
1314 if (dump_enabled_p ())
1315 dump_printf_loc (MSG_NOTE, vect_location,
1316 "=== vect_analyze_loop_form ===\n");
1318 /* Different restrictions apply when we are considering an inner-most loop,
1319 vs. an outer (nested) loop.
1320 (FORNOW. May want to relax some of these restrictions in the future). */
1322 if (!loop->inner)
1324 /* Inner-most loop. We currently require that the number of BBs is
1325 exactly 2 (the header and latch). Vectorizable inner-most loops
1326 look like this:
1328 (pre-header)
1330 header <--------+
1331 | | |
1332 | +--> latch --+
1334 (exit-bb) */
1336 if (loop->num_nodes != 2)
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340 "not vectorized: control flow in loop.\n");
1341 return false;
1344 if (empty_block_p (loop->header))
1346 if (dump_enabled_p ())
1347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1348 "not vectorized: empty loop.\n");
1349 return false;
1352 else
1354 struct loop *innerloop = loop->inner;
1355 edge entryedge;
1357 /* Nested loop. We currently require that the loop is doubly-nested,
1358 contains a single inner loop, and the number of BBs is exactly 5.
1359 Vectorizable outer-loops look like this:
1361 (pre-header)
1363 header <---+
1365 inner-loop |
1367 tail ------+
1369 (exit-bb)
1371 The inner-loop has the properties expected of inner-most loops
1372 as described above. */
1374 if ((loop->inner)->inner || (loop->inner)->next)
1376 if (dump_enabled_p ())
1377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1378 "not vectorized: multiple nested loops.\n");
1379 return false;
1382 if (loop->num_nodes != 5)
1384 if (dump_enabled_p ())
1385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1386 "not vectorized: control flow in loop.\n");
1387 return false;
1390 entryedge = loop_preheader_edge (innerloop);
1391 if (entryedge->src != loop->header
1392 || !single_exit (innerloop)
1393 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1395 if (dump_enabled_p ())
1396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397 "not vectorized: unsupported outerloop form.\n");
1398 return false;
1401 /* Analyze the inner-loop. */
1402 tree inner_niterm1, inner_niter, inner_assumptions;
1403 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1404 &inner_assumptions, &inner_niterm1,
1405 &inner_niter, NULL)
1406 /* Don't support analyzing niter under assumptions for inner
1407 loop. */
1408 || !integer_onep (inner_assumptions))
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1412 "not vectorized: Bad inner loop.\n");
1413 return false;
1416 if (!expr_invariant_in_loop_p (loop, inner_niter))
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420 "not vectorized: inner-loop count not"
1421 " invariant.\n");
1422 return false;
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "Considering outer-loop vectorization.\n");
1430 if (!single_exit (loop)
1431 || EDGE_COUNT (loop->header->preds) != 2)
1433 if (dump_enabled_p ())
1435 if (!single_exit (loop))
1436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1437 "not vectorized: multiple exits.\n");
1438 else if (EDGE_COUNT (loop->header->preds) != 2)
1439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1440 "not vectorized: too many incoming edges.\n");
1442 return false;
1445 /* We assume that the loop exit condition is at the end of the loop. i.e,
1446 that the loop is represented as a do-while (with a proper if-guard
1447 before the loop if needed), where the loop header contains all the
1448 executable statements, and the latch is empty. */
1449 if (!empty_block_p (loop->latch)
1450 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1452 if (dump_enabled_p ())
1453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1454 "not vectorized: latch block not empty.\n");
1455 return false;
1458 /* Make sure the exit is not abnormal. */
1459 edge e = single_exit (loop);
1460 if (e->flags & EDGE_ABNORMAL)
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 "not vectorized: abnormal loop exit edge.\n");
1465 return false;
1468 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1469 number_of_iterationsm1);
1470 if (!*loop_cond)
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 "not vectorized: complicated exit condition.\n");
1475 return false;
1478 if (integer_zerop (*assumptions)
1479 || !*number_of_iterations
1480 || chrec_contains_undetermined (*number_of_iterations))
1482 if (dump_enabled_p ())
1483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1484 "not vectorized: number of iterations cannot be "
1485 "computed.\n");
1486 return false;
1489 if (integer_zerop (*number_of_iterations))
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493 "not vectorized: number of iterations = 0.\n");
1494 return false;
1497 return true;
1500 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1502 loop_vec_info
1503 vect_analyze_loop_form (struct loop *loop)
1505 tree assumptions, number_of_iterations, number_of_iterationsm1;
1506 gcond *loop_cond, *inner_loop_cond = NULL;
1508 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1509 &assumptions, &number_of_iterationsm1,
1510 &number_of_iterations, &inner_loop_cond))
1511 return NULL;
1513 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1514 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1515 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1516 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1517 if (!integer_onep (assumptions))
1519 /* We consider to vectorize this loop by versioning it under
1520 some assumptions. In order to do this, we need to clear
1521 existing information computed by scev and niter analyzer. */
1522 scev_reset_htab ();
1523 free_numbers_of_iterations_estimates (loop);
1524 /* Also set flag for this loop so that following scev and niter
1525 analysis are done under the assumptions. */
1526 loop_constraint_set (loop, LOOP_C_FINITE);
1527 /* Also record the assumptions for versioning. */
1528 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1531 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1533 if (dump_enabled_p ())
1535 dump_printf_loc (MSG_NOTE, vect_location,
1536 "Symbolic number of iterations is ");
1537 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1538 dump_printf (MSG_NOTE, "\n");
1542 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1543 if (inner_loop_cond)
1544 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1545 = loop_exit_ctrl_vec_info_type;
1547 gcc_assert (!loop->aux);
1548 loop->aux = loop_vinfo;
1549 return loop_vinfo;
1554 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1555 statements update the vectorization factor. */
1557 static void
1558 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1560 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1561 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1562 int nbbs = loop->num_nodes;
1563 unsigned int vectorization_factor;
1564 int i;
1566 if (dump_enabled_p ())
1567 dump_printf_loc (MSG_NOTE, vect_location,
1568 "=== vect_update_vf_for_slp ===\n");
1570 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1571 gcc_assert (vectorization_factor != 0);
1573 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1574 vectorization factor of the loop is the unrolling factor required by
1575 the SLP instances. If that unrolling factor is 1, we say, that we
1576 perform pure SLP on loop - cross iteration parallelism is not
1577 exploited. */
1578 bool only_slp_in_loop = true;
1579 for (i = 0; i < nbbs; i++)
1581 basic_block bb = bbs[i];
1582 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1583 gsi_next (&si))
1585 gimple *stmt = gsi_stmt (si);
1586 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1587 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1588 && STMT_VINFO_RELATED_STMT (stmt_info))
1590 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1591 stmt_info = vinfo_for_stmt (stmt);
1593 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1594 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1595 && !PURE_SLP_STMT (stmt_info))
1596 /* STMT needs both SLP and loop-based vectorization. */
1597 only_slp_in_loop = false;
1601 if (only_slp_in_loop)
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "Loop contains only SLP stmts\n");
1605 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1607 else
1609 dump_printf_loc (MSG_NOTE, vect_location,
1610 "Loop contains SLP and non-SLP stmts\n");
1611 vectorization_factor
1612 = least_common_multiple (vectorization_factor,
1613 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1616 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1617 if (dump_enabled_p ())
1618 dump_printf_loc (MSG_NOTE, vect_location,
1619 "Updating vectorization factor to %d\n",
1620 vectorization_factor);
1623 /* Function vect_analyze_loop_operations.
1625 Scan the loop stmts and make sure they are all vectorizable. */
1627 static bool
1628 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1630 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632 int nbbs = loop->num_nodes;
1633 int i;
1634 stmt_vec_info stmt_info;
1635 bool need_to_vectorize = false;
1636 bool ok;
1638 if (dump_enabled_p ())
1639 dump_printf_loc (MSG_NOTE, vect_location,
1640 "=== vect_analyze_loop_operations ===\n");
1642 for (i = 0; i < nbbs; i++)
1644 basic_block bb = bbs[i];
1646 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1647 gsi_next (&si))
1649 gphi *phi = si.phi ();
1650 ok = true;
1652 stmt_info = vinfo_for_stmt (phi);
1653 if (dump_enabled_p ())
1655 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1658 if (virtual_operand_p (gimple_phi_result (phi)))
1659 continue;
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb))
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info)
1670 && STMT_VINFO_DEF_TYPE (stmt_info)
1671 != vect_double_reduction_def)
1673 if (dump_enabled_p ())
1674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1675 "Unsupported loop-closed phi in "
1676 "outer-loop.\n");
1677 return false;
1680 /* If PHI is used in the outer loop, we check that its operand
1681 is defined in the inner loop. */
1682 if (STMT_VINFO_RELEVANT_P (stmt_info))
1684 tree phi_op;
1685 gimple *op_def_stmt;
1687 if (gimple_phi_num_args (phi) != 1)
1688 return false;
1690 phi_op = PHI_ARG_DEF (phi, 0);
1691 if (TREE_CODE (phi_op) != SSA_NAME)
1692 return false;
1694 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1695 if (gimple_nop_p (op_def_stmt)
1696 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1697 || !vinfo_for_stmt (op_def_stmt))
1698 return false;
1700 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1701 != vect_used_in_outer
1702 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1703 != vect_used_in_outer_by_reduction)
1704 return false;
1707 continue;
1710 gcc_assert (stmt_info);
1712 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1713 || STMT_VINFO_LIVE_P (stmt_info))
1714 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1716 /* A scalar-dependence cycle that we don't support. */
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "not vectorized: scalar dependence cycle.\n");
1720 return false;
1723 if (STMT_VINFO_RELEVANT_P (stmt_info))
1725 need_to_vectorize = true;
1726 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1727 && ! PURE_SLP_STMT (stmt_info))
1728 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1729 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1730 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1731 && ! PURE_SLP_STMT (stmt_info))
1732 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1735 if (ok && STMT_VINFO_LIVE_P (stmt_info))
1736 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1738 if (!ok)
1740 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1743 "not vectorized: relevant phi not "
1744 "supported: ");
1745 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1747 return false;
1751 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1752 gsi_next (&si))
1754 gimple *stmt = gsi_stmt (si);
1755 if (!gimple_clobber_p (stmt)
1756 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1757 return false;
1759 } /* bbs */
1761 /* All operations in the loop are either irrelevant (deal with loop
1762 control, or dead), or only used outside the loop and can be moved
1763 out of the loop (e.g. invariants, inductions). The loop can be
1764 optimized away by scalar optimizations. We're better off not
1765 touching this loop. */
1766 if (!need_to_vectorize)
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_NOTE, vect_location,
1770 "All the computation can be taken out of the loop.\n");
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "not vectorized: redundant loop. no profit to "
1774 "vectorize.\n");
1775 return false;
1778 return true;
1782 /* Function vect_analyze_loop_2.
1784 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1785 for it. The different analyses will record information in the
1786 loop_vec_info struct. */
1787 static bool
1788 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1790 bool ok;
1791 int max_vf = MAX_VECTORIZATION_FACTOR;
1792 int min_vf = 2;
1793 unsigned int n_stmts = 0;
1795 /* The first group of checks is independent of the vector size. */
1796 fatal = true;
1798 /* Find all data references in the loop (which correspond to vdefs/vuses)
1799 and analyze their evolution in the loop. */
1801 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1803 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1804 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 "not vectorized: loop nest containing two "
1809 "or more consecutive inner loops cannot be "
1810 "vectorized\n");
1811 return false;
1814 for (unsigned i = 0; i < loop->num_nodes; i++)
1815 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1816 !gsi_end_p (gsi); gsi_next (&gsi))
1818 gimple *stmt = gsi_stmt (gsi);
1819 if (is_gimple_debug (stmt))
1820 continue;
1821 ++n_stmts;
1822 if (!find_data_references_in_stmt (loop, stmt,
1823 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1825 if (is_gimple_call (stmt) && loop->safelen)
1827 tree fndecl = gimple_call_fndecl (stmt), op;
1828 if (fndecl != NULL_TREE)
1830 cgraph_node *node = cgraph_node::get (fndecl);
1831 if (node != NULL && node->simd_clones != NULL)
1833 unsigned int j, n = gimple_call_num_args (stmt);
1834 for (j = 0; j < n; j++)
1836 op = gimple_call_arg (stmt, j);
1837 if (DECL_P (op)
1838 || (REFERENCE_CLASS_P (op)
1839 && get_base_address (op)))
1840 break;
1842 op = gimple_call_lhs (stmt);
1843 /* Ignore #pragma omp declare simd functions
1844 if they don't have data references in the
1845 call stmt itself. */
1846 if (j == n
1847 && !(op
1848 && (DECL_P (op)
1849 || (REFERENCE_CLASS_P (op)
1850 && get_base_address (op)))))
1851 continue;
1855 if (dump_enabled_p ())
1856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1857 "not vectorized: loop contains function "
1858 "calls or data references that cannot "
1859 "be analyzed\n");
1860 return false;
1864 /* Analyze the data references and also adjust the minimal
1865 vectorization factor according to the loads and stores. */
1867 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1868 if (!ok)
1870 if (dump_enabled_p ())
1871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872 "bad data references.\n");
1873 return false;
1876 /* Classify all cross-iteration scalar data-flow cycles.
1877 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1878 vect_analyze_scalar_cycles (loop_vinfo);
1880 vect_pattern_recog (loop_vinfo);
1882 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1884 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1885 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1887 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1888 if (!ok)
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 "bad data access.\n");
1893 return false;
1896 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1898 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1899 if (!ok)
1901 if (dump_enabled_p ())
1902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903 "unexpected pattern.\n");
1904 return false;
1907 /* While the rest of the analysis below depends on it in some way. */
1908 fatal = false;
1910 /* Analyze data dependences between the data-refs in the loop
1911 and adjust the maximum vectorization factor according to
1912 the dependences.
1913 FORNOW: fail at the first data dependence that we encounter. */
1915 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1916 if (!ok
1917 || max_vf < min_vf)
1919 if (dump_enabled_p ())
1920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1921 "bad data dependence.\n");
1922 return false;
1924 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1926 ok = vect_determine_vectorization_factor (loop_vinfo);
1927 if (!ok)
1929 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931 "can't determine vectorization factor.\n");
1932 return false;
1934 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1936 if (dump_enabled_p ())
1937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1938 "bad data dependence.\n");
1939 return false;
1942 /* Compute the scalar iteration cost. */
1943 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1945 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1946 HOST_WIDE_INT estimated_niter;
1947 unsigned th;
1948 int min_scalar_loop_bound;
1950 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1951 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1952 if (!ok)
1953 return false;
1955 /* If there are any SLP instances mark them as pure_slp. */
1956 bool slp = vect_make_slp_decision (loop_vinfo);
1957 if (slp)
1959 /* Find stmts that need to be both vectorized and SLPed. */
1960 vect_detect_hybrid_slp (loop_vinfo);
1962 /* Update the vectorization factor based on the SLP decision. */
1963 vect_update_vf_for_slp (loop_vinfo);
1966 /* This is the point where we can re-start analysis with SLP forced off. */
1967 start_over:
1969 /* Now the vectorization factor is final. */
1970 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1971 gcc_assert (vectorization_factor != 0);
1973 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1974 dump_printf_loc (MSG_NOTE, vect_location,
1975 "vectorization_factor = %d, niters = "
1976 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1977 LOOP_VINFO_INT_NITERS (loop_vinfo));
1979 HOST_WIDE_INT max_niter
1980 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1981 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1982 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1983 || (max_niter != -1
1984 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1986 if (dump_enabled_p ())
1987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1988 "not vectorized: iteration count smaller than "
1989 "vectorization factor.\n");
1990 return false;
1993 /* Analyze the alignment of the data-refs in the loop.
1994 Fail if a data reference is found that cannot be vectorized. */
1996 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1997 if (!ok)
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001 "bad data alignment.\n");
2002 return false;
2005 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2006 It is important to call pruning after vect_analyze_data_ref_accesses,
2007 since we use grouping information gathered by interleaving analysis. */
2008 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2009 if (!ok)
2010 return false;
2012 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2013 vectorization. */
2014 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2016 /* This pass will decide on using loop versioning and/or loop peeling in
2017 order to enhance the alignment of data references in the loop. */
2018 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2019 if (!ok)
2021 if (dump_enabled_p ())
2022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2023 "bad data alignment.\n");
2024 return false;
2028 if (slp)
2030 /* Analyze operations in the SLP instances. Note this may
2031 remove unsupported SLP instances which makes the above
2032 SLP kind detection invalid. */
2033 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2034 vect_slp_analyze_operations (loop_vinfo);
2035 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2036 goto again;
2039 /* Scan all the remaining operations in the loop that are not subject
2040 to SLP and make sure they are vectorizable. */
2041 ok = vect_analyze_loop_operations (loop_vinfo);
2042 if (!ok)
2044 if (dump_enabled_p ())
2045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2046 "bad operation or unsupported loop bound.\n");
2047 return false;
2050 /* If epilog loop is required because of data accesses with gaps,
2051 one additional iteration needs to be peeled. Check if there is
2052 enough iterations for vectorization. */
2053 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2054 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2056 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2057 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2059 if (wi::to_widest (scalar_niters) < vf)
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE, vect_location,
2063 "loop has no enough iterations to support"
2064 " peeling for gaps.\n");
2065 return false;
2069 /* Analyze cost. Decide if worth while to vectorize. */
2070 int min_profitable_estimate, min_profitable_iters;
2071 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2072 &min_profitable_estimate);
2074 if (min_profitable_iters < 0)
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078 "not vectorized: vectorization not profitable.\n");
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081 "not vectorized: vector version will never be "
2082 "profitable.\n");
2083 goto again;
2086 min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2087 * vectorization_factor);
2089 /* Use the cost model only if it is more conservative than user specified
2090 threshold. */
2091 th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2093 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2095 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2096 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 "not vectorized: vectorization not profitable.\n");
2101 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_NOTE, vect_location,
2103 "not vectorized: iteration count smaller than user "
2104 "specified loop bound parameter or minimum profitable "
2105 "iterations (whichever is more conservative).\n");
2106 goto again;
2109 estimated_niter
2110 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2111 if (estimated_niter == -1)
2112 estimated_niter = max_niter;
2113 if (estimated_niter != -1
2114 && ((unsigned HOST_WIDE_INT) estimated_niter
2115 < MAX (th, (unsigned) min_profitable_estimate)))
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2119 "not vectorized: estimated iteration count too "
2120 "small.\n");
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_NOTE, vect_location,
2123 "not vectorized: estimated iteration count smaller "
2124 "than specified loop bound parameter or minimum "
2125 "profitable iterations (whichever is more "
2126 "conservative).\n");
2127 goto again;
2130 /* Decide whether we need to create an epilogue loop to handle
2131 remaining scalar iterations. */
2132 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2133 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2134 * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2136 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2137 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2139 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2140 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2141 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2142 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2144 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2145 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2146 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2147 /* In case of versioning, check if the maximum number of
2148 iterations is greater than th. If they are identical,
2149 the epilogue is unnecessary. */
2150 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2151 || (unsigned HOST_WIDE_INT) max_niter > th)))
2152 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2154 /* If an epilogue loop is required make sure we can create one. */
2155 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2156 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2160 if (!vect_can_advance_ivs_p (loop_vinfo)
2161 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2162 single_exit (LOOP_VINFO_LOOP
2163 (loop_vinfo))))
2165 if (dump_enabled_p ())
2166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2167 "not vectorized: can't create required "
2168 "epilog loop\n");
2169 goto again;
2173 /* During peeling, we need to check if number of loop iterations is
2174 enough for both peeled prolog loop and vector loop. This check
2175 can be merged along with threshold check of loop versioning, so
2176 increase threshold for this case if necessary. */
2177 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2178 && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2179 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2181 unsigned niters_th;
2183 /* Niters for peeled prolog loop. */
2184 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2186 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2187 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2189 niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2191 else
2192 niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2194 /* Niters for at least one iteration of vectorized loop. */
2195 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2196 /* One additional iteration because of peeling for gap. */
2197 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2198 niters_th++;
2199 if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2200 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2203 gcc_assert (vectorization_factor
2204 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2206 /* Ok to vectorize! */
2207 return true;
2209 again:
2210 /* Try again with SLP forced off but if we didn't do any SLP there is
2211 no point in re-trying. */
2212 if (!slp)
2213 return false;
2215 /* If there are reduction chains re-trying will fail anyway. */
2216 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2217 return false;
2219 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2220 via interleaving or lane instructions. */
2221 slp_instance instance;
2222 slp_tree node;
2223 unsigned i, j;
2224 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2226 stmt_vec_info vinfo;
2227 vinfo = vinfo_for_stmt
2228 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2229 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2230 continue;
2231 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2232 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2233 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2234 if (! vect_store_lanes_supported (vectype, size)
2235 && ! vect_grouped_store_supported (vectype, size))
2236 return false;
2237 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2239 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2240 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2241 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2242 size = STMT_VINFO_GROUP_SIZE (vinfo);
2243 vectype = STMT_VINFO_VECTYPE (vinfo);
2244 if (! vect_load_lanes_supported (vectype, size)
2245 && ! vect_grouped_load_supported (vectype, single_element_p,
2246 size))
2247 return false;
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "re-trying with SLP disabled\n");
2255 /* Roll back state appropriately. No SLP this time. */
2256 slp = false;
2257 /* Restore vectorization factor as it were without SLP. */
2258 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2259 /* Free the SLP instances. */
2260 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2261 vect_free_slp_instance (instance);
2262 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2263 /* Reset SLP type to loop_vect on all stmts. */
2264 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2266 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2267 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2268 !gsi_end_p (si); gsi_next (&si))
2270 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2271 STMT_SLP_TYPE (stmt_info) = loop_vect;
2273 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2274 !gsi_end_p (si); gsi_next (&si))
2276 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2277 STMT_SLP_TYPE (stmt_info) = loop_vect;
2278 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2280 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2281 STMT_SLP_TYPE (stmt_info) = loop_vect;
2282 for (gimple_stmt_iterator pi
2283 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2284 !gsi_end_p (pi); gsi_next (&pi))
2286 gimple *pstmt = gsi_stmt (pi);
2287 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2292 /* Free optimized alias test DDRS. */
2293 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2294 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2295 /* Reset target cost data. */
2296 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2297 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2298 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2299 /* Reset assorted flags. */
2300 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2301 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2302 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2304 goto start_over;
2307 /* Function vect_analyze_loop.
2309 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2310 for it. The different analyses will record information in the
2311 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2312 be vectorized. */
2313 loop_vec_info
2314 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2316 loop_vec_info loop_vinfo;
2317 unsigned int vector_sizes;
2319 /* Autodetect first vector size we try. */
2320 current_vector_size = 0;
2321 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE, vect_location,
2325 "===== analyze_loop_nest =====\n");
2327 if (loop_outer (loop)
2328 && loop_vec_info_for_loop (loop_outer (loop))
2329 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE, vect_location,
2333 "outer-loop already vectorized.\n");
2334 return NULL;
2337 while (1)
2339 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2340 loop_vinfo = vect_analyze_loop_form (loop);
2341 if (!loop_vinfo)
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 "bad loop form.\n");
2346 return NULL;
2349 bool fatal = false;
2351 if (orig_loop_vinfo)
2352 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2354 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2356 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2358 return loop_vinfo;
2361 delete loop_vinfo;
2363 vector_sizes &= ~current_vector_size;
2364 if (fatal
2365 || vector_sizes == 0
2366 || current_vector_size == 0)
2367 return NULL;
2369 /* Try the next biggest vector size. */
2370 current_vector_size = 1 << floor_log2 (vector_sizes);
2371 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_NOTE, vect_location,
2373 "***** Re-trying analysis with "
2374 "vector size %d\n", current_vector_size);
2379 /* Function reduction_code_for_scalar_code
2381 Input:
2382 CODE - tree_code of a reduction operations.
2384 Output:
2385 REDUC_CODE - the corresponding tree-code to be used to reduce the
2386 vector of partial results into a single scalar result, or ERROR_MARK
2387 if the operation is a supported reduction operation, but does not have
2388 such a tree-code.
2390 Return FALSE if CODE currently cannot be vectorized as reduction. */
2392 static bool
2393 reduction_code_for_scalar_code (enum tree_code code,
2394 enum tree_code *reduc_code)
2396 switch (code)
2398 case MAX_EXPR:
2399 *reduc_code = REDUC_MAX_EXPR;
2400 return true;
2402 case MIN_EXPR:
2403 *reduc_code = REDUC_MIN_EXPR;
2404 return true;
2406 case PLUS_EXPR:
2407 *reduc_code = REDUC_PLUS_EXPR;
2408 return true;
2410 case MULT_EXPR:
2411 case MINUS_EXPR:
2412 case BIT_IOR_EXPR:
2413 case BIT_XOR_EXPR:
2414 case BIT_AND_EXPR:
2415 *reduc_code = ERROR_MARK;
2416 return true;
2418 default:
2419 return false;
2424 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2425 STMT is printed with a message MSG. */
2427 static void
2428 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2430 dump_printf_loc (msg_type, vect_location, "%s", msg);
2431 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2435 /* Detect SLP reduction of the form:
2437 #a1 = phi <a5, a0>
2438 a2 = operation (a1)
2439 a3 = operation (a2)
2440 a4 = operation (a3)
2441 a5 = operation (a4)
2443 #a = phi <a5>
2445 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2446 FIRST_STMT is the first reduction stmt in the chain
2447 (a2 = operation (a1)).
2449 Return TRUE if a reduction chain was detected. */
2451 static bool
2452 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2453 gimple *first_stmt)
2455 struct loop *loop = (gimple_bb (phi))->loop_father;
2456 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2457 enum tree_code code;
2458 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2459 stmt_vec_info use_stmt_info, current_stmt_info;
2460 tree lhs;
2461 imm_use_iterator imm_iter;
2462 use_operand_p use_p;
2463 int nloop_uses, size = 0, n_out_of_loop_uses;
2464 bool found = false;
2466 if (loop != vect_loop)
2467 return false;
2469 lhs = PHI_RESULT (phi);
2470 code = gimple_assign_rhs_code (first_stmt);
2471 while (1)
2473 nloop_uses = 0;
2474 n_out_of_loop_uses = 0;
2475 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2477 gimple *use_stmt = USE_STMT (use_p);
2478 if (is_gimple_debug (use_stmt))
2479 continue;
2481 /* Check if we got back to the reduction phi. */
2482 if (use_stmt == phi)
2484 loop_use_stmt = use_stmt;
2485 found = true;
2486 break;
2489 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2491 loop_use_stmt = use_stmt;
2492 nloop_uses++;
2494 else
2495 n_out_of_loop_uses++;
2497 /* There are can be either a single use in the loop or two uses in
2498 phi nodes. */
2499 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2500 return false;
2503 if (found)
2504 break;
2506 /* We reached a statement with no loop uses. */
2507 if (nloop_uses == 0)
2508 return false;
2510 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2511 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2512 return false;
2514 if (!is_gimple_assign (loop_use_stmt)
2515 || code != gimple_assign_rhs_code (loop_use_stmt)
2516 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2517 return false;
2519 /* Insert USE_STMT into reduction chain. */
2520 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2521 if (current_stmt)
2523 current_stmt_info = vinfo_for_stmt (current_stmt);
2524 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2525 GROUP_FIRST_ELEMENT (use_stmt_info)
2526 = GROUP_FIRST_ELEMENT (current_stmt_info);
2528 else
2529 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2531 lhs = gimple_assign_lhs (loop_use_stmt);
2532 current_stmt = loop_use_stmt;
2533 size++;
2536 if (!found || loop_use_stmt != phi || size < 2)
2537 return false;
2539 /* Swap the operands, if needed, to make the reduction operand be the second
2540 operand. */
2541 lhs = PHI_RESULT (phi);
2542 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2543 while (next_stmt)
2545 if (gimple_assign_rhs2 (next_stmt) == lhs)
2547 tree op = gimple_assign_rhs1 (next_stmt);
2548 gimple *def_stmt = NULL;
2550 if (TREE_CODE (op) == SSA_NAME)
2551 def_stmt = SSA_NAME_DEF_STMT (op);
2553 /* Check that the other def is either defined in the loop
2554 ("vect_internal_def"), or it's an induction (defined by a
2555 loop-header phi-node). */
2556 if (def_stmt
2557 && gimple_bb (def_stmt)
2558 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2559 && (is_gimple_assign (def_stmt)
2560 || is_gimple_call (def_stmt)
2561 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2562 == vect_induction_def
2563 || (gimple_code (def_stmt) == GIMPLE_PHI
2564 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2565 == vect_internal_def
2566 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2568 lhs = gimple_assign_lhs (next_stmt);
2569 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2570 continue;
2573 return false;
2575 else
2577 tree op = gimple_assign_rhs2 (next_stmt);
2578 gimple *def_stmt = NULL;
2580 if (TREE_CODE (op) == SSA_NAME)
2581 def_stmt = SSA_NAME_DEF_STMT (op);
2583 /* Check that the other def is either defined in the loop
2584 ("vect_internal_def"), or it's an induction (defined by a
2585 loop-header phi-node). */
2586 if (def_stmt
2587 && gimple_bb (def_stmt)
2588 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2589 && (is_gimple_assign (def_stmt)
2590 || is_gimple_call (def_stmt)
2591 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592 == vect_induction_def
2593 || (gimple_code (def_stmt) == GIMPLE_PHI
2594 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2595 == vect_internal_def
2596 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2598 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2601 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2604 swap_ssa_operands (next_stmt,
2605 gimple_assign_rhs1_ptr (next_stmt),
2606 gimple_assign_rhs2_ptr (next_stmt));
2607 update_stmt (next_stmt);
2609 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2610 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2612 else
2613 return false;
2616 lhs = gimple_assign_lhs (next_stmt);
2617 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2620 /* Save the chain for further analysis in SLP detection. */
2621 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2622 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2623 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2625 return true;
2629 /* Function vect_is_simple_reduction
2631 (1) Detect a cross-iteration def-use cycle that represents a simple
2632 reduction computation. We look for the following pattern:
2634 loop_header:
2635 a1 = phi < a0, a2 >
2636 a3 = ...
2637 a2 = operation (a3, a1)
2641 a3 = ...
2642 loop_header:
2643 a1 = phi < a0, a2 >
2644 a2 = operation (a3, a1)
2646 such that:
2647 1. operation is commutative and associative and it is safe to
2648 change the order of the computation
2649 2. no uses for a2 in the loop (a2 is used out of the loop)
2650 3. no uses of a1 in the loop besides the reduction operation
2651 4. no uses of a1 outside the loop.
2653 Conditions 1,4 are tested here.
2654 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2656 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2657 nested cycles.
2659 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2660 reductions:
2662 a1 = phi < a0, a2 >
2663 inner loop (def of a3)
2664 a2 = phi < a3 >
2666 (4) Detect condition expressions, ie:
2667 for (int i = 0; i < N; i++)
2668 if (a[i] < val)
2669 ret_val = a[i];
2673 static gimple *
2674 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2675 bool *double_reduc,
2676 bool need_wrapping_integral_overflow,
2677 enum vect_reduction_type *v_reduc_type)
2679 struct loop *loop = (gimple_bb (phi))->loop_father;
2680 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2681 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2682 enum tree_code orig_code, code;
2683 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2684 tree type;
2685 int nloop_uses;
2686 tree name;
2687 imm_use_iterator imm_iter;
2688 use_operand_p use_p;
2689 bool phi_def;
2691 *double_reduc = false;
2692 *v_reduc_type = TREE_CODE_REDUCTION;
2694 tree phi_name = PHI_RESULT (phi);
2695 /* ??? If there are no uses of the PHI result the inner loop reduction
2696 won't be detected as possibly double-reduction by vectorizable_reduction
2697 because that tries to walk the PHI arg from the preheader edge which
2698 can be constant. See PR60382. */
2699 if (has_zero_uses (phi_name))
2700 return NULL;
2701 nloop_uses = 0;
2702 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2704 gimple *use_stmt = USE_STMT (use_p);
2705 if (is_gimple_debug (use_stmt))
2706 continue;
2708 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2712 "intermediate value used outside loop.\n");
2714 return NULL;
2717 nloop_uses++;
2718 if (nloop_uses > 1)
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722 "reduction value used in loop.\n");
2723 return NULL;
2726 phi_use_stmt = use_stmt;
2729 edge latch_e = loop_latch_edge (loop);
2730 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2731 if (TREE_CODE (loop_arg) != SSA_NAME)
2733 if (dump_enabled_p ())
2735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2736 "reduction: not ssa_name: ");
2737 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2738 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2740 return NULL;
2743 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2744 if (is_gimple_assign (def_stmt))
2746 name = gimple_assign_lhs (def_stmt);
2747 phi_def = false;
2749 else if (gimple_code (def_stmt) == GIMPLE_PHI)
2751 name = PHI_RESULT (def_stmt);
2752 phi_def = true;
2754 else
2756 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2759 "reduction: unhandled reduction operation: ");
2760 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2762 return NULL;
2765 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2766 return NULL;
2768 nloop_uses = 0;
2769 auto_vec<gphi *, 3> lcphis;
2770 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2772 gimple *use_stmt = USE_STMT (use_p);
2773 if (is_gimple_debug (use_stmt))
2774 continue;
2775 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2776 nloop_uses++;
2777 else
2778 /* We can have more than one loop-closed PHI. */
2779 lcphis.safe_push (as_a <gphi *> (use_stmt));
2780 if (nloop_uses > 1)
2782 if (dump_enabled_p ())
2783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2784 "reduction used in loop.\n");
2785 return NULL;
2789 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2790 defined in the inner loop. */
2791 if (phi_def)
2793 op1 = PHI_ARG_DEF (def_stmt, 0);
2795 if (gimple_phi_num_args (def_stmt) != 1
2796 || TREE_CODE (op1) != SSA_NAME)
2798 if (dump_enabled_p ())
2799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800 "unsupported phi node definition.\n");
2802 return NULL;
2805 def1 = SSA_NAME_DEF_STMT (op1);
2806 if (gimple_bb (def1)
2807 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2808 && loop->inner
2809 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2810 && is_gimple_assign (def1)
2811 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2813 if (dump_enabled_p ())
2814 report_vect_op (MSG_NOTE, def_stmt,
2815 "detected double reduction: ");
2817 *double_reduc = true;
2818 return def_stmt;
2821 return NULL;
2824 /* If we are vectorizing an inner reduction we are executing that
2825 in the original order only in case we are not dealing with a
2826 double reduction. */
2827 bool check_reduction = true;
2828 if (flow_loop_nested_p (vect_loop, loop))
2830 gphi *lcphi;
2831 unsigned i;
2832 check_reduction = false;
2833 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2834 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2836 gimple *use_stmt = USE_STMT (use_p);
2837 if (is_gimple_debug (use_stmt))
2838 continue;
2839 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2840 check_reduction = true;
2844 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2845 code = orig_code = gimple_assign_rhs_code (def_stmt);
2847 /* We can handle "res -= x[i]", which is non-associative by
2848 simply rewriting this into "res += -x[i]". Avoid changing
2849 gimple instruction for the first simple tests and only do this
2850 if we're allowed to change code at all. */
2851 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2852 code = PLUS_EXPR;
2854 if (code == COND_EXPR)
2856 if (! nested_in_vect_loop)
2857 *v_reduc_type = COND_REDUCTION;
2859 op3 = gimple_assign_rhs1 (def_stmt);
2860 if (COMPARISON_CLASS_P (op3))
2862 op4 = TREE_OPERAND (op3, 1);
2863 op3 = TREE_OPERAND (op3, 0);
2865 if (op3 == phi_name || op4 == phi_name)
2867 if (dump_enabled_p ())
2868 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2869 "reduction: condition depends on previous"
2870 " iteration: ");
2871 return NULL;
2874 op1 = gimple_assign_rhs2 (def_stmt);
2875 op2 = gimple_assign_rhs3 (def_stmt);
2877 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2879 if (dump_enabled_p ())
2880 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2881 "reduction: not commutative/associative: ");
2882 return NULL;
2884 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2886 op1 = gimple_assign_rhs1 (def_stmt);
2887 op2 = gimple_assign_rhs2 (def_stmt);
2889 else
2891 if (dump_enabled_p ())
2892 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2893 "reduction: not handled operation: ");
2894 return NULL;
2897 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2899 if (dump_enabled_p ())
2900 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2901 "reduction: both uses not ssa_names: ");
2903 return NULL;
2906 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2907 if ((TREE_CODE (op1) == SSA_NAME
2908 && !types_compatible_p (type,TREE_TYPE (op1)))
2909 || (TREE_CODE (op2) == SSA_NAME
2910 && !types_compatible_p (type, TREE_TYPE (op2)))
2911 || (op3 && TREE_CODE (op3) == SSA_NAME
2912 && !types_compatible_p (type, TREE_TYPE (op3)))
2913 || (op4 && TREE_CODE (op4) == SSA_NAME
2914 && !types_compatible_p (type, TREE_TYPE (op4))))
2916 if (dump_enabled_p ())
2918 dump_printf_loc (MSG_NOTE, vect_location,
2919 "reduction: multiple types: operation type: ");
2920 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2921 dump_printf (MSG_NOTE, ", operands types: ");
2922 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923 TREE_TYPE (op1));
2924 dump_printf (MSG_NOTE, ",");
2925 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926 TREE_TYPE (op2));
2927 if (op3)
2929 dump_printf (MSG_NOTE, ",");
2930 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931 TREE_TYPE (op3));
2934 if (op4)
2936 dump_printf (MSG_NOTE, ",");
2937 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2938 TREE_TYPE (op4));
2940 dump_printf (MSG_NOTE, "\n");
2943 return NULL;
2946 /* Check that it's ok to change the order of the computation.
2947 Generally, when vectorizing a reduction we change the order of the
2948 computation. This may change the behavior of the program in some
2949 cases, so we need to check that this is ok. One exception is when
2950 vectorizing an outer-loop: the inner-loop is executed sequentially,
2951 and therefore vectorizing reductions in the inner-loop during
2952 outer-loop vectorization is safe. */
2954 if (*v_reduc_type != COND_REDUCTION
2955 && check_reduction)
2957 /* CHECKME: check for !flag_finite_math_only too? */
2958 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2960 /* Changing the order of operations changes the semantics. */
2961 if (dump_enabled_p ())
2962 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2963 "reduction: unsafe fp math optimization: ");
2964 return NULL;
2966 else if (INTEGRAL_TYPE_P (type))
2968 if (!operation_no_trapping_overflow (type, code))
2970 /* Changing the order of operations changes the semantics. */
2971 if (dump_enabled_p ())
2972 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2973 "reduction: unsafe int math optimization"
2974 " (overflow traps): ");
2975 return NULL;
2977 if (need_wrapping_integral_overflow
2978 && !TYPE_OVERFLOW_WRAPS (type)
2979 && operation_can_overflow (code))
2981 /* Changing the order of operations changes the semantics. */
2982 if (dump_enabled_p ())
2983 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2984 "reduction: unsafe int math optimization"
2985 " (overflow doesn't wrap): ");
2986 return NULL;
2989 else if (SAT_FIXED_POINT_TYPE_P (type))
2991 /* Changing the order of operations changes the semantics. */
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2994 "reduction: unsafe fixed-point math optimization: ");
2995 return NULL;
2999 /* Reduction is safe. We're dealing with one of the following:
3000 1) integer arithmetic and no trapv
3001 2) floating point arithmetic, and special flags permit this optimization
3002 3) nested cycle (i.e., outer loop vectorization). */
3003 if (TREE_CODE (op1) == SSA_NAME)
3004 def1 = SSA_NAME_DEF_STMT (op1);
3006 if (TREE_CODE (op2) == SSA_NAME)
3007 def2 = SSA_NAME_DEF_STMT (op2);
3009 if (code != COND_EXPR
3010 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3012 if (dump_enabled_p ())
3013 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3014 return NULL;
3017 /* Check that one def is the reduction def, defined by PHI,
3018 the other def is either defined in the loop ("vect_internal_def"),
3019 or it's an induction (defined by a loop-header phi-node). */
3021 if (def2 && def2 == phi
3022 && (code == COND_EXPR
3023 || !def1 || gimple_nop_p (def1)
3024 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3025 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3026 && (is_gimple_assign (def1)
3027 || is_gimple_call (def1)
3028 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3029 == vect_induction_def
3030 || (gimple_code (def1) == GIMPLE_PHI
3031 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3032 == vect_internal_def
3033 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3035 if (dump_enabled_p ())
3036 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3037 return def_stmt;
3040 if (def1 && def1 == phi
3041 && (code == COND_EXPR
3042 || !def2 || gimple_nop_p (def2)
3043 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3044 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3045 && (is_gimple_assign (def2)
3046 || is_gimple_call (def2)
3047 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3048 == vect_induction_def
3049 || (gimple_code (def2) == GIMPLE_PHI
3050 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3051 == vect_internal_def
3052 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3054 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3056 /* Check if we can swap operands (just for simplicity - so that
3057 the rest of the code can assume that the reduction variable
3058 is always the last (second) argument). */
3059 if (code == COND_EXPR)
3061 /* Swap cond_expr by inverting the condition. */
3062 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3063 enum tree_code invert_code = ERROR_MARK;
3064 enum tree_code cond_code = TREE_CODE (cond_expr);
3066 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3068 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3069 invert_code = invert_tree_comparison (cond_code, honor_nans);
3071 if (invert_code != ERROR_MARK)
3073 TREE_SET_CODE (cond_expr, invert_code);
3074 swap_ssa_operands (def_stmt,
3075 gimple_assign_rhs2_ptr (def_stmt),
3076 gimple_assign_rhs3_ptr (def_stmt));
3078 else
3080 if (dump_enabled_p ())
3081 report_vect_op (MSG_NOTE, def_stmt,
3082 "detected reduction: cannot swap operands "
3083 "for cond_expr");
3084 return NULL;
3087 else
3088 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3089 gimple_assign_rhs2_ptr (def_stmt));
3091 if (dump_enabled_p ())
3092 report_vect_op (MSG_NOTE, def_stmt,
3093 "detected reduction: need to swap operands: ");
3095 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3096 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3098 else
3100 if (dump_enabled_p ())
3101 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3104 return def_stmt;
3107 /* Try to find SLP reduction chain. */
3108 if (! nested_in_vect_loop
3109 && code != COND_EXPR
3110 && orig_code != MINUS_EXPR
3111 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3113 if (dump_enabled_p ())
3114 report_vect_op (MSG_NOTE, def_stmt,
3115 "reduction: detected reduction chain: ");
3117 return def_stmt;
3120 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3121 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3122 while (first)
3124 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3125 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3126 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3127 first = next;
3130 /* Look for the expression computing loop_arg from loop PHI result. */
3131 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3132 auto_bitmap visited;
3133 tree lookfor = PHI_RESULT (phi);
3134 ssa_op_iter curri;
3135 use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3136 SSA_OP_USE);
3137 while (USE_FROM_PTR (curr) != loop_arg)
3138 curr = op_iter_next_use (&curri);
3139 curri.i = curri.numops;
3142 path.safe_push (std::make_pair (curri, curr));
3143 tree use = USE_FROM_PTR (curr);
3144 if (use == lookfor)
3145 break;
3146 gimple *def = SSA_NAME_DEF_STMT (use);
3147 if (gimple_nop_p (def)
3148 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3150 pop:
3153 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3154 curri = x.first;
3155 curr = x.second;
3157 curr = op_iter_next_use (&curri);
3158 /* Skip already visited or non-SSA operands (from iterating
3159 over PHI args). */
3160 while (curr != NULL_USE_OPERAND_P
3161 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3162 || ! bitmap_set_bit (visited,
3163 SSA_NAME_VERSION
3164 (USE_FROM_PTR (curr)))));
3166 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3167 if (curr == NULL_USE_OPERAND_P)
3168 break;
3170 else
3172 if (gimple_code (def) == GIMPLE_PHI)
3173 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3174 else
3175 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3176 while (curr != NULL_USE_OPERAND_P
3177 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3178 || ! bitmap_set_bit (visited,
3179 SSA_NAME_VERSION
3180 (USE_FROM_PTR (curr)))))
3181 curr = op_iter_next_use (&curri);
3182 if (curr == NULL_USE_OPERAND_P)
3183 goto pop;
3186 while (1);
3187 if (dump_file && (dump_flags & TDF_DETAILS))
3189 dump_printf_loc (MSG_NOTE, vect_location,
3190 "reduction path: ");
3191 unsigned i;
3192 std::pair<ssa_op_iter, use_operand_p> *x;
3193 FOR_EACH_VEC_ELT (path, i, x)
3195 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3196 dump_printf (MSG_NOTE, " ");
3198 dump_printf (MSG_NOTE, "\n");
3201 /* Check whether the reduction path detected is valid. */
3202 bool fail = path.length () == 0;
3203 bool neg = false;
3204 for (unsigned i = 1; i < path.length (); ++i)
3206 gimple *use_stmt = USE_STMT (path[i].second);
3207 tree op = USE_FROM_PTR (path[i].second);
3208 if (! has_single_use (op)
3209 || ! is_gimple_assign (use_stmt))
3211 fail = true;
3212 break;
3214 if (gimple_assign_rhs_code (use_stmt) != code)
3216 if (code == PLUS_EXPR
3217 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3219 /* Track whether we negate the reduction value each iteration. */
3220 if (gimple_assign_rhs2 (use_stmt) == op)
3221 neg = ! neg;
3223 else
3225 fail = true;
3226 break;
3230 if (! fail && ! neg)
3231 return def_stmt;
3233 if (dump_enabled_p ())
3235 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3236 "reduction: unknown pattern: ");
3239 return NULL;
3242 /* Wrapper around vect_is_simple_reduction, which will modify code
3243 in-place if it enables detection of more reductions. Arguments
3244 as there. */
3246 gimple *
3247 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3248 bool *double_reduc,
3249 bool need_wrapping_integral_overflow)
3251 enum vect_reduction_type v_reduc_type;
3252 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3253 need_wrapping_integral_overflow,
3254 &v_reduc_type);
3255 if (def)
3257 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3258 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3259 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3260 reduc_def_info = vinfo_for_stmt (def);
3261 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3263 return def;
3266 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3268 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3269 int *peel_iters_epilogue,
3270 stmt_vector_for_cost *scalar_cost_vec,
3271 stmt_vector_for_cost *prologue_cost_vec,
3272 stmt_vector_for_cost *epilogue_cost_vec)
3274 int retval = 0;
3275 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3277 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3279 *peel_iters_epilogue = vf/2;
3280 if (dump_enabled_p ())
3281 dump_printf_loc (MSG_NOTE, vect_location,
3282 "cost model: epilogue peel iters set to vf/2 "
3283 "because loop iterations are unknown .\n");
3285 /* If peeled iterations are known but number of scalar loop
3286 iterations are unknown, count a taken branch per peeled loop. */
3287 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3288 NULL, 0, vect_prologue);
3289 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3290 NULL, 0, vect_epilogue);
3292 else
3294 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3295 peel_iters_prologue = niters < peel_iters_prologue ?
3296 niters : peel_iters_prologue;
3297 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3298 /* If we need to peel for gaps, but no peeling is required, we have to
3299 peel VF iterations. */
3300 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3301 *peel_iters_epilogue = vf;
3304 stmt_info_for_cost *si;
3305 int j;
3306 if (peel_iters_prologue)
3307 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3309 stmt_vec_info stmt_info
3310 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3311 retval += record_stmt_cost (prologue_cost_vec,
3312 si->count * peel_iters_prologue,
3313 si->kind, stmt_info, si->misalign,
3314 vect_prologue);
3316 if (*peel_iters_epilogue)
3317 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3319 stmt_vec_info stmt_info
3320 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3321 retval += record_stmt_cost (epilogue_cost_vec,
3322 si->count * *peel_iters_epilogue,
3323 si->kind, stmt_info, si->misalign,
3324 vect_epilogue);
3327 return retval;
3330 /* Function vect_estimate_min_profitable_iters
3332 Return the number of iterations required for the vector version of the
3333 loop to be profitable relative to the cost of the scalar version of the
3334 loop.
3336 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3337 of iterations for vectorization. -1 value means loop vectorization
3338 is not profitable. This returned value may be used for dynamic
3339 profitability check.
3341 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3342 for static check against estimated number of iterations. */
3344 static void
3345 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3346 int *ret_min_profitable_niters,
3347 int *ret_min_profitable_estimate)
3349 int min_profitable_iters;
3350 int min_profitable_estimate;
3351 int peel_iters_prologue;
3352 int peel_iters_epilogue;
3353 unsigned vec_inside_cost = 0;
3354 int vec_outside_cost = 0;
3355 unsigned vec_prologue_cost = 0;
3356 unsigned vec_epilogue_cost = 0;
3357 int scalar_single_iter_cost = 0;
3358 int scalar_outside_cost = 0;
3359 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3360 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3361 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3363 /* Cost model disabled. */
3364 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3366 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3367 *ret_min_profitable_niters = 0;
3368 *ret_min_profitable_estimate = 0;
3369 return;
3372 /* Requires loop versioning tests to handle misalignment. */
3373 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3375 /* FIXME: Make cost depend on complexity of individual check. */
3376 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3377 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3378 vect_prologue);
3379 dump_printf (MSG_NOTE,
3380 "cost model: Adding cost of checks for loop "
3381 "versioning to treat misalignment.\n");
3384 /* Requires loop versioning with alias checks. */
3385 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3387 /* FIXME: Make cost depend on complexity of individual check. */
3388 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3389 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3390 vect_prologue);
3391 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3392 if (len)
3393 /* Count LEN - 1 ANDs and LEN comparisons. */
3394 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3395 NULL, 0, vect_prologue);
3396 dump_printf (MSG_NOTE,
3397 "cost model: Adding cost of checks for loop "
3398 "versioning aliasing.\n");
3401 /* Requires loop versioning with niter checks. */
3402 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3404 /* FIXME: Make cost depend on complexity of individual check. */
3405 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3406 vect_prologue);
3407 dump_printf (MSG_NOTE,
3408 "cost model: Adding cost of checks for loop "
3409 "versioning niters.\n");
3412 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3413 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3414 vect_prologue);
3416 /* Count statements in scalar loop. Using this as scalar cost for a single
3417 iteration for now.
3419 TODO: Add outer loop support.
3421 TODO: Consider assigning different costs to different scalar
3422 statements. */
3424 scalar_single_iter_cost
3425 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3427 /* Add additional cost for the peeled instructions in prologue and epilogue
3428 loop.
3430 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3431 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3433 TODO: Build an expression that represents peel_iters for prologue and
3434 epilogue to be used in a run-time test. */
3436 if (npeel < 0)
3438 peel_iters_prologue = vf/2;
3439 dump_printf (MSG_NOTE, "cost model: "
3440 "prologue peel iters set to vf/2.\n");
3442 /* If peeling for alignment is unknown, loop bound of main loop becomes
3443 unknown. */
3444 peel_iters_epilogue = vf/2;
3445 dump_printf (MSG_NOTE, "cost model: "
3446 "epilogue peel iters set to vf/2 because "
3447 "peeling for alignment is unknown.\n");
3449 /* If peeled iterations are unknown, count a taken branch and a not taken
3450 branch per peeled loop. Even if scalar loop iterations are known,
3451 vector iterations are not known since peeled prologue iterations are
3452 not known. Hence guards remain the same. */
3453 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3454 NULL, 0, vect_prologue);
3455 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3456 NULL, 0, vect_prologue);
3457 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3458 NULL, 0, vect_epilogue);
3459 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3460 NULL, 0, vect_epilogue);
3461 stmt_info_for_cost *si;
3462 int j;
3463 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3465 struct _stmt_vec_info *stmt_info
3466 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3467 (void) add_stmt_cost (target_cost_data,
3468 si->count * peel_iters_prologue,
3469 si->kind, stmt_info, si->misalign,
3470 vect_prologue);
3471 (void) add_stmt_cost (target_cost_data,
3472 si->count * peel_iters_epilogue,
3473 si->kind, stmt_info, si->misalign,
3474 vect_epilogue);
3477 else
3479 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3480 stmt_info_for_cost *si;
3481 int j;
3482 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3484 prologue_cost_vec.create (2);
3485 epilogue_cost_vec.create (2);
3486 peel_iters_prologue = npeel;
3488 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3489 &peel_iters_epilogue,
3490 &LOOP_VINFO_SCALAR_ITERATION_COST
3491 (loop_vinfo),
3492 &prologue_cost_vec,
3493 &epilogue_cost_vec);
3495 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3497 struct _stmt_vec_info *stmt_info
3498 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3499 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3500 si->misalign, vect_prologue);
3503 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3505 struct _stmt_vec_info *stmt_info
3506 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3507 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3508 si->misalign, vect_epilogue);
3511 prologue_cost_vec.release ();
3512 epilogue_cost_vec.release ();
3515 /* FORNOW: The scalar outside cost is incremented in one of the
3516 following ways:
3518 1. The vectorizer checks for alignment and aliasing and generates
3519 a condition that allows dynamic vectorization. A cost model
3520 check is ANDED with the versioning condition. Hence scalar code
3521 path now has the added cost of the versioning check.
3523 if (cost > th & versioning_check)
3524 jmp to vector code
3526 Hence run-time scalar is incremented by not-taken branch cost.
3528 2. The vectorizer then checks if a prologue is required. If the
3529 cost model check was not done before during versioning, it has to
3530 be done before the prologue check.
3532 if (cost <= th)
3533 prologue = scalar_iters
3534 if (prologue == 0)
3535 jmp to vector code
3536 else
3537 execute prologue
3538 if (prologue == num_iters)
3539 go to exit
3541 Hence the run-time scalar cost is incremented by a taken branch,
3542 plus a not-taken branch, plus a taken branch cost.
3544 3. The vectorizer then checks if an epilogue is required. If the
3545 cost model check was not done before during prologue check, it
3546 has to be done with the epilogue check.
3548 if (prologue == 0)
3549 jmp to vector code
3550 else
3551 execute prologue
3552 if (prologue == num_iters)
3553 go to exit
3554 vector code:
3555 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3556 jmp to epilogue
3558 Hence the run-time scalar cost should be incremented by 2 taken
3559 branches.
3561 TODO: The back end may reorder the BBS's differently and reverse
3562 conditions/branch directions. Change the estimates below to
3563 something more reasonable. */
3565 /* If the number of iterations is known and we do not do versioning, we can
3566 decide whether to vectorize at compile time. Hence the scalar version
3567 do not carry cost model guard costs. */
3568 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3569 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3571 /* Cost model check occurs at versioning. */
3572 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3573 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3574 else
3576 /* Cost model check occurs at prologue generation. */
3577 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3578 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3579 + vect_get_stmt_cost (cond_branch_not_taken);
3580 /* Cost model check occurs at epilogue generation. */
3581 else
3582 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3586 /* Complete the target-specific cost calculations. */
3587 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3588 &vec_inside_cost, &vec_epilogue_cost);
3590 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3592 if (dump_enabled_p ())
3594 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3595 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3596 vec_inside_cost);
3597 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3598 vec_prologue_cost);
3599 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3600 vec_epilogue_cost);
3601 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3602 scalar_single_iter_cost);
3603 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3604 scalar_outside_cost);
3605 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3606 vec_outside_cost);
3607 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3608 peel_iters_prologue);
3609 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3610 peel_iters_epilogue);
3613 /* Calculate number of iterations required to make the vector version
3614 profitable, relative to the loop bodies only. The following condition
3615 must hold true:
3616 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3617 where
3618 SIC = scalar iteration cost, VIC = vector iteration cost,
3619 VOC = vector outside cost, VF = vectorization factor,
3620 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3621 SOC = scalar outside cost for run time cost model check. */
3623 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3625 if (vec_outside_cost <= 0)
3626 min_profitable_iters = 0;
3627 else
3629 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3630 - vec_inside_cost * peel_iters_prologue
3631 - vec_inside_cost * peel_iters_epilogue)
3632 / ((scalar_single_iter_cost * vf)
3633 - vec_inside_cost);
3635 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3636 <= (((int) vec_inside_cost * min_profitable_iters)
3637 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3638 min_profitable_iters++;
3641 /* vector version will never be profitable. */
3642 else
3644 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3645 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3646 "did not happen for a simd loop");
3648 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650 "cost model: the vector iteration cost = %d "
3651 "divided by the scalar iteration cost = %d "
3652 "is greater or equal to the vectorization factor = %d"
3653 ".\n",
3654 vec_inside_cost, scalar_single_iter_cost, vf);
3655 *ret_min_profitable_niters = -1;
3656 *ret_min_profitable_estimate = -1;
3657 return;
3660 dump_printf (MSG_NOTE,
3661 " Calculated minimum iters for profitability: %d\n",
3662 min_profitable_iters);
3664 /* We want the vectorized loop to execute at least once. */
3665 if (min_profitable_iters < (vf + peel_iters_prologue))
3666 min_profitable_iters = vf + peel_iters_prologue;
3668 if (dump_enabled_p ())
3669 dump_printf_loc (MSG_NOTE, vect_location,
3670 " Runtime profitability threshold = %d\n",
3671 min_profitable_iters);
3673 *ret_min_profitable_niters = min_profitable_iters;
3675 /* Calculate number of iterations required to make the vector version
3676 profitable, relative to the loop bodies only.
3678 Non-vectorized variant is SIC * niters and it must win over vector
3679 variant on the expected loop trip count. The following condition must hold true:
3680 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3682 if (vec_outside_cost <= 0)
3683 min_profitable_estimate = 0;
3684 else
3686 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3687 - vec_inside_cost * peel_iters_prologue
3688 - vec_inside_cost * peel_iters_epilogue)
3689 / ((scalar_single_iter_cost * vf)
3690 - vec_inside_cost);
3692 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE, vect_location,
3695 " Static estimate profitability threshold = %d\n",
3696 min_profitable_estimate);
3698 *ret_min_profitable_estimate = min_profitable_estimate;
3701 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3702 vector elements (not bits) for a vector with NELT elements. */
3703 static void
3704 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3705 vec_perm_indices *sel)
3707 unsigned int i;
3709 for (i = 0; i < nelt; i++)
3710 sel->quick_push ((i + offset) & (2 * nelt - 1));
3713 /* Checks whether the target supports whole-vector shifts for vectors of mode
3714 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3715 it supports vec_perm_const with masks for all necessary shift amounts. */
3716 static bool
3717 have_whole_vector_shift (machine_mode mode)
3719 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3720 return true;
3722 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3723 return false;
3725 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3726 auto_vec_perm_indices sel (nelt);
3728 for (i = nelt/2; i >= 1; i/=2)
3730 sel.truncate (0);
3731 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3732 if (!can_vec_perm_p (mode, false, &sel))
3733 return false;
3735 return true;
3738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3739 functions. Design better to avoid maintenance issues. */
3741 /* Function vect_model_reduction_cost.
3743 Models cost for a reduction operation, including the vector ops
3744 generated within the strip-mine loop, the initial definition before
3745 the loop, and the epilogue code that must be generated. */
3747 static void
3748 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3749 int ncopies)
3751 int prologue_cost = 0, epilogue_cost = 0;
3752 enum tree_code code;
3753 optab optab;
3754 tree vectype;
3755 gimple *orig_stmt;
3756 machine_mode mode;
3757 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3758 struct loop *loop = NULL;
3759 void *target_cost_data;
3761 if (loop_vinfo)
3763 loop = LOOP_VINFO_LOOP (loop_vinfo);
3764 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3766 else
3767 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3769 /* Condition reductions generate two reductions in the loop. */
3770 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3771 ncopies *= 2;
3773 /* Cost of reduction op inside loop. */
3774 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3775 stmt_info, 0, vect_body);
3777 vectype = STMT_VINFO_VECTYPE (stmt_info);
3778 mode = TYPE_MODE (vectype);
3779 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3781 if (!orig_stmt)
3782 orig_stmt = STMT_VINFO_STMT (stmt_info);
3784 code = gimple_assign_rhs_code (orig_stmt);
3786 /* Add in cost for initial definition.
3787 For cond reduction we have four vectors: initial index, step, initial
3788 result of the data reduction, initial value of the index reduction. */
3789 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3790 == COND_REDUCTION ? 4 : 1;
3791 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3792 scalar_to_vec, stmt_info, 0,
3793 vect_prologue);
3795 /* Determine cost of epilogue code.
3797 We have a reduction operator that will reduce the vector in one statement.
3798 Also requires scalar extract. */
3800 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3802 if (reduc_code != ERROR_MARK)
3804 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3806 /* An EQ stmt and an COND_EXPR stmt. */
3807 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3808 vector_stmt, stmt_info, 0,
3809 vect_epilogue);
3810 /* Reduction of the max index and a reduction of the found
3811 values. */
3812 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3813 vec_to_scalar, stmt_info, 0,
3814 vect_epilogue);
3815 /* A broadcast of the max value. */
3816 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3817 scalar_to_vec, stmt_info, 0,
3818 vect_epilogue);
3820 else
3822 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3823 stmt_info, 0, vect_epilogue);
3824 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3825 vec_to_scalar, stmt_info, 0,
3826 vect_epilogue);
3829 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3831 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3832 /* Extraction of scalar elements. */
3833 epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3834 vec_to_scalar, stmt_info, 0,
3835 vect_epilogue);
3836 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3837 epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3838 scalar_stmt, stmt_info, 0,
3839 vect_epilogue);
3841 else
3843 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3844 tree bitsize =
3845 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3846 int element_bitsize = tree_to_uhwi (bitsize);
3847 int nelements = vec_size_in_bits / element_bitsize;
3849 if (code == COND_EXPR)
3850 code = MAX_EXPR;
3852 optab = optab_for_tree_code (code, vectype, optab_default);
3854 /* We have a whole vector shift available. */
3855 if (optab != unknown_optab
3856 && VECTOR_MODE_P (mode)
3857 && optab_handler (optab, mode) != CODE_FOR_nothing
3858 && have_whole_vector_shift (mode))
3860 /* Final reduction via vector shifts and the reduction operator.
3861 Also requires scalar extract. */
3862 epilogue_cost += add_stmt_cost (target_cost_data,
3863 exact_log2 (nelements) * 2,
3864 vector_stmt, stmt_info, 0,
3865 vect_epilogue);
3866 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3867 vec_to_scalar, stmt_info, 0,
3868 vect_epilogue);
3870 else
3871 /* Use extracts and reduction op for final reduction. For N
3872 elements, we have N extracts and N-1 reduction ops. */
3873 epilogue_cost += add_stmt_cost (target_cost_data,
3874 nelements + nelements - 1,
3875 vector_stmt, stmt_info, 0,
3876 vect_epilogue);
3880 if (dump_enabled_p ())
3881 dump_printf (MSG_NOTE,
3882 "vect_model_reduction_cost: inside_cost = %d, "
3883 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3884 prologue_cost, epilogue_cost);
3888 /* Function vect_model_induction_cost.
3890 Models cost for induction operations. */
3892 static void
3893 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3895 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3896 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3897 unsigned inside_cost, prologue_cost;
3899 if (PURE_SLP_STMT (stmt_info))
3900 return;
3902 /* loop cost for vec_loop. */
3903 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3904 stmt_info, 0, vect_body);
3906 /* prologue cost for vec_init and vec_step. */
3907 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3908 stmt_info, 0, vect_prologue);
3910 if (dump_enabled_p ())
3911 dump_printf_loc (MSG_NOTE, vect_location,
3912 "vect_model_induction_cost: inside_cost = %d, "
3913 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3918 /* Function get_initial_def_for_reduction
3920 Input:
3921 STMT - a stmt that performs a reduction operation in the loop.
3922 INIT_VAL - the initial value of the reduction variable
3924 Output:
3925 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3926 of the reduction (used for adjusting the epilog - see below).
3927 Return a vector variable, initialized according to the operation that STMT
3928 performs. This vector will be used as the initial value of the
3929 vector of partial results.
3931 Option1 (adjust in epilog): Initialize the vector as follows:
3932 add/bit or/xor: [0,0,...,0,0]
3933 mult/bit and: [1,1,...,1,1]
3934 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3935 and when necessary (e.g. add/mult case) let the caller know
3936 that it needs to adjust the result by init_val.
3938 Option2: Initialize the vector as follows:
3939 add/bit or/xor: [init_val,0,0,...,0]
3940 mult/bit and: [init_val,1,1,...,1]
3941 min/max/cond_expr: [init_val,init_val,...,init_val]
3942 and no adjustments are needed.
3944 For example, for the following code:
3946 s = init_val;
3947 for (i=0;i<n;i++)
3948 s = s + a[i];
3950 STMT is 's = s + a[i]', and the reduction variable is 's'.
3951 For a vector of 4 units, we want to return either [0,0,0,init_val],
3952 or [0,0,0,0] and let the caller know that it needs to adjust
3953 the result at the end by 'init_val'.
3955 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3956 initialization vector is simpler (same element in all entries), if
3957 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3959 A cost model should help decide between these two schemes. */
3961 tree
3962 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3963 tree *adjustment_def)
3965 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3966 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3967 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3968 tree scalar_type = TREE_TYPE (init_val);
3969 tree vectype = get_vectype_for_scalar_type (scalar_type);
3970 int nunits;
3971 enum tree_code code = gimple_assign_rhs_code (stmt);
3972 tree def_for_init;
3973 tree init_def;
3974 int i;
3975 bool nested_in_vect_loop = false;
3976 REAL_VALUE_TYPE real_init_val = dconst0;
3977 int int_init_val = 0;
3978 gimple *def_stmt = NULL;
3979 gimple_seq stmts = NULL;
3981 gcc_assert (vectype);
3982 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3984 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3985 || SCALAR_FLOAT_TYPE_P (scalar_type));
3987 if (nested_in_vect_loop_p (loop, stmt))
3988 nested_in_vect_loop = true;
3989 else
3990 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3992 /* In case of double reduction we only create a vector variable to be put
3993 in the reduction phi node. The actual statement creation is done in
3994 vect_create_epilog_for_reduction. */
3995 if (adjustment_def && nested_in_vect_loop
3996 && TREE_CODE (init_val) == SSA_NAME
3997 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3998 && gimple_code (def_stmt) == GIMPLE_PHI
3999 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4000 && vinfo_for_stmt (def_stmt)
4001 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4002 == vect_double_reduction_def)
4004 *adjustment_def = NULL;
4005 return vect_create_destination_var (init_val, vectype);
4008 /* In case of a nested reduction do not use an adjustment def as
4009 that case is not supported by the epilogue generation correctly
4010 if ncopies is not one. */
4011 if (adjustment_def && nested_in_vect_loop)
4013 *adjustment_def = NULL;
4014 return vect_get_vec_def_for_operand (init_val, stmt);
4017 switch (code)
4019 case WIDEN_SUM_EXPR:
4020 case DOT_PROD_EXPR:
4021 case SAD_EXPR:
4022 case PLUS_EXPR:
4023 case MINUS_EXPR:
4024 case BIT_IOR_EXPR:
4025 case BIT_XOR_EXPR:
4026 case MULT_EXPR:
4027 case BIT_AND_EXPR:
4029 /* ADJUSMENT_DEF is NULL when called from
4030 vect_create_epilog_for_reduction to vectorize double reduction. */
4031 if (adjustment_def)
4032 *adjustment_def = init_val;
4034 if (code == MULT_EXPR)
4036 real_init_val = dconst1;
4037 int_init_val = 1;
4040 if (code == BIT_AND_EXPR)
4041 int_init_val = -1;
4043 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4044 def_for_init = build_real (scalar_type, real_init_val);
4045 else
4046 def_for_init = build_int_cst (scalar_type, int_init_val);
4048 if (adjustment_def)
4049 /* Option1: the first element is '0' or '1' as well. */
4050 init_def = gimple_build_vector_from_val (&stmts, vectype,
4051 def_for_init);
4052 else
4054 /* Option2: the first element is INIT_VAL. */
4055 auto_vec<tree, 32> elts (nunits);
4056 elts.quick_push (init_val);
4057 for (i = 1; i < nunits; ++i)
4058 elts.quick_push (def_for_init);
4059 init_def = gimple_build_vector (&stmts, vectype, elts);
4062 break;
4064 case MIN_EXPR:
4065 case MAX_EXPR:
4066 case COND_EXPR:
4068 if (adjustment_def)
4070 *adjustment_def = NULL_TREE;
4071 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4073 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4074 break;
4077 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4078 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4080 break;
4082 default:
4083 gcc_unreachable ();
4086 if (stmts)
4087 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4088 return init_def;
4091 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4092 NUMBER_OF_VECTORS is the number of vector defs to create. */
4094 static void
4095 get_initial_defs_for_reduction (slp_tree slp_node,
4096 vec<tree> *vec_oprnds,
4097 unsigned int number_of_vectors,
4098 enum tree_code code, bool reduc_chain)
4100 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4101 gimple *stmt = stmts[0];
4102 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4103 unsigned nunits;
4104 unsigned j, number_of_places_left_in_vector;
4105 tree vector_type, scalar_type;
4106 tree vop;
4107 int group_size = stmts.length ();
4108 unsigned int vec_num, i;
4109 unsigned number_of_copies = 1;
4110 vec<tree> voprnds;
4111 voprnds.create (number_of_vectors);
4112 tree neutral_op = NULL;
4113 struct loop *loop;
4115 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4116 scalar_type = TREE_TYPE (vector_type);
4117 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4119 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4121 loop = (gimple_bb (stmt))->loop_father;
4122 gcc_assert (loop);
4123 edge pe = loop_preheader_edge (loop);
4125 /* op is the reduction operand of the first stmt already. */
4126 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4127 we need either neutral operands or the original operands. See
4128 get_initial_def_for_reduction() for details. */
4129 switch (code)
4131 case WIDEN_SUM_EXPR:
4132 case DOT_PROD_EXPR:
4133 case SAD_EXPR:
4134 case PLUS_EXPR:
4135 case MINUS_EXPR:
4136 case BIT_IOR_EXPR:
4137 case BIT_XOR_EXPR:
4138 neutral_op = build_zero_cst (scalar_type);
4139 break;
4141 case MULT_EXPR:
4142 neutral_op = build_one_cst (scalar_type);
4143 break;
4145 case BIT_AND_EXPR:
4146 neutral_op = build_all_ones_cst (scalar_type);
4147 break;
4149 /* For MIN/MAX we don't have an easy neutral operand but
4150 the initial values can be used fine here. Only for
4151 a reduction chain we have to force a neutral element. */
4152 case MAX_EXPR:
4153 case MIN_EXPR:
4154 if (! reduc_chain)
4155 neutral_op = NULL;
4156 else
4157 neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4158 break;
4160 default:
4161 gcc_assert (! reduc_chain);
4162 neutral_op = NULL;
4165 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4166 created vectors. It is greater than 1 if unrolling is performed.
4168 For example, we have two scalar operands, s1 and s2 (e.g., group of
4169 strided accesses of size two), while NUNITS is four (i.e., four scalars
4170 of this type can be packed in a vector). The output vector will contain
4171 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4172 will be 2).
4174 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4175 containing the operands.
4177 For example, NUNITS is four as before, and the group size is 8
4178 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4179 {s5, s6, s7, s8}. */
4181 number_of_copies = nunits * number_of_vectors / group_size;
4183 number_of_places_left_in_vector = nunits;
4184 auto_vec<tree, 32> elts (nunits);
4185 elts.quick_grow (nunits);
4186 for (j = 0; j < number_of_copies; j++)
4188 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4190 tree op;
4191 /* Get the def before the loop. In reduction chain we have only
4192 one initial value. */
4193 if ((j != (number_of_copies - 1)
4194 || (reduc_chain && i != 0))
4195 && neutral_op)
4196 op = neutral_op;
4197 else
4198 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4200 /* Create 'vect_ = {op0,op1,...,opn}'. */
4201 number_of_places_left_in_vector--;
4202 elts[number_of_places_left_in_vector] = op;
4204 if (number_of_places_left_in_vector == 0)
4206 gimple_seq ctor_seq = NULL;
4207 tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
4208 if (ctor_seq != NULL)
4209 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4210 voprnds.quick_push (init);
4212 number_of_places_left_in_vector = nunits;
4217 /* Since the vectors are created in the reverse order, we should invert
4218 them. */
4219 vec_num = voprnds.length ();
4220 for (j = vec_num; j != 0; j--)
4222 vop = voprnds[j - 1];
4223 vec_oprnds->quick_push (vop);
4226 voprnds.release ();
4228 /* In case that VF is greater than the unrolling factor needed for the SLP
4229 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4230 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4231 to replicate the vectors. */
4232 tree neutral_vec = NULL;
4233 while (number_of_vectors > vec_oprnds->length ())
4235 if (neutral_op)
4237 if (!neutral_vec)
4239 gimple_seq ctor_seq = NULL;
4240 neutral_vec = gimple_build_vector_from_val
4241 (&ctor_seq, vector_type, neutral_op);
4242 if (ctor_seq != NULL)
4243 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4245 vec_oprnds->quick_push (neutral_vec);
4247 else
4249 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4250 vec_oprnds->quick_push (vop);
4256 /* Function vect_create_epilog_for_reduction
4258 Create code at the loop-epilog to finalize the result of a reduction
4259 computation.
4261 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4262 reduction statements.
4263 STMT is the scalar reduction stmt that is being vectorized.
4264 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4265 number of elements that we can fit in a vectype (nunits). In this case
4266 we have to generate more than one vector stmt - i.e - we need to "unroll"
4267 the vector stmt by a factor VF/nunits. For more details see documentation
4268 in vectorizable_operation.
4269 REDUC_CODE is the tree-code for the epilog reduction.
4270 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4271 computation.
4272 REDUC_INDEX is the index of the operand in the right hand side of the
4273 statement that is defined by REDUCTION_PHI.
4274 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4275 SLP_NODE is an SLP node containing a group of reduction statements. The
4276 first one in this group is STMT.
4278 This function:
4279 1. Creates the reduction def-use cycles: sets the arguments for
4280 REDUCTION_PHIS:
4281 The loop-entry argument is the vectorized initial-value of the reduction.
4282 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4283 sums.
4284 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4285 by applying the operation specified by REDUC_CODE if available, or by
4286 other means (whole-vector shifts or a scalar loop).
4287 The function also creates a new phi node at the loop exit to preserve
4288 loop-closed form, as illustrated below.
4290 The flow at the entry to this function:
4292 loop:
4293 vec_def = phi <null, null> # REDUCTION_PHI
4294 VECT_DEF = vector_stmt # vectorized form of STMT
4295 s_loop = scalar_stmt # (scalar) STMT
4296 loop_exit:
4297 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4298 use <s_out0>
4299 use <s_out0>
4301 The above is transformed by this function into:
4303 loop:
4304 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4305 VECT_DEF = vector_stmt # vectorized form of STMT
4306 s_loop = scalar_stmt # (scalar) STMT
4307 loop_exit:
4308 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4309 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4310 v_out2 = reduce <v_out1>
4311 s_out3 = extract_field <v_out2, 0>
4312 s_out4 = adjust_result <s_out3>
4313 use <s_out4>
4314 use <s_out4>
4317 static void
4318 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4319 gimple *reduc_def_stmt,
4320 int ncopies, enum tree_code reduc_code,
4321 vec<gimple *> reduction_phis,
4322 bool double_reduc,
4323 slp_tree slp_node,
4324 slp_instance slp_node_instance)
4326 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4327 stmt_vec_info prev_phi_info;
4328 tree vectype;
4329 machine_mode mode;
4330 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4331 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4332 basic_block exit_bb;
4333 tree scalar_dest;
4334 tree scalar_type;
4335 gimple *new_phi = NULL, *phi;
4336 gimple_stmt_iterator exit_gsi;
4337 tree vec_dest;
4338 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4339 gimple *epilog_stmt = NULL;
4340 enum tree_code code = gimple_assign_rhs_code (stmt);
4341 gimple *exit_phi;
4342 tree bitsize;
4343 tree adjustment_def = NULL;
4344 tree vec_initial_def = NULL;
4345 tree expr, def, initial_def = NULL;
4346 tree orig_name, scalar_result;
4347 imm_use_iterator imm_iter, phi_imm_iter;
4348 use_operand_p use_p, phi_use_p;
4349 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4350 bool nested_in_vect_loop = false;
4351 auto_vec<gimple *> new_phis;
4352 auto_vec<gimple *> inner_phis;
4353 enum vect_def_type dt = vect_unknown_def_type;
4354 int j, i;
4355 auto_vec<tree> scalar_results;
4356 unsigned int group_size = 1, k, ratio;
4357 auto_vec<tree> vec_initial_defs;
4358 auto_vec<gimple *> phis;
4359 bool slp_reduc = false;
4360 tree new_phi_result;
4361 gimple *inner_phi = NULL;
4362 tree induction_index = NULL_TREE;
4364 if (slp_node)
4365 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4367 if (nested_in_vect_loop_p (loop, stmt))
4369 outer_loop = loop;
4370 loop = loop->inner;
4371 nested_in_vect_loop = true;
4372 gcc_assert (!slp_node);
4375 vectype = STMT_VINFO_VECTYPE (stmt_info);
4376 gcc_assert (vectype);
4377 mode = TYPE_MODE (vectype);
4379 /* 1. Create the reduction def-use cycle:
4380 Set the arguments of REDUCTION_PHIS, i.e., transform
4382 loop:
4383 vec_def = phi <null, null> # REDUCTION_PHI
4384 VECT_DEF = vector_stmt # vectorized form of STMT
4387 into:
4389 loop:
4390 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4391 VECT_DEF = vector_stmt # vectorized form of STMT
4394 (in case of SLP, do it for all the phis). */
4396 /* Get the loop-entry arguments. */
4397 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4398 if (slp_node)
4400 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4401 vec_initial_defs.reserve (vec_num);
4402 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4403 &vec_initial_defs, vec_num, code,
4404 GROUP_FIRST_ELEMENT (stmt_info));
4406 else
4408 /* Get at the scalar def before the loop, that defines the initial value
4409 of the reduction variable. */
4410 gimple *def_stmt;
4411 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4412 loop_preheader_edge (loop));
4413 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4414 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4415 &adjustment_def);
4416 vec_initial_defs.create (1);
4417 vec_initial_defs.quick_push (vec_initial_def);
4420 /* Set phi nodes arguments. */
4421 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4423 tree vec_init_def = vec_initial_defs[i];
4424 tree def = vect_defs[i];
4425 for (j = 0; j < ncopies; j++)
4427 if (j != 0)
4429 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4430 if (nested_in_vect_loop)
4431 vec_init_def
4432 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4433 vec_init_def);
4436 /* Set the loop-entry arg of the reduction-phi. */
4438 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4439 == INTEGER_INDUC_COND_REDUCTION)
4441 /* Initialise the reduction phi to zero. This prevents initial
4442 values of non-zero interferring with the reduction op. */
4443 gcc_assert (ncopies == 1);
4444 gcc_assert (i == 0);
4446 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4447 tree zero_vec = build_zero_cst (vec_init_def_type);
4449 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4450 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4452 else
4453 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4454 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4456 /* Set the loop-latch arg for the reduction-phi. */
4457 if (j > 0)
4458 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4460 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4461 UNKNOWN_LOCATION);
4463 if (dump_enabled_p ())
4465 dump_printf_loc (MSG_NOTE, vect_location,
4466 "transform reduction: created def-use cycle: ");
4467 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4468 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4473 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4474 which is updated with the current index of the loop for every match of
4475 the original loop's cond_expr (VEC_STMT). This results in a vector
4476 containing the last time the condition passed for that vector lane.
4477 The first match will be a 1 to allow 0 to be used for non-matching
4478 indexes. If there are no matches at all then the vector will be all
4479 zeroes. */
4480 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4482 tree indx_before_incr, indx_after_incr;
4483 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4484 int k;
4486 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4487 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4489 int scalar_precision
4490 = GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (vectype)));
4491 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4492 tree cr_index_vector_type = build_vector_type
4493 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4495 /* First we create a simple vector induction variable which starts
4496 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4497 vector size (STEP). */
4499 /* Create a {1,2,3,...} vector. */
4500 auto_vec<tree, 32> vtemp (nunits_out);
4501 for (k = 0; k < nunits_out; ++k)
4502 vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4503 tree series_vect = build_vector (cr_index_vector_type, vtemp);
4505 /* Create a vector of the step value. */
4506 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4507 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4509 /* Create an induction variable. */
4510 gimple_stmt_iterator incr_gsi;
4511 bool insert_after;
4512 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4513 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4514 insert_after, &indx_before_incr, &indx_after_incr);
4516 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4517 filled with zeros (VEC_ZERO). */
4519 /* Create a vector of 0s. */
4520 tree zero = build_zero_cst (cr_index_scalar_type);
4521 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4523 /* Create a vector phi node. */
4524 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4525 new_phi = create_phi_node (new_phi_tree, loop->header);
4526 set_vinfo_for_stmt (new_phi,
4527 new_stmt_vec_info (new_phi, loop_vinfo));
4528 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4529 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4531 /* Now take the condition from the loops original cond_expr
4532 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4533 every match uses values from the induction variable
4534 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4535 (NEW_PHI_TREE).
4536 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4537 the new cond_expr (INDEX_COND_EXPR). */
4539 /* Duplicate the condition from vec_stmt. */
4540 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4542 /* Create a conditional, where the condition is taken from vec_stmt
4543 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4544 else is the phi (NEW_PHI_TREE). */
4545 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4546 ccompare, indx_before_incr,
4547 new_phi_tree);
4548 induction_index = make_ssa_name (cr_index_vector_type);
4549 gimple *index_condition = gimple_build_assign (induction_index,
4550 index_cond_expr);
4551 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4552 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4553 loop_vinfo);
4554 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4555 set_vinfo_for_stmt (index_condition, index_vec_info);
4557 /* Update the phi with the vec cond. */
4558 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4559 loop_latch_edge (loop), UNKNOWN_LOCATION);
4562 /* 2. Create epilog code.
4563 The reduction epilog code operates across the elements of the vector
4564 of partial results computed by the vectorized loop.
4565 The reduction epilog code consists of:
4567 step 1: compute the scalar result in a vector (v_out2)
4568 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4569 step 3: adjust the scalar result (s_out3) if needed.
4571 Step 1 can be accomplished using one the following three schemes:
4572 (scheme 1) using reduc_code, if available.
4573 (scheme 2) using whole-vector shifts, if available.
4574 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4575 combined.
4577 The overall epilog code looks like this:
4579 s_out0 = phi <s_loop> # original EXIT_PHI
4580 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4581 v_out2 = reduce <v_out1> # step 1
4582 s_out3 = extract_field <v_out2, 0> # step 2
4583 s_out4 = adjust_result <s_out3> # step 3
4585 (step 3 is optional, and steps 1 and 2 may be combined).
4586 Lastly, the uses of s_out0 are replaced by s_out4. */
4589 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4590 v_out1 = phi <VECT_DEF>
4591 Store them in NEW_PHIS. */
4593 exit_bb = single_exit (loop)->dest;
4594 prev_phi_info = NULL;
4595 new_phis.create (vect_defs.length ());
4596 FOR_EACH_VEC_ELT (vect_defs, i, def)
4598 for (j = 0; j < ncopies; j++)
4600 tree new_def = copy_ssa_name (def);
4601 phi = create_phi_node (new_def, exit_bb);
4602 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4603 if (j == 0)
4604 new_phis.quick_push (phi);
4605 else
4607 def = vect_get_vec_def_for_stmt_copy (dt, def);
4608 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4611 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4612 prev_phi_info = vinfo_for_stmt (phi);
4616 /* The epilogue is created for the outer-loop, i.e., for the loop being
4617 vectorized. Create exit phis for the outer loop. */
4618 if (double_reduc)
4620 loop = outer_loop;
4621 exit_bb = single_exit (loop)->dest;
4622 inner_phis.create (vect_defs.length ());
4623 FOR_EACH_VEC_ELT (new_phis, i, phi)
4625 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4626 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4627 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4628 PHI_RESULT (phi));
4629 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4630 loop_vinfo));
4631 inner_phis.quick_push (phi);
4632 new_phis[i] = outer_phi;
4633 prev_phi_info = vinfo_for_stmt (outer_phi);
4634 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4636 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4637 new_result = copy_ssa_name (PHI_RESULT (phi));
4638 outer_phi = create_phi_node (new_result, exit_bb);
4639 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4640 PHI_RESULT (phi));
4641 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4642 loop_vinfo));
4643 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4644 prev_phi_info = vinfo_for_stmt (outer_phi);
4649 exit_gsi = gsi_after_labels (exit_bb);
4651 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4652 (i.e. when reduc_code is not available) and in the final adjustment
4653 code (if needed). Also get the original scalar reduction variable as
4654 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4655 represents a reduction pattern), the tree-code and scalar-def are
4656 taken from the original stmt that the pattern-stmt (STMT) replaces.
4657 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4658 are taken from STMT. */
4660 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4661 if (!orig_stmt)
4663 /* Regular reduction */
4664 orig_stmt = stmt;
4666 else
4668 /* Reduction pattern */
4669 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4670 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4671 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4674 code = gimple_assign_rhs_code (orig_stmt);
4675 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4676 partial results are added and not subtracted. */
4677 if (code == MINUS_EXPR)
4678 code = PLUS_EXPR;
4680 scalar_dest = gimple_assign_lhs (orig_stmt);
4681 scalar_type = TREE_TYPE (scalar_dest);
4682 scalar_results.create (group_size);
4683 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4684 bitsize = TYPE_SIZE (scalar_type);
4686 /* In case this is a reduction in an inner-loop while vectorizing an outer
4687 loop - we don't need to extract a single scalar result at the end of the
4688 inner-loop (unless it is double reduction, i.e., the use of reduction is
4689 outside the outer-loop). The final vector of partial results will be used
4690 in the vectorized outer-loop, or reduced to a scalar result at the end of
4691 the outer-loop. */
4692 if (nested_in_vect_loop && !double_reduc)
4693 goto vect_finalize_reduction;
4695 /* SLP reduction without reduction chain, e.g.,
4696 # a1 = phi <a2, a0>
4697 # b1 = phi <b2, b0>
4698 a2 = operation (a1)
4699 b2 = operation (b1) */
4700 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4702 /* In case of reduction chain, e.g.,
4703 # a1 = phi <a3, a0>
4704 a2 = operation (a1)
4705 a3 = operation (a2),
4707 we may end up with more than one vector result. Here we reduce them to
4708 one vector. */
4709 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4711 tree first_vect = PHI_RESULT (new_phis[0]);
4712 gassign *new_vec_stmt = NULL;
4713 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4714 for (k = 1; k < new_phis.length (); k++)
4716 gimple *next_phi = new_phis[k];
4717 tree second_vect = PHI_RESULT (next_phi);
4718 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4719 new_vec_stmt = gimple_build_assign (tem, code,
4720 first_vect, second_vect);
4721 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4722 first_vect = tem;
4725 new_phi_result = first_vect;
4726 if (new_vec_stmt)
4728 new_phis.truncate (0);
4729 new_phis.safe_push (new_vec_stmt);
4732 /* Likewise if we couldn't use a single defuse cycle. */
4733 else if (ncopies > 1)
4735 gcc_assert (new_phis.length () == 1);
4736 tree first_vect = PHI_RESULT (new_phis[0]);
4737 gassign *new_vec_stmt = NULL;
4738 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4739 gimple *next_phi = new_phis[0];
4740 for (int k = 1; k < ncopies; ++k)
4742 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4743 tree second_vect = PHI_RESULT (next_phi);
4744 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4745 new_vec_stmt = gimple_build_assign (tem, code,
4746 first_vect, second_vect);
4747 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4748 first_vect = tem;
4750 new_phi_result = first_vect;
4751 new_phis.truncate (0);
4752 new_phis.safe_push (new_vec_stmt);
4754 else
4755 new_phi_result = PHI_RESULT (new_phis[0]);
4757 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4758 && reduc_code != ERROR_MARK)
4760 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4761 various data values where the condition matched and another vector
4762 (INDUCTION_INDEX) containing all the indexes of those matches. We
4763 need to extract the last matching index (which will be the index with
4764 highest value) and use this to index into the data vector.
4765 For the case where there were no matches, the data vector will contain
4766 all default values and the index vector will be all zeros. */
4768 /* Get various versions of the type of the vector of indexes. */
4769 tree index_vec_type = TREE_TYPE (induction_index);
4770 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4771 tree index_scalar_type = TREE_TYPE (index_vec_type);
4772 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4773 (index_vec_type);
4775 /* Get an unsigned integer version of the type of the data vector. */
4776 int scalar_precision
4777 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4778 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4779 tree vectype_unsigned = build_vector_type
4780 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4782 /* First we need to create a vector (ZERO_VEC) of zeros and another
4783 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4784 can create using a MAX reduction and then expanding.
4785 In the case where the loop never made any matches, the max index will
4786 be zero. */
4788 /* Vector of {0, 0, 0,...}. */
4789 tree zero_vec = make_ssa_name (vectype);
4790 tree zero_vec_rhs = build_zero_cst (vectype);
4791 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4792 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4794 /* Find maximum value from the vector of found indexes. */
4795 tree max_index = make_ssa_name (index_scalar_type);
4796 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4797 induction_index);
4798 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4800 /* Vector of {max_index, max_index, max_index,...}. */
4801 tree max_index_vec = make_ssa_name (index_vec_type);
4802 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4803 max_index);
4804 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4805 max_index_vec_rhs);
4806 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4808 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4809 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4810 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4811 otherwise. Only one value should match, resulting in a vector
4812 (VEC_COND) with one data value and the rest zeros.
4813 In the case where the loop never made any matches, every index will
4814 match, resulting in a vector with all data values (which will all be
4815 the default value). */
4817 /* Compare the max index vector to the vector of found indexes to find
4818 the position of the max value. */
4819 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4820 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4821 induction_index,
4822 max_index_vec);
4823 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4825 /* Use the compare to choose either values from the data vector or
4826 zero. */
4827 tree vec_cond = make_ssa_name (vectype);
4828 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4829 vec_compare, new_phi_result,
4830 zero_vec);
4831 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4833 /* Finally we need to extract the data value from the vector (VEC_COND)
4834 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4835 reduction, but because this doesn't exist, we can use a MAX reduction
4836 instead. The data value might be signed or a float so we need to cast
4837 it first.
4838 In the case where the loop never made any matches, the data values are
4839 all identical, and so will reduce down correctly. */
4841 /* Make the matched data values unsigned. */
4842 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4843 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4844 vec_cond);
4845 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4846 VIEW_CONVERT_EXPR,
4847 vec_cond_cast_rhs);
4848 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4850 /* Reduce down to a scalar value. */
4851 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4852 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4853 optab_default);
4854 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4855 != CODE_FOR_nothing);
4856 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4857 REDUC_MAX_EXPR,
4858 vec_cond_cast);
4859 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4861 /* Convert the reduced value back to the result type and set as the
4862 result. */
4863 gimple_seq stmts = NULL;
4864 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4865 data_reduc);
4866 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4867 scalar_results.safe_push (new_temp);
4869 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4870 && reduc_code == ERROR_MARK)
4872 /* Condition redution without supported REDUC_MAX_EXPR. Generate
4873 idx = 0;
4874 idx_val = induction_index[0];
4875 val = data_reduc[0];
4876 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4877 if (induction_index[i] > idx_val)
4878 val = data_reduc[i], idx_val = induction_index[i];
4879 return val; */
4881 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4882 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4883 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4884 unsigned HOST_WIDE_INT v_size
4885 = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4886 tree idx_val = NULL_TREE, val = NULL_TREE;
4887 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4889 tree old_idx_val = idx_val;
4890 tree old_val = val;
4891 idx_val = make_ssa_name (idx_eltype);
4892 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4893 build3 (BIT_FIELD_REF, idx_eltype,
4894 induction_index,
4895 bitsize_int (el_size),
4896 bitsize_int (off)));
4897 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4898 val = make_ssa_name (data_eltype);
4899 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4900 build3 (BIT_FIELD_REF,
4901 data_eltype,
4902 new_phi_result,
4903 bitsize_int (el_size),
4904 bitsize_int (off)));
4905 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4906 if (off != 0)
4908 tree new_idx_val = idx_val;
4909 tree new_val = val;
4910 if (off != v_size - el_size)
4912 new_idx_val = make_ssa_name (idx_eltype);
4913 epilog_stmt = gimple_build_assign (new_idx_val,
4914 MAX_EXPR, idx_val,
4915 old_idx_val);
4916 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4918 new_val = make_ssa_name (data_eltype);
4919 epilog_stmt = gimple_build_assign (new_val,
4920 COND_EXPR,
4921 build2 (GT_EXPR,
4922 boolean_type_node,
4923 idx_val,
4924 old_idx_val),
4925 val, old_val);
4926 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4927 idx_val = new_idx_val;
4928 val = new_val;
4931 /* Convert the reduced value back to the result type and set as the
4932 result. */
4933 gimple_seq stmts = NULL;
4934 val = gimple_convert (&stmts, scalar_type, val);
4935 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4936 scalar_results.safe_push (val);
4939 /* 2.3 Create the reduction code, using one of the three schemes described
4940 above. In SLP we simply need to extract all the elements from the
4941 vector (without reducing them), so we use scalar shifts. */
4942 else if (reduc_code != ERROR_MARK && !slp_reduc)
4944 tree tmp;
4945 tree vec_elem_type;
4947 /* Case 1: Create:
4948 v_out2 = reduc_expr <v_out1> */
4950 if (dump_enabled_p ())
4951 dump_printf_loc (MSG_NOTE, vect_location,
4952 "Reduce using direct vector reduction.\n");
4954 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4955 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4957 tree tmp_dest =
4958 vect_create_destination_var (scalar_dest, vec_elem_type);
4959 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4960 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4961 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4962 gimple_assign_set_lhs (epilog_stmt, new_temp);
4963 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4965 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4967 else
4968 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4970 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4971 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4972 gimple_assign_set_lhs (epilog_stmt, new_temp);
4973 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4975 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4976 == INTEGER_INDUC_COND_REDUCTION)
4978 /* Earlier we set the initial value to be zero. Check the result
4979 and if it is zero then replace with the original initial
4980 value. */
4981 tree zero = build_zero_cst (scalar_type);
4982 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4984 tmp = make_ssa_name (new_scalar_dest);
4985 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4986 initial_def, new_temp);
4987 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4988 new_temp = tmp;
4991 scalar_results.safe_push (new_temp);
4993 else
4995 bool reduce_with_shift = have_whole_vector_shift (mode);
4996 int element_bitsize = tree_to_uhwi (bitsize);
4997 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4998 tree vec_temp;
5000 /* COND reductions all do the final reduction with MAX_EXPR. */
5001 if (code == COND_EXPR)
5002 code = MAX_EXPR;
5004 /* Regardless of whether we have a whole vector shift, if we're
5005 emulating the operation via tree-vect-generic, we don't want
5006 to use it. Only the first round of the reduction is likely
5007 to still be profitable via emulation. */
5008 /* ??? It might be better to emit a reduction tree code here, so that
5009 tree-vect-generic can expand the first round via bit tricks. */
5010 if (!VECTOR_MODE_P (mode))
5011 reduce_with_shift = false;
5012 else
5014 optab optab = optab_for_tree_code (code, vectype, optab_default);
5015 if (optab_handler (optab, mode) == CODE_FOR_nothing)
5016 reduce_with_shift = false;
5019 if (reduce_with_shift && !slp_reduc)
5021 int nelements = vec_size_in_bits / element_bitsize;
5022 auto_vec_perm_indices sel (nelements);
5024 int elt_offset;
5026 tree zero_vec = build_zero_cst (vectype);
5027 /* Case 2: Create:
5028 for (offset = nelements/2; offset >= 1; offset/=2)
5030 Create: va' = vec_shift <va, offset>
5031 Create: va = vop <va, va'>
5032 } */
5034 tree rhs;
5036 if (dump_enabled_p ())
5037 dump_printf_loc (MSG_NOTE, vect_location,
5038 "Reduce using vector shifts\n");
5040 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5041 new_temp = new_phi_result;
5042 for (elt_offset = nelements / 2;
5043 elt_offset >= 1;
5044 elt_offset /= 2)
5046 sel.truncate (0);
5047 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5048 tree mask = vect_gen_perm_mask_any (vectype, sel);
5049 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5050 new_temp, zero_vec, mask);
5051 new_name = make_ssa_name (vec_dest, epilog_stmt);
5052 gimple_assign_set_lhs (epilog_stmt, new_name);
5053 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5055 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5056 new_temp);
5057 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5058 gimple_assign_set_lhs (epilog_stmt, new_temp);
5059 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5062 /* 2.4 Extract the final scalar result. Create:
5063 s_out3 = extract_field <v_out2, bitpos> */
5065 if (dump_enabled_p ())
5066 dump_printf_loc (MSG_NOTE, vect_location,
5067 "extract scalar result\n");
5069 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5070 bitsize, bitsize_zero_node);
5071 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5072 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5073 gimple_assign_set_lhs (epilog_stmt, new_temp);
5074 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5075 scalar_results.safe_push (new_temp);
5077 else
5079 /* Case 3: Create:
5080 s = extract_field <v_out2, 0>
5081 for (offset = element_size;
5082 offset < vector_size;
5083 offset += element_size;)
5085 Create: s' = extract_field <v_out2, offset>
5086 Create: s = op <s, s'> // For non SLP cases
5087 } */
5089 if (dump_enabled_p ())
5090 dump_printf_loc (MSG_NOTE, vect_location,
5091 "Reduce using scalar code.\n");
5093 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5094 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5096 int bit_offset;
5097 if (gimple_code (new_phi) == GIMPLE_PHI)
5098 vec_temp = PHI_RESULT (new_phi);
5099 else
5100 vec_temp = gimple_assign_lhs (new_phi);
5101 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5102 bitsize_zero_node);
5103 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5104 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5105 gimple_assign_set_lhs (epilog_stmt, new_temp);
5106 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5108 /* In SLP we don't need to apply reduction operation, so we just
5109 collect s' values in SCALAR_RESULTS. */
5110 if (slp_reduc)
5111 scalar_results.safe_push (new_temp);
5113 for (bit_offset = element_bitsize;
5114 bit_offset < vec_size_in_bits;
5115 bit_offset += element_bitsize)
5117 tree bitpos = bitsize_int (bit_offset);
5118 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5119 bitsize, bitpos);
5121 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5122 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5123 gimple_assign_set_lhs (epilog_stmt, new_name);
5124 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5126 if (slp_reduc)
5128 /* In SLP we don't need to apply reduction operation, so
5129 we just collect s' values in SCALAR_RESULTS. */
5130 new_temp = new_name;
5131 scalar_results.safe_push (new_name);
5133 else
5135 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5136 new_name, new_temp);
5137 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5138 gimple_assign_set_lhs (epilog_stmt, new_temp);
5139 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5144 /* The only case where we need to reduce scalar results in SLP, is
5145 unrolling. If the size of SCALAR_RESULTS is greater than
5146 GROUP_SIZE, we reduce them combining elements modulo
5147 GROUP_SIZE. */
5148 if (slp_reduc)
5150 tree res, first_res, new_res;
5151 gimple *new_stmt;
5153 /* Reduce multiple scalar results in case of SLP unrolling. */
5154 for (j = group_size; scalar_results.iterate (j, &res);
5155 j++)
5157 first_res = scalar_results[j % group_size];
5158 new_stmt = gimple_build_assign (new_scalar_dest, code,
5159 first_res, res);
5160 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5161 gimple_assign_set_lhs (new_stmt, new_res);
5162 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5163 scalar_results[j % group_size] = new_res;
5166 else
5167 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5168 scalar_results.safe_push (new_temp);
5171 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5172 == INTEGER_INDUC_COND_REDUCTION)
5174 /* Earlier we set the initial value to be zero. Check the result
5175 and if it is zero then replace with the original initial
5176 value. */
5177 tree zero = build_zero_cst (scalar_type);
5178 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5180 tree tmp = make_ssa_name (new_scalar_dest);
5181 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5182 initial_def, new_temp);
5183 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5184 scalar_results[0] = tmp;
5188 vect_finalize_reduction:
5190 if (double_reduc)
5191 loop = loop->inner;
5193 /* 2.5 Adjust the final result by the initial value of the reduction
5194 variable. (When such adjustment is not needed, then
5195 'adjustment_def' is zero). For example, if code is PLUS we create:
5196 new_temp = loop_exit_def + adjustment_def */
5198 if (adjustment_def)
5200 gcc_assert (!slp_reduc);
5201 if (nested_in_vect_loop)
5203 new_phi = new_phis[0];
5204 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5205 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5206 new_dest = vect_create_destination_var (scalar_dest, vectype);
5208 else
5210 new_temp = scalar_results[0];
5211 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5212 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5213 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5216 epilog_stmt = gimple_build_assign (new_dest, expr);
5217 new_temp = make_ssa_name (new_dest, epilog_stmt);
5218 gimple_assign_set_lhs (epilog_stmt, new_temp);
5219 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220 if (nested_in_vect_loop)
5222 set_vinfo_for_stmt (epilog_stmt,
5223 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5224 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5225 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5227 if (!double_reduc)
5228 scalar_results.quick_push (new_temp);
5229 else
5230 scalar_results[0] = new_temp;
5232 else
5233 scalar_results[0] = new_temp;
5235 new_phis[0] = epilog_stmt;
5238 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5239 phis with new adjusted scalar results, i.e., replace use <s_out0>
5240 with use <s_out4>.
5242 Transform:
5243 loop_exit:
5244 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5245 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5246 v_out2 = reduce <v_out1>
5247 s_out3 = extract_field <v_out2, 0>
5248 s_out4 = adjust_result <s_out3>
5249 use <s_out0>
5250 use <s_out0>
5252 into:
5254 loop_exit:
5255 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5256 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5257 v_out2 = reduce <v_out1>
5258 s_out3 = extract_field <v_out2, 0>
5259 s_out4 = adjust_result <s_out3>
5260 use <s_out4>
5261 use <s_out4> */
5264 /* In SLP reduction chain we reduce vector results into one vector if
5265 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5266 the last stmt in the reduction chain, since we are looking for the loop
5267 exit phi node. */
5268 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5270 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5271 /* Handle reduction patterns. */
5272 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5273 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5275 scalar_dest = gimple_assign_lhs (dest_stmt);
5276 group_size = 1;
5279 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5280 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5281 need to match SCALAR_RESULTS with corresponding statements. The first
5282 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5283 the first vector stmt, etc.
5284 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5285 if (group_size > new_phis.length ())
5287 ratio = group_size / new_phis.length ();
5288 gcc_assert (!(group_size % new_phis.length ()));
5290 else
5291 ratio = 1;
5293 for (k = 0; k < group_size; k++)
5295 if (k % ratio == 0)
5297 epilog_stmt = new_phis[k / ratio];
5298 reduction_phi = reduction_phis[k / ratio];
5299 if (double_reduc)
5300 inner_phi = inner_phis[k / ratio];
5303 if (slp_reduc)
5305 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5307 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5308 /* SLP statements can't participate in patterns. */
5309 gcc_assert (!orig_stmt);
5310 scalar_dest = gimple_assign_lhs (current_stmt);
5313 phis.create (3);
5314 /* Find the loop-closed-use at the loop exit of the original scalar
5315 result. (The reduction result is expected to have two immediate uses -
5316 one at the latch block, and one at the loop exit). */
5317 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5318 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5319 && !is_gimple_debug (USE_STMT (use_p)))
5320 phis.safe_push (USE_STMT (use_p));
5322 /* While we expect to have found an exit_phi because of loop-closed-ssa
5323 form we can end up without one if the scalar cycle is dead. */
5325 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5327 if (outer_loop)
5329 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5330 gphi *vect_phi;
5332 /* FORNOW. Currently not supporting the case that an inner-loop
5333 reduction is not used in the outer-loop (but only outside the
5334 outer-loop), unless it is double reduction. */
5335 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5336 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5337 || double_reduc);
5339 if (double_reduc)
5340 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5341 else
5342 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5343 if (!double_reduc
5344 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5345 != vect_double_reduction_def)
5346 continue;
5348 /* Handle double reduction:
5350 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5351 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5352 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5353 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5355 At that point the regular reduction (stmt2 and stmt3) is
5356 already vectorized, as well as the exit phi node, stmt4.
5357 Here we vectorize the phi node of double reduction, stmt1, and
5358 update all relevant statements. */
5360 /* Go through all the uses of s2 to find double reduction phi
5361 node, i.e., stmt1 above. */
5362 orig_name = PHI_RESULT (exit_phi);
5363 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5365 stmt_vec_info use_stmt_vinfo;
5366 stmt_vec_info new_phi_vinfo;
5367 tree vect_phi_init, preheader_arg, vect_phi_res;
5368 basic_block bb = gimple_bb (use_stmt);
5369 gimple *use;
5371 /* Check that USE_STMT is really double reduction phi
5372 node. */
5373 if (gimple_code (use_stmt) != GIMPLE_PHI
5374 || gimple_phi_num_args (use_stmt) != 2
5375 || bb->loop_father != outer_loop)
5376 continue;
5377 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5378 if (!use_stmt_vinfo
5379 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5380 != vect_double_reduction_def)
5381 continue;
5383 /* Create vector phi node for double reduction:
5384 vs1 = phi <vs0, vs2>
5385 vs1 was created previously in this function by a call to
5386 vect_get_vec_def_for_operand and is stored in
5387 vec_initial_def;
5388 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5389 vs0 is created here. */
5391 /* Create vector phi node. */
5392 vect_phi = create_phi_node (vec_initial_def, bb);
5393 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5394 loop_vec_info_for_loop (outer_loop));
5395 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5397 /* Create vs0 - initial def of the double reduction phi. */
5398 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5399 loop_preheader_edge (outer_loop));
5400 vect_phi_init = get_initial_def_for_reduction
5401 (stmt, preheader_arg, NULL);
5403 /* Update phi node arguments with vs0 and vs2. */
5404 add_phi_arg (vect_phi, vect_phi_init,
5405 loop_preheader_edge (outer_loop),
5406 UNKNOWN_LOCATION);
5407 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5408 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5409 if (dump_enabled_p ())
5411 dump_printf_loc (MSG_NOTE, vect_location,
5412 "created double reduction phi node: ");
5413 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5416 vect_phi_res = PHI_RESULT (vect_phi);
5418 /* Replace the use, i.e., set the correct vs1 in the regular
5419 reduction phi node. FORNOW, NCOPIES is always 1, so the
5420 loop is redundant. */
5421 use = reduction_phi;
5422 for (j = 0; j < ncopies; j++)
5424 edge pr_edge = loop_preheader_edge (loop);
5425 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5426 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5432 phis.release ();
5433 if (nested_in_vect_loop)
5435 if (double_reduc)
5436 loop = outer_loop;
5437 else
5438 continue;
5441 phis.create (3);
5442 /* Find the loop-closed-use at the loop exit of the original scalar
5443 result. (The reduction result is expected to have two immediate uses,
5444 one at the latch block, and one at the loop exit). For double
5445 reductions we are looking for exit phis of the outer loop. */
5446 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5448 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5450 if (!is_gimple_debug (USE_STMT (use_p)))
5451 phis.safe_push (USE_STMT (use_p));
5453 else
5455 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5457 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5459 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5461 if (!flow_bb_inside_loop_p (loop,
5462 gimple_bb (USE_STMT (phi_use_p)))
5463 && !is_gimple_debug (USE_STMT (phi_use_p)))
5464 phis.safe_push (USE_STMT (phi_use_p));
5470 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5472 /* Replace the uses: */
5473 orig_name = PHI_RESULT (exit_phi);
5474 scalar_result = scalar_results[k];
5475 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5476 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5477 SET_USE (use_p, scalar_result);
5480 phis.release ();
5485 /* Function is_nonwrapping_integer_induction.
5487 Check if STMT (which is part of loop LOOP) both increments and
5488 does not cause overflow. */
5490 static bool
5491 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5493 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5494 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5495 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5496 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5497 widest_int ni, max_loop_value, lhs_max;
5498 bool overflow = false;
5500 /* Make sure the loop is integer based. */
5501 if (TREE_CODE (base) != INTEGER_CST
5502 || TREE_CODE (step) != INTEGER_CST)
5503 return false;
5505 /* Check that the induction increments. */
5506 if (tree_int_cst_sgn (step) == -1)
5507 return false;
5509 /* Check that the max size of the loop will not wrap. */
5511 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5512 return true;
5514 if (! max_stmt_executions (loop, &ni))
5515 return false;
5517 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5518 &overflow);
5519 if (overflow)
5520 return false;
5522 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5523 TYPE_SIGN (lhs_type), &overflow);
5524 if (overflow)
5525 return false;
5527 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5528 <= TYPE_PRECISION (lhs_type));
5531 /* Function vectorizable_reduction.
5533 Check if STMT performs a reduction operation that can be vectorized.
5534 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5535 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5536 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5538 This function also handles reduction idioms (patterns) that have been
5539 recognized in advance during vect_pattern_recog. In this case, STMT may be
5540 of this form:
5541 X = pattern_expr (arg0, arg1, ..., X)
5542 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5543 sequence that had been detected and replaced by the pattern-stmt (STMT).
5545 This function also handles reduction of condition expressions, for example:
5546 for (int i = 0; i < N; i++)
5547 if (a[i] < value)
5548 last = a[i];
5549 This is handled by vectorising the loop and creating an additional vector
5550 containing the loop indexes for which "a[i] < value" was true. In the
5551 function epilogue this is reduced to a single max value and then used to
5552 index into the vector of results.
5554 In some cases of reduction patterns, the type of the reduction variable X is
5555 different than the type of the other arguments of STMT.
5556 In such cases, the vectype that is used when transforming STMT into a vector
5557 stmt is different than the vectype that is used to determine the
5558 vectorization factor, because it consists of a different number of elements
5559 than the actual number of elements that are being operated upon in parallel.
5561 For example, consider an accumulation of shorts into an int accumulator.
5562 On some targets it's possible to vectorize this pattern operating on 8
5563 shorts at a time (hence, the vectype for purposes of determining the
5564 vectorization factor should be V8HI); on the other hand, the vectype that
5565 is used to create the vector form is actually V4SI (the type of the result).
5567 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5568 indicates what is the actual level of parallelism (V8HI in the example), so
5569 that the right vectorization factor would be derived. This vectype
5570 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5571 be used to create the vectorized stmt. The right vectype for the vectorized
5572 stmt is obtained from the type of the result X:
5573 get_vectype_for_scalar_type (TREE_TYPE (X))
5575 This means that, contrary to "regular" reductions (or "regular" stmts in
5576 general), the following equation:
5577 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5578 does *NOT* necessarily hold for reduction patterns. */
5580 bool
5581 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5582 gimple **vec_stmt, slp_tree slp_node,
5583 slp_instance slp_node_instance)
5585 tree vec_dest;
5586 tree scalar_dest;
5587 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5588 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5589 tree vectype_in = NULL_TREE;
5590 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5591 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5592 enum tree_code code, orig_code, epilog_reduc_code;
5593 machine_mode vec_mode;
5594 int op_type;
5595 optab optab, reduc_optab;
5596 tree new_temp = NULL_TREE;
5597 gimple *def_stmt;
5598 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5599 tree scalar_type;
5600 bool is_simple_use;
5601 gimple *orig_stmt;
5602 stmt_vec_info orig_stmt_info = NULL;
5603 int i;
5604 int ncopies;
5605 int epilog_copies;
5606 stmt_vec_info prev_stmt_info, prev_phi_info;
5607 bool single_defuse_cycle = false;
5608 gimple *new_stmt = NULL;
5609 int j;
5610 tree ops[3];
5611 enum vect_def_type dts[3];
5612 bool nested_cycle = false, found_nested_cycle_def = false;
5613 bool double_reduc = false;
5614 basic_block def_bb;
5615 struct loop * def_stmt_loop, *outer_loop = NULL;
5616 tree def_arg;
5617 gimple *def_arg_stmt;
5618 auto_vec<tree> vec_oprnds0;
5619 auto_vec<tree> vec_oprnds1;
5620 auto_vec<tree> vec_oprnds2;
5621 auto_vec<tree> vect_defs;
5622 auto_vec<gimple *> phis;
5623 int vec_num;
5624 tree def0, tem;
5625 bool first_p = true;
5626 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5627 tree cond_reduc_val = NULL_TREE;
5629 /* Make sure it was already recognized as a reduction computation. */
5630 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5631 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5632 return false;
5634 if (nested_in_vect_loop_p (loop, stmt))
5636 outer_loop = loop;
5637 loop = loop->inner;
5638 nested_cycle = true;
5641 /* In case of reduction chain we switch to the first stmt in the chain, but
5642 we don't update STMT_INFO, since only the last stmt is marked as reduction
5643 and has reduction properties. */
5644 if (GROUP_FIRST_ELEMENT (stmt_info)
5645 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5647 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5648 first_p = false;
5651 if (gimple_code (stmt) == GIMPLE_PHI)
5653 /* Analysis is fully done on the reduction stmt invocation. */
5654 if (! vec_stmt)
5656 if (slp_node)
5657 slp_node_instance->reduc_phis = slp_node;
5659 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5660 return true;
5663 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5664 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5665 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5667 gcc_assert (is_gimple_assign (reduc_stmt));
5668 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5670 tree op = gimple_op (reduc_stmt, k);
5671 if (op == gimple_phi_result (stmt))
5672 continue;
5673 if (k == 1
5674 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5675 continue;
5676 tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5677 if (! vectype_in
5678 || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5679 vectype_in = tem;
5680 break;
5682 gcc_assert (vectype_in);
5684 if (slp_node)
5685 ncopies = 1;
5686 else
5687 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5689 use_operand_p use_p;
5690 gimple *use_stmt;
5691 if (ncopies > 1
5692 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5693 <= vect_used_only_live)
5694 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5695 && (use_stmt == reduc_stmt
5696 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5697 == reduc_stmt)))
5698 single_defuse_cycle = true;
5700 /* Create the destination vector */
5701 scalar_dest = gimple_assign_lhs (reduc_stmt);
5702 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5704 if (slp_node)
5705 /* The size vect_schedule_slp_instance computes is off for us. */
5706 vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5707 * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5708 / TYPE_VECTOR_SUBPARTS (vectype_in));
5709 else
5710 vec_num = 1;
5712 /* Generate the reduction PHIs upfront. */
5713 prev_phi_info = NULL;
5714 for (j = 0; j < ncopies; j++)
5716 if (j == 0 || !single_defuse_cycle)
5718 for (i = 0; i < vec_num; i++)
5720 /* Create the reduction-phi that defines the reduction
5721 operand. */
5722 gimple *new_phi = create_phi_node (vec_dest, loop->header);
5723 set_vinfo_for_stmt (new_phi,
5724 new_stmt_vec_info (new_phi, loop_vinfo));
5726 if (slp_node)
5727 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5728 else
5730 if (j == 0)
5731 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5732 else
5733 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5734 prev_phi_info = vinfo_for_stmt (new_phi);
5740 return true;
5743 /* 1. Is vectorizable reduction? */
5744 /* Not supportable if the reduction variable is used in the loop, unless
5745 it's a reduction chain. */
5746 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5747 && !GROUP_FIRST_ELEMENT (stmt_info))
5748 return false;
5750 /* Reductions that are not used even in an enclosing outer-loop,
5751 are expected to be "live" (used out of the loop). */
5752 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5753 && !STMT_VINFO_LIVE_P (stmt_info))
5754 return false;
5756 /* 2. Has this been recognized as a reduction pattern?
5758 Check if STMT represents a pattern that has been recognized
5759 in earlier analysis stages. For stmts that represent a pattern,
5760 the STMT_VINFO_RELATED_STMT field records the last stmt in
5761 the original sequence that constitutes the pattern. */
5763 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5764 if (orig_stmt)
5766 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5767 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5768 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5771 /* 3. Check the operands of the operation. The first operands are defined
5772 inside the loop body. The last operand is the reduction variable,
5773 which is defined by the loop-header-phi. */
5775 gcc_assert (is_gimple_assign (stmt));
5777 /* Flatten RHS. */
5778 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5780 case GIMPLE_BINARY_RHS:
5781 code = gimple_assign_rhs_code (stmt);
5782 op_type = TREE_CODE_LENGTH (code);
5783 gcc_assert (op_type == binary_op);
5784 ops[0] = gimple_assign_rhs1 (stmt);
5785 ops[1] = gimple_assign_rhs2 (stmt);
5786 break;
5788 case GIMPLE_TERNARY_RHS:
5789 code = gimple_assign_rhs_code (stmt);
5790 op_type = TREE_CODE_LENGTH (code);
5791 gcc_assert (op_type == ternary_op);
5792 ops[0] = gimple_assign_rhs1 (stmt);
5793 ops[1] = gimple_assign_rhs2 (stmt);
5794 ops[2] = gimple_assign_rhs3 (stmt);
5795 break;
5797 case GIMPLE_UNARY_RHS:
5798 return false;
5800 default:
5801 gcc_unreachable ();
5804 if (code == COND_EXPR && slp_node)
5805 return false;
5807 scalar_dest = gimple_assign_lhs (stmt);
5808 scalar_type = TREE_TYPE (scalar_dest);
5809 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5810 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5811 return false;
5813 /* Do not try to vectorize bit-precision reductions. */
5814 if (!type_has_mode_precision_p (scalar_type))
5815 return false;
5817 /* All uses but the last are expected to be defined in the loop.
5818 The last use is the reduction variable. In case of nested cycle this
5819 assumption is not true: we use reduc_index to record the index of the
5820 reduction variable. */
5821 gimple *reduc_def_stmt = NULL;
5822 int reduc_index = -1;
5823 for (i = 0; i < op_type; i++)
5825 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5826 if (i == 0 && code == COND_EXPR)
5827 continue;
5829 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5830 &def_stmt, &dts[i], &tem);
5831 dt = dts[i];
5832 gcc_assert (is_simple_use);
5833 if (dt == vect_reduction_def)
5835 reduc_def_stmt = def_stmt;
5836 reduc_index = i;
5837 continue;
5839 else
5841 if (!vectype_in)
5842 vectype_in = tem;
5845 if (dt != vect_internal_def
5846 && dt != vect_external_def
5847 && dt != vect_constant_def
5848 && dt != vect_induction_def
5849 && !(dt == vect_nested_cycle && nested_cycle))
5850 return false;
5852 if (dt == vect_nested_cycle)
5854 found_nested_cycle_def = true;
5855 reduc_def_stmt = def_stmt;
5856 reduc_index = i;
5859 if (i == 1 && code == COND_EXPR)
5861 /* Record how value of COND_EXPR is defined. */
5862 if (dt == vect_constant_def)
5864 cond_reduc_dt = dt;
5865 cond_reduc_val = ops[i];
5867 if (dt == vect_induction_def && def_stmt != NULL
5868 && is_nonwrapping_integer_induction (def_stmt, loop))
5869 cond_reduc_dt = dt;
5873 if (!vectype_in)
5874 vectype_in = vectype_out;
5876 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5877 directy used in stmt. */
5878 if (reduc_index == -1)
5880 if (orig_stmt)
5881 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5882 else
5883 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5886 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5887 return false;
5889 if (!(reduc_index == -1
5890 || dts[reduc_index] == vect_reduction_def
5891 || dts[reduc_index] == vect_nested_cycle
5892 || ((dts[reduc_index] == vect_internal_def
5893 || dts[reduc_index] == vect_external_def
5894 || dts[reduc_index] == vect_constant_def
5895 || dts[reduc_index] == vect_induction_def)
5896 && nested_cycle && found_nested_cycle_def)))
5898 /* For pattern recognized stmts, orig_stmt might be a reduction,
5899 but some helper statements for the pattern might not, or
5900 might be COND_EXPRs with reduction uses in the condition. */
5901 gcc_assert (orig_stmt);
5902 return false;
5905 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5906 enum vect_reduction_type v_reduc_type
5907 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5908 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5910 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5911 /* If we have a condition reduction, see if we can simplify it further. */
5912 if (v_reduc_type == COND_REDUCTION)
5914 if (cond_reduc_dt == vect_induction_def)
5916 if (dump_enabled_p ())
5917 dump_printf_loc (MSG_NOTE, vect_location,
5918 "condition expression based on "
5919 "integer induction.\n");
5920 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5921 = INTEGER_INDUC_COND_REDUCTION;
5924 /* Loop peeling modifies initial value of reduction PHI, which
5925 makes the reduction stmt to be transformed different to the
5926 original stmt analyzed. We need to record reduction code for
5927 CONST_COND_REDUCTION type reduction at analyzing stage, thus
5928 it can be used directly at transform stage. */
5929 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5930 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5932 /* Also set the reduction type to CONST_COND_REDUCTION. */
5933 gcc_assert (cond_reduc_dt == vect_constant_def);
5934 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5936 else if (cond_reduc_dt == vect_constant_def)
5938 enum vect_def_type cond_initial_dt;
5939 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5940 tree cond_initial_val
5941 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5943 gcc_assert (cond_reduc_val != NULL_TREE);
5944 vect_is_simple_use (cond_initial_val, loop_vinfo,
5945 &def_stmt, &cond_initial_dt);
5946 if (cond_initial_dt == vect_constant_def
5947 && types_compatible_p (TREE_TYPE (cond_initial_val),
5948 TREE_TYPE (cond_reduc_val)))
5950 tree e = fold_binary (LE_EXPR, boolean_type_node,
5951 cond_initial_val, cond_reduc_val);
5952 if (e && (integer_onep (e) || integer_zerop (e)))
5954 if (dump_enabled_p ())
5955 dump_printf_loc (MSG_NOTE, vect_location,
5956 "condition expression based on "
5957 "compile time constant.\n");
5958 /* Record reduction code at analysis stage. */
5959 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5960 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5961 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5962 = CONST_COND_REDUCTION;
5968 if (orig_stmt)
5969 gcc_assert (tmp == orig_stmt
5970 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5971 else
5972 /* We changed STMT to be the first stmt in reduction chain, hence we
5973 check that in this case the first element in the chain is STMT. */
5974 gcc_assert (stmt == tmp
5975 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5977 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5978 return false;
5980 if (slp_node)
5981 ncopies = 1;
5982 else
5983 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5985 gcc_assert (ncopies >= 1);
5987 vec_mode = TYPE_MODE (vectype_in);
5989 if (code == COND_EXPR)
5991 /* Only call during the analysis stage, otherwise we'll lose
5992 STMT_VINFO_TYPE. */
5993 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5994 ops[reduc_index], 0, NULL))
5996 if (dump_enabled_p ())
5997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5998 "unsupported condition in reduction\n");
5999 return false;
6002 else
6004 /* 4. Supportable by target? */
6006 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6007 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6009 /* Shifts and rotates are only supported by vectorizable_shifts,
6010 not vectorizable_reduction. */
6011 if (dump_enabled_p ())
6012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6013 "unsupported shift or rotation.\n");
6014 return false;
6017 /* 4.1. check support for the operation in the loop */
6018 optab = optab_for_tree_code (code, vectype_in, optab_default);
6019 if (!optab)
6021 if (dump_enabled_p ())
6022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6023 "no optab.\n");
6025 return false;
6028 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6030 if (dump_enabled_p ())
6031 dump_printf (MSG_NOTE, "op not supported by target.\n");
6033 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6034 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6035 return false;
6037 if (dump_enabled_p ())
6038 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6041 /* Worthwhile without SIMD support? */
6042 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6043 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6045 if (dump_enabled_p ())
6046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6047 "not worthwhile without SIMD support.\n");
6049 return false;
6053 /* 4.2. Check support for the epilog operation.
6055 If STMT represents a reduction pattern, then the type of the
6056 reduction variable may be different than the type of the rest
6057 of the arguments. For example, consider the case of accumulation
6058 of shorts into an int accumulator; The original code:
6059 S1: int_a = (int) short_a;
6060 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6062 was replaced with:
6063 STMT: int_acc = widen_sum <short_a, int_acc>
6065 This means that:
6066 1. The tree-code that is used to create the vector operation in the
6067 epilog code (that reduces the partial results) is not the
6068 tree-code of STMT, but is rather the tree-code of the original
6069 stmt from the pattern that STMT is replacing. I.e, in the example
6070 above we want to use 'widen_sum' in the loop, but 'plus' in the
6071 epilog.
6072 2. The type (mode) we use to check available target support
6073 for the vector operation to be created in the *epilog*, is
6074 determined by the type of the reduction variable (in the example
6075 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6076 However the type (mode) we use to check available target support
6077 for the vector operation to be created *inside the loop*, is
6078 determined by the type of the other arguments to STMT (in the
6079 example we'd check this: optab_handler (widen_sum_optab,
6080 vect_short_mode)).
6082 This is contrary to "regular" reductions, in which the types of all
6083 the arguments are the same as the type of the reduction variable.
6084 For "regular" reductions we can therefore use the same vector type
6085 (and also the same tree-code) when generating the epilog code and
6086 when generating the code inside the loop. */
6088 if (orig_stmt)
6090 /* This is a reduction pattern: get the vectype from the type of the
6091 reduction variable, and get the tree-code from orig_stmt. */
6092 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6093 == TREE_CODE_REDUCTION);
6094 orig_code = gimple_assign_rhs_code (orig_stmt);
6095 gcc_assert (vectype_out);
6096 vec_mode = TYPE_MODE (vectype_out);
6098 else
6100 /* Regular reduction: use the same vectype and tree-code as used for
6101 the vector code inside the loop can be used for the epilog code. */
6102 orig_code = code;
6104 if (code == MINUS_EXPR)
6105 orig_code = PLUS_EXPR;
6107 /* For simple condition reductions, replace with the actual expression
6108 we want to base our reduction around. */
6109 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6111 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6112 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6114 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6115 == INTEGER_INDUC_COND_REDUCTION)
6116 orig_code = MAX_EXPR;
6119 if (nested_cycle)
6121 def_bb = gimple_bb (reduc_def_stmt);
6122 def_stmt_loop = def_bb->loop_father;
6123 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6124 loop_preheader_edge (def_stmt_loop));
6125 if (TREE_CODE (def_arg) == SSA_NAME
6126 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6127 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6128 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6129 && vinfo_for_stmt (def_arg_stmt)
6130 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6131 == vect_double_reduction_def)
6132 double_reduc = true;
6135 epilog_reduc_code = ERROR_MARK;
6137 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6139 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
6141 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
6142 optab_default);
6143 if (!reduc_optab)
6145 if (dump_enabled_p ())
6146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6147 "no optab for reduction.\n");
6149 epilog_reduc_code = ERROR_MARK;
6151 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
6153 if (dump_enabled_p ())
6154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6155 "reduc op not supported by target.\n");
6157 epilog_reduc_code = ERROR_MARK;
6160 else
6162 if (!nested_cycle || double_reduc)
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6166 "no reduc code for scalar code.\n");
6168 return false;
6172 else
6174 int scalar_precision
6175 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6176 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6177 cr_index_vector_type = build_vector_type
6178 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6180 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
6181 optab_default);
6182 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
6183 != CODE_FOR_nothing)
6184 epilog_reduc_code = REDUC_MAX_EXPR;
6187 if ((double_reduc
6188 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6189 && ncopies > 1)
6191 if (dump_enabled_p ())
6192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6193 "multiple types in double reduction or condition "
6194 "reduction.\n");
6195 return false;
6198 /* In case of widenning multiplication by a constant, we update the type
6199 of the constant to be the type of the other operand. We check that the
6200 constant fits the type in the pattern recognition pass. */
6201 if (code == DOT_PROD_EXPR
6202 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6204 if (TREE_CODE (ops[0]) == INTEGER_CST)
6205 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6206 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6207 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6208 else
6210 if (dump_enabled_p ())
6211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6212 "invalid types in dot-prod\n");
6214 return false;
6218 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6220 widest_int ni;
6222 if (! max_loop_iterations (loop, &ni))
6224 if (dump_enabled_p ())
6225 dump_printf_loc (MSG_NOTE, vect_location,
6226 "loop count not known, cannot create cond "
6227 "reduction.\n");
6228 return false;
6230 /* Convert backedges to iterations. */
6231 ni += 1;
6233 /* The additional index will be the same type as the condition. Check
6234 that the loop can fit into this less one (because we'll use up the
6235 zero slot for when there are no matches). */
6236 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6237 if (wi::geu_p (ni, wi::to_widest (max_index)))
6239 if (dump_enabled_p ())
6240 dump_printf_loc (MSG_NOTE, vect_location,
6241 "loop size is greater than data size.\n");
6242 return false;
6246 /* In case the vectorization factor (VF) is bigger than the number
6247 of elements that we can fit in a vectype (nunits), we have to generate
6248 more than one vector stmt - i.e - we need to "unroll" the
6249 vector stmt by a factor VF/nunits. For more details see documentation
6250 in vectorizable_operation. */
6252 /* If the reduction is used in an outer loop we need to generate
6253 VF intermediate results, like so (e.g. for ncopies=2):
6254 r0 = phi (init, r0)
6255 r1 = phi (init, r1)
6256 r0 = x0 + r0;
6257 r1 = x1 + r1;
6258 (i.e. we generate VF results in 2 registers).
6259 In this case we have a separate def-use cycle for each copy, and therefore
6260 for each copy we get the vector def for the reduction variable from the
6261 respective phi node created for this copy.
6263 Otherwise (the reduction is unused in the loop nest), we can combine
6264 together intermediate results, like so (e.g. for ncopies=2):
6265 r = phi (init, r)
6266 r = x0 + r;
6267 r = x1 + r;
6268 (i.e. we generate VF/2 results in a single register).
6269 In this case for each copy we get the vector def for the reduction variable
6270 from the vectorized reduction operation generated in the previous iteration.
6272 This only works when we see both the reduction PHI and its only consumer
6273 in vectorizable_reduction and there are no intermediate stmts
6274 participating. */
6275 use_operand_p use_p;
6276 gimple *use_stmt;
6277 if (ncopies > 1
6278 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6279 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6280 && (use_stmt == stmt
6281 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6283 single_defuse_cycle = true;
6284 epilog_copies = 1;
6286 else
6287 epilog_copies = ncopies;
6289 /* If the reduction stmt is one of the patterns that have lane
6290 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6291 if ((ncopies > 1
6292 && ! single_defuse_cycle)
6293 && (code == DOT_PROD_EXPR
6294 || code == WIDEN_SUM_EXPR
6295 || code == SAD_EXPR))
6297 if (dump_enabled_p ())
6298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6299 "multi def-use cycle not possible for lane-reducing "
6300 "reduction operation\n");
6301 return false;
6304 if (!vec_stmt) /* transformation not required. */
6306 if (first_p)
6307 vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
6308 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6309 return true;
6312 /* Transform. */
6314 if (dump_enabled_p ())
6315 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6317 /* FORNOW: Multiple types are not supported for condition. */
6318 if (code == COND_EXPR)
6319 gcc_assert (ncopies == 1);
6321 /* Create the destination vector */
6322 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6324 prev_stmt_info = NULL;
6325 prev_phi_info = NULL;
6326 if (slp_node)
6327 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6328 else
6330 vec_num = 1;
6331 vec_oprnds0.create (1);
6332 vec_oprnds1.create (1);
6333 if (op_type == ternary_op)
6334 vec_oprnds2.create (1);
6337 phis.create (vec_num);
6338 vect_defs.create (vec_num);
6339 if (!slp_node)
6340 vect_defs.quick_push (NULL_TREE);
6342 if (slp_node)
6343 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6344 else
6345 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6347 for (j = 0; j < ncopies; j++)
6349 if (code == COND_EXPR)
6351 gcc_assert (!slp_node);
6352 vectorizable_condition (stmt, gsi, vec_stmt,
6353 PHI_RESULT (phis[0]),
6354 reduc_index, NULL);
6355 /* Multiple types are not supported for condition. */
6356 break;
6359 /* Handle uses. */
6360 if (j == 0)
6362 if (slp_node)
6364 /* Get vec defs for all the operands except the reduction index,
6365 ensuring the ordering of the ops in the vector is kept. */
6366 auto_vec<tree, 3> slp_ops;
6367 auto_vec<vec<tree>, 3> vec_defs;
6369 slp_ops.quick_push (ops[0]);
6370 slp_ops.quick_push (ops[1]);
6371 if (op_type == ternary_op)
6372 slp_ops.quick_push (ops[2]);
6374 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6376 vec_oprnds0.safe_splice (vec_defs[0]);
6377 vec_defs[0].release ();
6378 vec_oprnds1.safe_splice (vec_defs[1]);
6379 vec_defs[1].release ();
6380 if (op_type == ternary_op)
6382 vec_oprnds2.safe_splice (vec_defs[2]);
6383 vec_defs[2].release ();
6386 else
6388 vec_oprnds0.quick_push
6389 (vect_get_vec_def_for_operand (ops[0], stmt));
6390 vec_oprnds1.quick_push
6391 (vect_get_vec_def_for_operand (ops[1], stmt));
6392 if (op_type == ternary_op)
6393 vec_oprnds2.quick_push
6394 (vect_get_vec_def_for_operand (ops[2], stmt));
6397 else
6399 if (!slp_node)
6401 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6403 if (single_defuse_cycle && reduc_index == 0)
6404 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6405 else
6406 vec_oprnds0[0]
6407 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6408 if (single_defuse_cycle && reduc_index == 1)
6409 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6410 else
6411 vec_oprnds1[0]
6412 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6413 if (op_type == ternary_op)
6415 if (single_defuse_cycle && reduc_index == 2)
6416 vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6417 else
6418 vec_oprnds2[0]
6419 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6424 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6426 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6427 if (op_type == ternary_op)
6428 vop[2] = vec_oprnds2[i];
6430 new_temp = make_ssa_name (vec_dest, new_stmt);
6431 new_stmt = gimple_build_assign (new_temp, code,
6432 vop[0], vop[1], vop[2]);
6433 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6435 if (slp_node)
6437 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6438 vect_defs.quick_push (new_temp);
6440 else
6441 vect_defs[0] = new_temp;
6444 if (slp_node)
6445 continue;
6447 if (j == 0)
6448 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6449 else
6450 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6452 prev_stmt_info = vinfo_for_stmt (new_stmt);
6455 /* Finalize the reduction-phi (set its arguments) and create the
6456 epilog reduction code. */
6457 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6458 vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6460 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6461 epilog_copies,
6462 epilog_reduc_code, phis,
6463 double_reduc, slp_node, slp_node_instance);
6465 return true;
6468 /* Function vect_min_worthwhile_factor.
6470 For a loop where we could vectorize the operation indicated by CODE,
6471 return the minimum vectorization factor that makes it worthwhile
6472 to use generic vectors. */
6474 vect_min_worthwhile_factor (enum tree_code code)
6476 switch (code)
6478 case PLUS_EXPR:
6479 case MINUS_EXPR:
6480 case NEGATE_EXPR:
6481 return 4;
6483 case BIT_AND_EXPR:
6484 case BIT_IOR_EXPR:
6485 case BIT_XOR_EXPR:
6486 case BIT_NOT_EXPR:
6487 return 2;
6489 default:
6490 return INT_MAX;
6494 /* Return true if VINFO indicates we are doing loop vectorization and if
6495 it is worth decomposing CODE operations into scalar operations for
6496 that loop's vectorization factor. */
6498 bool
6499 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6501 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6502 return (loop_vinfo
6503 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6504 >= vect_min_worthwhile_factor (code)));
6507 /* Function vectorizable_induction
6509 Check if PHI performs an induction computation that can be vectorized.
6510 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6511 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6512 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6514 bool
6515 vectorizable_induction (gimple *phi,
6516 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6517 gimple **vec_stmt, slp_tree slp_node)
6519 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6520 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6521 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6522 unsigned ncopies;
6523 bool nested_in_vect_loop = false;
6524 struct loop *iv_loop;
6525 tree vec_def;
6526 edge pe = loop_preheader_edge (loop);
6527 basic_block new_bb;
6528 tree new_vec, vec_init, vec_step, t;
6529 tree new_name;
6530 gimple *new_stmt;
6531 gphi *induction_phi;
6532 tree induc_def, vec_dest;
6533 tree init_expr, step_expr;
6534 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6535 unsigned i;
6536 tree expr;
6537 gimple_seq stmts;
6538 imm_use_iterator imm_iter;
6539 use_operand_p use_p;
6540 gimple *exit_phi;
6541 edge latch_e;
6542 tree loop_arg;
6543 gimple_stmt_iterator si;
6544 basic_block bb = gimple_bb (phi);
6546 if (gimple_code (phi) != GIMPLE_PHI)
6547 return false;
6549 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6550 return false;
6552 /* Make sure it was recognized as induction computation. */
6553 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6554 return false;
6556 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6557 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6559 if (slp_node)
6560 ncopies = 1;
6561 else
6562 ncopies = vect_get_num_copies (loop_vinfo, vectype);
6563 gcc_assert (ncopies >= 1);
6565 /* FORNOW. These restrictions should be relaxed. */
6566 if (nested_in_vect_loop_p (loop, phi))
6568 imm_use_iterator imm_iter;
6569 use_operand_p use_p;
6570 gimple *exit_phi;
6571 edge latch_e;
6572 tree loop_arg;
6574 if (ncopies > 1)
6576 if (dump_enabled_p ())
6577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6578 "multiple types in nested loop.\n");
6579 return false;
6582 /* FORNOW: outer loop induction with SLP not supported. */
6583 if (STMT_SLP_TYPE (stmt_info))
6584 return false;
6586 exit_phi = NULL;
6587 latch_e = loop_latch_edge (loop->inner);
6588 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6589 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6591 gimple *use_stmt = USE_STMT (use_p);
6592 if (is_gimple_debug (use_stmt))
6593 continue;
6595 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6597 exit_phi = use_stmt;
6598 break;
6601 if (exit_phi)
6603 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6604 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6605 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6607 if (dump_enabled_p ())
6608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609 "inner-loop induction only used outside "
6610 "of the outer vectorized loop.\n");
6611 return false;
6615 nested_in_vect_loop = true;
6616 iv_loop = loop->inner;
6618 else
6619 iv_loop = loop;
6620 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6622 if (!vec_stmt) /* transformation not required. */
6624 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6625 if (dump_enabled_p ())
6626 dump_printf_loc (MSG_NOTE, vect_location,
6627 "=== vectorizable_induction ===\n");
6628 vect_model_induction_cost (stmt_info, ncopies);
6629 return true;
6632 /* Transform. */
6634 /* Compute a vector variable, initialized with the first VF values of
6635 the induction variable. E.g., for an iv with IV_PHI='X' and
6636 evolution S, for a vector of 4 units, we want to compute:
6637 [X, X + S, X + 2*S, X + 3*S]. */
6639 if (dump_enabled_p ())
6640 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6642 latch_e = loop_latch_edge (iv_loop);
6643 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6645 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6646 gcc_assert (step_expr != NULL_TREE);
6648 pe = loop_preheader_edge (iv_loop);
6649 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6650 loop_preheader_edge (iv_loop));
6652 /* Convert the step to the desired type. */
6653 stmts = NULL;
6654 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6655 if (stmts)
6657 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6658 gcc_assert (!new_bb);
6661 /* Find the first insertion point in the BB. */
6662 si = gsi_after_labels (bb);
6664 /* For SLP induction we have to generate several IVs as for example
6665 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6666 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
6667 [VF*S, VF*S, VF*S, VF*S] for all. */
6668 if (slp_node)
6670 /* Convert the init to the desired type. */
6671 stmts = NULL;
6672 init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6673 if (stmts)
6675 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6676 gcc_assert (!new_bb);
6679 /* Generate [VF*S, VF*S, ... ]. */
6680 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6682 expr = build_int_cst (integer_type_node, vf);
6683 expr = fold_convert (TREE_TYPE (step_expr), expr);
6685 else
6686 expr = build_int_cst (TREE_TYPE (step_expr), vf);
6687 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6688 expr, step_expr);
6689 if (! CONSTANT_CLASS_P (new_name))
6690 new_name = vect_init_vector (phi, new_name,
6691 TREE_TYPE (step_expr), NULL);
6692 new_vec = build_vector_from_val (vectype, new_name);
6693 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6695 /* Now generate the IVs. */
6696 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6697 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6698 unsigned elts = nunits * nvects;
6699 unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6700 gcc_assert (elts % group_size == 0);
6701 tree elt = init_expr;
6702 unsigned ivn;
6703 for (ivn = 0; ivn < nivs; ++ivn)
6705 auto_vec<tree, 32> elts (nunits);
6706 stmts = NULL;
6707 for (unsigned eltn = 0; eltn < nunits; ++eltn)
6709 if (ivn*nunits + eltn >= group_size
6710 && (ivn*nunits + eltn) % group_size == 0)
6711 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6712 elt, step_expr);
6713 elts.quick_push (elt);
6715 vec_init = gimple_build_vector (&stmts, vectype, elts);
6716 if (stmts)
6718 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6719 gcc_assert (!new_bb);
6722 /* Create the induction-phi that defines the induction-operand. */
6723 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6724 induction_phi = create_phi_node (vec_dest, iv_loop->header);
6725 set_vinfo_for_stmt (induction_phi,
6726 new_stmt_vec_info (induction_phi, loop_vinfo));
6727 induc_def = PHI_RESULT (induction_phi);
6729 /* Create the iv update inside the loop */
6730 vec_def = make_ssa_name (vec_dest);
6731 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6732 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6733 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6735 /* Set the arguments of the phi node: */
6736 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6737 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6738 UNKNOWN_LOCATION);
6740 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6743 /* Re-use IVs when we can. */
6744 if (ivn < nvects)
6746 unsigned vfp
6747 = least_common_multiple (group_size, nunits) / group_size;
6748 /* Generate [VF'*S, VF'*S, ... ]. */
6749 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6751 expr = build_int_cst (integer_type_node, vfp);
6752 expr = fold_convert (TREE_TYPE (step_expr), expr);
6754 else
6755 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6756 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6757 expr, step_expr);
6758 if (! CONSTANT_CLASS_P (new_name))
6759 new_name = vect_init_vector (phi, new_name,
6760 TREE_TYPE (step_expr), NULL);
6761 new_vec = build_vector_from_val (vectype, new_name);
6762 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6763 for (; ivn < nvects; ++ivn)
6765 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6766 tree def;
6767 if (gimple_code (iv) == GIMPLE_PHI)
6768 def = gimple_phi_result (iv);
6769 else
6770 def = gimple_assign_lhs (iv);
6771 new_stmt = gimple_build_assign (make_ssa_name (vectype),
6772 PLUS_EXPR,
6773 def, vec_step);
6774 if (gimple_code (iv) == GIMPLE_PHI)
6775 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6776 else
6778 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6779 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6781 set_vinfo_for_stmt (new_stmt,
6782 new_stmt_vec_info (new_stmt, loop_vinfo));
6783 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6787 return true;
6790 /* Create the vector that holds the initial_value of the induction. */
6791 if (nested_in_vect_loop)
6793 /* iv_loop is nested in the loop to be vectorized. init_expr had already
6794 been created during vectorization of previous stmts. We obtain it
6795 from the STMT_VINFO_VEC_STMT of the defining stmt. */
6796 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6797 /* If the initial value is not of proper type, convert it. */
6798 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6800 new_stmt
6801 = gimple_build_assign (vect_get_new_ssa_name (vectype,
6802 vect_simple_var,
6803 "vec_iv_"),
6804 VIEW_CONVERT_EXPR,
6805 build1 (VIEW_CONVERT_EXPR, vectype,
6806 vec_init));
6807 vec_init = gimple_assign_lhs (new_stmt);
6808 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6809 new_stmt);
6810 gcc_assert (!new_bb);
6811 set_vinfo_for_stmt (new_stmt,
6812 new_stmt_vec_info (new_stmt, loop_vinfo));
6815 else
6817 /* iv_loop is the loop to be vectorized. Create:
6818 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
6819 stmts = NULL;
6820 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6822 auto_vec<tree, 32> elts (nunits);
6823 elts.quick_push (new_name);
6824 for (i = 1; i < nunits; i++)
6826 /* Create: new_name_i = new_name + step_expr */
6827 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6828 new_name, step_expr);
6829 elts.quick_push (new_name);
6831 /* Create a vector from [new_name_0, new_name_1, ...,
6832 new_name_nunits-1] */
6833 vec_init = gimple_build_vector (&stmts, vectype, elts);
6834 if (stmts)
6836 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6837 gcc_assert (!new_bb);
6842 /* Create the vector that holds the step of the induction. */
6843 if (nested_in_vect_loop)
6844 /* iv_loop is nested in the loop to be vectorized. Generate:
6845 vec_step = [S, S, S, S] */
6846 new_name = step_expr;
6847 else
6849 /* iv_loop is the loop to be vectorized. Generate:
6850 vec_step = [VF*S, VF*S, VF*S, VF*S] */
6851 gimple_seq seq = NULL;
6852 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6854 expr = build_int_cst (integer_type_node, vf);
6855 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6857 else
6858 expr = build_int_cst (TREE_TYPE (step_expr), vf);
6859 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6860 expr, step_expr);
6861 if (seq)
6863 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6864 gcc_assert (!new_bb);
6868 t = unshare_expr (new_name);
6869 gcc_assert (CONSTANT_CLASS_P (new_name)
6870 || TREE_CODE (new_name) == SSA_NAME);
6871 new_vec = build_vector_from_val (vectype, t);
6872 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6875 /* Create the following def-use cycle:
6876 loop prolog:
6877 vec_init = ...
6878 vec_step = ...
6879 loop:
6880 vec_iv = PHI <vec_init, vec_loop>
6882 STMT
6884 vec_loop = vec_iv + vec_step; */
6886 /* Create the induction-phi that defines the induction-operand. */
6887 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6888 induction_phi = create_phi_node (vec_dest, iv_loop->header);
6889 set_vinfo_for_stmt (induction_phi,
6890 new_stmt_vec_info (induction_phi, loop_vinfo));
6891 induc_def = PHI_RESULT (induction_phi);
6893 /* Create the iv update inside the loop */
6894 vec_def = make_ssa_name (vec_dest);
6895 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6896 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6897 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6899 /* Set the arguments of the phi node: */
6900 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6901 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6902 UNKNOWN_LOCATION);
6904 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6906 /* In case that vectorization factor (VF) is bigger than the number
6907 of elements that we can fit in a vectype (nunits), we have to generate
6908 more than one vector stmt - i.e - we need to "unroll" the
6909 vector stmt by a factor VF/nunits. For more details see documentation
6910 in vectorizable_operation. */
6912 if (ncopies > 1)
6914 gimple_seq seq = NULL;
6915 stmt_vec_info prev_stmt_vinfo;
6916 /* FORNOW. This restriction should be relaxed. */
6917 gcc_assert (!nested_in_vect_loop);
6919 /* Create the vector that holds the step of the induction. */
6920 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6922 expr = build_int_cst (integer_type_node, nunits);
6923 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6925 else
6926 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6927 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6928 expr, step_expr);
6929 if (seq)
6931 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6932 gcc_assert (!new_bb);
6935 t = unshare_expr (new_name);
6936 gcc_assert (CONSTANT_CLASS_P (new_name)
6937 || TREE_CODE (new_name) == SSA_NAME);
6938 new_vec = build_vector_from_val (vectype, t);
6939 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6941 vec_def = induc_def;
6942 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6943 for (i = 1; i < ncopies; i++)
6945 /* vec_i = vec_prev + vec_step */
6946 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6947 vec_def, vec_step);
6948 vec_def = make_ssa_name (vec_dest, new_stmt);
6949 gimple_assign_set_lhs (new_stmt, vec_def);
6951 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6952 set_vinfo_for_stmt (new_stmt,
6953 new_stmt_vec_info (new_stmt, loop_vinfo));
6954 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6955 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
6959 if (nested_in_vect_loop)
6961 /* Find the loop-closed exit-phi of the induction, and record
6962 the final vector of induction results: */
6963 exit_phi = NULL;
6964 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6966 gimple *use_stmt = USE_STMT (use_p);
6967 if (is_gimple_debug (use_stmt))
6968 continue;
6970 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
6972 exit_phi = use_stmt;
6973 break;
6976 if (exit_phi)
6978 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
6979 /* FORNOW. Currently not supporting the case that an inner-loop induction
6980 is not used in the outer-loop (i.e. only outside the outer-loop). */
6981 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
6982 && !STMT_VINFO_LIVE_P (stmt_vinfo));
6984 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6985 if (dump_enabled_p ())
6987 dump_printf_loc (MSG_NOTE, vect_location,
6988 "vector of inductions after inner-loop:");
6989 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
6995 if (dump_enabled_p ())
6997 dump_printf_loc (MSG_NOTE, vect_location,
6998 "transform induction: created def-use cycle: ");
6999 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7000 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7001 SSA_NAME_DEF_STMT (vec_def), 0);
7004 return true;
7007 /* Function vectorizable_live_operation.
7009 STMT computes a value that is used outside the loop. Check if
7010 it can be supported. */
7012 bool
7013 vectorizable_live_operation (gimple *stmt,
7014 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7015 slp_tree slp_node, int slp_index,
7016 gimple **vec_stmt)
7018 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7019 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7020 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7021 imm_use_iterator imm_iter;
7022 tree lhs, lhs_type, bitsize, vec_bitsize;
7023 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7024 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7025 int ncopies;
7026 gimple *use_stmt;
7027 auto_vec<tree> vec_oprnds;
7029 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7031 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7032 return false;
7034 /* FORNOW. CHECKME. */
7035 if (nested_in_vect_loop_p (loop, stmt))
7036 return false;
7038 /* If STMT is not relevant and it is a simple assignment and its inputs are
7039 invariant then it can remain in place, unvectorized. The original last
7040 scalar value that it computes will be used. */
7041 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7043 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7044 if (dump_enabled_p ())
7045 dump_printf_loc (MSG_NOTE, vect_location,
7046 "statement is simple and uses invariant. Leaving in "
7047 "place.\n");
7048 return true;
7051 if (slp_node)
7052 ncopies = 1;
7053 else
7054 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7056 if (!vec_stmt)
7057 /* No transformation required. */
7058 return true;
7060 /* If stmt has a related stmt, then use that for getting the lhs. */
7061 if (is_pattern_stmt_p (stmt_info))
7062 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7064 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7065 : gimple_get_lhs (stmt);
7066 lhs_type = TREE_TYPE (lhs);
7068 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7069 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7070 : TYPE_SIZE (TREE_TYPE (vectype)));
7071 vec_bitsize = TYPE_SIZE (vectype);
7073 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7074 tree vec_lhs, bitstart;
7075 if (slp_node)
7077 gcc_assert (slp_index >= 0);
7079 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7080 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7082 /* Get the last occurrence of the scalar index from the concatenation of
7083 all the slp vectors. Calculate which slp vector it is and the index
7084 within. */
7085 int pos = (num_vec * nunits) - num_scalar + slp_index;
7086 int vec_entry = pos / nunits;
7087 int vec_index = pos % nunits;
7089 /* Get the correct slp vectorized stmt. */
7090 vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7092 /* Get entry to use. */
7093 bitstart = bitsize_int (vec_index);
7094 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7096 else
7098 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7099 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7101 /* For multiple copies, get the last copy. */
7102 for (int i = 1; i < ncopies; ++i)
7103 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7104 vec_lhs);
7106 /* Get the last lane in the vector. */
7107 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7110 /* Create a new vectorized stmt for the uses of STMT and insert outside the
7111 loop. */
7112 gimple_seq stmts = NULL;
7113 tree bftype = TREE_TYPE (vectype);
7114 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7115 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7116 tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7117 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7118 true, NULL_TREE);
7119 if (stmts)
7120 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7122 /* Replace use of lhs with newly computed result. If the use stmt is a
7123 single arg PHI, just replace all uses of PHI result. It's necessary
7124 because lcssa PHI defining lhs may be before newly inserted stmt. */
7125 use_operand_p use_p;
7126 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7127 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7128 && !is_gimple_debug (use_stmt))
7130 if (gimple_code (use_stmt) == GIMPLE_PHI
7131 && gimple_phi_num_args (use_stmt) == 1)
7133 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7135 else
7137 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7138 SET_USE (use_p, new_tree);
7140 update_stmt (use_stmt);
7143 return true;
7146 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
7148 static void
7149 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7151 ssa_op_iter op_iter;
7152 imm_use_iterator imm_iter;
7153 def_operand_p def_p;
7154 gimple *ustmt;
7156 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7158 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7160 basic_block bb;
7162 if (!is_gimple_debug (ustmt))
7163 continue;
7165 bb = gimple_bb (ustmt);
7167 if (!flow_bb_inside_loop_p (loop, bb))
7169 if (gimple_debug_bind_p (ustmt))
7171 if (dump_enabled_p ())
7172 dump_printf_loc (MSG_NOTE, vect_location,
7173 "killing debug use\n");
7175 gimple_debug_bind_reset_value (ustmt);
7176 update_stmt (ustmt);
7178 else
7179 gcc_unreachable ();
7185 /* Given loop represented by LOOP_VINFO, return true if computation of
7186 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7187 otherwise. */
7189 static bool
7190 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7192 /* Constant case. */
7193 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7195 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7196 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7198 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7199 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7200 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7201 return true;
7204 widest_int max;
7205 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7206 /* Check the upper bound of loop niters. */
7207 if (get_max_loop_iterations (loop, &max))
7209 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7210 signop sgn = TYPE_SIGN (type);
7211 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7212 if (max < type_max)
7213 return true;
7215 return false;
7218 /* Scale profiling counters by estimation for LOOP which is vectorized
7219 by factor VF. */
7221 static void
7222 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7224 edge preheader = loop_preheader_edge (loop);
7225 /* Reduce loop iterations by the vectorization factor. */
7226 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7227 profile_count freq_h = loop->header->count, freq_e = preheader->count;
7229 /* Use frequency only if counts are zero. */
7230 if (!(freq_h > 0) && !(freq_e > 0))
7232 freq_h = profile_count::from_gcov_type (loop->header->frequency);
7233 freq_e = profile_count::from_gcov_type (EDGE_FREQUENCY (preheader));
7235 if (freq_h > 0)
7237 profile_probability p;
7239 /* Avoid dropping loop body profile counter to 0 because of zero count
7240 in loop's preheader. */
7241 if (!(freq_e > profile_count::from_gcov_type (1)))
7242 freq_e = profile_count::from_gcov_type (1);
7243 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7244 scale_loop_frequencies (loop, p);
7247 basic_block exit_bb = single_pred (loop->latch);
7248 edge exit_e = single_exit (loop);
7249 exit_e->count = loop_preheader_edge (loop)->count;
7250 exit_e->probability = profile_probability::always ()
7251 .apply_scale (1, new_est_niter + 1);
7253 edge exit_l = single_pred_edge (loop->latch);
7254 profile_probability prob = exit_l->probability;
7255 exit_l->probability = exit_e->probability.invert ();
7256 exit_l->count = exit_bb->count - exit_e->count;
7257 if (prob.initialized_p () && exit_l->probability.initialized_p ())
7258 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7261 /* Function vect_transform_loop.
7263 The analysis phase has determined that the loop is vectorizable.
7264 Vectorize the loop - created vectorized stmts to replace the scalar
7265 stmts in the loop, and update the loop exit condition.
7266 Returns scalar epilogue loop if any. */
7268 struct loop *
7269 vect_transform_loop (loop_vec_info loop_vinfo)
7271 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7272 struct loop *epilogue = NULL;
7273 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7274 int nbbs = loop->num_nodes;
7275 int i;
7276 tree niters_vector = NULL;
7277 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7278 bool grouped_store;
7279 bool slp_scheduled = false;
7280 gimple *stmt, *pattern_stmt;
7281 gimple_seq pattern_def_seq = NULL;
7282 gimple_stmt_iterator pattern_def_si = gsi_none ();
7283 bool transform_pattern_stmt = false;
7284 bool check_profitability = false;
7285 int th;
7287 if (dump_enabled_p ())
7288 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7290 /* Use the more conservative vectorization threshold. If the number
7291 of iterations is constant assume the cost check has been performed
7292 by our caller. If the threshold makes all loops profitable that
7293 run at least the vectorization factor number of times checking
7294 is pointless, too. */
7295 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7296 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7297 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7299 if (dump_enabled_p ())
7300 dump_printf_loc (MSG_NOTE, vect_location,
7301 "Profitability threshold is %d loop iterations.\n",
7302 th);
7303 check_profitability = true;
7306 /* Make sure there exists a single-predecessor exit bb. Do this before
7307 versioning. */
7308 edge e = single_exit (loop);
7309 if (! single_pred_p (e->dest))
7311 split_loop_exit_edge (e);
7312 if (dump_enabled_p ())
7313 dump_printf (MSG_NOTE, "split exit edge\n");
7316 /* Version the loop first, if required, so the profitability check
7317 comes first. */
7319 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7321 vect_loop_versioning (loop_vinfo, th, check_profitability);
7322 check_profitability = false;
7325 /* Make sure there exists a single-predecessor exit bb also on the
7326 scalar loop copy. Do this after versioning but before peeling
7327 so CFG structure is fine for both scalar and if-converted loop
7328 to make slpeel_duplicate_current_defs_from_edges face matched
7329 loop closed PHI nodes on the exit. */
7330 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7332 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7333 if (! single_pred_p (e->dest))
7335 split_loop_exit_edge (e);
7336 if (dump_enabled_p ())
7337 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7341 tree niters = vect_build_loop_niters (loop_vinfo);
7342 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7343 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7344 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7345 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7346 check_profitability, niters_no_overflow);
7347 if (niters_vector == NULL_TREE)
7349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7350 niters_vector
7351 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7352 LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7353 else
7354 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7355 niters_no_overflow);
7358 /* 1) Make sure the loop header has exactly two entries
7359 2) Make sure we have a preheader basic block. */
7361 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7363 split_edge (loop_preheader_edge (loop));
7365 /* FORNOW: the vectorizer supports only loops which body consist
7366 of one basic block (header + empty latch). When the vectorizer will
7367 support more involved loop forms, the order by which the BBs are
7368 traversed need to be reconsidered. */
7370 for (i = 0; i < nbbs; i++)
7372 basic_block bb = bbs[i];
7373 stmt_vec_info stmt_info;
7375 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7376 gsi_next (&si))
7378 gphi *phi = si.phi ();
7379 if (dump_enabled_p ())
7381 dump_printf_loc (MSG_NOTE, vect_location,
7382 "------>vectorizing phi: ");
7383 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7385 stmt_info = vinfo_for_stmt (phi);
7386 if (!stmt_info)
7387 continue;
7389 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7390 vect_loop_kill_debug_uses (loop, phi);
7392 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7393 && !STMT_VINFO_LIVE_P (stmt_info))
7394 continue;
7396 if (STMT_VINFO_VECTYPE (stmt_info)
7397 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7398 != (unsigned HOST_WIDE_INT) vf)
7399 && dump_enabled_p ())
7400 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7402 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7403 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7404 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7405 && ! PURE_SLP_STMT (stmt_info))
7407 if (dump_enabled_p ())
7408 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7409 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7413 pattern_stmt = NULL;
7414 for (gimple_stmt_iterator si = gsi_start_bb (bb);
7415 !gsi_end_p (si) || transform_pattern_stmt;)
7417 bool is_store;
7419 if (transform_pattern_stmt)
7420 stmt = pattern_stmt;
7421 else
7423 stmt = gsi_stmt (si);
7424 /* During vectorization remove existing clobber stmts. */
7425 if (gimple_clobber_p (stmt))
7427 unlink_stmt_vdef (stmt);
7428 gsi_remove (&si, true);
7429 release_defs (stmt);
7430 continue;
7434 if (dump_enabled_p ())
7436 dump_printf_loc (MSG_NOTE, vect_location,
7437 "------>vectorizing statement: ");
7438 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7441 stmt_info = vinfo_for_stmt (stmt);
7443 /* vector stmts created in the outer-loop during vectorization of
7444 stmts in an inner-loop may not have a stmt_info, and do not
7445 need to be vectorized. */
7446 if (!stmt_info)
7448 gsi_next (&si);
7449 continue;
7452 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7453 vect_loop_kill_debug_uses (loop, stmt);
7455 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7456 && !STMT_VINFO_LIVE_P (stmt_info))
7458 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7459 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7460 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7461 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7463 stmt = pattern_stmt;
7464 stmt_info = vinfo_for_stmt (stmt);
7466 else
7468 gsi_next (&si);
7469 continue;
7472 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7473 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7474 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7475 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7476 transform_pattern_stmt = true;
7478 /* If pattern statement has def stmts, vectorize them too. */
7479 if (is_pattern_stmt_p (stmt_info))
7481 if (pattern_def_seq == NULL)
7483 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7484 pattern_def_si = gsi_start (pattern_def_seq);
7486 else if (!gsi_end_p (pattern_def_si))
7487 gsi_next (&pattern_def_si);
7488 if (pattern_def_seq != NULL)
7490 gimple *pattern_def_stmt = NULL;
7491 stmt_vec_info pattern_def_stmt_info = NULL;
7493 while (!gsi_end_p (pattern_def_si))
7495 pattern_def_stmt = gsi_stmt (pattern_def_si);
7496 pattern_def_stmt_info
7497 = vinfo_for_stmt (pattern_def_stmt);
7498 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7499 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7500 break;
7501 gsi_next (&pattern_def_si);
7504 if (!gsi_end_p (pattern_def_si))
7506 if (dump_enabled_p ())
7508 dump_printf_loc (MSG_NOTE, vect_location,
7509 "==> vectorizing pattern def "
7510 "stmt: ");
7511 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7512 pattern_def_stmt, 0);
7515 stmt = pattern_def_stmt;
7516 stmt_info = pattern_def_stmt_info;
7518 else
7520 pattern_def_si = gsi_none ();
7521 transform_pattern_stmt = false;
7524 else
7525 transform_pattern_stmt = false;
7528 if (STMT_VINFO_VECTYPE (stmt_info))
7530 unsigned int nunits
7531 = (unsigned int)
7532 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7533 if (!STMT_SLP_TYPE (stmt_info)
7534 && nunits != (unsigned int) vf
7535 && dump_enabled_p ())
7536 /* For SLP VF is set according to unrolling factor, and not
7537 to vector size, hence for SLP this print is not valid. */
7538 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7541 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7542 reached. */
7543 if (STMT_SLP_TYPE (stmt_info))
7545 if (!slp_scheduled)
7547 slp_scheduled = true;
7549 if (dump_enabled_p ())
7550 dump_printf_loc (MSG_NOTE, vect_location,
7551 "=== scheduling SLP instances ===\n");
7553 vect_schedule_slp (loop_vinfo);
7556 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7557 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7559 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7561 pattern_def_seq = NULL;
7562 gsi_next (&si);
7564 continue;
7568 /* -------- vectorize statement ------------ */
7569 if (dump_enabled_p ())
7570 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7572 grouped_store = false;
7573 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7574 if (is_store)
7576 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7578 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7579 interleaving chain was completed - free all the stores in
7580 the chain. */
7581 gsi_next (&si);
7582 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7584 else
7586 /* Free the attached stmt_vec_info and remove the stmt. */
7587 gimple *store = gsi_stmt (si);
7588 free_stmt_vec_info (store);
7589 unlink_stmt_vdef (store);
7590 gsi_remove (&si, true);
7591 release_defs (store);
7594 /* Stores can only appear at the end of pattern statements. */
7595 gcc_assert (!transform_pattern_stmt);
7596 pattern_def_seq = NULL;
7598 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7600 pattern_def_seq = NULL;
7601 gsi_next (&si);
7603 } /* stmts in BB */
7604 } /* BBs in loop */
7606 slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7608 scale_profile_for_vect_loop (loop, vf);
7610 /* The minimum number of iterations performed by the epilogue. This
7611 is 1 when peeling for gaps because we always need a final scalar
7612 iteration. */
7613 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7614 /* +1 to convert latch counts to loop iteration counts,
7615 -min_epilogue_iters to remove iterations that cannot be performed
7616 by the vector code. */
7617 int bias = 1 - min_epilogue_iters;
7618 /* In these calculations the "- 1" converts loop iteration counts
7619 back to latch counts. */
7620 if (loop->any_upper_bound)
7621 loop->nb_iterations_upper_bound
7622 = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7623 if (loop->any_likely_upper_bound)
7624 loop->nb_iterations_likely_upper_bound
7625 = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7626 if (loop->any_estimate)
7627 loop->nb_iterations_estimate
7628 = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7630 if (dump_enabled_p ())
7632 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7634 dump_printf_loc (MSG_NOTE, vect_location,
7635 "LOOP VECTORIZED\n");
7636 if (loop->inner)
7637 dump_printf_loc (MSG_NOTE, vect_location,
7638 "OUTER LOOP VECTORIZED\n");
7639 dump_printf (MSG_NOTE, "\n");
7641 else
7642 dump_printf_loc (MSG_NOTE, vect_location,
7643 "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7644 current_vector_size);
7647 /* Free SLP instances here because otherwise stmt reference counting
7648 won't work. */
7649 slp_instance instance;
7650 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7651 vect_free_slp_instance (instance);
7652 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7653 /* Clear-up safelen field since its value is invalid after vectorization
7654 since vectorized loop can have loop-carried dependencies. */
7655 loop->safelen = 0;
7657 /* Don't vectorize epilogue for epilogue. */
7658 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7659 epilogue = NULL;
7661 if (epilogue)
7663 unsigned int vector_sizes
7664 = targetm.vectorize.autovectorize_vector_sizes ();
7665 vector_sizes &= current_vector_size - 1;
7667 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7668 epilogue = NULL;
7669 else if (!vector_sizes)
7670 epilogue = NULL;
7671 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7672 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7674 int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7675 int ratio = current_vector_size / smallest_vec_size;
7676 int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7677 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7678 eiters = eiters % vf;
7680 epilogue->nb_iterations_upper_bound = eiters - 1;
7682 if (eiters < vf / ratio)
7683 epilogue = NULL;
7687 if (epilogue)
7689 epilogue->force_vectorize = loop->force_vectorize;
7690 epilogue->safelen = loop->safelen;
7691 epilogue->dont_vectorize = false;
7693 /* We may need to if-convert epilogue to vectorize it. */
7694 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7695 tree_if_conversion (epilogue);
7698 return epilogue;
7701 /* The code below is trying to perform simple optimization - revert
7702 if-conversion for masked stores, i.e. if the mask of a store is zero
7703 do not perform it and all stored value producers also if possible.
7704 For example,
7705 for (i=0; i<n; i++)
7706 if (c[i])
7708 p1[i] += 1;
7709 p2[i] = p3[i] +2;
7711 this transformation will produce the following semi-hammock:
7713 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7715 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7716 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7717 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7718 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7719 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7720 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7724 void
7725 optimize_mask_stores (struct loop *loop)
7727 basic_block *bbs = get_loop_body (loop);
7728 unsigned nbbs = loop->num_nodes;
7729 unsigned i;
7730 basic_block bb;
7731 struct loop *bb_loop;
7732 gimple_stmt_iterator gsi;
7733 gimple *stmt;
7734 auto_vec<gimple *> worklist;
7736 vect_location = find_loop_location (loop);
7737 /* Pick up all masked stores in loop if any. */
7738 for (i = 0; i < nbbs; i++)
7740 bb = bbs[i];
7741 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7742 gsi_next (&gsi))
7744 stmt = gsi_stmt (gsi);
7745 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7746 worklist.safe_push (stmt);
7750 free (bbs);
7751 if (worklist.is_empty ())
7752 return;
7754 /* Loop has masked stores. */
7755 while (!worklist.is_empty ())
7757 gimple *last, *last_store;
7758 edge e, efalse;
7759 tree mask;
7760 basic_block store_bb, join_bb;
7761 gimple_stmt_iterator gsi_to;
7762 tree vdef, new_vdef;
7763 gphi *phi;
7764 tree vectype;
7765 tree zero;
7767 last = worklist.pop ();
7768 mask = gimple_call_arg (last, 2);
7769 bb = gimple_bb (last);
7770 /* Create then_bb and if-then structure in CFG, then_bb belongs to
7771 the same loop as if_bb. It could be different to LOOP when two
7772 level loop-nest is vectorized and mask_store belongs to the inner
7773 one. */
7774 e = split_block (bb, last);
7775 bb_loop = bb->loop_father;
7776 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7777 join_bb = e->dest;
7778 store_bb = create_empty_bb (bb);
7779 add_bb_to_loop (store_bb, bb_loop);
7780 e->flags = EDGE_TRUE_VALUE;
7781 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7782 /* Put STORE_BB to likely part. */
7783 efalse->probability = profile_probability::unlikely ();
7784 store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7785 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7786 if (dom_info_available_p (CDI_DOMINATORS))
7787 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7788 if (dump_enabled_p ())
7789 dump_printf_loc (MSG_NOTE, vect_location,
7790 "Create new block %d to sink mask stores.",
7791 store_bb->index);
7792 /* Create vector comparison with boolean result. */
7793 vectype = TREE_TYPE (mask);
7794 zero = build_zero_cst (vectype);
7795 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7796 gsi = gsi_last_bb (bb);
7797 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7798 /* Create new PHI node for vdef of the last masked store:
7799 .MEM_2 = VDEF <.MEM_1>
7800 will be converted to
7801 .MEM.3 = VDEF <.MEM_1>
7802 and new PHI node will be created in join bb
7803 .MEM_2 = PHI <.MEM_1, .MEM_3>
7805 vdef = gimple_vdef (last);
7806 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7807 gimple_set_vdef (last, new_vdef);
7808 phi = create_phi_node (vdef, join_bb);
7809 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7811 /* Put all masked stores with the same mask to STORE_BB if possible. */
7812 while (true)
7814 gimple_stmt_iterator gsi_from;
7815 gimple *stmt1 = NULL;
7817 /* Move masked store to STORE_BB. */
7818 last_store = last;
7819 gsi = gsi_for_stmt (last);
7820 gsi_from = gsi;
7821 /* Shift GSI to the previous stmt for further traversal. */
7822 gsi_prev (&gsi);
7823 gsi_to = gsi_start_bb (store_bb);
7824 gsi_move_before (&gsi_from, &gsi_to);
7825 /* Setup GSI_TO to the non-empty block start. */
7826 gsi_to = gsi_start_bb (store_bb);
7827 if (dump_enabled_p ())
7829 dump_printf_loc (MSG_NOTE, vect_location,
7830 "Move stmt to created bb\n");
7831 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7833 /* Move all stored value producers if possible. */
7834 while (!gsi_end_p (gsi))
7836 tree lhs;
7837 imm_use_iterator imm_iter;
7838 use_operand_p use_p;
7839 bool res;
7841 /* Skip debug statements. */
7842 if (is_gimple_debug (gsi_stmt (gsi)))
7844 gsi_prev (&gsi);
7845 continue;
7847 stmt1 = gsi_stmt (gsi);
7848 /* Do not consider statements writing to memory or having
7849 volatile operand. */
7850 if (gimple_vdef (stmt1)
7851 || gimple_has_volatile_ops (stmt1))
7852 break;
7853 gsi_from = gsi;
7854 gsi_prev (&gsi);
7855 lhs = gimple_get_lhs (stmt1);
7856 if (!lhs)
7857 break;
7859 /* LHS of vectorized stmt must be SSA_NAME. */
7860 if (TREE_CODE (lhs) != SSA_NAME)
7861 break;
7863 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7865 /* Remove dead scalar statement. */
7866 if (has_zero_uses (lhs))
7868 gsi_remove (&gsi_from, true);
7869 continue;
7873 /* Check that LHS does not have uses outside of STORE_BB. */
7874 res = true;
7875 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7877 gimple *use_stmt;
7878 use_stmt = USE_STMT (use_p);
7879 if (is_gimple_debug (use_stmt))
7880 continue;
7881 if (gimple_bb (use_stmt) != store_bb)
7883 res = false;
7884 break;
7887 if (!res)
7888 break;
7890 if (gimple_vuse (stmt1)
7891 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7892 break;
7894 /* Can move STMT1 to STORE_BB. */
7895 if (dump_enabled_p ())
7897 dump_printf_loc (MSG_NOTE, vect_location,
7898 "Move stmt to created bb\n");
7899 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7901 gsi_move_before (&gsi_from, &gsi_to);
7902 /* Shift GSI_TO for further insertion. */
7903 gsi_prev (&gsi_to);
7905 /* Put other masked stores with the same mask to STORE_BB. */
7906 if (worklist.is_empty ()
7907 || gimple_call_arg (worklist.last (), 2) != mask
7908 || worklist.last () != stmt1)
7909 break;
7910 last = worklist.pop ();
7912 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);