* asan.c (create_cond_insert_point): Maintain profile.
[official-gcc.git] / gcc / tree-vect-loop.c
blobafb36eab1f7e94aacf976a03fc33fd4840ac68f3
1 /* Loop Vectorization
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
54 /* Loop Vectorization Pass.
56 This pass tries to vectorize loops.
58 For example, the vectorizer transforms the following simple loop:
60 short a[N]; short b[N]; short c[N]; int i;
62 for (i=0; i<N; i++){
63 a[i] = b[i] + c[i];
66 as if it was manually vectorized by rewriting the source code into:
68 typedef int __attribute__((mode(V8HI))) v8hi;
69 short a[N]; short b[N]; short c[N]; int i;
70 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
71 v8hi va, vb, vc;
73 for (i=0; i<N/8; i++){
74 vb = pb[i];
75 vc = pc[i];
76 va = vb + vc;
77 pa[i] = va;
80 The main entry to this pass is vectorize_loops(), in which
81 the vectorizer applies a set of analyses on a given set of loops,
82 followed by the actual vectorization transformation for the loops that
83 had successfully passed the analysis phase.
84 Throughout this pass we make a distinction between two types of
85 data: scalars (which are represented by SSA_NAMES), and memory references
86 ("data-refs"). These two types of data require different handling both
87 during analysis and transformation. The types of data-refs that the
88 vectorizer currently supports are ARRAY_REFS which base is an array DECL
89 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
90 accesses are required to have a simple (consecutive) access pattern.
92 Analysis phase:
93 ===============
94 The driver for the analysis phase is vect_analyze_loop().
95 It applies a set of analyses, some of which rely on the scalar evolution
96 analyzer (scev) developed by Sebastian Pop.
98 During the analysis phase the vectorizer records some information
99 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
100 loop, as well as general information about the loop as a whole, which is
101 recorded in a "loop_vec_info" struct attached to each loop.
103 Transformation phase:
104 =====================
105 The loop transformation phase scans all the stmts in the loop, and
106 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
107 the loop that needs to be vectorized. It inserts the vector code sequence
108 just before the scalar stmt S, and records a pointer to the vector code
109 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
110 attached to S). This pointer will be used for the vectorization of following
111 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
112 otherwise, we rely on dead code elimination for removing it.
114 For example, say stmt S1 was vectorized into stmt VS1:
116 VS1: vb = px[i];
117 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
118 S2: a = b;
120 To vectorize stmt S2, the vectorizer first finds the stmt that defines
121 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
122 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
123 resulting sequence would be:
125 VS1: vb = px[i];
126 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
127 VS2: va = vb;
128 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
130 Operands that are not SSA_NAMEs, are data-refs that appear in
131 load/store operations (like 'x[i]' in S1), and are handled differently.
133 Target modeling:
134 =================
135 Currently the only target specific information that is used is the
136 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
137 Targets that can support different sizes of vectors, for now will need
138 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
139 flexibility will be added in the future.
141 Since we only vectorize operations which vector form can be
142 expressed using existing tree codes, to verify that an operation is
143 supported, the vectorizer checks the relevant optab at the relevant
144 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
145 the value found is CODE_FOR_nothing, then there's no target support, and
146 we can't vectorize the stmt.
148 For additional information on this project see:
149 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
152 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
154 /* Function vect_determine_vectorization_factor
156 Determine the vectorization factor (VF). VF is the number of data elements
157 that are operated upon in parallel in a single iteration of the vectorized
158 loop. For example, when vectorizing a loop that operates on 4byte elements,
159 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
160 elements can fit in a single vector register.
162 We currently support vectorization of loops in which all types operated upon
163 are of the same size. Therefore this function currently sets VF according to
164 the size of the types operated upon, and fails if there are multiple sizes
165 in the loop.
167 VF is also the factor by which the loop iterations are strip-mined, e.g.:
168 original loop:
169 for (i=0; i<N; i++){
170 a[i] = b[i] + c[i];
173 vectorized loop:
174 for (i=0; i<N; i+=VF){
175 a[i:VF] = b[i:VF] + c[i:VF];
179 static bool
180 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
182 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
183 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
184 unsigned nbbs = loop->num_nodes;
185 unsigned int vectorization_factor = 0;
186 tree scalar_type = NULL_TREE;
187 gphi *phi;
188 tree vectype;
189 unsigned int nunits;
190 stmt_vec_info stmt_info;
191 unsigned i;
192 HOST_WIDE_INT dummy;
193 gimple *stmt, *pattern_stmt = NULL;
194 gimple_seq pattern_def_seq = NULL;
195 gimple_stmt_iterator pattern_def_si = gsi_none ();
196 bool analyze_pattern_stmt = false;
197 bool bool_result;
198 auto_vec<stmt_vec_info> mask_producers;
200 if (dump_enabled_p ())
201 dump_printf_loc (MSG_NOTE, vect_location,
202 "=== vect_determine_vectorization_factor ===\n");
204 for (i = 0; i < nbbs; i++)
206 basic_block bb = bbs[i];
208 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
209 gsi_next (&si))
211 phi = si.phi ();
212 stmt_info = vinfo_for_stmt (phi);
213 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
216 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
219 gcc_assert (stmt_info);
221 if (STMT_VINFO_RELEVANT_P (stmt_info)
222 || STMT_VINFO_LIVE_P (stmt_info))
224 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
225 scalar_type = TREE_TYPE (PHI_RESULT (phi));
227 if (dump_enabled_p ())
229 dump_printf_loc (MSG_NOTE, vect_location,
230 "get vectype for scalar type: ");
231 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
232 dump_printf (MSG_NOTE, "\n");
235 vectype = get_vectype_for_scalar_type (scalar_type);
236 if (!vectype)
238 if (dump_enabled_p ())
240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
241 "not vectorized: unsupported "
242 "data-type ");
243 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
244 scalar_type);
245 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
247 return false;
249 STMT_VINFO_VECTYPE (stmt_info) = vectype;
251 if (dump_enabled_p ())
253 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
254 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
255 dump_printf (MSG_NOTE, "\n");
258 nunits = TYPE_VECTOR_SUBPARTS (vectype);
259 if (dump_enabled_p ())
260 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
261 nunits);
263 if (!vectorization_factor
264 || (nunits > vectorization_factor))
265 vectorization_factor = nunits;
269 for (gimple_stmt_iterator si = gsi_start_bb (bb);
270 !gsi_end_p (si) || analyze_pattern_stmt;)
272 tree vf_vectype;
274 if (analyze_pattern_stmt)
275 stmt = pattern_stmt;
276 else
277 stmt = gsi_stmt (si);
279 stmt_info = vinfo_for_stmt (stmt);
281 if (dump_enabled_p ())
283 dump_printf_loc (MSG_NOTE, vect_location,
284 "==> examining statement: ");
285 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
288 gcc_assert (stmt_info);
290 /* Skip stmts which do not need to be vectorized. */
291 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
292 && !STMT_VINFO_LIVE_P (stmt_info))
293 || gimple_clobber_p (stmt))
295 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
296 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
297 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
298 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
300 stmt = pattern_stmt;
301 stmt_info = vinfo_for_stmt (pattern_stmt);
302 if (dump_enabled_p ())
304 dump_printf_loc (MSG_NOTE, vect_location,
305 "==> examining pattern statement: ");
306 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
309 else
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
313 gsi_next (&si);
314 continue;
317 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
318 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
319 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
320 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
321 analyze_pattern_stmt = true;
323 /* If a pattern statement has def stmts, analyze them too. */
324 if (is_pattern_stmt_p (stmt_info))
326 if (pattern_def_seq == NULL)
328 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
329 pattern_def_si = gsi_start (pattern_def_seq);
331 else if (!gsi_end_p (pattern_def_si))
332 gsi_next (&pattern_def_si);
333 if (pattern_def_seq != NULL)
335 gimple *pattern_def_stmt = NULL;
336 stmt_vec_info pattern_def_stmt_info = NULL;
338 while (!gsi_end_p (pattern_def_si))
340 pattern_def_stmt = gsi_stmt (pattern_def_si);
341 pattern_def_stmt_info
342 = vinfo_for_stmt (pattern_def_stmt);
343 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
344 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
345 break;
346 gsi_next (&pattern_def_si);
349 if (!gsi_end_p (pattern_def_si))
351 if (dump_enabled_p ())
353 dump_printf_loc (MSG_NOTE, vect_location,
354 "==> examining pattern def stmt: ");
355 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
356 pattern_def_stmt, 0);
359 stmt = pattern_def_stmt;
360 stmt_info = pattern_def_stmt_info;
362 else
364 pattern_def_si = gsi_none ();
365 analyze_pattern_stmt = false;
368 else
369 analyze_pattern_stmt = false;
372 if (gimple_get_lhs (stmt) == NULL_TREE
373 /* MASK_STORE has no lhs, but is ok. */
374 && (!is_gimple_call (stmt)
375 || !gimple_call_internal_p (stmt)
376 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
378 if (is_gimple_call (stmt))
380 /* Ignore calls with no lhs. These must be calls to
381 #pragma omp simd functions, and what vectorization factor
382 it really needs can't be determined until
383 vectorizable_simd_clone_call. */
384 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
386 pattern_def_seq = NULL;
387 gsi_next (&si);
389 continue;
391 if (dump_enabled_p ())
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "not vectorized: irregular stmt.");
395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
398 return false;
401 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
403 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 "not vectorized: vector stmt in loop:");
407 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
409 return false;
412 bool_result = false;
414 if (STMT_VINFO_VECTYPE (stmt_info))
416 /* The only case when a vectype had been already set is for stmts
417 that contain a dataref, or for "pattern-stmts" (stmts
418 generated by the vectorizer to represent/replace a certain
419 idiom). */
420 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
421 || is_pattern_stmt_p (stmt_info)
422 || !gsi_end_p (pattern_def_si));
423 vectype = STMT_VINFO_VECTYPE (stmt_info);
425 else
427 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
428 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
429 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
430 else
431 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
433 /* Bool ops don't participate in vectorization factor
434 computation. For comparison use compared types to
435 compute a factor. */
436 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
437 && is_gimple_assign (stmt)
438 && gimple_assign_rhs_code (stmt) != COND_EXPR)
440 if (STMT_VINFO_RELEVANT_P (stmt_info)
441 || STMT_VINFO_LIVE_P (stmt_info))
442 mask_producers.safe_push (stmt_info);
443 bool_result = true;
445 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
446 == tcc_comparison
447 && !VECT_SCALAR_BOOLEAN_TYPE_P
448 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
449 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
450 else
452 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
454 pattern_def_seq = NULL;
455 gsi_next (&si);
457 continue;
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "get vectype for scalar type: ");
465 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
466 dump_printf (MSG_NOTE, "\n");
468 vectype = get_vectype_for_scalar_type (scalar_type);
469 if (!vectype)
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
474 "not vectorized: unsupported "
475 "data-type ");
476 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
477 scalar_type);
478 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 return false;
483 if (!bool_result)
484 STMT_VINFO_VECTYPE (stmt_info) = vectype;
486 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
489 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
490 dump_printf (MSG_NOTE, "\n");
494 /* Don't try to compute VF out scalar types if we stmt
495 produces boolean vector. Use result vectype instead. */
496 if (VECTOR_BOOLEAN_TYPE_P (vectype))
497 vf_vectype = vectype;
498 else
500 /* The vectorization factor is according to the smallest
501 scalar type (or the largest vector size, but we only
502 support one vector size per loop). */
503 if (!bool_result)
504 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
505 &dummy);
506 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE, vect_location,
509 "get vectype for scalar type: ");
510 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
511 dump_printf (MSG_NOTE, "\n");
513 vf_vectype = get_vectype_for_scalar_type (scalar_type);
515 if (!vf_vectype)
517 if (dump_enabled_p ())
519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
520 "not vectorized: unsupported data-type ");
521 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
522 scalar_type);
523 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 return false;
528 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
529 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
531 if (dump_enabled_p ())
533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534 "not vectorized: different sized vector "
535 "types in statement, ");
536 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
537 vectype);
538 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540 vf_vectype);
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
543 return false;
546 if (dump_enabled_p ())
548 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
549 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
550 dump_printf (MSG_NOTE, "\n");
553 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
556 if (!vectorization_factor
557 || (nunits > vectorization_factor))
558 vectorization_factor = nunits;
560 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
562 pattern_def_seq = NULL;
563 gsi_next (&si);
568 /* TODO: Analyze cost. Decide if worth while to vectorize. */
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
571 vectorization_factor);
572 if (vectorization_factor <= 1)
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 "not vectorized: unsupported data-type\n");
577 return false;
579 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
581 for (i = 0; i < mask_producers.length (); i++)
583 tree mask_type = NULL;
585 stmt = STMT_VINFO_STMT (mask_producers[i]);
587 if (is_gimple_assign (stmt)
588 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
589 && !VECT_SCALAR_BOOLEAN_TYPE_P
590 (TREE_TYPE (gimple_assign_rhs1 (stmt))))
592 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
593 mask_type = get_mask_type_for_scalar_type (scalar_type);
595 if (!mask_type)
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
599 "not vectorized: unsupported mask\n");
600 return false;
603 else
605 tree rhs;
606 ssa_op_iter iter;
607 gimple *def_stmt;
608 enum vect_def_type dt;
610 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
612 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
613 &def_stmt, &dt, &vectype))
615 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "not vectorized: can't compute mask type "
619 "for statement, ");
620 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
623 return false;
626 /* No vectype probably means external definition.
627 Allow it in case there is another operand which
628 allows to determine mask type. */
629 if (!vectype)
630 continue;
632 if (!mask_type)
633 mask_type = vectype;
634 else if (TYPE_VECTOR_SUBPARTS (mask_type)
635 != TYPE_VECTOR_SUBPARTS (vectype))
637 if (dump_enabled_p ())
639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
640 "not vectorized: different sized masks "
641 "types in statement, ");
642 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
643 mask_type);
644 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
645 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
646 vectype);
647 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
649 return false;
651 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
652 != VECTOR_BOOLEAN_TYPE_P (vectype))
654 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
657 "not vectorized: mixed mask and "
658 "nonmask vector types in statement, ");
659 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
660 mask_type);
661 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
662 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
663 vectype);
664 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
666 return false;
670 /* We may compare boolean value loaded as vector of integers.
671 Fix mask_type in such case. */
672 if (mask_type
673 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
674 && gimple_code (stmt) == GIMPLE_ASSIGN
675 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
676 mask_type = build_same_sized_truth_vector_type (mask_type);
679 /* No mask_type should mean loop invariant predicate.
680 This is probably a subject for optimization in
681 if-conversion. */
682 if (!mask_type)
684 if (dump_enabled_p ())
686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
687 "not vectorized: can't compute mask type "
688 "for statement, ");
689 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
692 return false;
695 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
698 return true;
702 /* Function vect_is_simple_iv_evolution.
704 FORNOW: A simple evolution of an induction variables in the loop is
705 considered a polynomial evolution. */
707 static bool
708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
709 tree * step)
711 tree init_expr;
712 tree step_expr;
713 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
714 basic_block bb;
716 /* When there is no evolution in this loop, the evolution function
717 is not "simple". */
718 if (evolution_part == NULL_TREE)
719 return false;
721 /* When the evolution is a polynomial of degree >= 2
722 the evolution function is not "simple". */
723 if (tree_is_chrec (evolution_part))
724 return false;
726 step_expr = evolution_part;
727 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
732 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
733 dump_printf (MSG_NOTE, ", init: ");
734 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
735 dump_printf (MSG_NOTE, "\n");
738 *init = init_expr;
739 *step = step_expr;
741 if (TREE_CODE (step_expr) != INTEGER_CST
742 && (TREE_CODE (step_expr) != SSA_NAME
743 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
744 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
745 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
746 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
747 || !flag_associative_math)))
748 && (TREE_CODE (step_expr) != REAL_CST
749 || !flag_associative_math))
751 if (dump_enabled_p ())
752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
753 "step unknown.\n");
754 return false;
757 return true;
760 /* Function vect_analyze_scalar_cycles_1.
762 Examine the cross iteration def-use cycles of scalar variables
763 in LOOP. LOOP_VINFO represents the loop that is now being
764 considered for vectorization (can be LOOP, or an outer-loop
765 enclosing LOOP). */
767 static void
768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
770 basic_block bb = loop->header;
771 tree init, step;
772 auto_vec<gimple *, 64> worklist;
773 gphi_iterator gsi;
774 bool double_reduc;
776 if (dump_enabled_p ())
777 dump_printf_loc (MSG_NOTE, vect_location,
778 "=== vect_analyze_scalar_cycles ===\n");
780 /* First - identify all inductions. Reduction detection assumes that all the
781 inductions have been identified, therefore, this order must not be
782 changed. */
783 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
785 gphi *phi = gsi.phi ();
786 tree access_fn = NULL;
787 tree def = PHI_RESULT (phi);
788 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
790 if (dump_enabled_p ())
792 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
793 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
796 /* Skip virtual phi's. The data dependences that are associated with
797 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
798 if (virtual_operand_p (def))
799 continue;
801 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
803 /* Analyze the evolution function. */
804 access_fn = analyze_scalar_evolution (loop, def);
805 if (access_fn)
807 STRIP_NOPS (access_fn);
808 if (dump_enabled_p ())
810 dump_printf_loc (MSG_NOTE, vect_location,
811 "Access function of PHI: ");
812 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
813 dump_printf (MSG_NOTE, "\n");
815 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
816 = initial_condition_in_loop_num (access_fn, loop->num);
817 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
818 = evolution_part_in_loop_num (access_fn, loop->num);
821 if (!access_fn
822 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
823 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
824 && TREE_CODE (step) != INTEGER_CST))
826 worklist.safe_push (phi);
827 continue;
830 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
831 != NULL_TREE);
832 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
834 if (dump_enabled_p ())
835 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
836 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
840 /* Second - identify all reductions and nested cycles. */
841 while (worklist.length () > 0)
843 gimple *phi = worklist.pop ();
844 tree def = PHI_RESULT (phi);
845 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
846 gimple *reduc_stmt;
848 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
851 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
854 gcc_assert (!virtual_operand_p (def)
855 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
857 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
858 &double_reduc, false);
859 if (reduc_stmt)
861 if (double_reduc)
863 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE, vect_location,
865 "Detected double reduction.\n");
867 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
868 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
869 vect_double_reduction_def;
871 else
873 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
875 if (dump_enabled_p ())
876 dump_printf_loc (MSG_NOTE, vect_location,
877 "Detected vectorizable nested cycle.\n");
879 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
880 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
881 vect_nested_cycle;
883 else
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location,
887 "Detected reduction.\n");
889 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
890 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
891 vect_reduction_def;
892 /* Store the reduction cycles for possible vectorization in
893 loop-aware SLP if it was not detected as reduction
894 chain. */
895 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
896 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
900 else
901 if (dump_enabled_p ())
902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
903 "Unknown def-use cycle pattern.\n");
908 /* Function vect_analyze_scalar_cycles.
910 Examine the cross iteration def-use cycles of scalar variables, by
911 analyzing the loop-header PHIs of scalar variables. Classify each
912 cycle as one of the following: invariant, induction, reduction, unknown.
913 We do that for the loop represented by LOOP_VINFO, and also to its
914 inner-loop, if exists.
915 Examples for scalar cycles:
917 Example1: reduction:
919 loop1:
920 for (i=0; i<N; i++)
921 sum += a[i];
923 Example2: induction:
925 loop2:
926 for (i=0; i<N; i++)
927 a[i] = i; */
929 static void
930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
932 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
934 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
936 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
937 Reductions in such inner-loop therefore have different properties than
938 the reductions in the nest that gets vectorized:
939 1. When vectorized, they are executed in the same order as in the original
940 scalar loop, so we can't change the order of computation when
941 vectorizing them.
942 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
943 current checks are too strict. */
945 if (loop->inner)
946 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
949 /* Transfer group and reduction information from STMT to its pattern stmt. */
951 static void
952 vect_fixup_reduc_chain (gimple *stmt)
954 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
955 gimple *stmtp;
956 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
957 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
958 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
961 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
962 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
963 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
964 if (stmt)
965 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
966 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
968 while (stmt);
969 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
972 /* Fixup scalar cycles that now have their stmts detected as patterns. */
974 static void
975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
977 gimple *first;
978 unsigned i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
981 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
983 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
984 while (next)
986 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
987 break;
988 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
990 /* If not all stmt in the chain are patterns try to handle
991 the chain without patterns. */
992 if (! next)
994 vect_fixup_reduc_chain (first);
995 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
996 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1001 /* Function vect_get_loop_niters.
1003 Determine how many iterations the loop is executed and place it
1004 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1005 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1006 niter information holds in ASSUMPTIONS.
1008 Return the loop exit condition. */
1011 static gcond *
1012 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1013 tree *number_of_iterations, tree *number_of_iterationsm1)
1015 edge exit = single_exit (loop);
1016 struct tree_niter_desc niter_desc;
1017 tree niter_assumptions, niter, may_be_zero;
1018 gcond *cond = get_loop_exit_condition (loop);
1020 *assumptions = boolean_true_node;
1021 *number_of_iterationsm1 = chrec_dont_know;
1022 *number_of_iterations = chrec_dont_know;
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_NOTE, vect_location,
1025 "=== get_loop_niters ===\n");
1027 if (!exit)
1028 return cond;
1030 niter = chrec_dont_know;
1031 may_be_zero = NULL_TREE;
1032 niter_assumptions = boolean_true_node;
1033 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1034 || chrec_contains_undetermined (niter_desc.niter))
1035 return cond;
1037 niter_assumptions = niter_desc.assumptions;
1038 may_be_zero = niter_desc.may_be_zero;
1039 niter = niter_desc.niter;
1041 if (may_be_zero && integer_zerop (may_be_zero))
1042 may_be_zero = NULL_TREE;
1044 if (may_be_zero)
1046 if (COMPARISON_CLASS_P (may_be_zero))
1048 /* Try to combine may_be_zero with assumptions, this can simplify
1049 computation of niter expression. */
1050 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1051 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1052 niter_assumptions,
1053 fold_build1 (TRUTH_NOT_EXPR,
1054 boolean_type_node,
1055 may_be_zero));
1056 else
1057 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1058 build_int_cst (TREE_TYPE (niter), 0), niter);
1060 may_be_zero = NULL_TREE;
1062 else if (integer_nonzerop (may_be_zero))
1064 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1065 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1066 return cond;
1068 else
1069 return cond;
1072 *assumptions = niter_assumptions;
1073 *number_of_iterationsm1 = niter;
1075 /* We want the number of loop header executions which is the number
1076 of latch executions plus one.
1077 ??? For UINT_MAX latch executions this number overflows to zero
1078 for loops like do { n++; } while (n != 0); */
1079 if (niter && !chrec_contains_undetermined (niter))
1080 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1081 build_int_cst (TREE_TYPE (niter), 1));
1082 *number_of_iterations = niter;
1084 return cond;
1087 /* Function bb_in_loop_p
1089 Used as predicate for dfs order traversal of the loop bbs. */
1091 static bool
1092 bb_in_loop_p (const_basic_block bb, const void *data)
1094 const struct loop *const loop = (const struct loop *)data;
1095 if (flow_bb_inside_loop_p (loop, bb))
1096 return true;
1097 return false;
1101 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1102 stmt_vec_info structs for all the stmts in LOOP_IN. */
1104 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1105 : vec_info (vec_info::loop, init_cost (loop_in)),
1106 loop (loop_in),
1107 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1108 num_itersm1 (NULL_TREE),
1109 num_iters (NULL_TREE),
1110 num_iters_unchanged (NULL_TREE),
1111 num_iters_assumptions (NULL_TREE),
1112 th (0),
1113 vectorization_factor (0),
1114 max_vectorization_factor (0),
1115 unaligned_dr (NULL),
1116 peeling_for_alignment (0),
1117 ptr_mask (0),
1118 slp_unrolling_factor (1),
1119 single_scalar_iteration_cost (0),
1120 vectorizable (false),
1121 peeling_for_gaps (false),
1122 peeling_for_niter (false),
1123 operands_swapped (false),
1124 no_data_dependencies (false),
1125 has_mask_store (false),
1126 scalar_loop (NULL),
1127 orig_loop_info (NULL)
1129 /* Create/Update stmt_info for all stmts in the loop. */
1130 basic_block *body = get_loop_body (loop);
1131 for (unsigned int i = 0; i < loop->num_nodes; i++)
1133 basic_block bb = body[i];
1134 gimple_stmt_iterator si;
1136 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1138 gimple *phi = gsi_stmt (si);
1139 gimple_set_uid (phi, 0);
1140 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1143 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1145 gimple *stmt = gsi_stmt (si);
1146 gimple_set_uid (stmt, 0);
1147 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1150 free (body);
1152 /* CHECKME: We want to visit all BBs before their successors (except for
1153 latch blocks, for which this assertion wouldn't hold). In the simple
1154 case of the loop forms we allow, a dfs order of the BBs would the same
1155 as reversed postorder traversal, so we are safe. */
1157 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1158 bbs, loop->num_nodes, loop);
1159 gcc_assert (nbbs == loop->num_nodes);
1163 /* Free all memory used by the _loop_vec_info, as well as all the
1164 stmt_vec_info structs of all the stmts in the loop. */
1166 _loop_vec_info::~_loop_vec_info ()
1168 int nbbs;
1169 gimple_stmt_iterator si;
1170 int j;
1172 nbbs = loop->num_nodes;
1173 for (j = 0; j < nbbs; j++)
1175 basic_block bb = bbs[j];
1176 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1177 free_stmt_vec_info (gsi_stmt (si));
1179 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1181 gimple *stmt = gsi_stmt (si);
1183 /* We may have broken canonical form by moving a constant
1184 into RHS1 of a commutative op. Fix such occurrences. */
1185 if (operands_swapped && is_gimple_assign (stmt))
1187 enum tree_code code = gimple_assign_rhs_code (stmt);
1189 if ((code == PLUS_EXPR
1190 || code == POINTER_PLUS_EXPR
1191 || code == MULT_EXPR)
1192 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1193 swap_ssa_operands (stmt,
1194 gimple_assign_rhs1_ptr (stmt),
1195 gimple_assign_rhs2_ptr (stmt));
1196 else if (code == COND_EXPR
1197 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1199 tree cond_expr = gimple_assign_rhs1 (stmt);
1200 enum tree_code cond_code = TREE_CODE (cond_expr);
1202 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1204 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1205 0));
1206 cond_code = invert_tree_comparison (cond_code,
1207 honor_nans);
1208 if (cond_code != ERROR_MARK)
1210 TREE_SET_CODE (cond_expr, cond_code);
1211 swap_ssa_operands (stmt,
1212 gimple_assign_rhs2_ptr (stmt),
1213 gimple_assign_rhs3_ptr (stmt));
1219 /* Free stmt_vec_info. */
1220 free_stmt_vec_info (stmt);
1221 gsi_next (&si);
1225 free (bbs);
1227 loop->aux = NULL;
1231 /* Calculate the cost of one scalar iteration of the loop. */
1232 static void
1233 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1236 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1237 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1238 int innerloop_iters, i;
1240 /* Count statements in scalar loop. Using this as scalar cost for a single
1241 iteration for now.
1243 TODO: Add outer loop support.
1245 TODO: Consider assigning different costs to different scalar
1246 statements. */
1248 /* FORNOW. */
1249 innerloop_iters = 1;
1250 if (loop->inner)
1251 innerloop_iters = 50; /* FIXME */
1253 for (i = 0; i < nbbs; i++)
1255 gimple_stmt_iterator si;
1256 basic_block bb = bbs[i];
1258 if (bb->loop_father == loop->inner)
1259 factor = innerloop_iters;
1260 else
1261 factor = 1;
1263 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1265 gimple *stmt = gsi_stmt (si);
1266 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1268 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1269 continue;
1271 /* Skip stmts that are not vectorized inside the loop. */
1272 if (stmt_info
1273 && !STMT_VINFO_RELEVANT_P (stmt_info)
1274 && (!STMT_VINFO_LIVE_P (stmt_info)
1275 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1276 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1277 continue;
1279 vect_cost_for_stmt kind;
1280 if (STMT_VINFO_DATA_REF (stmt_info))
1282 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1283 kind = scalar_load;
1284 else
1285 kind = scalar_store;
1287 else
1288 kind = scalar_stmt;
1290 scalar_single_iter_cost
1291 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1292 factor, kind, stmt_info, 0, vect_prologue);
1295 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1296 = scalar_single_iter_cost;
1300 /* Function vect_analyze_loop_form_1.
1302 Verify that certain CFG restrictions hold, including:
1303 - the loop has a pre-header
1304 - the loop has a single entry and exit
1305 - the loop exit condition is simple enough
1306 - the number of iterations can be analyzed, i.e, a countable loop. The
1307 niter could be analyzed under some assumptions. */
1309 bool
1310 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1311 tree *assumptions, tree *number_of_iterationsm1,
1312 tree *number_of_iterations, gcond **inner_loop_cond)
1314 if (dump_enabled_p ())
1315 dump_printf_loc (MSG_NOTE, vect_location,
1316 "=== vect_analyze_loop_form ===\n");
1318 /* Different restrictions apply when we are considering an inner-most loop,
1319 vs. an outer (nested) loop.
1320 (FORNOW. May want to relax some of these restrictions in the future). */
1322 if (!loop->inner)
1324 /* Inner-most loop. We currently require that the number of BBs is
1325 exactly 2 (the header and latch). Vectorizable inner-most loops
1326 look like this:
1328 (pre-header)
1330 header <--------+
1331 | | |
1332 | +--> latch --+
1334 (exit-bb) */
1336 if (loop->num_nodes != 2)
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340 "not vectorized: control flow in loop.\n");
1341 return false;
1344 if (empty_block_p (loop->header))
1346 if (dump_enabled_p ())
1347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1348 "not vectorized: empty loop.\n");
1349 return false;
1352 else
1354 struct loop *innerloop = loop->inner;
1355 edge entryedge;
1357 /* Nested loop. We currently require that the loop is doubly-nested,
1358 contains a single inner loop, and the number of BBs is exactly 5.
1359 Vectorizable outer-loops look like this:
1361 (pre-header)
1363 header <---+
1365 inner-loop |
1367 tail ------+
1369 (exit-bb)
1371 The inner-loop has the properties expected of inner-most loops
1372 as described above. */
1374 if ((loop->inner)->inner || (loop->inner)->next)
1376 if (dump_enabled_p ())
1377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1378 "not vectorized: multiple nested loops.\n");
1379 return false;
1382 if (loop->num_nodes != 5)
1384 if (dump_enabled_p ())
1385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1386 "not vectorized: control flow in loop.\n");
1387 return false;
1390 entryedge = loop_preheader_edge (innerloop);
1391 if (entryedge->src != loop->header
1392 || !single_exit (innerloop)
1393 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1395 if (dump_enabled_p ())
1396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397 "not vectorized: unsupported outerloop form.\n");
1398 return false;
1401 /* Analyze the inner-loop. */
1402 tree inner_niterm1, inner_niter, inner_assumptions;
1403 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1404 &inner_assumptions, &inner_niterm1,
1405 &inner_niter, NULL)
1406 /* Don't support analyzing niter under assumptions for inner
1407 loop. */
1408 || !integer_onep (inner_assumptions))
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1412 "not vectorized: Bad inner loop.\n");
1413 return false;
1416 if (!expr_invariant_in_loop_p (loop, inner_niter))
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420 "not vectorized: inner-loop count not"
1421 " invariant.\n");
1422 return false;
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "Considering outer-loop vectorization.\n");
1430 if (!single_exit (loop)
1431 || EDGE_COUNT (loop->header->preds) != 2)
1433 if (dump_enabled_p ())
1435 if (!single_exit (loop))
1436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1437 "not vectorized: multiple exits.\n");
1438 else if (EDGE_COUNT (loop->header->preds) != 2)
1439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1440 "not vectorized: too many incoming edges.\n");
1442 return false;
1445 /* We assume that the loop exit condition is at the end of the loop. i.e,
1446 that the loop is represented as a do-while (with a proper if-guard
1447 before the loop if needed), where the loop header contains all the
1448 executable statements, and the latch is empty. */
1449 if (!empty_block_p (loop->latch)
1450 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1452 if (dump_enabled_p ())
1453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1454 "not vectorized: latch block not empty.\n");
1455 return false;
1458 /* Make sure the exit is not abnormal. */
1459 edge e = single_exit (loop);
1460 if (e->flags & EDGE_ABNORMAL)
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 "not vectorized: abnormal loop exit edge.\n");
1465 return false;
1468 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1469 number_of_iterationsm1);
1470 if (!*loop_cond)
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 "not vectorized: complicated exit condition.\n");
1475 return false;
1478 if (integer_zerop (*assumptions)
1479 || !*number_of_iterations
1480 || chrec_contains_undetermined (*number_of_iterations))
1482 if (dump_enabled_p ())
1483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1484 "not vectorized: number of iterations cannot be "
1485 "computed.\n");
1486 return false;
1489 if (integer_zerop (*number_of_iterations))
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493 "not vectorized: number of iterations = 0.\n");
1494 return false;
1497 return true;
1500 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1502 loop_vec_info
1503 vect_analyze_loop_form (struct loop *loop)
1505 tree assumptions, number_of_iterations, number_of_iterationsm1;
1506 gcond *loop_cond, *inner_loop_cond = NULL;
1508 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1509 &assumptions, &number_of_iterationsm1,
1510 &number_of_iterations, &inner_loop_cond))
1511 return NULL;
1513 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1514 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1515 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1516 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1517 if (!integer_onep (assumptions))
1519 /* We consider to vectorize this loop by versioning it under
1520 some assumptions. In order to do this, we need to clear
1521 existing information computed by scev and niter analyzer. */
1522 scev_reset_htab ();
1523 free_numbers_of_iterations_estimates (loop);
1524 /* Also set flag for this loop so that following scev and niter
1525 analysis are done under the assumptions. */
1526 loop_constraint_set (loop, LOOP_C_FINITE);
1527 /* Also record the assumptions for versioning. */
1528 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1531 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1533 if (dump_enabled_p ())
1535 dump_printf_loc (MSG_NOTE, vect_location,
1536 "Symbolic number of iterations is ");
1537 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1538 dump_printf (MSG_NOTE, "\n");
1542 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1543 if (inner_loop_cond)
1544 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1545 = loop_exit_ctrl_vec_info_type;
1547 gcc_assert (!loop->aux);
1548 loop->aux = loop_vinfo;
1549 return loop_vinfo;
1554 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1555 statements update the vectorization factor. */
1557 static void
1558 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1560 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1561 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1562 int nbbs = loop->num_nodes;
1563 unsigned int vectorization_factor;
1564 int i;
1566 if (dump_enabled_p ())
1567 dump_printf_loc (MSG_NOTE, vect_location,
1568 "=== vect_update_vf_for_slp ===\n");
1570 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1571 gcc_assert (vectorization_factor != 0);
1573 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1574 vectorization factor of the loop is the unrolling factor required by
1575 the SLP instances. If that unrolling factor is 1, we say, that we
1576 perform pure SLP on loop - cross iteration parallelism is not
1577 exploited. */
1578 bool only_slp_in_loop = true;
1579 for (i = 0; i < nbbs; i++)
1581 basic_block bb = bbs[i];
1582 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1583 gsi_next (&si))
1585 gimple *stmt = gsi_stmt (si);
1586 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1587 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1588 && STMT_VINFO_RELATED_STMT (stmt_info))
1590 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1591 stmt_info = vinfo_for_stmt (stmt);
1593 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1594 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1595 && !PURE_SLP_STMT (stmt_info))
1596 /* STMT needs both SLP and loop-based vectorization. */
1597 only_slp_in_loop = false;
1601 if (only_slp_in_loop)
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "Loop contains only SLP stmts\n");
1605 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1607 else
1609 dump_printf_loc (MSG_NOTE, vect_location,
1610 "Loop contains SLP and non-SLP stmts\n");
1611 vectorization_factor
1612 = least_common_multiple (vectorization_factor,
1613 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1616 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1617 if (dump_enabled_p ())
1618 dump_printf_loc (MSG_NOTE, vect_location,
1619 "Updating vectorization factor to %d\n",
1620 vectorization_factor);
1623 /* Function vect_analyze_loop_operations.
1625 Scan the loop stmts and make sure they are all vectorizable. */
1627 static bool
1628 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1630 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632 int nbbs = loop->num_nodes;
1633 int i;
1634 stmt_vec_info stmt_info;
1635 bool need_to_vectorize = false;
1636 bool ok;
1638 if (dump_enabled_p ())
1639 dump_printf_loc (MSG_NOTE, vect_location,
1640 "=== vect_analyze_loop_operations ===\n");
1642 for (i = 0; i < nbbs; i++)
1644 basic_block bb = bbs[i];
1646 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1647 gsi_next (&si))
1649 gphi *phi = si.phi ();
1650 ok = true;
1652 stmt_info = vinfo_for_stmt (phi);
1653 if (dump_enabled_p ())
1655 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1658 if (virtual_operand_p (gimple_phi_result (phi)))
1659 continue;
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb))
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info)
1670 && STMT_VINFO_DEF_TYPE (stmt_info)
1671 != vect_double_reduction_def)
1673 if (dump_enabled_p ())
1674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1675 "Unsupported loop-closed phi in "
1676 "outer-loop.\n");
1677 return false;
1680 /* If PHI is used in the outer loop, we check that its operand
1681 is defined in the inner loop. */
1682 if (STMT_VINFO_RELEVANT_P (stmt_info))
1684 tree phi_op;
1685 gimple *op_def_stmt;
1687 if (gimple_phi_num_args (phi) != 1)
1688 return false;
1690 phi_op = PHI_ARG_DEF (phi, 0);
1691 if (TREE_CODE (phi_op) != SSA_NAME)
1692 return false;
1694 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1695 if (gimple_nop_p (op_def_stmt)
1696 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1697 || !vinfo_for_stmt (op_def_stmt))
1698 return false;
1700 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1701 != vect_used_in_outer
1702 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1703 != vect_used_in_outer_by_reduction)
1704 return false;
1707 continue;
1710 gcc_assert (stmt_info);
1712 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1713 || STMT_VINFO_LIVE_P (stmt_info))
1714 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1716 /* A scalar-dependence cycle that we don't support. */
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "not vectorized: scalar dependence cycle.\n");
1720 return false;
1723 if (STMT_VINFO_RELEVANT_P (stmt_info))
1725 need_to_vectorize = true;
1726 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1727 && ! PURE_SLP_STMT (stmt_info))
1728 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1729 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1730 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1731 && ! PURE_SLP_STMT (stmt_info))
1732 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1735 if (ok && STMT_VINFO_LIVE_P (stmt_info))
1736 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1738 if (!ok)
1740 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1743 "not vectorized: relevant phi not "
1744 "supported: ");
1745 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1747 return false;
1751 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1752 gsi_next (&si))
1754 gimple *stmt = gsi_stmt (si);
1755 if (!gimple_clobber_p (stmt)
1756 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1757 return false;
1759 } /* bbs */
1761 /* All operations in the loop are either irrelevant (deal with loop
1762 control, or dead), or only used outside the loop and can be moved
1763 out of the loop (e.g. invariants, inductions). The loop can be
1764 optimized away by scalar optimizations. We're better off not
1765 touching this loop. */
1766 if (!need_to_vectorize)
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_NOTE, vect_location,
1770 "All the computation can be taken out of the loop.\n");
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "not vectorized: redundant loop. no profit to "
1774 "vectorize.\n");
1775 return false;
1778 return true;
1782 /* Function vect_analyze_loop_2.
1784 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1785 for it. The different analyses will record information in the
1786 loop_vec_info struct. */
1787 static bool
1788 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1790 bool ok;
1791 int max_vf = MAX_VECTORIZATION_FACTOR;
1792 int min_vf = 2;
1793 unsigned int n_stmts = 0;
1795 /* The first group of checks is independent of the vector size. */
1796 fatal = true;
1798 /* Find all data references in the loop (which correspond to vdefs/vuses)
1799 and analyze their evolution in the loop. */
1801 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1803 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1804 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 "not vectorized: loop nest containing two "
1809 "or more consecutive inner loops cannot be "
1810 "vectorized\n");
1811 return false;
1814 for (unsigned i = 0; i < loop->num_nodes; i++)
1815 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1816 !gsi_end_p (gsi); gsi_next (&gsi))
1818 gimple *stmt = gsi_stmt (gsi);
1819 if (is_gimple_debug (stmt))
1820 continue;
1821 ++n_stmts;
1822 if (!find_data_references_in_stmt (loop, stmt,
1823 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1825 if (is_gimple_call (stmt) && loop->safelen)
1827 tree fndecl = gimple_call_fndecl (stmt), op;
1828 if (fndecl != NULL_TREE)
1830 cgraph_node *node = cgraph_node::get (fndecl);
1831 if (node != NULL && node->simd_clones != NULL)
1833 unsigned int j, n = gimple_call_num_args (stmt);
1834 for (j = 0; j < n; j++)
1836 op = gimple_call_arg (stmt, j);
1837 if (DECL_P (op)
1838 || (REFERENCE_CLASS_P (op)
1839 && get_base_address (op)))
1840 break;
1842 op = gimple_call_lhs (stmt);
1843 /* Ignore #pragma omp declare simd functions
1844 if they don't have data references in the
1845 call stmt itself. */
1846 if (j == n
1847 && !(op
1848 && (DECL_P (op)
1849 || (REFERENCE_CLASS_P (op)
1850 && get_base_address (op)))))
1851 continue;
1855 if (dump_enabled_p ())
1856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1857 "not vectorized: loop contains function "
1858 "calls or data references that cannot "
1859 "be analyzed\n");
1860 return false;
1864 /* Analyze the data references and also adjust the minimal
1865 vectorization factor according to the loads and stores. */
1867 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1868 if (!ok)
1870 if (dump_enabled_p ())
1871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872 "bad data references.\n");
1873 return false;
1876 /* Classify all cross-iteration scalar data-flow cycles.
1877 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1878 vect_analyze_scalar_cycles (loop_vinfo);
1880 vect_pattern_recog (loop_vinfo);
1882 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1884 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1885 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1887 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1888 if (!ok)
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 "bad data access.\n");
1893 return false;
1896 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1898 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1899 if (!ok)
1901 if (dump_enabled_p ())
1902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903 "unexpected pattern.\n");
1904 return false;
1907 /* While the rest of the analysis below depends on it in some way. */
1908 fatal = false;
1910 /* Analyze data dependences between the data-refs in the loop
1911 and adjust the maximum vectorization factor according to
1912 the dependences.
1913 FORNOW: fail at the first data dependence that we encounter. */
1915 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1916 if (!ok
1917 || max_vf < min_vf)
1919 if (dump_enabled_p ())
1920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1921 "bad data dependence.\n");
1922 return false;
1924 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1926 ok = vect_determine_vectorization_factor (loop_vinfo);
1927 if (!ok)
1929 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931 "can't determine vectorization factor.\n");
1932 return false;
1934 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1936 if (dump_enabled_p ())
1937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1938 "bad data dependence.\n");
1939 return false;
1942 /* Compute the scalar iteration cost. */
1943 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1945 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1946 HOST_WIDE_INT estimated_niter;
1947 unsigned th;
1948 int min_scalar_loop_bound;
1950 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1951 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1952 if (!ok)
1953 return false;
1955 /* If there are any SLP instances mark them as pure_slp. */
1956 bool slp = vect_make_slp_decision (loop_vinfo);
1957 if (slp)
1959 /* Find stmts that need to be both vectorized and SLPed. */
1960 vect_detect_hybrid_slp (loop_vinfo);
1962 /* Update the vectorization factor based on the SLP decision. */
1963 vect_update_vf_for_slp (loop_vinfo);
1966 /* This is the point where we can re-start analysis with SLP forced off. */
1967 start_over:
1969 /* Now the vectorization factor is final. */
1970 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1971 gcc_assert (vectorization_factor != 0);
1973 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1974 dump_printf_loc (MSG_NOTE, vect_location,
1975 "vectorization_factor = %d, niters = "
1976 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1977 LOOP_VINFO_INT_NITERS (loop_vinfo));
1979 HOST_WIDE_INT max_niter
1980 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1981 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1982 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1983 || (max_niter != -1
1984 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1986 if (dump_enabled_p ())
1987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1988 "not vectorized: iteration count smaller than "
1989 "vectorization factor.\n");
1990 return false;
1993 /* Analyze the alignment of the data-refs in the loop.
1994 Fail if a data reference is found that cannot be vectorized. */
1996 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1997 if (!ok)
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001 "bad data alignment.\n");
2002 return false;
2005 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2006 It is important to call pruning after vect_analyze_data_ref_accesses,
2007 since we use grouping information gathered by interleaving analysis. */
2008 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2009 if (!ok)
2010 return false;
2012 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2013 vectorization. */
2014 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2016 /* This pass will decide on using loop versioning and/or loop peeling in
2017 order to enhance the alignment of data references in the loop. */
2018 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2019 if (!ok)
2021 if (dump_enabled_p ())
2022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2023 "bad data alignment.\n");
2024 return false;
2028 if (slp)
2030 /* Analyze operations in the SLP instances. Note this may
2031 remove unsupported SLP instances which makes the above
2032 SLP kind detection invalid. */
2033 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2034 vect_slp_analyze_operations (loop_vinfo);
2035 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2036 goto again;
2039 /* Scan all the remaining operations in the loop that are not subject
2040 to SLP and make sure they are vectorizable. */
2041 ok = vect_analyze_loop_operations (loop_vinfo);
2042 if (!ok)
2044 if (dump_enabled_p ())
2045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2046 "bad operation or unsupported loop bound.\n");
2047 return false;
2050 /* If epilog loop is required because of data accesses with gaps,
2051 one additional iteration needs to be peeled. Check if there is
2052 enough iterations for vectorization. */
2053 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2054 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2056 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2057 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2059 if (wi::to_widest (scalar_niters) < vf)
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE, vect_location,
2063 "loop has no enough iterations to support"
2064 " peeling for gaps.\n");
2065 return false;
2069 /* Analyze cost. Decide if worth while to vectorize. */
2070 int min_profitable_estimate, min_profitable_iters;
2071 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2072 &min_profitable_estimate);
2074 if (min_profitable_iters < 0)
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078 "not vectorized: vectorization not profitable.\n");
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081 "not vectorized: vector version will never be "
2082 "profitable.\n");
2083 goto again;
2086 min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2087 * vectorization_factor);
2089 /* Use the cost model only if it is more conservative than user specified
2090 threshold. */
2091 th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2093 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2095 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2096 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 "not vectorized: vectorization not profitable.\n");
2101 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_NOTE, vect_location,
2103 "not vectorized: iteration count smaller than user "
2104 "specified loop bound parameter or minimum profitable "
2105 "iterations (whichever is more conservative).\n");
2106 goto again;
2109 estimated_niter
2110 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2111 if (estimated_niter == -1)
2112 estimated_niter = max_niter;
2113 if (estimated_niter != -1
2114 && ((unsigned HOST_WIDE_INT) estimated_niter
2115 < MAX (th, (unsigned) min_profitable_estimate)))
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2119 "not vectorized: estimated iteration count too "
2120 "small.\n");
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_NOTE, vect_location,
2123 "not vectorized: estimated iteration count smaller "
2124 "than specified loop bound parameter or minimum "
2125 "profitable iterations (whichever is more "
2126 "conservative).\n");
2127 goto again;
2130 /* Decide whether we need to create an epilogue loop to handle
2131 remaining scalar iterations. */
2132 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2133 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2134 * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2136 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2137 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2139 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2140 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2141 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2142 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2144 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2145 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2146 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2147 /* In case of versioning, check if the maximum number of
2148 iterations is greater than th. If they are identical,
2149 the epilogue is unnecessary. */
2150 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2151 || (unsigned HOST_WIDE_INT) max_niter > th)))
2152 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2154 /* If an epilogue loop is required make sure we can create one. */
2155 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2156 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2160 if (!vect_can_advance_ivs_p (loop_vinfo)
2161 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2162 single_exit (LOOP_VINFO_LOOP
2163 (loop_vinfo))))
2165 if (dump_enabled_p ())
2166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2167 "not vectorized: can't create required "
2168 "epilog loop\n");
2169 goto again;
2173 /* During peeling, we need to check if number of loop iterations is
2174 enough for both peeled prolog loop and vector loop. This check
2175 can be merged along with threshold check of loop versioning, so
2176 increase threshold for this case if necessary. */
2177 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2178 && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2179 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2181 unsigned niters_th;
2183 /* Niters for peeled prolog loop. */
2184 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2186 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2187 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2189 niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2191 else
2192 niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2194 /* Niters for at least one iteration of vectorized loop. */
2195 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2196 /* One additional iteration because of peeling for gap. */
2197 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2198 niters_th++;
2199 if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2200 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2203 gcc_assert (vectorization_factor
2204 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2206 /* Ok to vectorize! */
2207 return true;
2209 again:
2210 /* Try again with SLP forced off but if we didn't do any SLP there is
2211 no point in re-trying. */
2212 if (!slp)
2213 return false;
2215 /* If there are reduction chains re-trying will fail anyway. */
2216 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2217 return false;
2219 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2220 via interleaving or lane instructions. */
2221 slp_instance instance;
2222 slp_tree node;
2223 unsigned i, j;
2224 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2226 stmt_vec_info vinfo;
2227 vinfo = vinfo_for_stmt
2228 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2229 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2230 continue;
2231 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2232 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2233 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2234 if (! vect_store_lanes_supported (vectype, size)
2235 && ! vect_grouped_store_supported (vectype, size))
2236 return false;
2237 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2239 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2240 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2241 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2242 size = STMT_VINFO_GROUP_SIZE (vinfo);
2243 vectype = STMT_VINFO_VECTYPE (vinfo);
2244 if (! vect_load_lanes_supported (vectype, size)
2245 && ! vect_grouped_load_supported (vectype, single_element_p,
2246 size))
2247 return false;
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "re-trying with SLP disabled\n");
2255 /* Roll back state appropriately. No SLP this time. */
2256 slp = false;
2257 /* Restore vectorization factor as it were without SLP. */
2258 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2259 /* Free the SLP instances. */
2260 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2261 vect_free_slp_instance (instance);
2262 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2263 /* Reset SLP type to loop_vect on all stmts. */
2264 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2266 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2267 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2268 !gsi_end_p (si); gsi_next (&si))
2270 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2271 STMT_SLP_TYPE (stmt_info) = loop_vect;
2273 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2274 !gsi_end_p (si); gsi_next (&si))
2276 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2277 STMT_SLP_TYPE (stmt_info) = loop_vect;
2278 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2280 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2281 STMT_SLP_TYPE (stmt_info) = loop_vect;
2282 for (gimple_stmt_iterator pi
2283 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2284 !gsi_end_p (pi); gsi_next (&pi))
2286 gimple *pstmt = gsi_stmt (pi);
2287 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2292 /* Free optimized alias test DDRS. */
2293 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2294 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2295 /* Reset target cost data. */
2296 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2297 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2298 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2299 /* Reset assorted flags. */
2300 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2301 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2302 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2304 goto start_over;
2307 /* Function vect_analyze_loop.
2309 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2310 for it. The different analyses will record information in the
2311 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2312 be vectorized. */
2313 loop_vec_info
2314 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2316 loop_vec_info loop_vinfo;
2317 unsigned int vector_sizes;
2319 /* Autodetect first vector size we try. */
2320 current_vector_size = 0;
2321 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE, vect_location,
2325 "===== analyze_loop_nest =====\n");
2327 if (loop_outer (loop)
2328 && loop_vec_info_for_loop (loop_outer (loop))
2329 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE, vect_location,
2333 "outer-loop already vectorized.\n");
2334 return NULL;
2337 while (1)
2339 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2340 loop_vinfo = vect_analyze_loop_form (loop);
2341 if (!loop_vinfo)
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 "bad loop form.\n");
2346 return NULL;
2349 bool fatal = false;
2351 if (orig_loop_vinfo)
2352 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2354 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2356 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2358 return loop_vinfo;
2361 delete loop_vinfo;
2363 vector_sizes &= ~current_vector_size;
2364 if (fatal
2365 || vector_sizes == 0
2366 || current_vector_size == 0)
2367 return NULL;
2369 /* Try the next biggest vector size. */
2370 current_vector_size = 1 << floor_log2 (vector_sizes);
2371 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_NOTE, vect_location,
2373 "***** Re-trying analysis with "
2374 "vector size %d\n", current_vector_size);
2379 /* Function reduction_code_for_scalar_code
2381 Input:
2382 CODE - tree_code of a reduction operations.
2384 Output:
2385 REDUC_CODE - the corresponding tree-code to be used to reduce the
2386 vector of partial results into a single scalar result, or ERROR_MARK
2387 if the operation is a supported reduction operation, but does not have
2388 such a tree-code.
2390 Return FALSE if CODE currently cannot be vectorized as reduction. */
2392 static bool
2393 reduction_code_for_scalar_code (enum tree_code code,
2394 enum tree_code *reduc_code)
2396 switch (code)
2398 case MAX_EXPR:
2399 *reduc_code = REDUC_MAX_EXPR;
2400 return true;
2402 case MIN_EXPR:
2403 *reduc_code = REDUC_MIN_EXPR;
2404 return true;
2406 case PLUS_EXPR:
2407 *reduc_code = REDUC_PLUS_EXPR;
2408 return true;
2410 case MULT_EXPR:
2411 case MINUS_EXPR:
2412 case BIT_IOR_EXPR:
2413 case BIT_XOR_EXPR:
2414 case BIT_AND_EXPR:
2415 *reduc_code = ERROR_MARK;
2416 return true;
2418 default:
2419 return false;
2424 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2425 STMT is printed with a message MSG. */
2427 static void
2428 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2430 dump_printf_loc (msg_type, vect_location, "%s", msg);
2431 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2435 /* Detect SLP reduction of the form:
2437 #a1 = phi <a5, a0>
2438 a2 = operation (a1)
2439 a3 = operation (a2)
2440 a4 = operation (a3)
2441 a5 = operation (a4)
2443 #a = phi <a5>
2445 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2446 FIRST_STMT is the first reduction stmt in the chain
2447 (a2 = operation (a1)).
2449 Return TRUE if a reduction chain was detected. */
2451 static bool
2452 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2453 gimple *first_stmt)
2455 struct loop *loop = (gimple_bb (phi))->loop_father;
2456 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2457 enum tree_code code;
2458 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2459 stmt_vec_info use_stmt_info, current_stmt_info;
2460 tree lhs;
2461 imm_use_iterator imm_iter;
2462 use_operand_p use_p;
2463 int nloop_uses, size = 0, n_out_of_loop_uses;
2464 bool found = false;
2466 if (loop != vect_loop)
2467 return false;
2469 lhs = PHI_RESULT (phi);
2470 code = gimple_assign_rhs_code (first_stmt);
2471 while (1)
2473 nloop_uses = 0;
2474 n_out_of_loop_uses = 0;
2475 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2477 gimple *use_stmt = USE_STMT (use_p);
2478 if (is_gimple_debug (use_stmt))
2479 continue;
2481 /* Check if we got back to the reduction phi. */
2482 if (use_stmt == phi)
2484 loop_use_stmt = use_stmt;
2485 found = true;
2486 break;
2489 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2491 loop_use_stmt = use_stmt;
2492 nloop_uses++;
2494 else
2495 n_out_of_loop_uses++;
2497 /* There are can be either a single use in the loop or two uses in
2498 phi nodes. */
2499 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2500 return false;
2503 if (found)
2504 break;
2506 /* We reached a statement with no loop uses. */
2507 if (nloop_uses == 0)
2508 return false;
2510 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2511 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2512 return false;
2514 if (!is_gimple_assign (loop_use_stmt)
2515 || code != gimple_assign_rhs_code (loop_use_stmt)
2516 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2517 return false;
2519 /* Insert USE_STMT into reduction chain. */
2520 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2521 if (current_stmt)
2523 current_stmt_info = vinfo_for_stmt (current_stmt);
2524 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2525 GROUP_FIRST_ELEMENT (use_stmt_info)
2526 = GROUP_FIRST_ELEMENT (current_stmt_info);
2528 else
2529 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2531 lhs = gimple_assign_lhs (loop_use_stmt);
2532 current_stmt = loop_use_stmt;
2533 size++;
2536 if (!found || loop_use_stmt != phi || size < 2)
2537 return false;
2539 /* Swap the operands, if needed, to make the reduction operand be the second
2540 operand. */
2541 lhs = PHI_RESULT (phi);
2542 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2543 while (next_stmt)
2545 if (gimple_assign_rhs2 (next_stmt) == lhs)
2547 tree op = gimple_assign_rhs1 (next_stmt);
2548 gimple *def_stmt = NULL;
2550 if (TREE_CODE (op) == SSA_NAME)
2551 def_stmt = SSA_NAME_DEF_STMT (op);
2553 /* Check that the other def is either defined in the loop
2554 ("vect_internal_def"), or it's an induction (defined by a
2555 loop-header phi-node). */
2556 if (def_stmt
2557 && gimple_bb (def_stmt)
2558 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2559 && (is_gimple_assign (def_stmt)
2560 || is_gimple_call (def_stmt)
2561 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2562 == vect_induction_def
2563 || (gimple_code (def_stmt) == GIMPLE_PHI
2564 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2565 == vect_internal_def
2566 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2568 lhs = gimple_assign_lhs (next_stmt);
2569 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2570 continue;
2573 return false;
2575 else
2577 tree op = gimple_assign_rhs2 (next_stmt);
2578 gimple *def_stmt = NULL;
2580 if (TREE_CODE (op) == SSA_NAME)
2581 def_stmt = SSA_NAME_DEF_STMT (op);
2583 /* Check that the other def is either defined in the loop
2584 ("vect_internal_def"), or it's an induction (defined by a
2585 loop-header phi-node). */
2586 if (def_stmt
2587 && gimple_bb (def_stmt)
2588 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2589 && (is_gimple_assign (def_stmt)
2590 || is_gimple_call (def_stmt)
2591 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592 == vect_induction_def
2593 || (gimple_code (def_stmt) == GIMPLE_PHI
2594 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2595 == vect_internal_def
2596 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2598 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2601 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2604 swap_ssa_operands (next_stmt,
2605 gimple_assign_rhs1_ptr (next_stmt),
2606 gimple_assign_rhs2_ptr (next_stmt));
2607 update_stmt (next_stmt);
2609 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2610 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2612 else
2613 return false;
2616 lhs = gimple_assign_lhs (next_stmt);
2617 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2620 /* Save the chain for further analysis in SLP detection. */
2621 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2622 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2623 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2625 return true;
2629 /* Function vect_is_simple_reduction
2631 (1) Detect a cross-iteration def-use cycle that represents a simple
2632 reduction computation. We look for the following pattern:
2634 loop_header:
2635 a1 = phi < a0, a2 >
2636 a3 = ...
2637 a2 = operation (a3, a1)
2641 a3 = ...
2642 loop_header:
2643 a1 = phi < a0, a2 >
2644 a2 = operation (a3, a1)
2646 such that:
2647 1. operation is commutative and associative and it is safe to
2648 change the order of the computation
2649 2. no uses for a2 in the loop (a2 is used out of the loop)
2650 3. no uses of a1 in the loop besides the reduction operation
2651 4. no uses of a1 outside the loop.
2653 Conditions 1,4 are tested here.
2654 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2656 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2657 nested cycles.
2659 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2660 reductions:
2662 a1 = phi < a0, a2 >
2663 inner loop (def of a3)
2664 a2 = phi < a3 >
2666 (4) Detect condition expressions, ie:
2667 for (int i = 0; i < N; i++)
2668 if (a[i] < val)
2669 ret_val = a[i];
2673 static gimple *
2674 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2675 bool *double_reduc,
2676 bool need_wrapping_integral_overflow,
2677 enum vect_reduction_type *v_reduc_type)
2679 struct loop *loop = (gimple_bb (phi))->loop_father;
2680 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2681 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2682 enum tree_code orig_code, code;
2683 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2684 tree type;
2685 int nloop_uses;
2686 tree name;
2687 imm_use_iterator imm_iter;
2688 use_operand_p use_p;
2689 bool phi_def;
2691 *double_reduc = false;
2692 *v_reduc_type = TREE_CODE_REDUCTION;
2694 tree phi_name = PHI_RESULT (phi);
2695 /* ??? If there are no uses of the PHI result the inner loop reduction
2696 won't be detected as possibly double-reduction by vectorizable_reduction
2697 because that tries to walk the PHI arg from the preheader edge which
2698 can be constant. See PR60382. */
2699 if (has_zero_uses (phi_name))
2700 return NULL;
2701 nloop_uses = 0;
2702 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2704 gimple *use_stmt = USE_STMT (use_p);
2705 if (is_gimple_debug (use_stmt))
2706 continue;
2708 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2710 if (dump_enabled_p ())
2711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2712 "intermediate value used outside loop.\n");
2714 return NULL;
2717 nloop_uses++;
2718 if (nloop_uses > 1)
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722 "reduction value used in loop.\n");
2723 return NULL;
2726 phi_use_stmt = use_stmt;
2729 edge latch_e = loop_latch_edge (loop);
2730 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2731 if (TREE_CODE (loop_arg) != SSA_NAME)
2733 if (dump_enabled_p ())
2735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2736 "reduction: not ssa_name: ");
2737 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2738 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2740 return NULL;
2743 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2744 if (is_gimple_assign (def_stmt))
2746 name = gimple_assign_lhs (def_stmt);
2747 phi_def = false;
2749 else if (gimple_code (def_stmt) == GIMPLE_PHI)
2751 name = PHI_RESULT (def_stmt);
2752 phi_def = true;
2754 else
2756 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2759 "reduction: unhandled reduction operation: ");
2760 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2762 return NULL;
2765 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2766 return NULL;
2768 nloop_uses = 0;
2769 auto_vec<gphi *, 3> lcphis;
2770 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2772 gimple *use_stmt = USE_STMT (use_p);
2773 if (is_gimple_debug (use_stmt))
2774 continue;
2775 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2776 nloop_uses++;
2777 else
2778 /* We can have more than one loop-closed PHI. */
2779 lcphis.safe_push (as_a <gphi *> (use_stmt));
2780 if (nloop_uses > 1)
2782 if (dump_enabled_p ())
2783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2784 "reduction used in loop.\n");
2785 return NULL;
2789 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2790 defined in the inner loop. */
2791 if (phi_def)
2793 op1 = PHI_ARG_DEF (def_stmt, 0);
2795 if (gimple_phi_num_args (def_stmt) != 1
2796 || TREE_CODE (op1) != SSA_NAME)
2798 if (dump_enabled_p ())
2799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800 "unsupported phi node definition.\n");
2802 return NULL;
2805 def1 = SSA_NAME_DEF_STMT (op1);
2806 if (gimple_bb (def1)
2807 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2808 && loop->inner
2809 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2810 && is_gimple_assign (def1)
2811 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2813 if (dump_enabled_p ())
2814 report_vect_op (MSG_NOTE, def_stmt,
2815 "detected double reduction: ");
2817 *double_reduc = true;
2818 return def_stmt;
2821 return NULL;
2824 /* If we are vectorizing an inner reduction we are executing that
2825 in the original order only in case we are not dealing with a
2826 double reduction. */
2827 bool check_reduction = true;
2828 if (flow_loop_nested_p (vect_loop, loop))
2830 gphi *lcphi;
2831 unsigned i;
2832 check_reduction = false;
2833 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2834 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2836 gimple *use_stmt = USE_STMT (use_p);
2837 if (is_gimple_debug (use_stmt))
2838 continue;
2839 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2840 check_reduction = true;
2844 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2845 code = orig_code = gimple_assign_rhs_code (def_stmt);
2847 /* We can handle "res -= x[i]", which is non-associative by
2848 simply rewriting this into "res += -x[i]". Avoid changing
2849 gimple instruction for the first simple tests and only do this
2850 if we're allowed to change code at all. */
2851 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2852 code = PLUS_EXPR;
2854 if (code == COND_EXPR)
2856 if (! nested_in_vect_loop)
2857 *v_reduc_type = COND_REDUCTION;
2859 op3 = gimple_assign_rhs1 (def_stmt);
2860 if (COMPARISON_CLASS_P (op3))
2862 op4 = TREE_OPERAND (op3, 1);
2863 op3 = TREE_OPERAND (op3, 0);
2865 if (op3 == phi_name || op4 == phi_name)
2867 if (dump_enabled_p ())
2868 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2869 "reduction: condition depends on previous"
2870 " iteration: ");
2871 return NULL;
2874 op1 = gimple_assign_rhs2 (def_stmt);
2875 op2 = gimple_assign_rhs3 (def_stmt);
2877 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2879 if (dump_enabled_p ())
2880 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2881 "reduction: not commutative/associative: ");
2882 return NULL;
2884 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2886 op1 = gimple_assign_rhs1 (def_stmt);
2887 op2 = gimple_assign_rhs2 (def_stmt);
2889 else
2891 if (dump_enabled_p ())
2892 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2893 "reduction: not handled operation: ");
2894 return NULL;
2897 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2899 if (dump_enabled_p ())
2900 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2901 "reduction: both uses not ssa_names: ");
2903 return NULL;
2906 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2907 if ((TREE_CODE (op1) == SSA_NAME
2908 && !types_compatible_p (type,TREE_TYPE (op1)))
2909 || (TREE_CODE (op2) == SSA_NAME
2910 && !types_compatible_p (type, TREE_TYPE (op2)))
2911 || (op3 && TREE_CODE (op3) == SSA_NAME
2912 && !types_compatible_p (type, TREE_TYPE (op3)))
2913 || (op4 && TREE_CODE (op4) == SSA_NAME
2914 && !types_compatible_p (type, TREE_TYPE (op4))))
2916 if (dump_enabled_p ())
2918 dump_printf_loc (MSG_NOTE, vect_location,
2919 "reduction: multiple types: operation type: ");
2920 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2921 dump_printf (MSG_NOTE, ", operands types: ");
2922 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923 TREE_TYPE (op1));
2924 dump_printf (MSG_NOTE, ",");
2925 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926 TREE_TYPE (op2));
2927 if (op3)
2929 dump_printf (MSG_NOTE, ",");
2930 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931 TREE_TYPE (op3));
2934 if (op4)
2936 dump_printf (MSG_NOTE, ",");
2937 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2938 TREE_TYPE (op4));
2940 dump_printf (MSG_NOTE, "\n");
2943 return NULL;
2946 /* Check that it's ok to change the order of the computation.
2947 Generally, when vectorizing a reduction we change the order of the
2948 computation. This may change the behavior of the program in some
2949 cases, so we need to check that this is ok. One exception is when
2950 vectorizing an outer-loop: the inner-loop is executed sequentially,
2951 and therefore vectorizing reductions in the inner-loop during
2952 outer-loop vectorization is safe. */
2954 if (*v_reduc_type != COND_REDUCTION
2955 && check_reduction)
2957 /* CHECKME: check for !flag_finite_math_only too? */
2958 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2960 /* Changing the order of operations changes the semantics. */
2961 if (dump_enabled_p ())
2962 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2963 "reduction: unsafe fp math optimization: ");
2964 return NULL;
2966 else if (INTEGRAL_TYPE_P (type))
2968 if (!operation_no_trapping_overflow (type, code))
2970 /* Changing the order of operations changes the semantics. */
2971 if (dump_enabled_p ())
2972 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2973 "reduction: unsafe int math optimization"
2974 " (overflow traps): ");
2975 return NULL;
2977 if (need_wrapping_integral_overflow
2978 && !TYPE_OVERFLOW_WRAPS (type)
2979 && operation_can_overflow (code))
2981 /* Changing the order of operations changes the semantics. */
2982 if (dump_enabled_p ())
2983 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2984 "reduction: unsafe int math optimization"
2985 " (overflow doesn't wrap): ");
2986 return NULL;
2989 else if (SAT_FIXED_POINT_TYPE_P (type))
2991 /* Changing the order of operations changes the semantics. */
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2994 "reduction: unsafe fixed-point math optimization: ");
2995 return NULL;
2999 /* Reduction is safe. We're dealing with one of the following:
3000 1) integer arithmetic and no trapv
3001 2) floating point arithmetic, and special flags permit this optimization
3002 3) nested cycle (i.e., outer loop vectorization). */
3003 if (TREE_CODE (op1) == SSA_NAME)
3004 def1 = SSA_NAME_DEF_STMT (op1);
3006 if (TREE_CODE (op2) == SSA_NAME)
3007 def2 = SSA_NAME_DEF_STMT (op2);
3009 if (code != COND_EXPR
3010 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3012 if (dump_enabled_p ())
3013 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3014 return NULL;
3017 /* Check that one def is the reduction def, defined by PHI,
3018 the other def is either defined in the loop ("vect_internal_def"),
3019 or it's an induction (defined by a loop-header phi-node). */
3021 if (def2 && def2 == phi
3022 && (code == COND_EXPR
3023 || !def1 || gimple_nop_p (def1)
3024 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3025 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3026 && (is_gimple_assign (def1)
3027 || is_gimple_call (def1)
3028 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3029 == vect_induction_def
3030 || (gimple_code (def1) == GIMPLE_PHI
3031 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3032 == vect_internal_def
3033 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3035 if (dump_enabled_p ())
3036 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3037 return def_stmt;
3040 if (def1 && def1 == phi
3041 && (code == COND_EXPR
3042 || !def2 || gimple_nop_p (def2)
3043 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3044 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3045 && (is_gimple_assign (def2)
3046 || is_gimple_call (def2)
3047 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3048 == vect_induction_def
3049 || (gimple_code (def2) == GIMPLE_PHI
3050 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3051 == vect_internal_def
3052 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3054 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3056 /* Check if we can swap operands (just for simplicity - so that
3057 the rest of the code can assume that the reduction variable
3058 is always the last (second) argument). */
3059 if (code == COND_EXPR)
3061 /* Swap cond_expr by inverting the condition. */
3062 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3063 enum tree_code invert_code = ERROR_MARK;
3064 enum tree_code cond_code = TREE_CODE (cond_expr);
3066 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3068 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3069 invert_code = invert_tree_comparison (cond_code, honor_nans);
3071 if (invert_code != ERROR_MARK)
3073 TREE_SET_CODE (cond_expr, invert_code);
3074 swap_ssa_operands (def_stmt,
3075 gimple_assign_rhs2_ptr (def_stmt),
3076 gimple_assign_rhs3_ptr (def_stmt));
3078 else
3080 if (dump_enabled_p ())
3081 report_vect_op (MSG_NOTE, def_stmt,
3082 "detected reduction: cannot swap operands "
3083 "for cond_expr");
3084 return NULL;
3087 else
3088 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3089 gimple_assign_rhs2_ptr (def_stmt));
3091 if (dump_enabled_p ())
3092 report_vect_op (MSG_NOTE, def_stmt,
3093 "detected reduction: need to swap operands: ");
3095 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3096 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3098 else
3100 if (dump_enabled_p ())
3101 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3104 return def_stmt;
3107 /* Try to find SLP reduction chain. */
3108 if (! nested_in_vect_loop
3109 && code != COND_EXPR
3110 && orig_code != MINUS_EXPR
3111 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3113 if (dump_enabled_p ())
3114 report_vect_op (MSG_NOTE, def_stmt,
3115 "reduction: detected reduction chain: ");
3117 return def_stmt;
3120 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3121 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3122 while (first)
3124 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3125 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3126 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3127 first = next;
3130 /* Look for the expression computing loop_arg from loop PHI result. */
3131 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3132 auto_bitmap visited;
3133 tree lookfor = PHI_RESULT (phi);
3134 ssa_op_iter curri;
3135 use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3136 SSA_OP_USE);
3137 while (USE_FROM_PTR (curr) != loop_arg)
3138 curr = op_iter_next_use (&curri);
3139 curri.i = curri.numops;
3142 path.safe_push (std::make_pair (curri, curr));
3143 tree use = USE_FROM_PTR (curr);
3144 if (use == lookfor)
3145 break;
3146 gimple *def = SSA_NAME_DEF_STMT (use);
3147 if (gimple_nop_p (def)
3148 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3150 pop:
3153 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3154 curri = x.first;
3155 curr = x.second;
3157 curr = op_iter_next_use (&curri);
3158 /* Skip already visited or non-SSA operands (from iterating
3159 over PHI args). */
3160 while (curr != NULL_USE_OPERAND_P
3161 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3162 || ! bitmap_set_bit (visited,
3163 SSA_NAME_VERSION
3164 (USE_FROM_PTR (curr)))));
3166 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3167 if (curr == NULL_USE_OPERAND_P)
3168 break;
3170 else
3172 if (gimple_code (def) == GIMPLE_PHI)
3173 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3174 else
3175 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3176 while (curr != NULL_USE_OPERAND_P
3177 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3178 || ! bitmap_set_bit (visited,
3179 SSA_NAME_VERSION
3180 (USE_FROM_PTR (curr)))))
3181 curr = op_iter_next_use (&curri);
3182 if (curr == NULL_USE_OPERAND_P)
3183 goto pop;
3186 while (1);
3187 if (dump_file && (dump_flags & TDF_DETAILS))
3189 dump_printf_loc (MSG_NOTE, vect_location,
3190 "reduction path: ");
3191 unsigned i;
3192 std::pair<ssa_op_iter, use_operand_p> *x;
3193 FOR_EACH_VEC_ELT (path, i, x)
3195 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3196 dump_printf (MSG_NOTE, " ");
3198 dump_printf (MSG_NOTE, "\n");
3201 /* Check whether the reduction path detected is valid. */
3202 bool fail = path.length () == 0;
3203 bool neg = false;
3204 for (unsigned i = 1; i < path.length (); ++i)
3206 gimple *use_stmt = USE_STMT (path[i].second);
3207 tree op = USE_FROM_PTR (path[i].second);
3208 if (! has_single_use (op)
3209 || ! is_gimple_assign (use_stmt))
3211 fail = true;
3212 break;
3214 if (gimple_assign_rhs_code (use_stmt) != code)
3216 if (code == PLUS_EXPR
3217 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3219 /* Track whether we negate the reduction value each iteration. */
3220 if (gimple_assign_rhs2 (use_stmt) == op)
3221 neg = ! neg;
3223 else
3225 fail = true;
3226 break;
3230 if (! fail && ! neg)
3231 return def_stmt;
3233 if (dump_enabled_p ())
3235 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3236 "reduction: unknown pattern: ");
3239 return NULL;
3242 /* Wrapper around vect_is_simple_reduction, which will modify code
3243 in-place if it enables detection of more reductions. Arguments
3244 as there. */
3246 gimple *
3247 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3248 bool *double_reduc,
3249 bool need_wrapping_integral_overflow)
3251 enum vect_reduction_type v_reduc_type;
3252 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3253 need_wrapping_integral_overflow,
3254 &v_reduc_type);
3255 if (def)
3257 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3258 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3259 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3260 reduc_def_info = vinfo_for_stmt (def);
3261 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3263 return def;
3266 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3268 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3269 int *peel_iters_epilogue,
3270 stmt_vector_for_cost *scalar_cost_vec,
3271 stmt_vector_for_cost *prologue_cost_vec,
3272 stmt_vector_for_cost *epilogue_cost_vec)
3274 int retval = 0;
3275 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3277 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3279 *peel_iters_epilogue = vf/2;
3280 if (dump_enabled_p ())
3281 dump_printf_loc (MSG_NOTE, vect_location,
3282 "cost model: epilogue peel iters set to vf/2 "
3283 "because loop iterations are unknown .\n");
3285 /* If peeled iterations are known but number of scalar loop
3286 iterations are unknown, count a taken branch per peeled loop. */
3287 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3288 NULL, 0, vect_prologue);
3289 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3290 NULL, 0, vect_epilogue);
3292 else
3294 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3295 peel_iters_prologue = niters < peel_iters_prologue ?
3296 niters : peel_iters_prologue;
3297 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3298 /* If we need to peel for gaps, but no peeling is required, we have to
3299 peel VF iterations. */
3300 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3301 *peel_iters_epilogue = vf;
3304 stmt_info_for_cost *si;
3305 int j;
3306 if (peel_iters_prologue)
3307 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3309 stmt_vec_info stmt_info
3310 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3311 retval += record_stmt_cost (prologue_cost_vec,
3312 si->count * peel_iters_prologue,
3313 si->kind, stmt_info, si->misalign,
3314 vect_prologue);
3316 if (*peel_iters_epilogue)
3317 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3319 stmt_vec_info stmt_info
3320 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3321 retval += record_stmt_cost (epilogue_cost_vec,
3322 si->count * *peel_iters_epilogue,
3323 si->kind, stmt_info, si->misalign,
3324 vect_epilogue);
3327 return retval;
3330 /* Function vect_estimate_min_profitable_iters
3332 Return the number of iterations required for the vector version of the
3333 loop to be profitable relative to the cost of the scalar version of the
3334 loop.
3336 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3337 of iterations for vectorization. -1 value means loop vectorization
3338 is not profitable. This returned value may be used for dynamic
3339 profitability check.
3341 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3342 for static check against estimated number of iterations. */
3344 static void
3345 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3346 int *ret_min_profitable_niters,
3347 int *ret_min_profitable_estimate)
3349 int min_profitable_iters;
3350 int min_profitable_estimate;
3351 int peel_iters_prologue;
3352 int peel_iters_epilogue;
3353 unsigned vec_inside_cost = 0;
3354 int vec_outside_cost = 0;
3355 unsigned vec_prologue_cost = 0;
3356 unsigned vec_epilogue_cost = 0;
3357 int scalar_single_iter_cost = 0;
3358 int scalar_outside_cost = 0;
3359 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3360 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3361 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3363 /* Cost model disabled. */
3364 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3366 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3367 *ret_min_profitable_niters = 0;
3368 *ret_min_profitable_estimate = 0;
3369 return;
3372 /* Requires loop versioning tests to handle misalignment. */
3373 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3375 /* FIXME: Make cost depend on complexity of individual check. */
3376 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3377 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3378 vect_prologue);
3379 dump_printf (MSG_NOTE,
3380 "cost model: Adding cost of checks for loop "
3381 "versioning to treat misalignment.\n");
3384 /* Requires loop versioning with alias checks. */
3385 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3387 /* FIXME: Make cost depend on complexity of individual check. */
3388 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3389 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3390 vect_prologue);
3391 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3392 if (len)
3393 /* Count LEN - 1 ANDs and LEN comparisons. */
3394 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3395 NULL, 0, vect_prologue);
3396 dump_printf (MSG_NOTE,
3397 "cost model: Adding cost of checks for loop "
3398 "versioning aliasing.\n");
3401 /* Requires loop versioning with niter checks. */
3402 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3404 /* FIXME: Make cost depend on complexity of individual check. */
3405 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3406 vect_prologue);
3407 dump_printf (MSG_NOTE,
3408 "cost model: Adding cost of checks for loop "
3409 "versioning niters.\n");
3412 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3413 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3414 vect_prologue);
3416 /* Count statements in scalar loop. Using this as scalar cost for a single
3417 iteration for now.
3419 TODO: Add outer loop support.
3421 TODO: Consider assigning different costs to different scalar
3422 statements. */
3424 scalar_single_iter_cost
3425 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3427 /* Add additional cost for the peeled instructions in prologue and epilogue
3428 loop.
3430 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3431 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3433 TODO: Build an expression that represents peel_iters for prologue and
3434 epilogue to be used in a run-time test. */
3436 if (npeel < 0)
3438 peel_iters_prologue = vf/2;
3439 dump_printf (MSG_NOTE, "cost model: "
3440 "prologue peel iters set to vf/2.\n");
3442 /* If peeling for alignment is unknown, loop bound of main loop becomes
3443 unknown. */
3444 peel_iters_epilogue = vf/2;
3445 dump_printf (MSG_NOTE, "cost model: "
3446 "epilogue peel iters set to vf/2 because "
3447 "peeling for alignment is unknown.\n");
3449 /* If peeled iterations are unknown, count a taken branch and a not taken
3450 branch per peeled loop. Even if scalar loop iterations are known,
3451 vector iterations are not known since peeled prologue iterations are
3452 not known. Hence guards remain the same. */
3453 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3454 NULL, 0, vect_prologue);
3455 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3456 NULL, 0, vect_prologue);
3457 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3458 NULL, 0, vect_epilogue);
3459 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3460 NULL, 0, vect_epilogue);
3461 stmt_info_for_cost *si;
3462 int j;
3463 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3465 struct _stmt_vec_info *stmt_info
3466 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3467 (void) add_stmt_cost (target_cost_data,
3468 si->count * peel_iters_prologue,
3469 si->kind, stmt_info, si->misalign,
3470 vect_prologue);
3471 (void) add_stmt_cost (target_cost_data,
3472 si->count * peel_iters_epilogue,
3473 si->kind, stmt_info, si->misalign,
3474 vect_epilogue);
3477 else
3479 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3480 stmt_info_for_cost *si;
3481 int j;
3482 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3484 prologue_cost_vec.create (2);
3485 epilogue_cost_vec.create (2);
3486 peel_iters_prologue = npeel;
3488 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3489 &peel_iters_epilogue,
3490 &LOOP_VINFO_SCALAR_ITERATION_COST
3491 (loop_vinfo),
3492 &prologue_cost_vec,
3493 &epilogue_cost_vec);
3495 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3497 struct _stmt_vec_info *stmt_info
3498 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3499 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3500 si->misalign, vect_prologue);
3503 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3505 struct _stmt_vec_info *stmt_info
3506 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3507 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3508 si->misalign, vect_epilogue);
3511 prologue_cost_vec.release ();
3512 epilogue_cost_vec.release ();
3515 /* FORNOW: The scalar outside cost is incremented in one of the
3516 following ways:
3518 1. The vectorizer checks for alignment and aliasing and generates
3519 a condition that allows dynamic vectorization. A cost model
3520 check is ANDED with the versioning condition. Hence scalar code
3521 path now has the added cost of the versioning check.
3523 if (cost > th & versioning_check)
3524 jmp to vector code
3526 Hence run-time scalar is incremented by not-taken branch cost.
3528 2. The vectorizer then checks if a prologue is required. If the
3529 cost model check was not done before during versioning, it has to
3530 be done before the prologue check.
3532 if (cost <= th)
3533 prologue = scalar_iters
3534 if (prologue == 0)
3535 jmp to vector code
3536 else
3537 execute prologue
3538 if (prologue == num_iters)
3539 go to exit
3541 Hence the run-time scalar cost is incremented by a taken branch,
3542 plus a not-taken branch, plus a taken branch cost.
3544 3. The vectorizer then checks if an epilogue is required. If the
3545 cost model check was not done before during prologue check, it
3546 has to be done with the epilogue check.
3548 if (prologue == 0)
3549 jmp to vector code
3550 else
3551 execute prologue
3552 if (prologue == num_iters)
3553 go to exit
3554 vector code:
3555 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3556 jmp to epilogue
3558 Hence the run-time scalar cost should be incremented by 2 taken
3559 branches.
3561 TODO: The back end may reorder the BBS's differently and reverse
3562 conditions/branch directions. Change the estimates below to
3563 something more reasonable. */
3565 /* If the number of iterations is known and we do not do versioning, we can
3566 decide whether to vectorize at compile time. Hence the scalar version
3567 do not carry cost model guard costs. */
3568 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3569 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3571 /* Cost model check occurs at versioning. */
3572 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3573 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3574 else
3576 /* Cost model check occurs at prologue generation. */
3577 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3578 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3579 + vect_get_stmt_cost (cond_branch_not_taken);
3580 /* Cost model check occurs at epilogue generation. */
3581 else
3582 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3586 /* Complete the target-specific cost calculations. */
3587 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3588 &vec_inside_cost, &vec_epilogue_cost);
3590 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3592 if (dump_enabled_p ())
3594 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3595 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3596 vec_inside_cost);
3597 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3598 vec_prologue_cost);
3599 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3600 vec_epilogue_cost);
3601 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3602 scalar_single_iter_cost);
3603 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3604 scalar_outside_cost);
3605 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3606 vec_outside_cost);
3607 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3608 peel_iters_prologue);
3609 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3610 peel_iters_epilogue);
3613 /* Calculate number of iterations required to make the vector version
3614 profitable, relative to the loop bodies only. The following condition
3615 must hold true:
3616 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3617 where
3618 SIC = scalar iteration cost, VIC = vector iteration cost,
3619 VOC = vector outside cost, VF = vectorization factor,
3620 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3621 SOC = scalar outside cost for run time cost model check. */
3623 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3625 if (vec_outside_cost <= 0)
3626 min_profitable_iters = 0;
3627 else
3629 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3630 - vec_inside_cost * peel_iters_prologue
3631 - vec_inside_cost * peel_iters_epilogue)
3632 / ((scalar_single_iter_cost * vf)
3633 - vec_inside_cost);
3635 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3636 <= (((int) vec_inside_cost * min_profitable_iters)
3637 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3638 min_profitable_iters++;
3641 /* vector version will never be profitable. */
3642 else
3644 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3645 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3646 "did not happen for a simd loop");
3648 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650 "cost model: the vector iteration cost = %d "
3651 "divided by the scalar iteration cost = %d "
3652 "is greater or equal to the vectorization factor = %d"
3653 ".\n",
3654 vec_inside_cost, scalar_single_iter_cost, vf);
3655 *ret_min_profitable_niters = -1;
3656 *ret_min_profitable_estimate = -1;
3657 return;
3660 dump_printf (MSG_NOTE,
3661 " Calculated minimum iters for profitability: %d\n",
3662 min_profitable_iters);
3664 /* We want the vectorized loop to execute at least once. */
3665 if (min_profitable_iters < (vf + peel_iters_prologue))
3666 min_profitable_iters = vf + peel_iters_prologue;
3668 if (dump_enabled_p ())
3669 dump_printf_loc (MSG_NOTE, vect_location,
3670 " Runtime profitability threshold = %d\n",
3671 min_profitable_iters);
3673 *ret_min_profitable_niters = min_profitable_iters;
3675 /* Calculate number of iterations required to make the vector version
3676 profitable, relative to the loop bodies only.
3678 Non-vectorized variant is SIC * niters and it must win over vector
3679 variant on the expected loop trip count. The following condition must hold true:
3680 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3682 if (vec_outside_cost <= 0)
3683 min_profitable_estimate = 0;
3684 else
3686 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3687 - vec_inside_cost * peel_iters_prologue
3688 - vec_inside_cost * peel_iters_epilogue)
3689 / ((scalar_single_iter_cost * vf)
3690 - vec_inside_cost);
3692 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE, vect_location,
3695 " Static estimate profitability threshold = %d\n",
3696 min_profitable_estimate);
3698 *ret_min_profitable_estimate = min_profitable_estimate;
3701 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3702 vector elements (not bits) for a vector with NELT elements. */
3703 static void
3704 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3705 vec_perm_indices *sel)
3707 unsigned int i;
3709 for (i = 0; i < nelt; i++)
3710 sel->quick_push ((i + offset) & (2 * nelt - 1));
3713 /* Checks whether the target supports whole-vector shifts for vectors of mode
3714 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3715 it supports vec_perm_const with masks for all necessary shift amounts. */
3716 static bool
3717 have_whole_vector_shift (machine_mode mode)
3719 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3720 return true;
3722 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3723 return false;
3725 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3726 auto_vec_perm_indices sel (nelt);
3728 for (i = nelt/2; i >= 1; i/=2)
3730 sel.truncate (0);
3731 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3732 if (!can_vec_perm_p (mode, false, &sel))
3733 return false;
3735 return true;
3738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3739 functions. Design better to avoid maintenance issues. */
3741 /* Function vect_model_reduction_cost.
3743 Models cost for a reduction operation, including the vector ops
3744 generated within the strip-mine loop, the initial definition before
3745 the loop, and the epilogue code that must be generated. */
3747 static void
3748 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3749 int ncopies)
3751 int prologue_cost = 0, epilogue_cost = 0;
3752 enum tree_code code;
3753 optab optab;
3754 tree vectype;
3755 gimple *orig_stmt;
3756 machine_mode mode;
3757 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3758 struct loop *loop = NULL;
3759 void *target_cost_data;
3761 if (loop_vinfo)
3763 loop = LOOP_VINFO_LOOP (loop_vinfo);
3764 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3766 else
3767 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3769 /* Condition reductions generate two reductions in the loop. */
3770 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3771 ncopies *= 2;
3773 /* Cost of reduction op inside loop. */
3774 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3775 stmt_info, 0, vect_body);
3777 vectype = STMT_VINFO_VECTYPE (stmt_info);
3778 mode = TYPE_MODE (vectype);
3779 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3781 if (!orig_stmt)
3782 orig_stmt = STMT_VINFO_STMT (stmt_info);
3784 code = gimple_assign_rhs_code (orig_stmt);
3786 /* Add in cost for initial definition.
3787 For cond reduction we have four vectors: initial index, step, initial
3788 result of the data reduction, initial value of the index reduction. */
3789 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3790 == COND_REDUCTION ? 4 : 1;
3791 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3792 scalar_to_vec, stmt_info, 0,
3793 vect_prologue);
3795 /* Determine cost of epilogue code.
3797 We have a reduction operator that will reduce the vector in one statement.
3798 Also requires scalar extract. */
3800 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3802 if (reduc_code != ERROR_MARK)
3804 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3806 /* An EQ stmt and an COND_EXPR stmt. */
3807 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3808 vector_stmt, stmt_info, 0,
3809 vect_epilogue);
3810 /* Reduction of the max index and a reduction of the found
3811 values. */
3812 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3813 vec_to_scalar, stmt_info, 0,
3814 vect_epilogue);
3815 /* A broadcast of the max value. */
3816 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3817 scalar_to_vec, stmt_info, 0,
3818 vect_epilogue);
3820 else
3822 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3823 stmt_info, 0, vect_epilogue);
3824 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3825 vec_to_scalar, stmt_info, 0,
3826 vect_epilogue);
3829 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3831 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3832 /* Extraction of scalar elements. */
3833 epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3834 vec_to_scalar, stmt_info, 0,
3835 vect_epilogue);
3836 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3837 epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3838 scalar_stmt, stmt_info, 0,
3839 vect_epilogue);
3841 else
3843 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3844 tree bitsize =
3845 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3846 int element_bitsize = tree_to_uhwi (bitsize);
3847 int nelements = vec_size_in_bits / element_bitsize;
3849 if (code == COND_EXPR)
3850 code = MAX_EXPR;
3852 optab = optab_for_tree_code (code, vectype, optab_default);
3854 /* We have a whole vector shift available. */
3855 if (optab != unknown_optab
3856 && VECTOR_MODE_P (mode)
3857 && optab_handler (optab, mode) != CODE_FOR_nothing
3858 && have_whole_vector_shift (mode))
3860 /* Final reduction via vector shifts and the reduction operator.
3861 Also requires scalar extract. */
3862 epilogue_cost += add_stmt_cost (target_cost_data,
3863 exact_log2 (nelements) * 2,
3864 vector_stmt, stmt_info, 0,
3865 vect_epilogue);
3866 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3867 vec_to_scalar, stmt_info, 0,
3868 vect_epilogue);
3870 else
3871 /* Use extracts and reduction op for final reduction. For N
3872 elements, we have N extracts and N-1 reduction ops. */
3873 epilogue_cost += add_stmt_cost (target_cost_data,
3874 nelements + nelements - 1,
3875 vector_stmt, stmt_info, 0,
3876 vect_epilogue);
3880 if (dump_enabled_p ())
3881 dump_printf (MSG_NOTE,
3882 "vect_model_reduction_cost: inside_cost = %d, "
3883 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3884 prologue_cost, epilogue_cost);
3888 /* Function vect_model_induction_cost.
3890 Models cost for induction operations. */
3892 static void
3893 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3895 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3896 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3897 unsigned inside_cost, prologue_cost;
3899 if (PURE_SLP_STMT (stmt_info))
3900 return;
3902 /* loop cost for vec_loop. */
3903 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3904 stmt_info, 0, vect_body);
3906 /* prologue cost for vec_init and vec_step. */
3907 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3908 stmt_info, 0, vect_prologue);
3910 if (dump_enabled_p ())
3911 dump_printf_loc (MSG_NOTE, vect_location,
3912 "vect_model_induction_cost: inside_cost = %d, "
3913 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3918 /* Function get_initial_def_for_reduction
3920 Input:
3921 STMT - a stmt that performs a reduction operation in the loop.
3922 INIT_VAL - the initial value of the reduction variable
3924 Output:
3925 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3926 of the reduction (used for adjusting the epilog - see below).
3927 Return a vector variable, initialized according to the operation that STMT
3928 performs. This vector will be used as the initial value of the
3929 vector of partial results.
3931 Option1 (adjust in epilog): Initialize the vector as follows:
3932 add/bit or/xor: [0,0,...,0,0]
3933 mult/bit and: [1,1,...,1,1]
3934 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3935 and when necessary (e.g. add/mult case) let the caller know
3936 that it needs to adjust the result by init_val.
3938 Option2: Initialize the vector as follows:
3939 add/bit or/xor: [init_val,0,0,...,0]
3940 mult/bit and: [init_val,1,1,...,1]
3941 min/max/cond_expr: [init_val,init_val,...,init_val]
3942 and no adjustments are needed.
3944 For example, for the following code:
3946 s = init_val;
3947 for (i=0;i<n;i++)
3948 s = s + a[i];
3950 STMT is 's = s + a[i]', and the reduction variable is 's'.
3951 For a vector of 4 units, we want to return either [0,0,0,init_val],
3952 or [0,0,0,0] and let the caller know that it needs to adjust
3953 the result at the end by 'init_val'.
3955 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3956 initialization vector is simpler (same element in all entries), if
3957 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3959 A cost model should help decide between these two schemes. */
3961 tree
3962 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3963 tree *adjustment_def)
3965 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3966 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3967 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3968 tree scalar_type = TREE_TYPE (init_val);
3969 tree vectype = get_vectype_for_scalar_type (scalar_type);
3970 int nunits;
3971 enum tree_code code = gimple_assign_rhs_code (stmt);
3972 tree def_for_init;
3973 tree init_def;
3974 int i;
3975 bool nested_in_vect_loop = false;
3976 REAL_VALUE_TYPE real_init_val = dconst0;
3977 int int_init_val = 0;
3978 gimple *def_stmt = NULL;
3979 gimple_seq stmts = NULL;
3981 gcc_assert (vectype);
3982 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3984 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3985 || SCALAR_FLOAT_TYPE_P (scalar_type));
3987 if (nested_in_vect_loop_p (loop, stmt))
3988 nested_in_vect_loop = true;
3989 else
3990 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3992 /* In case of double reduction we only create a vector variable to be put
3993 in the reduction phi node. The actual statement creation is done in
3994 vect_create_epilog_for_reduction. */
3995 if (adjustment_def && nested_in_vect_loop
3996 && TREE_CODE (init_val) == SSA_NAME
3997 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3998 && gimple_code (def_stmt) == GIMPLE_PHI
3999 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4000 && vinfo_for_stmt (def_stmt)
4001 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4002 == vect_double_reduction_def)
4004 *adjustment_def = NULL;
4005 return vect_create_destination_var (init_val, vectype);
4008 /* In case of a nested reduction do not use an adjustment def as
4009 that case is not supported by the epilogue generation correctly
4010 if ncopies is not one. */
4011 if (adjustment_def && nested_in_vect_loop)
4013 *adjustment_def = NULL;
4014 return vect_get_vec_def_for_operand (init_val, stmt);
4017 switch (code)
4019 case WIDEN_SUM_EXPR:
4020 case DOT_PROD_EXPR:
4021 case SAD_EXPR:
4022 case PLUS_EXPR:
4023 case MINUS_EXPR:
4024 case BIT_IOR_EXPR:
4025 case BIT_XOR_EXPR:
4026 case MULT_EXPR:
4027 case BIT_AND_EXPR:
4029 /* ADJUSMENT_DEF is NULL when called from
4030 vect_create_epilog_for_reduction to vectorize double reduction. */
4031 if (adjustment_def)
4032 *adjustment_def = init_val;
4034 if (code == MULT_EXPR)
4036 real_init_val = dconst1;
4037 int_init_val = 1;
4040 if (code == BIT_AND_EXPR)
4041 int_init_val = -1;
4043 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4044 def_for_init = build_real (scalar_type, real_init_val);
4045 else
4046 def_for_init = build_int_cst (scalar_type, int_init_val);
4048 if (adjustment_def)
4049 /* Option1: the first element is '0' or '1' as well. */
4050 init_def = gimple_build_vector_from_val (&stmts, vectype,
4051 def_for_init);
4052 else
4054 /* Option2: the first element is INIT_VAL. */
4055 auto_vec<tree, 32> elts (nunits);
4056 elts.quick_push (init_val);
4057 for (i = 1; i < nunits; ++i)
4058 elts.quick_push (def_for_init);
4059 init_def = gimple_build_vector (&stmts, vectype, elts);
4062 break;
4064 case MIN_EXPR:
4065 case MAX_EXPR:
4066 case COND_EXPR:
4068 if (adjustment_def)
4070 *adjustment_def = NULL_TREE;
4071 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4073 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4074 break;
4077 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4078 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4080 break;
4082 default:
4083 gcc_unreachable ();
4086 if (stmts)
4087 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4088 return init_def;
4091 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4092 NUMBER_OF_VECTORS is the number of vector defs to create. */
4094 static void
4095 get_initial_defs_for_reduction (slp_tree slp_node,
4096 vec<tree> *vec_oprnds,
4097 unsigned int number_of_vectors,
4098 enum tree_code code, bool reduc_chain)
4100 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4101 gimple *stmt = stmts[0];
4102 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4103 unsigned nunits;
4104 unsigned j, number_of_places_left_in_vector;
4105 tree vector_type, scalar_type;
4106 tree vop;
4107 int group_size = stmts.length ();
4108 unsigned int vec_num, i;
4109 unsigned number_of_copies = 1;
4110 vec<tree> voprnds;
4111 voprnds.create (number_of_vectors);
4112 tree neutral_op = NULL;
4113 struct loop *loop;
4115 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4116 scalar_type = TREE_TYPE (vector_type);
4117 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4119 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4121 loop = (gimple_bb (stmt))->loop_father;
4122 gcc_assert (loop);
4123 edge pe = loop_preheader_edge (loop);
4125 /* op is the reduction operand of the first stmt already. */
4126 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4127 we need either neutral operands or the original operands. See
4128 get_initial_def_for_reduction() for details. */
4129 switch (code)
4131 case WIDEN_SUM_EXPR:
4132 case DOT_PROD_EXPR:
4133 case SAD_EXPR:
4134 case PLUS_EXPR:
4135 case MINUS_EXPR:
4136 case BIT_IOR_EXPR:
4137 case BIT_XOR_EXPR:
4138 neutral_op = build_zero_cst (scalar_type);
4139 break;
4141 case MULT_EXPR:
4142 neutral_op = build_one_cst (scalar_type);
4143 break;
4145 case BIT_AND_EXPR:
4146 neutral_op = build_all_ones_cst (scalar_type);
4147 break;
4149 /* For MIN/MAX we don't have an easy neutral operand but
4150 the initial values can be used fine here. Only for
4151 a reduction chain we have to force a neutral element. */
4152 case MAX_EXPR:
4153 case MIN_EXPR:
4154 if (! reduc_chain)
4155 neutral_op = NULL;
4156 else
4157 neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4158 break;
4160 default:
4161 gcc_assert (! reduc_chain);
4162 neutral_op = NULL;
4165 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4166 created vectors. It is greater than 1 if unrolling is performed.
4168 For example, we have two scalar operands, s1 and s2 (e.g., group of
4169 strided accesses of size two), while NUNITS is four (i.e., four scalars
4170 of this type can be packed in a vector). The output vector will contain
4171 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4172 will be 2).
4174 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4175 containing the operands.
4177 For example, NUNITS is four as before, and the group size is 8
4178 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4179 {s5, s6, s7, s8}. */
4181 number_of_copies = nunits * number_of_vectors / group_size;
4183 number_of_places_left_in_vector = nunits;
4184 auto_vec<tree, 32> elts (nunits);
4185 elts.quick_grow (nunits);
4186 for (j = 0; j < number_of_copies; j++)
4188 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4190 tree op;
4191 /* Get the def before the loop. In reduction chain we have only
4192 one initial value. */
4193 if ((j != (number_of_copies - 1)
4194 || (reduc_chain && i != 0))
4195 && neutral_op)
4196 op = neutral_op;
4197 else
4198 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4200 /* Create 'vect_ = {op0,op1,...,opn}'. */
4201 number_of_places_left_in_vector--;
4202 elts[number_of_places_left_in_vector] = op;
4204 if (number_of_places_left_in_vector == 0)
4206 gimple_seq ctor_seq = NULL;
4207 tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
4208 if (ctor_seq != NULL)
4209 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4210 voprnds.quick_push (init);
4212 number_of_places_left_in_vector = nunits;
4217 /* Since the vectors are created in the reverse order, we should invert
4218 them. */
4219 vec_num = voprnds.length ();
4220 for (j = vec_num; j != 0; j--)
4222 vop = voprnds[j - 1];
4223 vec_oprnds->quick_push (vop);
4226 voprnds.release ();
4228 /* In case that VF is greater than the unrolling factor needed for the SLP
4229 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4230 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4231 to replicate the vectors. */
4232 tree neutral_vec = NULL;
4233 while (number_of_vectors > vec_oprnds->length ())
4235 if (neutral_op)
4237 if (!neutral_vec)
4239 gimple_seq ctor_seq = NULL;
4240 neutral_vec = gimple_build_vector_from_val
4241 (&ctor_seq, vector_type, neutral_op);
4242 if (ctor_seq != NULL)
4243 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4245 vec_oprnds->quick_push (neutral_vec);
4247 else
4249 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4250 vec_oprnds->quick_push (vop);
4256 /* Function vect_create_epilog_for_reduction
4258 Create code at the loop-epilog to finalize the result of a reduction
4259 computation.
4261 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4262 reduction statements.
4263 STMT is the scalar reduction stmt that is being vectorized.
4264 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4265 number of elements that we can fit in a vectype (nunits). In this case
4266 we have to generate more than one vector stmt - i.e - we need to "unroll"
4267 the vector stmt by a factor VF/nunits. For more details see documentation
4268 in vectorizable_operation.
4269 REDUC_CODE is the tree-code for the epilog reduction.
4270 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4271 computation.
4272 REDUC_INDEX is the index of the operand in the right hand side of the
4273 statement that is defined by REDUCTION_PHI.
4274 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4275 SLP_NODE is an SLP node containing a group of reduction statements. The
4276 first one in this group is STMT.
4278 This function:
4279 1. Creates the reduction def-use cycles: sets the arguments for
4280 REDUCTION_PHIS:
4281 The loop-entry argument is the vectorized initial-value of the reduction.
4282 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4283 sums.
4284 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4285 by applying the operation specified by REDUC_CODE if available, or by
4286 other means (whole-vector shifts or a scalar loop).
4287 The function also creates a new phi node at the loop exit to preserve
4288 loop-closed form, as illustrated below.
4290 The flow at the entry to this function:
4292 loop:
4293 vec_def = phi <null, null> # REDUCTION_PHI
4294 VECT_DEF = vector_stmt # vectorized form of STMT
4295 s_loop = scalar_stmt # (scalar) STMT
4296 loop_exit:
4297 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4298 use <s_out0>
4299 use <s_out0>
4301 The above is transformed by this function into:
4303 loop:
4304 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4305 VECT_DEF = vector_stmt # vectorized form of STMT
4306 s_loop = scalar_stmt # (scalar) STMT
4307 loop_exit:
4308 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4309 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4310 v_out2 = reduce <v_out1>
4311 s_out3 = extract_field <v_out2, 0>
4312 s_out4 = adjust_result <s_out3>
4313 use <s_out4>
4314 use <s_out4>
4317 static void
4318 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4319 gimple *reduc_def_stmt,
4320 int ncopies, enum tree_code reduc_code,
4321 vec<gimple *> reduction_phis,
4322 bool double_reduc,
4323 slp_tree slp_node,
4324 slp_instance slp_node_instance)
4326 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4327 stmt_vec_info prev_phi_info;
4328 tree vectype;
4329 machine_mode mode;
4330 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4331 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4332 basic_block exit_bb;
4333 tree scalar_dest;
4334 tree scalar_type;
4335 gimple *new_phi = NULL, *phi;
4336 gimple_stmt_iterator exit_gsi;
4337 tree vec_dest;
4338 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4339 gimple *epilog_stmt = NULL;
4340 enum tree_code code = gimple_assign_rhs_code (stmt);
4341 gimple *exit_phi;
4342 tree bitsize;
4343 tree adjustment_def = NULL;
4344 tree vec_initial_def = NULL;
4345 tree expr, def, initial_def = NULL;
4346 tree orig_name, scalar_result;
4347 imm_use_iterator imm_iter, phi_imm_iter;
4348 use_operand_p use_p, phi_use_p;
4349 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4350 bool nested_in_vect_loop = false;
4351 auto_vec<gimple *> new_phis;
4352 auto_vec<gimple *> inner_phis;
4353 enum vect_def_type dt = vect_unknown_def_type;
4354 int j, i;
4355 auto_vec<tree> scalar_results;
4356 unsigned int group_size = 1, k, ratio;
4357 auto_vec<tree> vec_initial_defs;
4358 auto_vec<gimple *> phis;
4359 bool slp_reduc = false;
4360 tree new_phi_result;
4361 gimple *inner_phi = NULL;
4362 tree induction_index = NULL_TREE;
4364 if (slp_node)
4365 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4367 if (nested_in_vect_loop_p (loop, stmt))
4369 outer_loop = loop;
4370 loop = loop->inner;
4371 nested_in_vect_loop = true;
4372 gcc_assert (!slp_node);
4375 vectype = STMT_VINFO_VECTYPE (stmt_info);
4376 gcc_assert (vectype);
4377 mode = TYPE_MODE (vectype);
4379 /* 1. Create the reduction def-use cycle:
4380 Set the arguments of REDUCTION_PHIS, i.e., transform
4382 loop:
4383 vec_def = phi <null, null> # REDUCTION_PHI
4384 VECT_DEF = vector_stmt # vectorized form of STMT
4387 into:
4389 loop:
4390 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4391 VECT_DEF = vector_stmt # vectorized form of STMT
4394 (in case of SLP, do it for all the phis). */
4396 /* Get the loop-entry arguments. */
4397 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4398 if (slp_node)
4400 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4401 vec_initial_defs.reserve (vec_num);
4402 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4403 &vec_initial_defs, vec_num, code,
4404 GROUP_FIRST_ELEMENT (stmt_info));
4406 else
4408 /* Get at the scalar def before the loop, that defines the initial value
4409 of the reduction variable. */
4410 gimple *def_stmt;
4411 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4412 loop_preheader_edge (loop));
4413 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4414 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4415 &adjustment_def);
4416 vec_initial_defs.create (1);
4417 vec_initial_defs.quick_push (vec_initial_def);
4420 /* Set phi nodes arguments. */
4421 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4423 tree vec_init_def = vec_initial_defs[i];
4424 tree def = vect_defs[i];
4425 for (j = 0; j < ncopies; j++)
4427 if (j != 0)
4429 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4430 if (nested_in_vect_loop)
4431 vec_init_def
4432 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4433 vec_init_def);
4436 /* Set the loop-entry arg of the reduction-phi. */
4438 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4439 == INTEGER_INDUC_COND_REDUCTION)
4441 /* Initialise the reduction phi to zero. This prevents initial
4442 values of non-zero interferring with the reduction op. */
4443 gcc_assert (ncopies == 1);
4444 gcc_assert (i == 0);
4446 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4447 tree zero_vec = build_zero_cst (vec_init_def_type);
4449 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4450 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4452 else
4453 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4454 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4456 /* Set the loop-latch arg for the reduction-phi. */
4457 if (j > 0)
4458 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4460 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4461 UNKNOWN_LOCATION);
4463 if (dump_enabled_p ())
4465 dump_printf_loc (MSG_NOTE, vect_location,
4466 "transform reduction: created def-use cycle: ");
4467 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4468 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4473 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4474 which is updated with the current index of the loop for every match of
4475 the original loop's cond_expr (VEC_STMT). This results in a vector
4476 containing the last time the condition passed for that vector lane.
4477 The first match will be a 1 to allow 0 to be used for non-matching
4478 indexes. If there are no matches at all then the vector will be all
4479 zeroes. */
4480 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4482 tree indx_before_incr, indx_after_incr;
4483 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4484 int k;
4486 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4487 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4489 int scalar_precision
4490 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4491 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4492 tree cr_index_vector_type = build_vector_type
4493 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4495 /* First we create a simple vector induction variable which starts
4496 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4497 vector size (STEP). */
4499 /* Create a {1,2,3,...} vector. */
4500 auto_vec<tree, 32> vtemp (nunits_out);
4501 for (k = 0; k < nunits_out; ++k)
4502 vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4503 tree series_vect = build_vector (cr_index_vector_type, vtemp);
4505 /* Create a vector of the step value. */
4506 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4507 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4509 /* Create an induction variable. */
4510 gimple_stmt_iterator incr_gsi;
4511 bool insert_after;
4512 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4513 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4514 insert_after, &indx_before_incr, &indx_after_incr);
4516 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4517 filled with zeros (VEC_ZERO). */
4519 /* Create a vector of 0s. */
4520 tree zero = build_zero_cst (cr_index_scalar_type);
4521 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4523 /* Create a vector phi node. */
4524 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4525 new_phi = create_phi_node (new_phi_tree, loop->header);
4526 set_vinfo_for_stmt (new_phi,
4527 new_stmt_vec_info (new_phi, loop_vinfo));
4528 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4529 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4531 /* Now take the condition from the loops original cond_expr
4532 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4533 every match uses values from the induction variable
4534 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4535 (NEW_PHI_TREE).
4536 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4537 the new cond_expr (INDEX_COND_EXPR). */
4539 /* Duplicate the condition from vec_stmt. */
4540 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4542 /* Create a conditional, where the condition is taken from vec_stmt
4543 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4544 else is the phi (NEW_PHI_TREE). */
4545 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4546 ccompare, indx_before_incr,
4547 new_phi_tree);
4548 induction_index = make_ssa_name (cr_index_vector_type);
4549 gimple *index_condition = gimple_build_assign (induction_index,
4550 index_cond_expr);
4551 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4552 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4553 loop_vinfo);
4554 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4555 set_vinfo_for_stmt (index_condition, index_vec_info);
4557 /* Update the phi with the vec cond. */
4558 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4559 loop_latch_edge (loop), UNKNOWN_LOCATION);
4562 /* 2. Create epilog code.
4563 The reduction epilog code operates across the elements of the vector
4564 of partial results computed by the vectorized loop.
4565 The reduction epilog code consists of:
4567 step 1: compute the scalar result in a vector (v_out2)
4568 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4569 step 3: adjust the scalar result (s_out3) if needed.
4571 Step 1 can be accomplished using one the following three schemes:
4572 (scheme 1) using reduc_code, if available.
4573 (scheme 2) using whole-vector shifts, if available.
4574 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4575 combined.
4577 The overall epilog code looks like this:
4579 s_out0 = phi <s_loop> # original EXIT_PHI
4580 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4581 v_out2 = reduce <v_out1> # step 1
4582 s_out3 = extract_field <v_out2, 0> # step 2
4583 s_out4 = adjust_result <s_out3> # step 3
4585 (step 3 is optional, and steps 1 and 2 may be combined).
4586 Lastly, the uses of s_out0 are replaced by s_out4. */
4589 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4590 v_out1 = phi <VECT_DEF>
4591 Store them in NEW_PHIS. */
4593 exit_bb = single_exit (loop)->dest;
4594 prev_phi_info = NULL;
4595 new_phis.create (vect_defs.length ());
4596 FOR_EACH_VEC_ELT (vect_defs, i, def)
4598 for (j = 0; j < ncopies; j++)
4600 tree new_def = copy_ssa_name (def);
4601 phi = create_phi_node (new_def, exit_bb);
4602 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4603 if (j == 0)
4604 new_phis.quick_push (phi);
4605 else
4607 def = vect_get_vec_def_for_stmt_copy (dt, def);
4608 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4611 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4612 prev_phi_info = vinfo_for_stmt (phi);
4616 /* The epilogue is created for the outer-loop, i.e., for the loop being
4617 vectorized. Create exit phis for the outer loop. */
4618 if (double_reduc)
4620 loop = outer_loop;
4621 exit_bb = single_exit (loop)->dest;
4622 inner_phis.create (vect_defs.length ());
4623 FOR_EACH_VEC_ELT (new_phis, i, phi)
4625 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4626 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4627 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4628 PHI_RESULT (phi));
4629 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4630 loop_vinfo));
4631 inner_phis.quick_push (phi);
4632 new_phis[i] = outer_phi;
4633 prev_phi_info = vinfo_for_stmt (outer_phi);
4634 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4636 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4637 new_result = copy_ssa_name (PHI_RESULT (phi));
4638 outer_phi = create_phi_node (new_result, exit_bb);
4639 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4640 PHI_RESULT (phi));
4641 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4642 loop_vinfo));
4643 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4644 prev_phi_info = vinfo_for_stmt (outer_phi);
4649 exit_gsi = gsi_after_labels (exit_bb);
4651 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4652 (i.e. when reduc_code is not available) and in the final adjustment
4653 code (if needed). Also get the original scalar reduction variable as
4654 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4655 represents a reduction pattern), the tree-code and scalar-def are
4656 taken from the original stmt that the pattern-stmt (STMT) replaces.
4657 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4658 are taken from STMT. */
4660 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4661 if (!orig_stmt)
4663 /* Regular reduction */
4664 orig_stmt = stmt;
4666 else
4668 /* Reduction pattern */
4669 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4670 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4671 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4674 code = gimple_assign_rhs_code (orig_stmt);
4675 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4676 partial results are added and not subtracted. */
4677 if (code == MINUS_EXPR)
4678 code = PLUS_EXPR;
4680 scalar_dest = gimple_assign_lhs (orig_stmt);
4681 scalar_type = TREE_TYPE (scalar_dest);
4682 scalar_results.create (group_size);
4683 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4684 bitsize = TYPE_SIZE (scalar_type);
4686 /* In case this is a reduction in an inner-loop while vectorizing an outer
4687 loop - we don't need to extract a single scalar result at the end of the
4688 inner-loop (unless it is double reduction, i.e., the use of reduction is
4689 outside the outer-loop). The final vector of partial results will be used
4690 in the vectorized outer-loop, or reduced to a scalar result at the end of
4691 the outer-loop. */
4692 if (nested_in_vect_loop && !double_reduc)
4693 goto vect_finalize_reduction;
4695 /* SLP reduction without reduction chain, e.g.,
4696 # a1 = phi <a2, a0>
4697 # b1 = phi <b2, b0>
4698 a2 = operation (a1)
4699 b2 = operation (b1) */
4700 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4702 /* In case of reduction chain, e.g.,
4703 # a1 = phi <a3, a0>
4704 a2 = operation (a1)
4705 a3 = operation (a2),
4707 we may end up with more than one vector result. Here we reduce them to
4708 one vector. */
4709 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4711 tree first_vect = PHI_RESULT (new_phis[0]);
4712 gassign *new_vec_stmt = NULL;
4713 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4714 for (k = 1; k < new_phis.length (); k++)
4716 gimple *next_phi = new_phis[k];
4717 tree second_vect = PHI_RESULT (next_phi);
4718 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4719 new_vec_stmt = gimple_build_assign (tem, code,
4720 first_vect, second_vect);
4721 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4722 first_vect = tem;
4725 new_phi_result = first_vect;
4726 if (new_vec_stmt)
4728 new_phis.truncate (0);
4729 new_phis.safe_push (new_vec_stmt);
4732 /* Likewise if we couldn't use a single defuse cycle. */
4733 else if (ncopies > 1)
4735 gcc_assert (new_phis.length () == 1);
4736 tree first_vect = PHI_RESULT (new_phis[0]);
4737 gassign *new_vec_stmt = NULL;
4738 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4739 gimple *next_phi = new_phis[0];
4740 for (int k = 1; k < ncopies; ++k)
4742 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4743 tree second_vect = PHI_RESULT (next_phi);
4744 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4745 new_vec_stmt = gimple_build_assign (tem, code,
4746 first_vect, second_vect);
4747 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4748 first_vect = tem;
4750 new_phi_result = first_vect;
4751 new_phis.truncate (0);
4752 new_phis.safe_push (new_vec_stmt);
4754 else
4755 new_phi_result = PHI_RESULT (new_phis[0]);
4757 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4758 && reduc_code != ERROR_MARK)
4760 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4761 various data values where the condition matched and another vector
4762 (INDUCTION_INDEX) containing all the indexes of those matches. We
4763 need to extract the last matching index (which will be the index with
4764 highest value) and use this to index into the data vector.
4765 For the case where there were no matches, the data vector will contain
4766 all default values and the index vector will be all zeros. */
4768 /* Get various versions of the type of the vector of indexes. */
4769 tree index_vec_type = TREE_TYPE (induction_index);
4770 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4771 tree index_scalar_type = TREE_TYPE (index_vec_type);
4772 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4773 (index_vec_type);
4775 /* Get an unsigned integer version of the type of the data vector. */
4776 int scalar_precision
4777 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4778 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4779 tree vectype_unsigned = build_vector_type
4780 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4782 /* First we need to create a vector (ZERO_VEC) of zeros and another
4783 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4784 can create using a MAX reduction and then expanding.
4785 In the case where the loop never made any matches, the max index will
4786 be zero. */
4788 /* Vector of {0, 0, 0,...}. */
4789 tree zero_vec = make_ssa_name (vectype);
4790 tree zero_vec_rhs = build_zero_cst (vectype);
4791 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4792 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4794 /* Find maximum value from the vector of found indexes. */
4795 tree max_index = make_ssa_name (index_scalar_type);
4796 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4797 induction_index);
4798 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4800 /* Vector of {max_index, max_index, max_index,...}. */
4801 tree max_index_vec = make_ssa_name (index_vec_type);
4802 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4803 max_index);
4804 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4805 max_index_vec_rhs);
4806 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4808 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4809 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4810 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4811 otherwise. Only one value should match, resulting in a vector
4812 (VEC_COND) with one data value and the rest zeros.
4813 In the case where the loop never made any matches, every index will
4814 match, resulting in a vector with all data values (which will all be
4815 the default value). */
4817 /* Compare the max index vector to the vector of found indexes to find
4818 the position of the max value. */
4819 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4820 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4821 induction_index,
4822 max_index_vec);
4823 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4825 /* Use the compare to choose either values from the data vector or
4826 zero. */
4827 tree vec_cond = make_ssa_name (vectype);
4828 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4829 vec_compare, new_phi_result,
4830 zero_vec);
4831 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4833 /* Finally we need to extract the data value from the vector (VEC_COND)
4834 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4835 reduction, but because this doesn't exist, we can use a MAX reduction
4836 instead. The data value might be signed or a float so we need to cast
4837 it first.
4838 In the case where the loop never made any matches, the data values are
4839 all identical, and so will reduce down correctly. */
4841 /* Make the matched data values unsigned. */
4842 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4843 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4844 vec_cond);
4845 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4846 VIEW_CONVERT_EXPR,
4847 vec_cond_cast_rhs);
4848 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4850 /* Reduce down to a scalar value. */
4851 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4852 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4853 optab_default);
4854 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4855 != CODE_FOR_nothing);
4856 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4857 REDUC_MAX_EXPR,
4858 vec_cond_cast);
4859 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4861 /* Convert the reduced value back to the result type and set as the
4862 result. */
4863 gimple_seq stmts = NULL;
4864 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4865 data_reduc);
4866 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4867 scalar_results.safe_push (new_temp);
4869 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4870 && reduc_code == ERROR_MARK)
4872 /* Condition redution without supported REDUC_MAX_EXPR. Generate
4873 idx = 0;
4874 idx_val = induction_index[0];
4875 val = data_reduc[0];
4876 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4877 if (induction_index[i] > idx_val)
4878 val = data_reduc[i], idx_val = induction_index[i];
4879 return val; */
4881 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4882 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4883 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4884 unsigned HOST_WIDE_INT v_size
4885 = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4886 tree idx_val = NULL_TREE, val = NULL_TREE;
4887 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4889 tree old_idx_val = idx_val;
4890 tree old_val = val;
4891 idx_val = make_ssa_name (idx_eltype);
4892 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4893 build3 (BIT_FIELD_REF, idx_eltype,
4894 induction_index,
4895 bitsize_int (el_size),
4896 bitsize_int (off)));
4897 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4898 val = make_ssa_name (data_eltype);
4899 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4900 build3 (BIT_FIELD_REF,
4901 data_eltype,
4902 new_phi_result,
4903 bitsize_int (el_size),
4904 bitsize_int (off)));
4905 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4906 if (off != 0)
4908 tree new_idx_val = idx_val;
4909 tree new_val = val;
4910 if (off != v_size - el_size)
4912 new_idx_val = make_ssa_name (idx_eltype);
4913 epilog_stmt = gimple_build_assign (new_idx_val,
4914 MAX_EXPR, idx_val,
4915 old_idx_val);
4916 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4918 new_val = make_ssa_name (data_eltype);
4919 epilog_stmt = gimple_build_assign (new_val,
4920 COND_EXPR,
4921 build2 (GT_EXPR,
4922 boolean_type_node,
4923 idx_val,
4924 old_idx_val),
4925 val, old_val);
4926 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4927 idx_val = new_idx_val;
4928 val = new_val;
4931 /* Convert the reduced value back to the result type and set as the
4932 result. */
4933 gimple_seq stmts = NULL;
4934 val = gimple_convert (&stmts, scalar_type, val);
4935 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4936 scalar_results.safe_push (val);
4939 /* 2.3 Create the reduction code, using one of the three schemes described
4940 above. In SLP we simply need to extract all the elements from the
4941 vector (without reducing them), so we use scalar shifts. */
4942 else if (reduc_code != ERROR_MARK && !slp_reduc)
4944 tree tmp;
4945 tree vec_elem_type;
4947 /* Case 1: Create:
4948 v_out2 = reduc_expr <v_out1> */
4950 if (dump_enabled_p ())
4951 dump_printf_loc (MSG_NOTE, vect_location,
4952 "Reduce using direct vector reduction.\n");
4954 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4955 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4957 tree tmp_dest =
4958 vect_create_destination_var (scalar_dest, vec_elem_type);
4959 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4960 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4961 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4962 gimple_assign_set_lhs (epilog_stmt, new_temp);
4963 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4965 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4967 else
4968 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4970 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4971 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4972 gimple_assign_set_lhs (epilog_stmt, new_temp);
4973 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4975 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4976 == INTEGER_INDUC_COND_REDUCTION)
4978 /* Earlier we set the initial value to be zero. Check the result
4979 and if it is zero then replace with the original initial
4980 value. */
4981 tree zero = build_zero_cst (scalar_type);
4982 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4984 tmp = make_ssa_name (new_scalar_dest);
4985 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4986 initial_def, new_temp);
4987 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4988 new_temp = tmp;
4991 scalar_results.safe_push (new_temp);
4993 else
4995 bool reduce_with_shift = have_whole_vector_shift (mode);
4996 int element_bitsize = tree_to_uhwi (bitsize);
4997 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4998 tree vec_temp;
5000 /* COND reductions all do the final reduction with MAX_EXPR. */
5001 if (code == COND_EXPR)
5002 code = MAX_EXPR;
5004 /* Regardless of whether we have a whole vector shift, if we're
5005 emulating the operation via tree-vect-generic, we don't want
5006 to use it. Only the first round of the reduction is likely
5007 to still be profitable via emulation. */
5008 /* ??? It might be better to emit a reduction tree code here, so that
5009 tree-vect-generic can expand the first round via bit tricks. */
5010 if (!VECTOR_MODE_P (mode))
5011 reduce_with_shift = false;
5012 else
5014 optab optab = optab_for_tree_code (code, vectype, optab_default);
5015 if (optab_handler (optab, mode) == CODE_FOR_nothing)
5016 reduce_with_shift = false;
5019 if (reduce_with_shift && !slp_reduc)
5021 int nelements = vec_size_in_bits / element_bitsize;
5022 auto_vec_perm_indices sel (nelements);
5024 int elt_offset;
5026 tree zero_vec = build_zero_cst (vectype);
5027 /* Case 2: Create:
5028 for (offset = nelements/2; offset >= 1; offset/=2)
5030 Create: va' = vec_shift <va, offset>
5031 Create: va = vop <va, va'>
5032 } */
5034 tree rhs;
5036 if (dump_enabled_p ())
5037 dump_printf_loc (MSG_NOTE, vect_location,
5038 "Reduce using vector shifts\n");
5040 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5041 new_temp = new_phi_result;
5042 for (elt_offset = nelements / 2;
5043 elt_offset >= 1;
5044 elt_offset /= 2)
5046 sel.truncate (0);
5047 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5048 tree mask = vect_gen_perm_mask_any (vectype, sel);
5049 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5050 new_temp, zero_vec, mask);
5051 new_name = make_ssa_name (vec_dest, epilog_stmt);
5052 gimple_assign_set_lhs (epilog_stmt, new_name);
5053 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5055 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5056 new_temp);
5057 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5058 gimple_assign_set_lhs (epilog_stmt, new_temp);
5059 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5062 /* 2.4 Extract the final scalar result. Create:
5063 s_out3 = extract_field <v_out2, bitpos> */
5065 if (dump_enabled_p ())
5066 dump_printf_loc (MSG_NOTE, vect_location,
5067 "extract scalar result\n");
5069 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5070 bitsize, bitsize_zero_node);
5071 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5072 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5073 gimple_assign_set_lhs (epilog_stmt, new_temp);
5074 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5075 scalar_results.safe_push (new_temp);
5077 else
5079 /* Case 3: Create:
5080 s = extract_field <v_out2, 0>
5081 for (offset = element_size;
5082 offset < vector_size;
5083 offset += element_size;)
5085 Create: s' = extract_field <v_out2, offset>
5086 Create: s = op <s, s'> // For non SLP cases
5087 } */
5089 if (dump_enabled_p ())
5090 dump_printf_loc (MSG_NOTE, vect_location,
5091 "Reduce using scalar code.\n");
5093 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5094 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5096 int bit_offset;
5097 if (gimple_code (new_phi) == GIMPLE_PHI)
5098 vec_temp = PHI_RESULT (new_phi);
5099 else
5100 vec_temp = gimple_assign_lhs (new_phi);
5101 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5102 bitsize_zero_node);
5103 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5104 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5105 gimple_assign_set_lhs (epilog_stmt, new_temp);
5106 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5108 /* In SLP we don't need to apply reduction operation, so we just
5109 collect s' values in SCALAR_RESULTS. */
5110 if (slp_reduc)
5111 scalar_results.safe_push (new_temp);
5113 for (bit_offset = element_bitsize;
5114 bit_offset < vec_size_in_bits;
5115 bit_offset += element_bitsize)
5117 tree bitpos = bitsize_int (bit_offset);
5118 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5119 bitsize, bitpos);
5121 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5122 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5123 gimple_assign_set_lhs (epilog_stmt, new_name);
5124 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5126 if (slp_reduc)
5128 /* In SLP we don't need to apply reduction operation, so
5129 we just collect s' values in SCALAR_RESULTS. */
5130 new_temp = new_name;
5131 scalar_results.safe_push (new_name);
5133 else
5135 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5136 new_name, new_temp);
5137 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5138 gimple_assign_set_lhs (epilog_stmt, new_temp);
5139 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5144 /* The only case where we need to reduce scalar results in SLP, is
5145 unrolling. If the size of SCALAR_RESULTS is greater than
5146 GROUP_SIZE, we reduce them combining elements modulo
5147 GROUP_SIZE. */
5148 if (slp_reduc)
5150 tree res, first_res, new_res;
5151 gimple *new_stmt;
5153 /* Reduce multiple scalar results in case of SLP unrolling. */
5154 for (j = group_size; scalar_results.iterate (j, &res);
5155 j++)
5157 first_res = scalar_results[j % group_size];
5158 new_stmt = gimple_build_assign (new_scalar_dest, code,
5159 first_res, res);
5160 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5161 gimple_assign_set_lhs (new_stmt, new_res);
5162 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5163 scalar_results[j % group_size] = new_res;
5166 else
5167 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5168 scalar_results.safe_push (new_temp);
5171 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5172 == INTEGER_INDUC_COND_REDUCTION)
5174 /* Earlier we set the initial value to be zero. Check the result
5175 and if it is zero then replace with the original initial
5176 value. */
5177 tree zero = build_zero_cst (scalar_type);
5178 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5180 tree tmp = make_ssa_name (new_scalar_dest);
5181 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5182 initial_def, new_temp);
5183 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5184 scalar_results[0] = tmp;
5188 vect_finalize_reduction:
5190 if (double_reduc)
5191 loop = loop->inner;
5193 /* 2.5 Adjust the final result by the initial value of the reduction
5194 variable. (When such adjustment is not needed, then
5195 'adjustment_def' is zero). For example, if code is PLUS we create:
5196 new_temp = loop_exit_def + adjustment_def */
5198 if (adjustment_def)
5200 gcc_assert (!slp_reduc);
5201 if (nested_in_vect_loop)
5203 new_phi = new_phis[0];
5204 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5205 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5206 new_dest = vect_create_destination_var (scalar_dest, vectype);
5208 else
5210 new_temp = scalar_results[0];
5211 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5212 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5213 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5216 epilog_stmt = gimple_build_assign (new_dest, expr);
5217 new_temp = make_ssa_name (new_dest, epilog_stmt);
5218 gimple_assign_set_lhs (epilog_stmt, new_temp);
5219 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220 if (nested_in_vect_loop)
5222 set_vinfo_for_stmt (epilog_stmt,
5223 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5224 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5225 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5227 if (!double_reduc)
5228 scalar_results.quick_push (new_temp);
5229 else
5230 scalar_results[0] = new_temp;
5232 else
5233 scalar_results[0] = new_temp;
5235 new_phis[0] = epilog_stmt;
5238 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5239 phis with new adjusted scalar results, i.e., replace use <s_out0>
5240 with use <s_out4>.
5242 Transform:
5243 loop_exit:
5244 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5245 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5246 v_out2 = reduce <v_out1>
5247 s_out3 = extract_field <v_out2, 0>
5248 s_out4 = adjust_result <s_out3>
5249 use <s_out0>
5250 use <s_out0>
5252 into:
5254 loop_exit:
5255 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5256 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5257 v_out2 = reduce <v_out1>
5258 s_out3 = extract_field <v_out2, 0>
5259 s_out4 = adjust_result <s_out3>
5260 use <s_out4>
5261 use <s_out4> */
5264 /* In SLP reduction chain we reduce vector results into one vector if
5265 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5266 the last stmt in the reduction chain, since we are looking for the loop
5267 exit phi node. */
5268 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5270 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5271 /* Handle reduction patterns. */
5272 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5273 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5275 scalar_dest = gimple_assign_lhs (dest_stmt);
5276 group_size = 1;
5279 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5280 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5281 need to match SCALAR_RESULTS with corresponding statements. The first
5282 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5283 the first vector stmt, etc.
5284 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5285 if (group_size > new_phis.length ())
5287 ratio = group_size / new_phis.length ();
5288 gcc_assert (!(group_size % new_phis.length ()));
5290 else
5291 ratio = 1;
5293 for (k = 0; k < group_size; k++)
5295 if (k % ratio == 0)
5297 epilog_stmt = new_phis[k / ratio];
5298 reduction_phi = reduction_phis[k / ratio];
5299 if (double_reduc)
5300 inner_phi = inner_phis[k / ratio];
5303 if (slp_reduc)
5305 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5307 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5308 /* SLP statements can't participate in patterns. */
5309 gcc_assert (!orig_stmt);
5310 scalar_dest = gimple_assign_lhs (current_stmt);
5313 phis.create (3);
5314 /* Find the loop-closed-use at the loop exit of the original scalar
5315 result. (The reduction result is expected to have two immediate uses -
5316 one at the latch block, and one at the loop exit). */
5317 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5318 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5319 && !is_gimple_debug (USE_STMT (use_p)))
5320 phis.safe_push (USE_STMT (use_p));
5322 /* While we expect to have found an exit_phi because of loop-closed-ssa
5323 form we can end up without one if the scalar cycle is dead. */
5325 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5327 if (outer_loop)
5329 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5330 gphi *vect_phi;
5332 /* FORNOW. Currently not supporting the case that an inner-loop
5333 reduction is not used in the outer-loop (but only outside the
5334 outer-loop), unless it is double reduction. */
5335 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5336 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5337 || double_reduc);
5339 if (double_reduc)
5340 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5341 else
5342 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5343 if (!double_reduc
5344 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5345 != vect_double_reduction_def)
5346 continue;
5348 /* Handle double reduction:
5350 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5351 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5352 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5353 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5355 At that point the regular reduction (stmt2 and stmt3) is
5356 already vectorized, as well as the exit phi node, stmt4.
5357 Here we vectorize the phi node of double reduction, stmt1, and
5358 update all relevant statements. */
5360 /* Go through all the uses of s2 to find double reduction phi
5361 node, i.e., stmt1 above. */
5362 orig_name = PHI_RESULT (exit_phi);
5363 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5365 stmt_vec_info use_stmt_vinfo;
5366 stmt_vec_info new_phi_vinfo;
5367 tree vect_phi_init, preheader_arg, vect_phi_res;
5368 basic_block bb = gimple_bb (use_stmt);
5369 gimple *use;
5371 /* Check that USE_STMT is really double reduction phi
5372 node. */
5373 if (gimple_code (use_stmt) != GIMPLE_PHI
5374 || gimple_phi_num_args (use_stmt) != 2
5375 || bb->loop_father != outer_loop)
5376 continue;
5377 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5378 if (!use_stmt_vinfo
5379 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5380 != vect_double_reduction_def)
5381 continue;
5383 /* Create vector phi node for double reduction:
5384 vs1 = phi <vs0, vs2>
5385 vs1 was created previously in this function by a call to
5386 vect_get_vec_def_for_operand and is stored in
5387 vec_initial_def;
5388 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5389 vs0 is created here. */
5391 /* Create vector phi node. */
5392 vect_phi = create_phi_node (vec_initial_def, bb);
5393 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5394 loop_vec_info_for_loop (outer_loop));
5395 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5397 /* Create vs0 - initial def of the double reduction phi. */
5398 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5399 loop_preheader_edge (outer_loop));
5400 vect_phi_init = get_initial_def_for_reduction
5401 (stmt, preheader_arg, NULL);
5403 /* Update phi node arguments with vs0 and vs2. */
5404 add_phi_arg (vect_phi, vect_phi_init,
5405 loop_preheader_edge (outer_loop),
5406 UNKNOWN_LOCATION);
5407 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5408 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5409 if (dump_enabled_p ())
5411 dump_printf_loc (MSG_NOTE, vect_location,
5412 "created double reduction phi node: ");
5413 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5416 vect_phi_res = PHI_RESULT (vect_phi);
5418 /* Replace the use, i.e., set the correct vs1 in the regular
5419 reduction phi node. FORNOW, NCOPIES is always 1, so the
5420 loop is redundant. */
5421 use = reduction_phi;
5422 for (j = 0; j < ncopies; j++)
5424 edge pr_edge = loop_preheader_edge (loop);
5425 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5426 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5432 phis.release ();
5433 if (nested_in_vect_loop)
5435 if (double_reduc)
5436 loop = outer_loop;
5437 else
5438 continue;
5441 phis.create (3);
5442 /* Find the loop-closed-use at the loop exit of the original scalar
5443 result. (The reduction result is expected to have two immediate uses,
5444 one at the latch block, and one at the loop exit). For double
5445 reductions we are looking for exit phis of the outer loop. */
5446 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5448 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5450 if (!is_gimple_debug (USE_STMT (use_p)))
5451 phis.safe_push (USE_STMT (use_p));
5453 else
5455 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5457 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5459 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5461 if (!flow_bb_inside_loop_p (loop,
5462 gimple_bb (USE_STMT (phi_use_p)))
5463 && !is_gimple_debug (USE_STMT (phi_use_p)))
5464 phis.safe_push (USE_STMT (phi_use_p));
5470 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5472 /* Replace the uses: */
5473 orig_name = PHI_RESULT (exit_phi);
5474 scalar_result = scalar_results[k];
5475 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5476 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5477 SET_USE (use_p, scalar_result);
5480 phis.release ();
5485 /* Function is_nonwrapping_integer_induction.
5487 Check if STMT (which is part of loop LOOP) both increments and
5488 does not cause overflow. */
5490 static bool
5491 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5493 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5494 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5495 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5496 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5497 widest_int ni, max_loop_value, lhs_max;
5498 bool overflow = false;
5500 /* Make sure the loop is integer based. */
5501 if (TREE_CODE (base) != INTEGER_CST
5502 || TREE_CODE (step) != INTEGER_CST)
5503 return false;
5505 /* Check that the induction increments. */
5506 if (tree_int_cst_sgn (step) == -1)
5507 return false;
5509 /* Check that the max size of the loop will not wrap. */
5511 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5512 return true;
5514 if (! max_stmt_executions (loop, &ni))
5515 return false;
5517 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5518 &overflow);
5519 if (overflow)
5520 return false;
5522 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5523 TYPE_SIGN (lhs_type), &overflow);
5524 if (overflow)
5525 return false;
5527 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5528 <= TYPE_PRECISION (lhs_type));
5531 /* Function vectorizable_reduction.
5533 Check if STMT performs a reduction operation that can be vectorized.
5534 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5535 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5536 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5538 This function also handles reduction idioms (patterns) that have been
5539 recognized in advance during vect_pattern_recog. In this case, STMT may be
5540 of this form:
5541 X = pattern_expr (arg0, arg1, ..., X)
5542 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5543 sequence that had been detected and replaced by the pattern-stmt (STMT).
5545 This function also handles reduction of condition expressions, for example:
5546 for (int i = 0; i < N; i++)
5547 if (a[i] < value)
5548 last = a[i];
5549 This is handled by vectorising the loop and creating an additional vector
5550 containing the loop indexes for which "a[i] < value" was true. In the
5551 function epilogue this is reduced to a single max value and then used to
5552 index into the vector of results.
5554 In some cases of reduction patterns, the type of the reduction variable X is
5555 different than the type of the other arguments of STMT.
5556 In such cases, the vectype that is used when transforming STMT into a vector
5557 stmt is different than the vectype that is used to determine the
5558 vectorization factor, because it consists of a different number of elements
5559 than the actual number of elements that are being operated upon in parallel.
5561 For example, consider an accumulation of shorts into an int accumulator.
5562 On some targets it's possible to vectorize this pattern operating on 8
5563 shorts at a time (hence, the vectype for purposes of determining the
5564 vectorization factor should be V8HI); on the other hand, the vectype that
5565 is used to create the vector form is actually V4SI (the type of the result).
5567 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5568 indicates what is the actual level of parallelism (V8HI in the example), so
5569 that the right vectorization factor would be derived. This vectype
5570 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5571 be used to create the vectorized stmt. The right vectype for the vectorized
5572 stmt is obtained from the type of the result X:
5573 get_vectype_for_scalar_type (TREE_TYPE (X))
5575 This means that, contrary to "regular" reductions (or "regular" stmts in
5576 general), the following equation:
5577 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5578 does *NOT* necessarily hold for reduction patterns. */
5580 bool
5581 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5582 gimple **vec_stmt, slp_tree slp_node,
5583 slp_instance slp_node_instance)
5585 tree vec_dest;
5586 tree scalar_dest;
5587 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5588 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5589 tree vectype_in = NULL_TREE;
5590 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5591 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5592 enum tree_code code, orig_code, epilog_reduc_code;
5593 machine_mode vec_mode;
5594 int op_type;
5595 optab optab, reduc_optab;
5596 tree new_temp = NULL_TREE;
5597 gimple *def_stmt;
5598 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5599 tree scalar_type;
5600 bool is_simple_use;
5601 gimple *orig_stmt;
5602 stmt_vec_info orig_stmt_info = NULL;
5603 int i;
5604 int ncopies;
5605 int epilog_copies;
5606 stmt_vec_info prev_stmt_info, prev_phi_info;
5607 bool single_defuse_cycle = false;
5608 gimple *new_stmt = NULL;
5609 int j;
5610 tree ops[3];
5611 enum vect_def_type dts[3];
5612 bool nested_cycle = false, found_nested_cycle_def = false;
5613 bool double_reduc = false;
5614 basic_block def_bb;
5615 struct loop * def_stmt_loop, *outer_loop = NULL;
5616 tree def_arg;
5617 gimple *def_arg_stmt;
5618 auto_vec<tree> vec_oprnds0;
5619 auto_vec<tree> vec_oprnds1;
5620 auto_vec<tree> vec_oprnds2;
5621 auto_vec<tree> vect_defs;
5622 auto_vec<gimple *> phis;
5623 int vec_num;
5624 tree def0, tem;
5625 bool first_p = true;
5626 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5627 tree cond_reduc_val = NULL_TREE;
5629 /* Make sure it was already recognized as a reduction computation. */
5630 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5631 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5632 return false;
5634 if (nested_in_vect_loop_p (loop, stmt))
5636 outer_loop = loop;
5637 loop = loop->inner;
5638 nested_cycle = true;
5641 /* In case of reduction chain we switch to the first stmt in the chain, but
5642 we don't update STMT_INFO, since only the last stmt is marked as reduction
5643 and has reduction properties. */
5644 if (GROUP_FIRST_ELEMENT (stmt_info)
5645 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5647 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5648 first_p = false;
5651 if (gimple_code (stmt) == GIMPLE_PHI)
5653 /* Analysis is fully done on the reduction stmt invocation. */
5654 if (! vec_stmt)
5656 if (slp_node)
5657 slp_node_instance->reduc_phis = slp_node;
5659 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5660 return true;
5663 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5664 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5665 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5667 gcc_assert (is_gimple_assign (reduc_stmt));
5668 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5670 tree op = gimple_op (reduc_stmt, k);
5671 if (op == gimple_phi_result (stmt))
5672 continue;
5673 if (k == 1
5674 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5675 continue;
5676 tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5677 if (! vectype_in
5678 || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5679 vectype_in = tem;
5680 break;
5682 gcc_assert (vectype_in);
5684 if (slp_node)
5685 ncopies = 1;
5686 else
5687 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5689 use_operand_p use_p;
5690 gimple *use_stmt;
5691 if (ncopies > 1
5692 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5693 <= vect_used_only_live)
5694 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5695 && (use_stmt == reduc_stmt
5696 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5697 == reduc_stmt)))
5698 single_defuse_cycle = true;
5700 /* Create the destination vector */
5701 scalar_dest = gimple_assign_lhs (reduc_stmt);
5702 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5704 if (slp_node)
5705 /* The size vect_schedule_slp_instance computes is off for us. */
5706 vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5707 * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5708 / TYPE_VECTOR_SUBPARTS (vectype_in));
5709 else
5710 vec_num = 1;
5712 /* Generate the reduction PHIs upfront. */
5713 prev_phi_info = NULL;
5714 for (j = 0; j < ncopies; j++)
5716 if (j == 0 || !single_defuse_cycle)
5718 for (i = 0; i < vec_num; i++)
5720 /* Create the reduction-phi that defines the reduction
5721 operand. */
5722 gimple *new_phi = create_phi_node (vec_dest, loop->header);
5723 set_vinfo_for_stmt (new_phi,
5724 new_stmt_vec_info (new_phi, loop_vinfo));
5726 if (slp_node)
5727 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5728 else
5730 if (j == 0)
5731 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5732 else
5733 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5734 prev_phi_info = vinfo_for_stmt (new_phi);
5740 return true;
5743 /* 1. Is vectorizable reduction? */
5744 /* Not supportable if the reduction variable is used in the loop, unless
5745 it's a reduction chain. */
5746 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5747 && !GROUP_FIRST_ELEMENT (stmt_info))
5748 return false;
5750 /* Reductions that are not used even in an enclosing outer-loop,
5751 are expected to be "live" (used out of the loop). */
5752 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5753 && !STMT_VINFO_LIVE_P (stmt_info))
5754 return false;
5756 /* 2. Has this been recognized as a reduction pattern?
5758 Check if STMT represents a pattern that has been recognized
5759 in earlier analysis stages. For stmts that represent a pattern,
5760 the STMT_VINFO_RELATED_STMT field records the last stmt in
5761 the original sequence that constitutes the pattern. */
5763 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5764 if (orig_stmt)
5766 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5767 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5768 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5771 /* 3. Check the operands of the operation. The first operands are defined
5772 inside the loop body. The last operand is the reduction variable,
5773 which is defined by the loop-header-phi. */
5775 gcc_assert (is_gimple_assign (stmt));
5777 /* Flatten RHS. */
5778 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5780 case GIMPLE_BINARY_RHS:
5781 code = gimple_assign_rhs_code (stmt);
5782 op_type = TREE_CODE_LENGTH (code);
5783 gcc_assert (op_type == binary_op);
5784 ops[0] = gimple_assign_rhs1 (stmt);
5785 ops[1] = gimple_assign_rhs2 (stmt);
5786 break;
5788 case GIMPLE_TERNARY_RHS:
5789 code = gimple_assign_rhs_code (stmt);
5790 op_type = TREE_CODE_LENGTH (code);
5791 gcc_assert (op_type == ternary_op);
5792 ops[0] = gimple_assign_rhs1 (stmt);
5793 ops[1] = gimple_assign_rhs2 (stmt);
5794 ops[2] = gimple_assign_rhs3 (stmt);
5795 break;
5797 case GIMPLE_UNARY_RHS:
5798 return false;
5800 default:
5801 gcc_unreachable ();
5804 if (code == COND_EXPR && slp_node)
5805 return false;
5807 scalar_dest = gimple_assign_lhs (stmt);
5808 scalar_type = TREE_TYPE (scalar_dest);
5809 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5810 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5811 return false;
5813 /* Do not try to vectorize bit-precision reductions. */
5814 if (!type_has_mode_precision_p (scalar_type))
5815 return false;
5817 /* All uses but the last are expected to be defined in the loop.
5818 The last use is the reduction variable. In case of nested cycle this
5819 assumption is not true: we use reduc_index to record the index of the
5820 reduction variable. */
5821 gimple *reduc_def_stmt = NULL;
5822 int reduc_index = -1;
5823 for (i = 0; i < op_type; i++)
5825 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5826 if (i == 0 && code == COND_EXPR)
5827 continue;
5829 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5830 &def_stmt, &dts[i], &tem);
5831 dt = dts[i];
5832 gcc_assert (is_simple_use);
5833 if (dt == vect_reduction_def)
5835 reduc_def_stmt = def_stmt;
5836 reduc_index = i;
5837 continue;
5839 else if (tem)
5841 /* To properly compute ncopies we are interested in the widest
5842 input type in case we're looking at a widening accumulation. */
5843 if (!vectype_in
5844 || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5845 vectype_in = tem;
5848 if (dt != vect_internal_def
5849 && dt != vect_external_def
5850 && dt != vect_constant_def
5851 && dt != vect_induction_def
5852 && !(dt == vect_nested_cycle && nested_cycle))
5853 return false;
5855 if (dt == vect_nested_cycle)
5857 found_nested_cycle_def = true;
5858 reduc_def_stmt = def_stmt;
5859 reduc_index = i;
5862 if (i == 1 && code == COND_EXPR)
5864 /* Record how value of COND_EXPR is defined. */
5865 if (dt == vect_constant_def)
5867 cond_reduc_dt = dt;
5868 cond_reduc_val = ops[i];
5870 if (dt == vect_induction_def && def_stmt != NULL
5871 && is_nonwrapping_integer_induction (def_stmt, loop))
5872 cond_reduc_dt = dt;
5876 if (!vectype_in)
5877 vectype_in = vectype_out;
5879 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5880 directy used in stmt. */
5881 if (reduc_index == -1)
5883 if (orig_stmt)
5884 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5885 else
5886 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5889 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5890 return false;
5892 if (!(reduc_index == -1
5893 || dts[reduc_index] == vect_reduction_def
5894 || dts[reduc_index] == vect_nested_cycle
5895 || ((dts[reduc_index] == vect_internal_def
5896 || dts[reduc_index] == vect_external_def
5897 || dts[reduc_index] == vect_constant_def
5898 || dts[reduc_index] == vect_induction_def)
5899 && nested_cycle && found_nested_cycle_def)))
5901 /* For pattern recognized stmts, orig_stmt might be a reduction,
5902 but some helper statements for the pattern might not, or
5903 might be COND_EXPRs with reduction uses in the condition. */
5904 gcc_assert (orig_stmt);
5905 return false;
5908 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5909 enum vect_reduction_type v_reduc_type
5910 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5911 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5913 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5914 /* If we have a condition reduction, see if we can simplify it further. */
5915 if (v_reduc_type == COND_REDUCTION)
5917 if (cond_reduc_dt == vect_induction_def)
5919 if (dump_enabled_p ())
5920 dump_printf_loc (MSG_NOTE, vect_location,
5921 "condition expression based on "
5922 "integer induction.\n");
5923 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5924 = INTEGER_INDUC_COND_REDUCTION;
5927 /* Loop peeling modifies initial value of reduction PHI, which
5928 makes the reduction stmt to be transformed different to the
5929 original stmt analyzed. We need to record reduction code for
5930 CONST_COND_REDUCTION type reduction at analyzing stage, thus
5931 it can be used directly at transform stage. */
5932 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5933 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5935 /* Also set the reduction type to CONST_COND_REDUCTION. */
5936 gcc_assert (cond_reduc_dt == vect_constant_def);
5937 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5939 else if (cond_reduc_dt == vect_constant_def)
5941 enum vect_def_type cond_initial_dt;
5942 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5943 tree cond_initial_val
5944 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5946 gcc_assert (cond_reduc_val != NULL_TREE);
5947 vect_is_simple_use (cond_initial_val, loop_vinfo,
5948 &def_stmt, &cond_initial_dt);
5949 if (cond_initial_dt == vect_constant_def
5950 && types_compatible_p (TREE_TYPE (cond_initial_val),
5951 TREE_TYPE (cond_reduc_val)))
5953 tree e = fold_binary (LE_EXPR, boolean_type_node,
5954 cond_initial_val, cond_reduc_val);
5955 if (e && (integer_onep (e) || integer_zerop (e)))
5957 if (dump_enabled_p ())
5958 dump_printf_loc (MSG_NOTE, vect_location,
5959 "condition expression based on "
5960 "compile time constant.\n");
5961 /* Record reduction code at analysis stage. */
5962 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5963 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5964 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5965 = CONST_COND_REDUCTION;
5971 if (orig_stmt)
5972 gcc_assert (tmp == orig_stmt
5973 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5974 else
5975 /* We changed STMT to be the first stmt in reduction chain, hence we
5976 check that in this case the first element in the chain is STMT. */
5977 gcc_assert (stmt == tmp
5978 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5980 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5981 return false;
5983 if (slp_node)
5984 ncopies = 1;
5985 else
5986 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5988 gcc_assert (ncopies >= 1);
5990 vec_mode = TYPE_MODE (vectype_in);
5992 if (code == COND_EXPR)
5994 /* Only call during the analysis stage, otherwise we'll lose
5995 STMT_VINFO_TYPE. */
5996 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5997 ops[reduc_index], 0, NULL))
5999 if (dump_enabled_p ())
6000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6001 "unsupported condition in reduction\n");
6002 return false;
6005 else
6007 /* 4. Supportable by target? */
6009 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6010 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6012 /* Shifts and rotates are only supported by vectorizable_shifts,
6013 not vectorizable_reduction. */
6014 if (dump_enabled_p ())
6015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6016 "unsupported shift or rotation.\n");
6017 return false;
6020 /* 4.1. check support for the operation in the loop */
6021 optab = optab_for_tree_code (code, vectype_in, optab_default);
6022 if (!optab)
6024 if (dump_enabled_p ())
6025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6026 "no optab.\n");
6028 return false;
6031 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6033 if (dump_enabled_p ())
6034 dump_printf (MSG_NOTE, "op not supported by target.\n");
6036 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6037 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6038 return false;
6040 if (dump_enabled_p ())
6041 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6044 /* Worthwhile without SIMD support? */
6045 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6046 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6048 if (dump_enabled_p ())
6049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6050 "not worthwhile without SIMD support.\n");
6052 return false;
6056 /* 4.2. Check support for the epilog operation.
6058 If STMT represents a reduction pattern, then the type of the
6059 reduction variable may be different than the type of the rest
6060 of the arguments. For example, consider the case of accumulation
6061 of shorts into an int accumulator; The original code:
6062 S1: int_a = (int) short_a;
6063 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6065 was replaced with:
6066 STMT: int_acc = widen_sum <short_a, int_acc>
6068 This means that:
6069 1. The tree-code that is used to create the vector operation in the
6070 epilog code (that reduces the partial results) is not the
6071 tree-code of STMT, but is rather the tree-code of the original
6072 stmt from the pattern that STMT is replacing. I.e, in the example
6073 above we want to use 'widen_sum' in the loop, but 'plus' in the
6074 epilog.
6075 2. The type (mode) we use to check available target support
6076 for the vector operation to be created in the *epilog*, is
6077 determined by the type of the reduction variable (in the example
6078 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6079 However the type (mode) we use to check available target support
6080 for the vector operation to be created *inside the loop*, is
6081 determined by the type of the other arguments to STMT (in the
6082 example we'd check this: optab_handler (widen_sum_optab,
6083 vect_short_mode)).
6085 This is contrary to "regular" reductions, in which the types of all
6086 the arguments are the same as the type of the reduction variable.
6087 For "regular" reductions we can therefore use the same vector type
6088 (and also the same tree-code) when generating the epilog code and
6089 when generating the code inside the loop. */
6091 if (orig_stmt)
6093 /* This is a reduction pattern: get the vectype from the type of the
6094 reduction variable, and get the tree-code from orig_stmt. */
6095 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6096 == TREE_CODE_REDUCTION);
6097 orig_code = gimple_assign_rhs_code (orig_stmt);
6098 gcc_assert (vectype_out);
6099 vec_mode = TYPE_MODE (vectype_out);
6101 else
6103 /* Regular reduction: use the same vectype and tree-code as used for
6104 the vector code inside the loop can be used for the epilog code. */
6105 orig_code = code;
6107 if (code == MINUS_EXPR)
6108 orig_code = PLUS_EXPR;
6110 /* For simple condition reductions, replace with the actual expression
6111 we want to base our reduction around. */
6112 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6114 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6115 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6117 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6118 == INTEGER_INDUC_COND_REDUCTION)
6119 orig_code = MAX_EXPR;
6122 if (nested_cycle)
6124 def_bb = gimple_bb (reduc_def_stmt);
6125 def_stmt_loop = def_bb->loop_father;
6126 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6127 loop_preheader_edge (def_stmt_loop));
6128 if (TREE_CODE (def_arg) == SSA_NAME
6129 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6130 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6131 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6132 && vinfo_for_stmt (def_arg_stmt)
6133 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6134 == vect_double_reduction_def)
6135 double_reduc = true;
6138 epilog_reduc_code = ERROR_MARK;
6140 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6142 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
6144 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
6145 optab_default);
6146 if (!reduc_optab)
6148 if (dump_enabled_p ())
6149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6150 "no optab for reduction.\n");
6152 epilog_reduc_code = ERROR_MARK;
6154 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
6156 if (dump_enabled_p ())
6157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6158 "reduc op not supported by target.\n");
6160 epilog_reduc_code = ERROR_MARK;
6163 else
6165 if (!nested_cycle || double_reduc)
6167 if (dump_enabled_p ())
6168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6169 "no reduc code for scalar code.\n");
6171 return false;
6175 else
6177 int scalar_precision
6178 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6179 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6180 cr_index_vector_type = build_vector_type
6181 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6183 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
6184 optab_default);
6185 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
6186 != CODE_FOR_nothing)
6187 epilog_reduc_code = REDUC_MAX_EXPR;
6190 if ((double_reduc
6191 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6192 && ncopies > 1)
6194 if (dump_enabled_p ())
6195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196 "multiple types in double reduction or condition "
6197 "reduction.\n");
6198 return false;
6201 /* In case of widenning multiplication by a constant, we update the type
6202 of the constant to be the type of the other operand. We check that the
6203 constant fits the type in the pattern recognition pass. */
6204 if (code == DOT_PROD_EXPR
6205 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6207 if (TREE_CODE (ops[0]) == INTEGER_CST)
6208 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6209 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6210 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6211 else
6213 if (dump_enabled_p ())
6214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215 "invalid types in dot-prod\n");
6217 return false;
6221 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6223 widest_int ni;
6225 if (! max_loop_iterations (loop, &ni))
6227 if (dump_enabled_p ())
6228 dump_printf_loc (MSG_NOTE, vect_location,
6229 "loop count not known, cannot create cond "
6230 "reduction.\n");
6231 return false;
6233 /* Convert backedges to iterations. */
6234 ni += 1;
6236 /* The additional index will be the same type as the condition. Check
6237 that the loop can fit into this less one (because we'll use up the
6238 zero slot for when there are no matches). */
6239 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6240 if (wi::geu_p (ni, wi::to_widest (max_index)))
6242 if (dump_enabled_p ())
6243 dump_printf_loc (MSG_NOTE, vect_location,
6244 "loop size is greater than data size.\n");
6245 return false;
6249 /* In case the vectorization factor (VF) is bigger than the number
6250 of elements that we can fit in a vectype (nunits), we have to generate
6251 more than one vector stmt - i.e - we need to "unroll" the
6252 vector stmt by a factor VF/nunits. For more details see documentation
6253 in vectorizable_operation. */
6255 /* If the reduction is used in an outer loop we need to generate
6256 VF intermediate results, like so (e.g. for ncopies=2):
6257 r0 = phi (init, r0)
6258 r1 = phi (init, r1)
6259 r0 = x0 + r0;
6260 r1 = x1 + r1;
6261 (i.e. we generate VF results in 2 registers).
6262 In this case we have a separate def-use cycle for each copy, and therefore
6263 for each copy we get the vector def for the reduction variable from the
6264 respective phi node created for this copy.
6266 Otherwise (the reduction is unused in the loop nest), we can combine
6267 together intermediate results, like so (e.g. for ncopies=2):
6268 r = phi (init, r)
6269 r = x0 + r;
6270 r = x1 + r;
6271 (i.e. we generate VF/2 results in a single register).
6272 In this case for each copy we get the vector def for the reduction variable
6273 from the vectorized reduction operation generated in the previous iteration.
6275 This only works when we see both the reduction PHI and its only consumer
6276 in vectorizable_reduction and there are no intermediate stmts
6277 participating. */
6278 use_operand_p use_p;
6279 gimple *use_stmt;
6280 if (ncopies > 1
6281 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6282 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6283 && (use_stmt == stmt
6284 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6286 single_defuse_cycle = true;
6287 epilog_copies = 1;
6289 else
6290 epilog_copies = ncopies;
6292 /* If the reduction stmt is one of the patterns that have lane
6293 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6294 if ((ncopies > 1
6295 && ! single_defuse_cycle)
6296 && (code == DOT_PROD_EXPR
6297 || code == WIDEN_SUM_EXPR
6298 || code == SAD_EXPR))
6300 if (dump_enabled_p ())
6301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6302 "multi def-use cycle not possible for lane-reducing "
6303 "reduction operation\n");
6304 return false;
6307 if (!vec_stmt) /* transformation not required. */
6309 if (first_p)
6310 vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
6311 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6312 return true;
6315 /* Transform. */
6317 if (dump_enabled_p ())
6318 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6320 /* FORNOW: Multiple types are not supported for condition. */
6321 if (code == COND_EXPR)
6322 gcc_assert (ncopies == 1);
6324 /* Create the destination vector */
6325 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6327 prev_stmt_info = NULL;
6328 prev_phi_info = NULL;
6329 if (slp_node)
6330 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6331 else
6333 vec_num = 1;
6334 vec_oprnds0.create (1);
6335 vec_oprnds1.create (1);
6336 if (op_type == ternary_op)
6337 vec_oprnds2.create (1);
6340 phis.create (vec_num);
6341 vect_defs.create (vec_num);
6342 if (!slp_node)
6343 vect_defs.quick_push (NULL_TREE);
6345 if (slp_node)
6346 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6347 else
6348 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6350 for (j = 0; j < ncopies; j++)
6352 if (code == COND_EXPR)
6354 gcc_assert (!slp_node);
6355 vectorizable_condition (stmt, gsi, vec_stmt,
6356 PHI_RESULT (phis[0]),
6357 reduc_index, NULL);
6358 /* Multiple types are not supported for condition. */
6359 break;
6362 /* Handle uses. */
6363 if (j == 0)
6365 if (slp_node)
6367 /* Get vec defs for all the operands except the reduction index,
6368 ensuring the ordering of the ops in the vector is kept. */
6369 auto_vec<tree, 3> slp_ops;
6370 auto_vec<vec<tree>, 3> vec_defs;
6372 slp_ops.quick_push (ops[0]);
6373 slp_ops.quick_push (ops[1]);
6374 if (op_type == ternary_op)
6375 slp_ops.quick_push (ops[2]);
6377 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6379 vec_oprnds0.safe_splice (vec_defs[0]);
6380 vec_defs[0].release ();
6381 vec_oprnds1.safe_splice (vec_defs[1]);
6382 vec_defs[1].release ();
6383 if (op_type == ternary_op)
6385 vec_oprnds2.safe_splice (vec_defs[2]);
6386 vec_defs[2].release ();
6389 else
6391 vec_oprnds0.quick_push
6392 (vect_get_vec_def_for_operand (ops[0], stmt));
6393 vec_oprnds1.quick_push
6394 (vect_get_vec_def_for_operand (ops[1], stmt));
6395 if (op_type == ternary_op)
6396 vec_oprnds2.quick_push
6397 (vect_get_vec_def_for_operand (ops[2], stmt));
6400 else
6402 if (!slp_node)
6404 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6406 if (single_defuse_cycle && reduc_index == 0)
6407 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6408 else
6409 vec_oprnds0[0]
6410 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6411 if (single_defuse_cycle && reduc_index == 1)
6412 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6413 else
6414 vec_oprnds1[0]
6415 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6416 if (op_type == ternary_op)
6418 if (single_defuse_cycle && reduc_index == 2)
6419 vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6420 else
6421 vec_oprnds2[0]
6422 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6427 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6429 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6430 if (op_type == ternary_op)
6431 vop[2] = vec_oprnds2[i];
6433 new_temp = make_ssa_name (vec_dest, new_stmt);
6434 new_stmt = gimple_build_assign (new_temp, code,
6435 vop[0], vop[1], vop[2]);
6436 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6438 if (slp_node)
6440 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6441 vect_defs.quick_push (new_temp);
6443 else
6444 vect_defs[0] = new_temp;
6447 if (slp_node)
6448 continue;
6450 if (j == 0)
6451 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6452 else
6453 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6455 prev_stmt_info = vinfo_for_stmt (new_stmt);
6458 /* Finalize the reduction-phi (set its arguments) and create the
6459 epilog reduction code. */
6460 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6461 vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6463 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6464 epilog_copies,
6465 epilog_reduc_code, phis,
6466 double_reduc, slp_node, slp_node_instance);
6468 return true;
6471 /* Function vect_min_worthwhile_factor.
6473 For a loop where we could vectorize the operation indicated by CODE,
6474 return the minimum vectorization factor that makes it worthwhile
6475 to use generic vectors. */
6477 vect_min_worthwhile_factor (enum tree_code code)
6479 switch (code)
6481 case PLUS_EXPR:
6482 case MINUS_EXPR:
6483 case NEGATE_EXPR:
6484 return 4;
6486 case BIT_AND_EXPR:
6487 case BIT_IOR_EXPR:
6488 case BIT_XOR_EXPR:
6489 case BIT_NOT_EXPR:
6490 return 2;
6492 default:
6493 return INT_MAX;
6497 /* Return true if VINFO indicates we are doing loop vectorization and if
6498 it is worth decomposing CODE operations into scalar operations for
6499 that loop's vectorization factor. */
6501 bool
6502 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6504 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6505 return (loop_vinfo
6506 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6507 >= vect_min_worthwhile_factor (code)));
6510 /* Function vectorizable_induction
6512 Check if PHI performs an induction computation that can be vectorized.
6513 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6514 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6515 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6517 bool
6518 vectorizable_induction (gimple *phi,
6519 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6520 gimple **vec_stmt, slp_tree slp_node)
6522 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6523 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6524 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6525 unsigned ncopies;
6526 bool nested_in_vect_loop = false;
6527 struct loop *iv_loop;
6528 tree vec_def;
6529 edge pe = loop_preheader_edge (loop);
6530 basic_block new_bb;
6531 tree new_vec, vec_init, vec_step, t;
6532 tree new_name;
6533 gimple *new_stmt;
6534 gphi *induction_phi;
6535 tree induc_def, vec_dest;
6536 tree init_expr, step_expr;
6537 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6538 unsigned i;
6539 tree expr;
6540 gimple_seq stmts;
6541 imm_use_iterator imm_iter;
6542 use_operand_p use_p;
6543 gimple *exit_phi;
6544 edge latch_e;
6545 tree loop_arg;
6546 gimple_stmt_iterator si;
6547 basic_block bb = gimple_bb (phi);
6549 if (gimple_code (phi) != GIMPLE_PHI)
6550 return false;
6552 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6553 return false;
6555 /* Make sure it was recognized as induction computation. */
6556 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6557 return false;
6559 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6560 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6562 if (slp_node)
6563 ncopies = 1;
6564 else
6565 ncopies = vect_get_num_copies (loop_vinfo, vectype);
6566 gcc_assert (ncopies >= 1);
6568 /* FORNOW. These restrictions should be relaxed. */
6569 if (nested_in_vect_loop_p (loop, phi))
6571 imm_use_iterator imm_iter;
6572 use_operand_p use_p;
6573 gimple *exit_phi;
6574 edge latch_e;
6575 tree loop_arg;
6577 if (ncopies > 1)
6579 if (dump_enabled_p ())
6580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581 "multiple types in nested loop.\n");
6582 return false;
6585 /* FORNOW: outer loop induction with SLP not supported. */
6586 if (STMT_SLP_TYPE (stmt_info))
6587 return false;
6589 exit_phi = NULL;
6590 latch_e = loop_latch_edge (loop->inner);
6591 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6592 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6594 gimple *use_stmt = USE_STMT (use_p);
6595 if (is_gimple_debug (use_stmt))
6596 continue;
6598 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6600 exit_phi = use_stmt;
6601 break;
6604 if (exit_phi)
6606 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6607 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6608 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6610 if (dump_enabled_p ())
6611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6612 "inner-loop induction only used outside "
6613 "of the outer vectorized loop.\n");
6614 return false;
6618 nested_in_vect_loop = true;
6619 iv_loop = loop->inner;
6621 else
6622 iv_loop = loop;
6623 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6625 if (!vec_stmt) /* transformation not required. */
6627 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6628 if (dump_enabled_p ())
6629 dump_printf_loc (MSG_NOTE, vect_location,
6630 "=== vectorizable_induction ===\n");
6631 vect_model_induction_cost (stmt_info, ncopies);
6632 return true;
6635 /* Transform. */
6637 /* Compute a vector variable, initialized with the first VF values of
6638 the induction variable. E.g., for an iv with IV_PHI='X' and
6639 evolution S, for a vector of 4 units, we want to compute:
6640 [X, X + S, X + 2*S, X + 3*S]. */
6642 if (dump_enabled_p ())
6643 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6645 latch_e = loop_latch_edge (iv_loop);
6646 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6648 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6649 gcc_assert (step_expr != NULL_TREE);
6651 pe = loop_preheader_edge (iv_loop);
6652 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6653 loop_preheader_edge (iv_loop));
6655 /* Convert the step to the desired type. */
6656 stmts = NULL;
6657 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6658 if (stmts)
6660 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6661 gcc_assert (!new_bb);
6664 /* Find the first insertion point in the BB. */
6665 si = gsi_after_labels (bb);
6667 /* For SLP induction we have to generate several IVs as for example
6668 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6669 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
6670 [VF*S, VF*S, VF*S, VF*S] for all. */
6671 if (slp_node)
6673 /* Convert the init to the desired type. */
6674 stmts = NULL;
6675 init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6676 if (stmts)
6678 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6679 gcc_assert (!new_bb);
6682 /* Generate [VF*S, VF*S, ... ]. */
6683 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6685 expr = build_int_cst (integer_type_node, vf);
6686 expr = fold_convert (TREE_TYPE (step_expr), expr);
6688 else
6689 expr = build_int_cst (TREE_TYPE (step_expr), vf);
6690 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6691 expr, step_expr);
6692 if (! CONSTANT_CLASS_P (new_name))
6693 new_name = vect_init_vector (phi, new_name,
6694 TREE_TYPE (step_expr), NULL);
6695 new_vec = build_vector_from_val (vectype, new_name);
6696 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6698 /* Now generate the IVs. */
6699 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6700 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6701 unsigned elts = nunits * nvects;
6702 unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6703 gcc_assert (elts % group_size == 0);
6704 tree elt = init_expr;
6705 unsigned ivn;
6706 for (ivn = 0; ivn < nivs; ++ivn)
6708 auto_vec<tree, 32> elts (nunits);
6709 stmts = NULL;
6710 for (unsigned eltn = 0; eltn < nunits; ++eltn)
6712 if (ivn*nunits + eltn >= group_size
6713 && (ivn*nunits + eltn) % group_size == 0)
6714 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6715 elt, step_expr);
6716 elts.quick_push (elt);
6718 vec_init = gimple_build_vector (&stmts, vectype, elts);
6719 if (stmts)
6721 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6722 gcc_assert (!new_bb);
6725 /* Create the induction-phi that defines the induction-operand. */
6726 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6727 induction_phi = create_phi_node (vec_dest, iv_loop->header);
6728 set_vinfo_for_stmt (induction_phi,
6729 new_stmt_vec_info (induction_phi, loop_vinfo));
6730 induc_def = PHI_RESULT (induction_phi);
6732 /* Create the iv update inside the loop */
6733 vec_def = make_ssa_name (vec_dest);
6734 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6735 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6736 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6738 /* Set the arguments of the phi node: */
6739 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6740 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6741 UNKNOWN_LOCATION);
6743 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6746 /* Re-use IVs when we can. */
6747 if (ivn < nvects)
6749 unsigned vfp
6750 = least_common_multiple (group_size, nunits) / group_size;
6751 /* Generate [VF'*S, VF'*S, ... ]. */
6752 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6754 expr = build_int_cst (integer_type_node, vfp);
6755 expr = fold_convert (TREE_TYPE (step_expr), expr);
6757 else
6758 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6759 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6760 expr, step_expr);
6761 if (! CONSTANT_CLASS_P (new_name))
6762 new_name = vect_init_vector (phi, new_name,
6763 TREE_TYPE (step_expr), NULL);
6764 new_vec = build_vector_from_val (vectype, new_name);
6765 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6766 for (; ivn < nvects; ++ivn)
6768 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6769 tree def;
6770 if (gimple_code (iv) == GIMPLE_PHI)
6771 def = gimple_phi_result (iv);
6772 else
6773 def = gimple_assign_lhs (iv);
6774 new_stmt = gimple_build_assign (make_ssa_name (vectype),
6775 PLUS_EXPR,
6776 def, vec_step);
6777 if (gimple_code (iv) == GIMPLE_PHI)
6778 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6779 else
6781 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6782 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6784 set_vinfo_for_stmt (new_stmt,
6785 new_stmt_vec_info (new_stmt, loop_vinfo));
6786 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6790 return true;
6793 /* Create the vector that holds the initial_value of the induction. */
6794 if (nested_in_vect_loop)
6796 /* iv_loop is nested in the loop to be vectorized. init_expr had already
6797 been created during vectorization of previous stmts. We obtain it
6798 from the STMT_VINFO_VEC_STMT of the defining stmt. */
6799 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6800 /* If the initial value is not of proper type, convert it. */
6801 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6803 new_stmt
6804 = gimple_build_assign (vect_get_new_ssa_name (vectype,
6805 vect_simple_var,
6806 "vec_iv_"),
6807 VIEW_CONVERT_EXPR,
6808 build1 (VIEW_CONVERT_EXPR, vectype,
6809 vec_init));
6810 vec_init = gimple_assign_lhs (new_stmt);
6811 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6812 new_stmt);
6813 gcc_assert (!new_bb);
6814 set_vinfo_for_stmt (new_stmt,
6815 new_stmt_vec_info (new_stmt, loop_vinfo));
6818 else
6820 /* iv_loop is the loop to be vectorized. Create:
6821 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
6822 stmts = NULL;
6823 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6825 auto_vec<tree, 32> elts (nunits);
6826 elts.quick_push (new_name);
6827 for (i = 1; i < nunits; i++)
6829 /* Create: new_name_i = new_name + step_expr */
6830 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6831 new_name, step_expr);
6832 elts.quick_push (new_name);
6834 /* Create a vector from [new_name_0, new_name_1, ...,
6835 new_name_nunits-1] */
6836 vec_init = gimple_build_vector (&stmts, vectype, elts);
6837 if (stmts)
6839 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6840 gcc_assert (!new_bb);
6845 /* Create the vector that holds the step of the induction. */
6846 if (nested_in_vect_loop)
6847 /* iv_loop is nested in the loop to be vectorized. Generate:
6848 vec_step = [S, S, S, S] */
6849 new_name = step_expr;
6850 else
6852 /* iv_loop is the loop to be vectorized. Generate:
6853 vec_step = [VF*S, VF*S, VF*S, VF*S] */
6854 gimple_seq seq = NULL;
6855 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6857 expr = build_int_cst (integer_type_node, vf);
6858 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6860 else
6861 expr = build_int_cst (TREE_TYPE (step_expr), vf);
6862 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6863 expr, step_expr);
6864 if (seq)
6866 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6867 gcc_assert (!new_bb);
6871 t = unshare_expr (new_name);
6872 gcc_assert (CONSTANT_CLASS_P (new_name)
6873 || TREE_CODE (new_name) == SSA_NAME);
6874 new_vec = build_vector_from_val (vectype, t);
6875 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6878 /* Create the following def-use cycle:
6879 loop prolog:
6880 vec_init = ...
6881 vec_step = ...
6882 loop:
6883 vec_iv = PHI <vec_init, vec_loop>
6885 STMT
6887 vec_loop = vec_iv + vec_step; */
6889 /* Create the induction-phi that defines the induction-operand. */
6890 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6891 induction_phi = create_phi_node (vec_dest, iv_loop->header);
6892 set_vinfo_for_stmt (induction_phi,
6893 new_stmt_vec_info (induction_phi, loop_vinfo));
6894 induc_def = PHI_RESULT (induction_phi);
6896 /* Create the iv update inside the loop */
6897 vec_def = make_ssa_name (vec_dest);
6898 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6899 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6900 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6902 /* Set the arguments of the phi node: */
6903 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6904 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6905 UNKNOWN_LOCATION);
6907 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6909 /* In case that vectorization factor (VF) is bigger than the number
6910 of elements that we can fit in a vectype (nunits), we have to generate
6911 more than one vector stmt - i.e - we need to "unroll" the
6912 vector stmt by a factor VF/nunits. For more details see documentation
6913 in vectorizable_operation. */
6915 if (ncopies > 1)
6917 gimple_seq seq = NULL;
6918 stmt_vec_info prev_stmt_vinfo;
6919 /* FORNOW. This restriction should be relaxed. */
6920 gcc_assert (!nested_in_vect_loop);
6922 /* Create the vector that holds the step of the induction. */
6923 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6925 expr = build_int_cst (integer_type_node, nunits);
6926 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6928 else
6929 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6930 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6931 expr, step_expr);
6932 if (seq)
6934 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6935 gcc_assert (!new_bb);
6938 t = unshare_expr (new_name);
6939 gcc_assert (CONSTANT_CLASS_P (new_name)
6940 || TREE_CODE (new_name) == SSA_NAME);
6941 new_vec = build_vector_from_val (vectype, t);
6942 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6944 vec_def = induc_def;
6945 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6946 for (i = 1; i < ncopies; i++)
6948 /* vec_i = vec_prev + vec_step */
6949 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6950 vec_def, vec_step);
6951 vec_def = make_ssa_name (vec_dest, new_stmt);
6952 gimple_assign_set_lhs (new_stmt, vec_def);
6954 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6955 set_vinfo_for_stmt (new_stmt,
6956 new_stmt_vec_info (new_stmt, loop_vinfo));
6957 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6958 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
6962 if (nested_in_vect_loop)
6964 /* Find the loop-closed exit-phi of the induction, and record
6965 the final vector of induction results: */
6966 exit_phi = NULL;
6967 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6969 gimple *use_stmt = USE_STMT (use_p);
6970 if (is_gimple_debug (use_stmt))
6971 continue;
6973 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
6975 exit_phi = use_stmt;
6976 break;
6979 if (exit_phi)
6981 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
6982 /* FORNOW. Currently not supporting the case that an inner-loop induction
6983 is not used in the outer-loop (i.e. only outside the outer-loop). */
6984 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
6985 && !STMT_VINFO_LIVE_P (stmt_vinfo));
6987 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6988 if (dump_enabled_p ())
6990 dump_printf_loc (MSG_NOTE, vect_location,
6991 "vector of inductions after inner-loop:");
6992 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
6998 if (dump_enabled_p ())
7000 dump_printf_loc (MSG_NOTE, vect_location,
7001 "transform induction: created def-use cycle: ");
7002 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7003 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7004 SSA_NAME_DEF_STMT (vec_def), 0);
7007 return true;
7010 /* Function vectorizable_live_operation.
7012 STMT computes a value that is used outside the loop. Check if
7013 it can be supported. */
7015 bool
7016 vectorizable_live_operation (gimple *stmt,
7017 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7018 slp_tree slp_node, int slp_index,
7019 gimple **vec_stmt)
7021 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7022 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7023 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7024 imm_use_iterator imm_iter;
7025 tree lhs, lhs_type, bitsize, vec_bitsize;
7026 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7027 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7028 int ncopies;
7029 gimple *use_stmt;
7030 auto_vec<tree> vec_oprnds;
7032 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7034 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7035 return false;
7037 /* FORNOW. CHECKME. */
7038 if (nested_in_vect_loop_p (loop, stmt))
7039 return false;
7041 /* If STMT is not relevant and it is a simple assignment and its inputs are
7042 invariant then it can remain in place, unvectorized. The original last
7043 scalar value that it computes will be used. */
7044 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7046 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7047 if (dump_enabled_p ())
7048 dump_printf_loc (MSG_NOTE, vect_location,
7049 "statement is simple and uses invariant. Leaving in "
7050 "place.\n");
7051 return true;
7054 if (slp_node)
7055 ncopies = 1;
7056 else
7057 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7059 if (!vec_stmt)
7060 /* No transformation required. */
7061 return true;
7063 /* If stmt has a related stmt, then use that for getting the lhs. */
7064 if (is_pattern_stmt_p (stmt_info))
7065 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7067 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7068 : gimple_get_lhs (stmt);
7069 lhs_type = TREE_TYPE (lhs);
7071 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7072 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7073 : TYPE_SIZE (TREE_TYPE (vectype)));
7074 vec_bitsize = TYPE_SIZE (vectype);
7076 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7077 tree vec_lhs, bitstart;
7078 if (slp_node)
7080 gcc_assert (slp_index >= 0);
7082 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7083 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7085 /* Get the last occurrence of the scalar index from the concatenation of
7086 all the slp vectors. Calculate which slp vector it is and the index
7087 within. */
7088 int pos = (num_vec * nunits) - num_scalar + slp_index;
7089 int vec_entry = pos / nunits;
7090 int vec_index = pos % nunits;
7092 /* Get the correct slp vectorized stmt. */
7093 vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7095 /* Get entry to use. */
7096 bitstart = bitsize_int (vec_index);
7097 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7099 else
7101 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7102 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7104 /* For multiple copies, get the last copy. */
7105 for (int i = 1; i < ncopies; ++i)
7106 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7107 vec_lhs);
7109 /* Get the last lane in the vector. */
7110 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7113 /* Create a new vectorized stmt for the uses of STMT and insert outside the
7114 loop. */
7115 gimple_seq stmts = NULL;
7116 tree bftype = TREE_TYPE (vectype);
7117 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7118 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7119 tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7120 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7121 true, NULL_TREE);
7122 if (stmts)
7123 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7125 /* Replace use of lhs with newly computed result. If the use stmt is a
7126 single arg PHI, just replace all uses of PHI result. It's necessary
7127 because lcssa PHI defining lhs may be before newly inserted stmt. */
7128 use_operand_p use_p;
7129 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7130 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7131 && !is_gimple_debug (use_stmt))
7133 if (gimple_code (use_stmt) == GIMPLE_PHI
7134 && gimple_phi_num_args (use_stmt) == 1)
7136 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7138 else
7140 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7141 SET_USE (use_p, new_tree);
7143 update_stmt (use_stmt);
7146 return true;
7149 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
7151 static void
7152 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7154 ssa_op_iter op_iter;
7155 imm_use_iterator imm_iter;
7156 def_operand_p def_p;
7157 gimple *ustmt;
7159 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7161 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7163 basic_block bb;
7165 if (!is_gimple_debug (ustmt))
7166 continue;
7168 bb = gimple_bb (ustmt);
7170 if (!flow_bb_inside_loop_p (loop, bb))
7172 if (gimple_debug_bind_p (ustmt))
7174 if (dump_enabled_p ())
7175 dump_printf_loc (MSG_NOTE, vect_location,
7176 "killing debug use\n");
7178 gimple_debug_bind_reset_value (ustmt);
7179 update_stmt (ustmt);
7181 else
7182 gcc_unreachable ();
7188 /* Given loop represented by LOOP_VINFO, return true if computation of
7189 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7190 otherwise. */
7192 static bool
7193 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7195 /* Constant case. */
7196 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7198 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7199 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7201 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7202 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7203 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7204 return true;
7207 widest_int max;
7208 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7209 /* Check the upper bound of loop niters. */
7210 if (get_max_loop_iterations (loop, &max))
7212 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7213 signop sgn = TYPE_SIGN (type);
7214 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7215 if (max < type_max)
7216 return true;
7218 return false;
7221 /* Scale profiling counters by estimation for LOOP which is vectorized
7222 by factor VF. */
7224 static void
7225 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7227 edge preheader = loop_preheader_edge (loop);
7228 /* Reduce loop iterations by the vectorization factor. */
7229 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7230 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7232 if (freq_h.nonzero_p ())
7234 profile_probability p;
7236 /* Avoid dropping loop body profile counter to 0 because of zero count
7237 in loop's preheader. */
7238 if (!(freq_e == profile_count::zero ()))
7239 freq_e = freq_e.force_nonzero ();
7240 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7241 scale_loop_frequencies (loop, p);
7244 edge exit_e = single_exit (loop);
7245 exit_e->probability = profile_probability::always ()
7246 .apply_scale (1, new_est_niter + 1);
7248 edge exit_l = single_pred_edge (loop->latch);
7249 profile_probability prob = exit_l->probability;
7250 exit_l->probability = exit_e->probability.invert ();
7251 if (prob.initialized_p () && exit_l->probability.initialized_p ())
7252 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7255 /* Function vect_transform_loop.
7257 The analysis phase has determined that the loop is vectorizable.
7258 Vectorize the loop - created vectorized stmts to replace the scalar
7259 stmts in the loop, and update the loop exit condition.
7260 Returns scalar epilogue loop if any. */
7262 struct loop *
7263 vect_transform_loop (loop_vec_info loop_vinfo)
7265 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7266 struct loop *epilogue = NULL;
7267 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7268 int nbbs = loop->num_nodes;
7269 int i;
7270 tree niters_vector = NULL;
7271 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7272 bool grouped_store;
7273 bool slp_scheduled = false;
7274 gimple *stmt, *pattern_stmt;
7275 gimple_seq pattern_def_seq = NULL;
7276 gimple_stmt_iterator pattern_def_si = gsi_none ();
7277 bool transform_pattern_stmt = false;
7278 bool check_profitability = false;
7279 int th;
7281 if (dump_enabled_p ())
7282 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7284 /* Use the more conservative vectorization threshold. If the number
7285 of iterations is constant assume the cost check has been performed
7286 by our caller. If the threshold makes all loops profitable that
7287 run at least the vectorization factor number of times checking
7288 is pointless, too. */
7289 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7290 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7291 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7293 if (dump_enabled_p ())
7294 dump_printf_loc (MSG_NOTE, vect_location,
7295 "Profitability threshold is %d loop iterations.\n",
7296 th);
7297 check_profitability = true;
7300 /* Make sure there exists a single-predecessor exit bb. Do this before
7301 versioning. */
7302 edge e = single_exit (loop);
7303 if (! single_pred_p (e->dest))
7305 split_loop_exit_edge (e);
7306 if (dump_enabled_p ())
7307 dump_printf (MSG_NOTE, "split exit edge\n");
7310 /* Version the loop first, if required, so the profitability check
7311 comes first. */
7313 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7315 vect_loop_versioning (loop_vinfo, th, check_profitability);
7316 check_profitability = false;
7319 /* Make sure there exists a single-predecessor exit bb also on the
7320 scalar loop copy. Do this after versioning but before peeling
7321 so CFG structure is fine for both scalar and if-converted loop
7322 to make slpeel_duplicate_current_defs_from_edges face matched
7323 loop closed PHI nodes on the exit. */
7324 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7326 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7327 if (! single_pred_p (e->dest))
7329 split_loop_exit_edge (e);
7330 if (dump_enabled_p ())
7331 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7335 tree niters = vect_build_loop_niters (loop_vinfo);
7336 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7337 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7338 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7339 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7340 check_profitability, niters_no_overflow);
7341 if (niters_vector == NULL_TREE)
7343 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7344 niters_vector
7345 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7346 LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7347 else
7348 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7349 niters_no_overflow);
7352 /* 1) Make sure the loop header has exactly two entries
7353 2) Make sure we have a preheader basic block. */
7355 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7357 split_edge (loop_preheader_edge (loop));
7359 /* FORNOW: the vectorizer supports only loops which body consist
7360 of one basic block (header + empty latch). When the vectorizer will
7361 support more involved loop forms, the order by which the BBs are
7362 traversed need to be reconsidered. */
7364 for (i = 0; i < nbbs; i++)
7366 basic_block bb = bbs[i];
7367 stmt_vec_info stmt_info;
7369 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7370 gsi_next (&si))
7372 gphi *phi = si.phi ();
7373 if (dump_enabled_p ())
7375 dump_printf_loc (MSG_NOTE, vect_location,
7376 "------>vectorizing phi: ");
7377 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7379 stmt_info = vinfo_for_stmt (phi);
7380 if (!stmt_info)
7381 continue;
7383 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7384 vect_loop_kill_debug_uses (loop, phi);
7386 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7387 && !STMT_VINFO_LIVE_P (stmt_info))
7388 continue;
7390 if (STMT_VINFO_VECTYPE (stmt_info)
7391 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7392 != (unsigned HOST_WIDE_INT) vf)
7393 && dump_enabled_p ())
7394 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7396 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7397 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7398 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7399 && ! PURE_SLP_STMT (stmt_info))
7401 if (dump_enabled_p ())
7402 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7403 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7407 pattern_stmt = NULL;
7408 for (gimple_stmt_iterator si = gsi_start_bb (bb);
7409 !gsi_end_p (si) || transform_pattern_stmt;)
7411 bool is_store;
7413 if (transform_pattern_stmt)
7414 stmt = pattern_stmt;
7415 else
7417 stmt = gsi_stmt (si);
7418 /* During vectorization remove existing clobber stmts. */
7419 if (gimple_clobber_p (stmt))
7421 unlink_stmt_vdef (stmt);
7422 gsi_remove (&si, true);
7423 release_defs (stmt);
7424 continue;
7428 if (dump_enabled_p ())
7430 dump_printf_loc (MSG_NOTE, vect_location,
7431 "------>vectorizing statement: ");
7432 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7435 stmt_info = vinfo_for_stmt (stmt);
7437 /* vector stmts created in the outer-loop during vectorization of
7438 stmts in an inner-loop may not have a stmt_info, and do not
7439 need to be vectorized. */
7440 if (!stmt_info)
7442 gsi_next (&si);
7443 continue;
7446 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7447 vect_loop_kill_debug_uses (loop, stmt);
7449 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7450 && !STMT_VINFO_LIVE_P (stmt_info))
7452 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7453 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7454 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7455 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7457 stmt = pattern_stmt;
7458 stmt_info = vinfo_for_stmt (stmt);
7460 else
7462 gsi_next (&si);
7463 continue;
7466 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7467 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7468 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7469 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7470 transform_pattern_stmt = true;
7472 /* If pattern statement has def stmts, vectorize them too. */
7473 if (is_pattern_stmt_p (stmt_info))
7475 if (pattern_def_seq == NULL)
7477 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7478 pattern_def_si = gsi_start (pattern_def_seq);
7480 else if (!gsi_end_p (pattern_def_si))
7481 gsi_next (&pattern_def_si);
7482 if (pattern_def_seq != NULL)
7484 gimple *pattern_def_stmt = NULL;
7485 stmt_vec_info pattern_def_stmt_info = NULL;
7487 while (!gsi_end_p (pattern_def_si))
7489 pattern_def_stmt = gsi_stmt (pattern_def_si);
7490 pattern_def_stmt_info
7491 = vinfo_for_stmt (pattern_def_stmt);
7492 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7493 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7494 break;
7495 gsi_next (&pattern_def_si);
7498 if (!gsi_end_p (pattern_def_si))
7500 if (dump_enabled_p ())
7502 dump_printf_loc (MSG_NOTE, vect_location,
7503 "==> vectorizing pattern def "
7504 "stmt: ");
7505 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7506 pattern_def_stmt, 0);
7509 stmt = pattern_def_stmt;
7510 stmt_info = pattern_def_stmt_info;
7512 else
7514 pattern_def_si = gsi_none ();
7515 transform_pattern_stmt = false;
7518 else
7519 transform_pattern_stmt = false;
7522 if (STMT_VINFO_VECTYPE (stmt_info))
7524 unsigned int nunits
7525 = (unsigned int)
7526 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7527 if (!STMT_SLP_TYPE (stmt_info)
7528 && nunits != (unsigned int) vf
7529 && dump_enabled_p ())
7530 /* For SLP VF is set according to unrolling factor, and not
7531 to vector size, hence for SLP this print is not valid. */
7532 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7535 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7536 reached. */
7537 if (STMT_SLP_TYPE (stmt_info))
7539 if (!slp_scheduled)
7541 slp_scheduled = true;
7543 if (dump_enabled_p ())
7544 dump_printf_loc (MSG_NOTE, vect_location,
7545 "=== scheduling SLP instances ===\n");
7547 vect_schedule_slp (loop_vinfo);
7550 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7551 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7553 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7555 pattern_def_seq = NULL;
7556 gsi_next (&si);
7558 continue;
7562 /* -------- vectorize statement ------------ */
7563 if (dump_enabled_p ())
7564 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7566 grouped_store = false;
7567 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7568 if (is_store)
7570 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7572 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7573 interleaving chain was completed - free all the stores in
7574 the chain. */
7575 gsi_next (&si);
7576 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7578 else
7580 /* Free the attached stmt_vec_info and remove the stmt. */
7581 gimple *store = gsi_stmt (si);
7582 free_stmt_vec_info (store);
7583 unlink_stmt_vdef (store);
7584 gsi_remove (&si, true);
7585 release_defs (store);
7588 /* Stores can only appear at the end of pattern statements. */
7589 gcc_assert (!transform_pattern_stmt);
7590 pattern_def_seq = NULL;
7592 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7594 pattern_def_seq = NULL;
7595 gsi_next (&si);
7597 } /* stmts in BB */
7598 } /* BBs in loop */
7600 slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7602 scale_profile_for_vect_loop (loop, vf);
7604 /* The minimum number of iterations performed by the epilogue. This
7605 is 1 when peeling for gaps because we always need a final scalar
7606 iteration. */
7607 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7608 /* +1 to convert latch counts to loop iteration counts,
7609 -min_epilogue_iters to remove iterations that cannot be performed
7610 by the vector code. */
7611 int bias = 1 - min_epilogue_iters;
7612 /* In these calculations the "- 1" converts loop iteration counts
7613 back to latch counts. */
7614 if (loop->any_upper_bound)
7615 loop->nb_iterations_upper_bound
7616 = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7617 if (loop->any_likely_upper_bound)
7618 loop->nb_iterations_likely_upper_bound
7619 = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7620 if (loop->any_estimate)
7621 loop->nb_iterations_estimate
7622 = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7624 if (dump_enabled_p ())
7626 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7628 dump_printf_loc (MSG_NOTE, vect_location,
7629 "LOOP VECTORIZED\n");
7630 if (loop->inner)
7631 dump_printf_loc (MSG_NOTE, vect_location,
7632 "OUTER LOOP VECTORIZED\n");
7633 dump_printf (MSG_NOTE, "\n");
7635 else
7636 dump_printf_loc (MSG_NOTE, vect_location,
7637 "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7638 current_vector_size);
7641 /* Free SLP instances here because otherwise stmt reference counting
7642 won't work. */
7643 slp_instance instance;
7644 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7645 vect_free_slp_instance (instance);
7646 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7647 /* Clear-up safelen field since its value is invalid after vectorization
7648 since vectorized loop can have loop-carried dependencies. */
7649 loop->safelen = 0;
7651 /* Don't vectorize epilogue for epilogue. */
7652 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7653 epilogue = NULL;
7655 if (epilogue)
7657 unsigned int vector_sizes
7658 = targetm.vectorize.autovectorize_vector_sizes ();
7659 vector_sizes &= current_vector_size - 1;
7661 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7662 epilogue = NULL;
7663 else if (!vector_sizes)
7664 epilogue = NULL;
7665 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7666 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7668 int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7669 int ratio = current_vector_size / smallest_vec_size;
7670 int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7671 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7672 eiters = eiters % vf;
7674 epilogue->nb_iterations_upper_bound = eiters - 1;
7676 if (eiters < vf / ratio)
7677 epilogue = NULL;
7681 if (epilogue)
7683 epilogue->force_vectorize = loop->force_vectorize;
7684 epilogue->safelen = loop->safelen;
7685 epilogue->dont_vectorize = false;
7687 /* We may need to if-convert epilogue to vectorize it. */
7688 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7689 tree_if_conversion (epilogue);
7692 return epilogue;
7695 /* The code below is trying to perform simple optimization - revert
7696 if-conversion for masked stores, i.e. if the mask of a store is zero
7697 do not perform it and all stored value producers also if possible.
7698 For example,
7699 for (i=0; i<n; i++)
7700 if (c[i])
7702 p1[i] += 1;
7703 p2[i] = p3[i] +2;
7705 this transformation will produce the following semi-hammock:
7707 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7709 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7710 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7711 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7712 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7713 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7714 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7718 void
7719 optimize_mask_stores (struct loop *loop)
7721 basic_block *bbs = get_loop_body (loop);
7722 unsigned nbbs = loop->num_nodes;
7723 unsigned i;
7724 basic_block bb;
7725 struct loop *bb_loop;
7726 gimple_stmt_iterator gsi;
7727 gimple *stmt;
7728 auto_vec<gimple *> worklist;
7730 vect_location = find_loop_location (loop);
7731 /* Pick up all masked stores in loop if any. */
7732 for (i = 0; i < nbbs; i++)
7734 bb = bbs[i];
7735 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7736 gsi_next (&gsi))
7738 stmt = gsi_stmt (gsi);
7739 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7740 worklist.safe_push (stmt);
7744 free (bbs);
7745 if (worklist.is_empty ())
7746 return;
7748 /* Loop has masked stores. */
7749 while (!worklist.is_empty ())
7751 gimple *last, *last_store;
7752 edge e, efalse;
7753 tree mask;
7754 basic_block store_bb, join_bb;
7755 gimple_stmt_iterator gsi_to;
7756 tree vdef, new_vdef;
7757 gphi *phi;
7758 tree vectype;
7759 tree zero;
7761 last = worklist.pop ();
7762 mask = gimple_call_arg (last, 2);
7763 bb = gimple_bb (last);
7764 /* Create then_bb and if-then structure in CFG, then_bb belongs to
7765 the same loop as if_bb. It could be different to LOOP when two
7766 level loop-nest is vectorized and mask_store belongs to the inner
7767 one. */
7768 e = split_block (bb, last);
7769 bb_loop = bb->loop_father;
7770 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7771 join_bb = e->dest;
7772 store_bb = create_empty_bb (bb);
7773 add_bb_to_loop (store_bb, bb_loop);
7774 e->flags = EDGE_TRUE_VALUE;
7775 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7776 /* Put STORE_BB to likely part. */
7777 efalse->probability = profile_probability::unlikely ();
7778 store_bb->count = efalse->count ();
7779 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7780 if (dom_info_available_p (CDI_DOMINATORS))
7781 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7782 if (dump_enabled_p ())
7783 dump_printf_loc (MSG_NOTE, vect_location,
7784 "Create new block %d to sink mask stores.",
7785 store_bb->index);
7786 /* Create vector comparison with boolean result. */
7787 vectype = TREE_TYPE (mask);
7788 zero = build_zero_cst (vectype);
7789 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7790 gsi = gsi_last_bb (bb);
7791 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7792 /* Create new PHI node for vdef of the last masked store:
7793 .MEM_2 = VDEF <.MEM_1>
7794 will be converted to
7795 .MEM.3 = VDEF <.MEM_1>
7796 and new PHI node will be created in join bb
7797 .MEM_2 = PHI <.MEM_1, .MEM_3>
7799 vdef = gimple_vdef (last);
7800 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7801 gimple_set_vdef (last, new_vdef);
7802 phi = create_phi_node (vdef, join_bb);
7803 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7805 /* Put all masked stores with the same mask to STORE_BB if possible. */
7806 while (true)
7808 gimple_stmt_iterator gsi_from;
7809 gimple *stmt1 = NULL;
7811 /* Move masked store to STORE_BB. */
7812 last_store = last;
7813 gsi = gsi_for_stmt (last);
7814 gsi_from = gsi;
7815 /* Shift GSI to the previous stmt for further traversal. */
7816 gsi_prev (&gsi);
7817 gsi_to = gsi_start_bb (store_bb);
7818 gsi_move_before (&gsi_from, &gsi_to);
7819 /* Setup GSI_TO to the non-empty block start. */
7820 gsi_to = gsi_start_bb (store_bb);
7821 if (dump_enabled_p ())
7823 dump_printf_loc (MSG_NOTE, vect_location,
7824 "Move stmt to created bb\n");
7825 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7827 /* Move all stored value producers if possible. */
7828 while (!gsi_end_p (gsi))
7830 tree lhs;
7831 imm_use_iterator imm_iter;
7832 use_operand_p use_p;
7833 bool res;
7835 /* Skip debug statements. */
7836 if (is_gimple_debug (gsi_stmt (gsi)))
7838 gsi_prev (&gsi);
7839 continue;
7841 stmt1 = gsi_stmt (gsi);
7842 /* Do not consider statements writing to memory or having
7843 volatile operand. */
7844 if (gimple_vdef (stmt1)
7845 || gimple_has_volatile_ops (stmt1))
7846 break;
7847 gsi_from = gsi;
7848 gsi_prev (&gsi);
7849 lhs = gimple_get_lhs (stmt1);
7850 if (!lhs)
7851 break;
7853 /* LHS of vectorized stmt must be SSA_NAME. */
7854 if (TREE_CODE (lhs) != SSA_NAME)
7855 break;
7857 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7859 /* Remove dead scalar statement. */
7860 if (has_zero_uses (lhs))
7862 gsi_remove (&gsi_from, true);
7863 continue;
7867 /* Check that LHS does not have uses outside of STORE_BB. */
7868 res = true;
7869 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7871 gimple *use_stmt;
7872 use_stmt = USE_STMT (use_p);
7873 if (is_gimple_debug (use_stmt))
7874 continue;
7875 if (gimple_bb (use_stmt) != store_bb)
7877 res = false;
7878 break;
7881 if (!res)
7882 break;
7884 if (gimple_vuse (stmt1)
7885 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7886 break;
7888 /* Can move STMT1 to STORE_BB. */
7889 if (dump_enabled_p ())
7891 dump_printf_loc (MSG_NOTE, vect_location,
7892 "Move stmt to created bb\n");
7893 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7895 gsi_move_before (&gsi_from, &gsi_to);
7896 /* Shift GSI_TO for further insertion. */
7897 gsi_prev (&gsi_to);
7899 /* Put other masked stores with the same mask to STORE_BB. */
7900 if (worklist.is_empty ()
7901 || gimple_call_arg (worklist.last (), 2) != mask
7902 || worklist.last () != stmt1)
7903 break;
7904 last = worklist.pop ();
7906 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);