selftest: split out named_temp_file from temp_source_file
[official-gcc.git] / gcc / tree-vect-loop.c
blobfa06505d1f228279cf7ec8d4f597687cd432722b
1 /* Loop Vectorization
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
53 /* Loop Vectorization Pass.
55 This pass tries to vectorize loops.
57 For example, the vectorizer transforms the following simple loop:
59 short a[N]; short b[N]; short c[N]; int i;
61 for (i=0; i<N; i++){
62 a[i] = b[i] + c[i];
65 as if it was manually vectorized by rewriting the source code into:
67 typedef int __attribute__((mode(V8HI))) v8hi;
68 short a[N]; short b[N]; short c[N]; int i;
69 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
70 v8hi va, vb, vc;
72 for (i=0; i<N/8; i++){
73 vb = pb[i];
74 vc = pc[i];
75 va = vb + vc;
76 pa[i] = va;
79 The main entry to this pass is vectorize_loops(), in which
80 the vectorizer applies a set of analyses on a given set of loops,
81 followed by the actual vectorization transformation for the loops that
82 had successfully passed the analysis phase.
83 Throughout this pass we make a distinction between two types of
84 data: scalars (which are represented by SSA_NAMES), and memory references
85 ("data-refs"). These two types of data require different handling both
86 during analysis and transformation. The types of data-refs that the
87 vectorizer currently supports are ARRAY_REFS which base is an array DECL
88 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
89 accesses are required to have a simple (consecutive) access pattern.
91 Analysis phase:
92 ===============
93 The driver for the analysis phase is vect_analyze_loop().
94 It applies a set of analyses, some of which rely on the scalar evolution
95 analyzer (scev) developed by Sebastian Pop.
97 During the analysis phase the vectorizer records some information
98 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
99 loop, as well as general information about the loop as a whole, which is
100 recorded in a "loop_vec_info" struct attached to each loop.
102 Transformation phase:
103 =====================
104 The loop transformation phase scans all the stmts in the loop, and
105 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
106 the loop that needs to be vectorized. It inserts the vector code sequence
107 just before the scalar stmt S, and records a pointer to the vector code
108 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
109 attached to S). This pointer will be used for the vectorization of following
110 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
111 otherwise, we rely on dead code elimination for removing it.
113 For example, say stmt S1 was vectorized into stmt VS1:
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 S2: a = b;
119 To vectorize stmt S2, the vectorizer first finds the stmt that defines
120 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
121 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
122 resulting sequence would be:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 VS2: va = vb;
127 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
129 Operands that are not SSA_NAMEs, are data-refs that appear in
130 load/store operations (like 'x[i]' in S1), and are handled differently.
132 Target modeling:
133 =================
134 Currently the only target specific information that is used is the
135 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
136 Targets that can support different sizes of vectors, for now will need
137 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
138 flexibility will be added in the future.
140 Since we only vectorize operations which vector form can be
141 expressed using existing tree codes, to verify that an operation is
142 supported, the vectorizer checks the relevant optab at the relevant
143 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
144 the value found is CODE_FOR_nothing, then there's no target support, and
145 we can't vectorize the stmt.
147 For additional information on this project see:
148 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
151 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
153 /* Function vect_determine_vectorization_factor
155 Determine the vectorization factor (VF). VF is the number of data elements
156 that are operated upon in parallel in a single iteration of the vectorized
157 loop. For example, when vectorizing a loop that operates on 4byte elements,
158 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
159 elements can fit in a single vector register.
161 We currently support vectorization of loops in which all types operated upon
162 are of the same size. Therefore this function currently sets VF according to
163 the size of the types operated upon, and fails if there are multiple sizes
164 in the loop.
166 VF is also the factor by which the loop iterations are strip-mined, e.g.:
167 original loop:
168 for (i=0; i<N; i++){
169 a[i] = b[i] + c[i];
172 vectorized loop:
173 for (i=0; i<N; i+=VF){
174 a[i:VF] = b[i:VF] + c[i:VF];
178 static bool
179 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
181 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
182 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
183 unsigned nbbs = loop->num_nodes;
184 unsigned int vectorization_factor = 0;
185 tree scalar_type;
186 gphi *phi;
187 tree vectype;
188 unsigned int nunits;
189 stmt_vec_info stmt_info;
190 unsigned i;
191 HOST_WIDE_INT dummy;
192 gimple *stmt, *pattern_stmt = NULL;
193 gimple_seq pattern_def_seq = NULL;
194 gimple_stmt_iterator pattern_def_si = gsi_none ();
195 bool analyze_pattern_stmt = false;
196 bool bool_result;
197 auto_vec<stmt_vec_info> mask_producers;
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_NOTE, vect_location,
201 "=== vect_determine_vectorization_factor ===\n");
203 for (i = 0; i < nbbs; i++)
205 basic_block bb = bbs[i];
207 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
208 gsi_next (&si))
210 phi = si.phi ();
211 stmt_info = vinfo_for_stmt (phi);
212 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
215 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
218 gcc_assert (stmt_info);
220 if (STMT_VINFO_RELEVANT_P (stmt_info)
221 || STMT_VINFO_LIVE_P (stmt_info))
223 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
224 scalar_type = TREE_TYPE (PHI_RESULT (phi));
226 if (dump_enabled_p ())
228 dump_printf_loc (MSG_NOTE, vect_location,
229 "get vectype for scalar type: ");
230 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
231 dump_printf (MSG_NOTE, "\n");
234 vectype = get_vectype_for_scalar_type (scalar_type);
235 if (!vectype)
237 if (dump_enabled_p ())
239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
240 "not vectorized: unsupported "
241 "data-type ");
242 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
243 scalar_type);
244 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
246 return false;
248 STMT_VINFO_VECTYPE (stmt_info) = vectype;
250 if (dump_enabled_p ())
252 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
253 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
254 dump_printf (MSG_NOTE, "\n");
257 nunits = TYPE_VECTOR_SUBPARTS (vectype);
258 if (dump_enabled_p ())
259 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
260 nunits);
262 if (!vectorization_factor
263 || (nunits > vectorization_factor))
264 vectorization_factor = nunits;
268 for (gimple_stmt_iterator si = gsi_start_bb (bb);
269 !gsi_end_p (si) || analyze_pattern_stmt;)
271 tree vf_vectype;
273 if (analyze_pattern_stmt)
274 stmt = pattern_stmt;
275 else
276 stmt = gsi_stmt (si);
278 stmt_info = vinfo_for_stmt (stmt);
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_NOTE, vect_location,
283 "==> examining statement: ");
284 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
287 gcc_assert (stmt_info);
289 /* Skip stmts which do not need to be vectorized. */
290 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
291 && !STMT_VINFO_LIVE_P (stmt_info))
292 || gimple_clobber_p (stmt))
294 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
295 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
296 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
297 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
299 stmt = pattern_stmt;
300 stmt_info = vinfo_for_stmt (pattern_stmt);
301 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location,
304 "==> examining pattern statement: ");
305 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
308 else
310 if (dump_enabled_p ())
311 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
312 gsi_next (&si);
313 continue;
316 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
317 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
318 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
319 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
320 analyze_pattern_stmt = true;
322 /* If a pattern statement has def stmts, analyze them too. */
323 if (is_pattern_stmt_p (stmt_info))
325 if (pattern_def_seq == NULL)
327 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
328 pattern_def_si = gsi_start (pattern_def_seq);
330 else if (!gsi_end_p (pattern_def_si))
331 gsi_next (&pattern_def_si);
332 if (pattern_def_seq != NULL)
334 gimple *pattern_def_stmt = NULL;
335 stmt_vec_info pattern_def_stmt_info = NULL;
337 while (!gsi_end_p (pattern_def_si))
339 pattern_def_stmt = gsi_stmt (pattern_def_si);
340 pattern_def_stmt_info
341 = vinfo_for_stmt (pattern_def_stmt);
342 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
343 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
344 break;
345 gsi_next (&pattern_def_si);
348 if (!gsi_end_p (pattern_def_si))
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE, vect_location,
353 "==> examining pattern def stmt: ");
354 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
355 pattern_def_stmt, 0);
358 stmt = pattern_def_stmt;
359 stmt_info = pattern_def_stmt_info;
361 else
363 pattern_def_si = gsi_none ();
364 analyze_pattern_stmt = false;
367 else
368 analyze_pattern_stmt = false;
371 if (gimple_get_lhs (stmt) == NULL_TREE
372 /* MASK_STORE has no lhs, but is ok. */
373 && (!is_gimple_call (stmt)
374 || !gimple_call_internal_p (stmt)
375 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
377 if (is_gimple_call (stmt))
379 /* Ignore calls with no lhs. These must be calls to
380 #pragma omp simd functions, and what vectorization factor
381 it really needs can't be determined until
382 vectorizable_simd_clone_call. */
383 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
385 pattern_def_seq = NULL;
386 gsi_next (&si);
388 continue;
390 if (dump_enabled_p ())
392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
393 "not vectorized: irregular stmt.");
394 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
397 return false;
400 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
402 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
405 "not vectorized: vector stmt in loop:");
406 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
408 return false;
411 bool_result = false;
413 if (STMT_VINFO_VECTYPE (stmt_info))
415 /* The only case when a vectype had been already set is for stmts
416 that contain a dataref, or for "pattern-stmts" (stmts
417 generated by the vectorizer to represent/replace a certain
418 idiom). */
419 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
420 || is_pattern_stmt_p (stmt_info)
421 || !gsi_end_p (pattern_def_si));
422 vectype = STMT_VINFO_VECTYPE (stmt_info);
424 else
426 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
427 if (is_gimple_call (stmt)
428 && gimple_call_internal_p (stmt)
429 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
430 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
431 else
432 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
434 /* Bool ops don't participate in vectorization factor
435 computation. For comparison use compared types to
436 compute a factor. */
437 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
438 && is_gimple_assign (stmt)
439 && gimple_assign_rhs_code (stmt) != COND_EXPR)
441 if (STMT_VINFO_RELEVANT_P (stmt_info)
442 || STMT_VINFO_LIVE_P (stmt_info))
443 mask_producers.safe_push (stmt_info);
444 bool_result = true;
446 if (gimple_code (stmt) == GIMPLE_ASSIGN
447 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
448 == tcc_comparison
449 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
450 != BOOLEAN_TYPE)
451 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
452 else
454 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
456 pattern_def_seq = NULL;
457 gsi_next (&si);
459 continue;
463 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "get vectype for scalar type: ");
467 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
468 dump_printf (MSG_NOTE, "\n");
470 vectype = get_vectype_for_scalar_type (scalar_type);
471 if (!vectype)
473 if (dump_enabled_p ())
475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
476 "not vectorized: unsupported "
477 "data-type ");
478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
479 scalar_type);
480 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
482 return false;
485 if (!bool_result)
486 STMT_VINFO_VECTYPE (stmt_info) = vectype;
488 if (dump_enabled_p ())
490 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
491 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
492 dump_printf (MSG_NOTE, "\n");
496 /* Don't try to compute VF out scalar types if we stmt
497 produces boolean vector. Use result vectype instead. */
498 if (VECTOR_BOOLEAN_TYPE_P (vectype))
499 vf_vectype = vectype;
500 else
502 /* The vectorization factor is according to the smallest
503 scalar type (or the largest vector size, but we only
504 support one vector size per loop). */
505 if (!bool_result)
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
517 if (!vf_vectype)
519 if (dump_enabled_p ())
521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
522 "not vectorized: unsupported data-type ");
523 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
524 scalar_type);
525 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527 return false;
530 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
531 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
533 if (dump_enabled_p ())
535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
536 "not vectorized: different sized vector "
537 "types in statement, ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
539 vectype);
540 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
541 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
542 vf_vectype);
543 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
545 return false;
548 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
551 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
552 dump_printf (MSG_NOTE, "\n");
555 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
558 if (!vectorization_factor
559 || (nunits > vectorization_factor))
560 vectorization_factor = nunits;
562 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
564 pattern_def_seq = NULL;
565 gsi_next (&si);
570 /* TODO: Analyze cost. Decide if worth while to vectorize. */
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
573 vectorization_factor);
574 if (vectorization_factor <= 1)
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
578 "not vectorized: unsupported data-type\n");
579 return false;
581 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
583 for (i = 0; i < mask_producers.length (); i++)
585 tree mask_type = NULL;
587 stmt = STMT_VINFO_STMT (mask_producers[i]);
589 if (gimple_code (stmt) == GIMPLE_ASSIGN
590 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
591 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
593 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
594 mask_type = get_mask_type_for_scalar_type (scalar_type);
596 if (!mask_type)
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "not vectorized: unsupported mask\n");
601 return false;
604 else
606 tree rhs;
607 ssa_op_iter iter;
608 gimple *def_stmt;
609 enum vect_def_type dt;
611 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
613 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
614 &def_stmt, &dt, &vectype))
616 if (dump_enabled_p ())
618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
619 "not vectorized: can't compute mask type "
620 "for statement, ");
621 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
624 return false;
627 /* No vectype probably means external definition.
628 Allow it in case there is another operand which
629 allows to determine mask type. */
630 if (!vectype)
631 continue;
633 if (!mask_type)
634 mask_type = vectype;
635 else if (TYPE_VECTOR_SUBPARTS (mask_type)
636 != TYPE_VECTOR_SUBPARTS (vectype))
638 if (dump_enabled_p ())
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "not vectorized: different sized masks "
642 "types in statement, ");
643 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
644 mask_type);
645 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
646 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
647 vectype);
648 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
650 return false;
652 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
653 != VECTOR_BOOLEAN_TYPE_P (vectype))
655 if (dump_enabled_p ())
657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
658 "not vectorized: mixed mask and "
659 "nonmask vector types in statement, ");
660 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
661 mask_type);
662 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
663 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
664 vectype);
665 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
667 return false;
671 /* We may compare boolean value loaded as vector of integers.
672 Fix mask_type in such case. */
673 if (mask_type
674 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
675 && gimple_code (stmt) == GIMPLE_ASSIGN
676 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
677 mask_type = build_same_sized_truth_vector_type (mask_type);
680 /* No mask_type should mean loop invariant predicate.
681 This is probably a subject for optimization in
682 if-conversion. */
683 if (!mask_type)
685 if (dump_enabled_p ())
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
688 "not vectorized: can't compute mask type "
689 "for statement, ");
690 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
693 return false;
696 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
699 return true;
703 /* Function vect_is_simple_iv_evolution.
705 FORNOW: A simple evolution of an induction variables in the loop is
706 considered a polynomial evolution. */
708 static bool
709 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
710 tree * step)
712 tree init_expr;
713 tree step_expr;
714 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
715 basic_block bb;
717 /* When there is no evolution in this loop, the evolution function
718 is not "simple". */
719 if (evolution_part == NULL_TREE)
720 return false;
722 /* When the evolution is a polynomial of degree >= 2
723 the evolution function is not "simple". */
724 if (tree_is_chrec (evolution_part))
725 return false;
727 step_expr = evolution_part;
728 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
734 dump_printf (MSG_NOTE, ", init: ");
735 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
736 dump_printf (MSG_NOTE, "\n");
739 *init = init_expr;
740 *step = step_expr;
742 if (TREE_CODE (step_expr) != INTEGER_CST
743 && (TREE_CODE (step_expr) != SSA_NAME
744 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
745 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
746 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
747 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
748 || !flag_associative_math)))
749 && (TREE_CODE (step_expr) != REAL_CST
750 || !flag_associative_math))
752 if (dump_enabled_p ())
753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
754 "step unknown.\n");
755 return false;
758 return true;
761 /* Function vect_analyze_scalar_cycles_1.
763 Examine the cross iteration def-use cycles of scalar variables
764 in LOOP. LOOP_VINFO represents the loop that is now being
765 considered for vectorization (can be LOOP, or an outer-loop
766 enclosing LOOP). */
768 static void
769 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
771 basic_block bb = loop->header;
772 tree init, step;
773 auto_vec<gimple *, 64> worklist;
774 gphi_iterator gsi;
775 bool double_reduc;
777 if (dump_enabled_p ())
778 dump_printf_loc (MSG_NOTE, vect_location,
779 "=== vect_analyze_scalar_cycles ===\n");
781 /* First - identify all inductions. Reduction detection assumes that all the
782 inductions have been identified, therefore, this order must not be
783 changed. */
784 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
786 gphi *phi = gsi.phi ();
787 tree access_fn = NULL;
788 tree def = PHI_RESULT (phi);
789 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
791 if (dump_enabled_p ())
793 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
794 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
797 /* Skip virtual phi's. The data dependences that are associated with
798 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
799 if (virtual_operand_p (def))
800 continue;
802 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
804 /* Analyze the evolution function. */
805 access_fn = analyze_scalar_evolution (loop, def);
806 if (access_fn)
808 STRIP_NOPS (access_fn);
809 if (dump_enabled_p ())
811 dump_printf_loc (MSG_NOTE, vect_location,
812 "Access function of PHI: ");
813 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
814 dump_printf (MSG_NOTE, "\n");
816 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
817 = initial_condition_in_loop_num (access_fn, loop->num);
818 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
819 = evolution_part_in_loop_num (access_fn, loop->num);
822 if (!access_fn
823 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
824 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
825 && TREE_CODE (step) != INTEGER_CST))
827 worklist.safe_push (phi);
828 continue;
831 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
832 != NULL_TREE);
833 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
835 if (dump_enabled_p ())
836 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
837 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
841 /* Second - identify all reductions and nested cycles. */
842 while (worklist.length () > 0)
844 gimple *phi = worklist.pop ();
845 tree def = PHI_RESULT (phi);
846 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
847 gimple *reduc_stmt;
848 bool nested_cycle;
850 if (dump_enabled_p ())
852 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
853 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
856 gcc_assert (!virtual_operand_p (def)
857 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
859 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
860 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
861 &double_reduc, false);
862 if (reduc_stmt)
864 if (double_reduc)
866 if (dump_enabled_p ())
867 dump_printf_loc (MSG_NOTE, vect_location,
868 "Detected double reduction.\n");
870 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
871 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
872 vect_double_reduction_def;
874 else
876 if (nested_cycle)
878 if (dump_enabled_p ())
879 dump_printf_loc (MSG_NOTE, vect_location,
880 "Detected vectorizable nested cycle.\n");
882 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
883 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
884 vect_nested_cycle;
886 else
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location,
890 "Detected reduction.\n");
892 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
893 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
894 vect_reduction_def;
895 /* Store the reduction cycles for possible vectorization in
896 loop-aware SLP. */
897 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
901 else
902 if (dump_enabled_p ())
903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
904 "Unknown def-use cycle pattern.\n");
909 /* Function vect_analyze_scalar_cycles.
911 Examine the cross iteration def-use cycles of scalar variables, by
912 analyzing the loop-header PHIs of scalar variables. Classify each
913 cycle as one of the following: invariant, induction, reduction, unknown.
914 We do that for the loop represented by LOOP_VINFO, and also to its
915 inner-loop, if exists.
916 Examples for scalar cycles:
918 Example1: reduction:
920 loop1:
921 for (i=0; i<N; i++)
922 sum += a[i];
924 Example2: induction:
926 loop2:
927 for (i=0; i<N; i++)
928 a[i] = i; */
930 static void
931 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
933 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
935 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
937 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
938 Reductions in such inner-loop therefore have different properties than
939 the reductions in the nest that gets vectorized:
940 1. When vectorized, they are executed in the same order as in the original
941 scalar loop, so we can't change the order of computation when
942 vectorizing them.
943 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
944 current checks are too strict. */
946 if (loop->inner)
947 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
950 /* Transfer group and reduction information from STMT to its pattern stmt. */
952 static void
953 vect_fixup_reduc_chain (gimple *stmt)
955 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
956 gimple *stmtp;
957 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
958 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
959 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
962 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
963 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
964 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
965 if (stmt)
966 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
967 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
969 while (stmt);
970 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
973 /* Fixup scalar cycles that now have their stmts detected as patterns. */
975 static void
976 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
978 gimple *first;
979 unsigned i;
981 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
982 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
984 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
985 while (next)
987 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
988 break;
989 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
991 /* If not all stmt in the chain are patterns try to handle
992 the chain without patterns. */
993 if (! next)
995 vect_fixup_reduc_chain (first);
996 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
997 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1002 /* Function vect_get_loop_niters.
1004 Determine how many iterations the loop is executed and place it
1005 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
1006 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
1007 niter information holds in ASSUMPTIONS.
1009 Return the loop exit condition. */
1012 static gcond *
1013 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1014 tree *number_of_iterations, tree *number_of_iterationsm1)
1016 edge exit = single_exit (loop);
1017 struct tree_niter_desc niter_desc;
1018 tree niter_assumptions, niter, may_be_zero;
1019 gcond *cond = get_loop_exit_condition (loop);
1021 *assumptions = boolean_true_node;
1022 *number_of_iterationsm1 = chrec_dont_know;
1023 *number_of_iterations = chrec_dont_know;
1024 if (dump_enabled_p ())
1025 dump_printf_loc (MSG_NOTE, vect_location,
1026 "=== get_loop_niters ===\n");
1028 if (!exit)
1029 return cond;
1031 niter = chrec_dont_know;
1032 may_be_zero = NULL_TREE;
1033 niter_assumptions = boolean_true_node;
1034 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1035 || chrec_contains_undetermined (niter_desc.niter))
1036 return cond;
1038 niter_assumptions = niter_desc.assumptions;
1039 may_be_zero = niter_desc.may_be_zero;
1040 niter = niter_desc.niter;
1042 if (may_be_zero && integer_zerop (may_be_zero))
1043 may_be_zero = NULL_TREE;
1045 if (may_be_zero)
1047 if (COMPARISON_CLASS_P (may_be_zero))
1049 /* Try to combine may_be_zero with assumptions, this can simplify
1050 computation of niter expression. */
1051 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1052 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1053 niter_assumptions,
1054 fold_build1 (TRUTH_NOT_EXPR,
1055 boolean_type_node,
1056 may_be_zero));
1057 else
1058 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1059 build_int_cst (TREE_TYPE (niter), 0), niter);
1061 may_be_zero = NULL_TREE;
1063 else if (integer_nonzerop (may_be_zero))
1065 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1066 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1067 return cond;
1069 else
1070 return cond;
1073 *assumptions = niter_assumptions;
1074 *number_of_iterationsm1 = niter;
1076 /* We want the number of loop header executions which is the number
1077 of latch executions plus one.
1078 ??? For UINT_MAX latch executions this number overflows to zero
1079 for loops like do { n++; } while (n != 0); */
1080 if (niter && !chrec_contains_undetermined (niter))
1081 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1082 build_int_cst (TREE_TYPE (niter), 1));
1083 *number_of_iterations = niter;
1085 return cond;
1088 /* Function bb_in_loop_p
1090 Used as predicate for dfs order traversal of the loop bbs. */
1092 static bool
1093 bb_in_loop_p (const_basic_block bb, const void *data)
1095 const struct loop *const loop = (const struct loop *)data;
1096 if (flow_bb_inside_loop_p (loop, bb))
1097 return true;
1098 return false;
1102 /* Function new_loop_vec_info.
1104 Create and initialize a new loop_vec_info struct for LOOP, as well as
1105 stmt_vec_info structs for all the stmts in LOOP. */
1107 static loop_vec_info
1108 new_loop_vec_info (struct loop *loop)
1110 loop_vec_info res;
1111 basic_block *bbs;
1112 gimple_stmt_iterator si;
1113 unsigned int i, nbbs;
1115 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1116 res->kind = vec_info::loop;
1117 LOOP_VINFO_LOOP (res) = loop;
1119 bbs = get_loop_body (loop);
1121 /* Create/Update stmt_info for all stmts in the loop. */
1122 for (i = 0; i < loop->num_nodes; i++)
1124 basic_block bb = bbs[i];
1126 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1128 gimple *phi = gsi_stmt (si);
1129 gimple_set_uid (phi, 0);
1130 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1133 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1135 gimple *stmt = gsi_stmt (si);
1136 gimple_set_uid (stmt, 0);
1137 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1141 /* CHECKME: We want to visit all BBs before their successors (except for
1142 latch blocks, for which this assertion wouldn't hold). In the simple
1143 case of the loop forms we allow, a dfs order of the BBs would the same
1144 as reversed postorder traversal, so we are safe. */
1146 free (bbs);
1147 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1148 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1149 bbs, loop->num_nodes, loop);
1150 gcc_assert (nbbs == loop->num_nodes);
1152 LOOP_VINFO_BBS (res) = bbs;
1153 LOOP_VINFO_NITERSM1 (res) = NULL;
1154 LOOP_VINFO_NITERS (res) = NULL;
1155 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1156 LOOP_VINFO_NITERS_ASSUMPTIONS (res) = NULL;
1157 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1158 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1159 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1160 LOOP_VINFO_VECT_FACTOR (res) = 0;
1161 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1162 LOOP_VINFO_DATAREFS (res) = vNULL;
1163 LOOP_VINFO_DDRS (res) = vNULL;
1164 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1165 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1166 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1167 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1168 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1169 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1170 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1171 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1172 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1173 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1174 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1175 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1177 return res;
1181 /* Function destroy_loop_vec_info.
1183 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1184 stmts in the loop. */
1186 void
1187 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1189 struct loop *loop;
1190 basic_block *bbs;
1191 int nbbs;
1192 gimple_stmt_iterator si;
1193 int j;
1194 vec<slp_instance> slp_instances;
1195 slp_instance instance;
1196 bool swapped;
1198 if (!loop_vinfo)
1199 return;
1201 loop = LOOP_VINFO_LOOP (loop_vinfo);
1203 bbs = LOOP_VINFO_BBS (loop_vinfo);
1204 nbbs = clean_stmts ? loop->num_nodes : 0;
1205 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1207 for (j = 0; j < nbbs; j++)
1209 basic_block bb = bbs[j];
1210 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1211 free_stmt_vec_info (gsi_stmt (si));
1213 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1215 gimple *stmt = gsi_stmt (si);
1217 /* We may have broken canonical form by moving a constant
1218 into RHS1 of a commutative op. Fix such occurrences. */
1219 if (swapped && is_gimple_assign (stmt))
1221 enum tree_code code = gimple_assign_rhs_code (stmt);
1223 if ((code == PLUS_EXPR
1224 || code == POINTER_PLUS_EXPR
1225 || code == MULT_EXPR)
1226 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1227 swap_ssa_operands (stmt,
1228 gimple_assign_rhs1_ptr (stmt),
1229 gimple_assign_rhs2_ptr (stmt));
1232 /* Free stmt_vec_info. */
1233 free_stmt_vec_info (stmt);
1234 gsi_next (&si);
1238 free (LOOP_VINFO_BBS (loop_vinfo));
1239 vect_destroy_datarefs (loop_vinfo);
1240 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1241 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1242 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1243 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1244 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1245 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1246 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1247 vect_free_slp_instance (instance);
1249 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1250 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1251 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1252 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1254 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1255 loop_vinfo->scalar_cost_vec.release ();
1257 free (loop_vinfo);
1258 loop->aux = NULL;
1262 /* Calculate the cost of one scalar iteration of the loop. */
1263 static void
1264 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1266 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1267 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1268 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1269 int innerloop_iters, i;
1271 /* Count statements in scalar loop. Using this as scalar cost for a single
1272 iteration for now.
1274 TODO: Add outer loop support.
1276 TODO: Consider assigning different costs to different scalar
1277 statements. */
1279 /* FORNOW. */
1280 innerloop_iters = 1;
1281 if (loop->inner)
1282 innerloop_iters = 50; /* FIXME */
1284 for (i = 0; i < nbbs; i++)
1286 gimple_stmt_iterator si;
1287 basic_block bb = bbs[i];
1289 if (bb->loop_father == loop->inner)
1290 factor = innerloop_iters;
1291 else
1292 factor = 1;
1294 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1296 gimple *stmt = gsi_stmt (si);
1297 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1299 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1300 continue;
1302 /* Skip stmts that are not vectorized inside the loop. */
1303 if (stmt_info
1304 && !STMT_VINFO_RELEVANT_P (stmt_info)
1305 && (!STMT_VINFO_LIVE_P (stmt_info)
1306 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1307 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1308 continue;
1310 vect_cost_for_stmt kind;
1311 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1313 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1314 kind = scalar_load;
1315 else
1316 kind = scalar_store;
1318 else
1319 kind = scalar_stmt;
1321 scalar_single_iter_cost
1322 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1323 factor, kind, NULL, 0, vect_prologue);
1326 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1327 = scalar_single_iter_cost;
1331 /* Function vect_analyze_loop_form_1.
1333 Verify that certain CFG restrictions hold, including:
1334 - the loop has a pre-header
1335 - the loop has a single entry and exit
1336 - the loop exit condition is simple enough
1337 - the number of iterations can be analyzed, i.e, a countable loop. The
1338 niter could be analyzed under some assumptions. */
1340 bool
1341 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1342 tree *assumptions, tree *number_of_iterationsm1,
1343 tree *number_of_iterations, gcond **inner_loop_cond)
1345 if (dump_enabled_p ())
1346 dump_printf_loc (MSG_NOTE, vect_location,
1347 "=== vect_analyze_loop_form ===\n");
1349 /* Different restrictions apply when we are considering an inner-most loop,
1350 vs. an outer (nested) loop.
1351 (FORNOW. May want to relax some of these restrictions in the future). */
1353 if (!loop->inner)
1355 /* Inner-most loop. We currently require that the number of BBs is
1356 exactly 2 (the header and latch). Vectorizable inner-most loops
1357 look like this:
1359 (pre-header)
1361 header <--------+
1362 | | |
1363 | +--> latch --+
1365 (exit-bb) */
1367 if (loop->num_nodes != 2)
1369 if (dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371 "not vectorized: control flow in loop.\n");
1372 return false;
1375 if (empty_block_p (loop->header))
1377 if (dump_enabled_p ())
1378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379 "not vectorized: empty loop.\n");
1380 return false;
1383 else
1385 struct loop *innerloop = loop->inner;
1386 edge entryedge;
1388 /* Nested loop. We currently require that the loop is doubly-nested,
1389 contains a single inner loop, and the number of BBs is exactly 5.
1390 Vectorizable outer-loops look like this:
1392 (pre-header)
1394 header <---+
1396 inner-loop |
1398 tail ------+
1400 (exit-bb)
1402 The inner-loop has the properties expected of inner-most loops
1403 as described above. */
1405 if ((loop->inner)->inner || (loop->inner)->next)
1407 if (dump_enabled_p ())
1408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1409 "not vectorized: multiple nested loops.\n");
1410 return false;
1413 if (loop->num_nodes != 5)
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417 "not vectorized: control flow in loop.\n");
1418 return false;
1421 entryedge = loop_preheader_edge (innerloop);
1422 if (entryedge->src != loop->header
1423 || !single_exit (innerloop)
1424 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428 "not vectorized: unsupported outerloop form.\n");
1429 return false;
1432 /* Analyze the inner-loop. */
1433 tree inner_niterm1, inner_niter, inner_assumptions;
1434 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1435 &inner_assumptions, &inner_niterm1,
1436 &inner_niter, NULL)
1437 /* Don't support analyzing niter under assumptions for inner
1438 loop. */
1439 || !integer_onep (inner_assumptions))
1441 if (dump_enabled_p ())
1442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443 "not vectorized: Bad inner loop.\n");
1444 return false;
1447 if (!expr_invariant_in_loop_p (loop, inner_niter))
1449 if (dump_enabled_p ())
1450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1451 "not vectorized: inner-loop count not"
1452 " invariant.\n");
1453 return false;
1456 if (dump_enabled_p ())
1457 dump_printf_loc (MSG_NOTE, vect_location,
1458 "Considering outer-loop vectorization.\n");
1461 if (!single_exit (loop)
1462 || EDGE_COUNT (loop->header->preds) != 2)
1464 if (dump_enabled_p ())
1466 if (!single_exit (loop))
1467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1468 "not vectorized: multiple exits.\n");
1469 else if (EDGE_COUNT (loop->header->preds) != 2)
1470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1471 "not vectorized: too many incoming edges.\n");
1473 return false;
1476 /* We assume that the loop exit condition is at the end of the loop. i.e,
1477 that the loop is represented as a do-while (with a proper if-guard
1478 before the loop if needed), where the loop header contains all the
1479 executable statements, and the latch is empty. */
1480 if (!empty_block_p (loop->latch)
1481 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1483 if (dump_enabled_p ())
1484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485 "not vectorized: latch block not empty.\n");
1486 return false;
1489 /* Make sure the exit is not abnormal. */
1490 edge e = single_exit (loop);
1491 if (e->flags & EDGE_ABNORMAL)
1493 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495 "not vectorized: abnormal loop exit edge.\n");
1496 return false;
1499 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1500 number_of_iterationsm1);
1501 if (!*loop_cond)
1503 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505 "not vectorized: complicated exit condition.\n");
1506 return false;
1509 if (integer_zerop (*assumptions)
1510 || !*number_of_iterations
1511 || chrec_contains_undetermined (*number_of_iterations))
1513 if (dump_enabled_p ())
1514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1515 "not vectorized: number of iterations cannot be "
1516 "computed.\n");
1517 return false;
1520 if (integer_zerop (*number_of_iterations))
1522 if (dump_enabled_p ())
1523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1524 "not vectorized: number of iterations = 0.\n");
1525 return false;
1528 return true;
1531 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1533 loop_vec_info
1534 vect_analyze_loop_form (struct loop *loop)
1536 tree assumptions, number_of_iterations, number_of_iterationsm1;
1537 gcond *loop_cond, *inner_loop_cond = NULL;
1539 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1540 &assumptions, &number_of_iterationsm1,
1541 &number_of_iterations, &inner_loop_cond))
1542 return NULL;
1544 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1545 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1546 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1547 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1548 if (!integer_onep (assumptions))
1550 /* We consider to vectorize this loop by versioning it under
1551 some assumptions. In order to do this, we need to clear
1552 existing information computed by scev and niter analyzer. */
1553 scev_reset_htab ();
1554 free_numbers_of_iterations_estimates_loop (loop);
1555 /* Also set flag for this loop so that following scev and niter
1556 analysis are done under the assumptions. */
1557 loop_constraint_set (loop, LOOP_C_FINITE);
1558 /* Also record the assumptions for versioning. */
1559 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1562 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1564 if (dump_enabled_p ())
1566 dump_printf_loc (MSG_NOTE, vect_location,
1567 "Symbolic number of iterations is ");
1568 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1569 dump_printf (MSG_NOTE, "\n");
1573 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1574 if (inner_loop_cond)
1575 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1576 = loop_exit_ctrl_vec_info_type;
1578 gcc_assert (!loop->aux);
1579 loop->aux = loop_vinfo;
1580 return loop_vinfo;
1585 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1586 statements update the vectorization factor. */
1588 static void
1589 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1591 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1592 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1593 int nbbs = loop->num_nodes;
1594 unsigned int vectorization_factor;
1595 int i;
1597 if (dump_enabled_p ())
1598 dump_printf_loc (MSG_NOTE, vect_location,
1599 "=== vect_update_vf_for_slp ===\n");
1601 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1602 gcc_assert (vectorization_factor != 0);
1604 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1605 vectorization factor of the loop is the unrolling factor required by
1606 the SLP instances. If that unrolling factor is 1, we say, that we
1607 perform pure SLP on loop - cross iteration parallelism is not
1608 exploited. */
1609 bool only_slp_in_loop = true;
1610 for (i = 0; i < nbbs; i++)
1612 basic_block bb = bbs[i];
1613 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1614 gsi_next (&si))
1616 gimple *stmt = gsi_stmt (si);
1617 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1618 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1619 && STMT_VINFO_RELATED_STMT (stmt_info))
1621 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1622 stmt_info = vinfo_for_stmt (stmt);
1624 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1625 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1626 && !PURE_SLP_STMT (stmt_info))
1627 /* STMT needs both SLP and loop-based vectorization. */
1628 only_slp_in_loop = false;
1632 if (only_slp_in_loop)
1633 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1634 else
1635 vectorization_factor
1636 = least_common_multiple (vectorization_factor,
1637 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1639 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1640 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_NOTE, vect_location,
1642 "Updating vectorization factor to %d\n",
1643 vectorization_factor);
1646 /* Function vect_analyze_loop_operations.
1648 Scan the loop stmts and make sure they are all vectorizable. */
1650 static bool
1651 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1653 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1654 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1655 int nbbs = loop->num_nodes;
1656 int i;
1657 stmt_vec_info stmt_info;
1658 bool need_to_vectorize = false;
1659 bool ok;
1661 if (dump_enabled_p ())
1662 dump_printf_loc (MSG_NOTE, vect_location,
1663 "=== vect_analyze_loop_operations ===\n");
1665 for (i = 0; i < nbbs; i++)
1667 basic_block bb = bbs[i];
1669 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1670 gsi_next (&si))
1672 gphi *phi = si.phi ();
1673 ok = true;
1675 stmt_info = vinfo_for_stmt (phi);
1676 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1679 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1681 if (virtual_operand_p (gimple_phi_result (phi)))
1682 continue;
1684 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685 (i.e., a phi in the tail of the outer-loop). */
1686 if (! is_loop_header_bb_p (bb))
1688 /* FORNOW: we currently don't support the case that these phis
1689 are not used in the outerloop (unless it is double reduction,
1690 i.e., this phi is vect_reduction_def), cause this case
1691 requires to actually do something here. */
1692 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1693 || STMT_VINFO_LIVE_P (stmt_info))
1694 && STMT_VINFO_DEF_TYPE (stmt_info)
1695 != vect_double_reduction_def)
1697 if (dump_enabled_p ())
1698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699 "Unsupported loop-closed phi in "
1700 "outer-loop.\n");
1701 return false;
1704 /* If PHI is used in the outer loop, we check that its operand
1705 is defined in the inner loop. */
1706 if (STMT_VINFO_RELEVANT_P (stmt_info))
1708 tree phi_op;
1709 gimple *op_def_stmt;
1711 if (gimple_phi_num_args (phi) != 1)
1712 return false;
1714 phi_op = PHI_ARG_DEF (phi, 0);
1715 if (TREE_CODE (phi_op) != SSA_NAME)
1716 return false;
1718 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1719 if (gimple_nop_p (op_def_stmt)
1720 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1721 || !vinfo_for_stmt (op_def_stmt))
1722 return false;
1724 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1725 != vect_used_in_outer
1726 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1727 != vect_used_in_outer_by_reduction)
1728 return false;
1731 continue;
1734 gcc_assert (stmt_info);
1736 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1737 || STMT_VINFO_LIVE_P (stmt_info))
1738 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1740 /* A scalar-dependence cycle that we don't support. */
1741 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1743 "not vectorized: scalar dependence cycle.\n");
1744 return false;
1747 if (STMT_VINFO_RELEVANT_P (stmt_info))
1749 need_to_vectorize = true;
1750 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1751 ok = vectorizable_induction (phi, NULL, NULL);
1754 if (ok && STMT_VINFO_LIVE_P (stmt_info))
1755 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1757 if (!ok)
1759 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762 "not vectorized: relevant phi not "
1763 "supported: ");
1764 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1766 return false;
1770 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771 gsi_next (&si))
1773 gimple *stmt = gsi_stmt (si);
1774 if (!gimple_clobber_p (stmt)
1775 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1776 return false;
1778 } /* bbs */
1780 /* All operations in the loop are either irrelevant (deal with loop
1781 control, or dead), or only used outside the loop and can be moved
1782 out of the loop (e.g. invariants, inductions). The loop can be
1783 optimized away by scalar optimizations. We're better off not
1784 touching this loop. */
1785 if (!need_to_vectorize)
1787 if (dump_enabled_p ())
1788 dump_printf_loc (MSG_NOTE, vect_location,
1789 "All the computation can be taken out of the loop.\n");
1790 if (dump_enabled_p ())
1791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1792 "not vectorized: redundant loop. no profit to "
1793 "vectorize.\n");
1794 return false;
1797 return true;
1801 /* Function vect_analyze_loop_2.
1803 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1804 for it. The different analyses will record information in the
1805 loop_vec_info struct. */
1806 static bool
1807 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1809 bool ok;
1810 int max_vf = MAX_VECTORIZATION_FACTOR;
1811 int min_vf = 2;
1812 unsigned int n_stmts = 0;
1814 /* The first group of checks is independent of the vector size. */
1815 fatal = true;
1817 /* Find all data references in the loop (which correspond to vdefs/vuses)
1818 and analyze their evolution in the loop. */
1820 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1822 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1823 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1825 if (dump_enabled_p ())
1826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1827 "not vectorized: loop nest containing two "
1828 "or more consecutive inner loops cannot be "
1829 "vectorized\n");
1830 return false;
1833 for (unsigned i = 0; i < loop->num_nodes; i++)
1834 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1835 !gsi_end_p (gsi); gsi_next (&gsi))
1837 gimple *stmt = gsi_stmt (gsi);
1838 if (is_gimple_debug (stmt))
1839 continue;
1840 ++n_stmts;
1841 if (!find_data_references_in_stmt (loop, stmt,
1842 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1844 if (is_gimple_call (stmt) && loop->safelen)
1846 tree fndecl = gimple_call_fndecl (stmt), op;
1847 if (fndecl != NULL_TREE)
1849 cgraph_node *node = cgraph_node::get (fndecl);
1850 if (node != NULL && node->simd_clones != NULL)
1852 unsigned int j, n = gimple_call_num_args (stmt);
1853 for (j = 0; j < n; j++)
1855 op = gimple_call_arg (stmt, j);
1856 if (DECL_P (op)
1857 || (REFERENCE_CLASS_P (op)
1858 && get_base_address (op)))
1859 break;
1861 op = gimple_call_lhs (stmt);
1862 /* Ignore #pragma omp declare simd functions
1863 if they don't have data references in the
1864 call stmt itself. */
1865 if (j == n
1866 && !(op
1867 && (DECL_P (op)
1868 || (REFERENCE_CLASS_P (op)
1869 && get_base_address (op)))))
1870 continue;
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876 "not vectorized: loop contains function "
1877 "calls or data references that cannot "
1878 "be analyzed\n");
1879 return false;
1883 /* Analyze the data references and also adjust the minimal
1884 vectorization factor according to the loads and stores. */
1886 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1887 if (!ok)
1889 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891 "bad data references.\n");
1892 return false;
1895 /* Classify all cross-iteration scalar data-flow cycles.
1896 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1897 vect_analyze_scalar_cycles (loop_vinfo);
1899 vect_pattern_recog (loop_vinfo);
1901 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1903 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1904 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1906 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1907 if (!ok)
1909 if (dump_enabled_p ())
1910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1911 "bad data access.\n");
1912 return false;
1915 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1917 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1918 if (!ok)
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922 "unexpected pattern.\n");
1923 return false;
1926 /* While the rest of the analysis below depends on it in some way. */
1927 fatal = false;
1929 /* Analyze data dependences between the data-refs in the loop
1930 and adjust the maximum vectorization factor according to
1931 the dependences.
1932 FORNOW: fail at the first data dependence that we encounter. */
1934 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1935 if (!ok
1936 || max_vf < min_vf)
1938 if (dump_enabled_p ())
1939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940 "bad data dependence.\n");
1941 return false;
1944 ok = vect_determine_vectorization_factor (loop_vinfo);
1945 if (!ok)
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949 "can't determine vectorization factor.\n");
1950 return false;
1952 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1956 "bad data dependence.\n");
1957 return false;
1960 /* Compute the scalar iteration cost. */
1961 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1963 int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1964 HOST_WIDE_INT estimated_niter;
1965 unsigned th;
1966 int min_scalar_loop_bound;
1968 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1969 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1970 if (!ok)
1971 return false;
1973 /* If there are any SLP instances mark them as pure_slp. */
1974 bool slp = vect_make_slp_decision (loop_vinfo);
1975 if (slp)
1977 /* Find stmts that need to be both vectorized and SLPed. */
1978 vect_detect_hybrid_slp (loop_vinfo);
1980 /* Update the vectorization factor based on the SLP decision. */
1981 vect_update_vf_for_slp (loop_vinfo);
1984 /* This is the point where we can re-start analysis with SLP forced off. */
1985 start_over:
1987 /* Now the vectorization factor is final. */
1988 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989 gcc_assert (vectorization_factor != 0);
1991 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1992 dump_printf_loc (MSG_NOTE, vect_location,
1993 "vectorization_factor = %d, niters = "
1994 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1995 LOOP_VINFO_INT_NITERS (loop_vinfo));
1997 HOST_WIDE_INT max_niter
1998 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1999 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2000 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
2001 || (max_niter != -1
2002 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
2004 if (dump_enabled_p ())
2005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2006 "not vectorized: iteration count smaller than "
2007 "vectorization factor.\n");
2008 return false;
2011 /* Analyze the alignment of the data-refs in the loop.
2012 Fail if a data reference is found that cannot be vectorized. */
2014 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2015 if (!ok)
2017 if (dump_enabled_p ())
2018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2019 "bad data alignment.\n");
2020 return false;
2023 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2024 It is important to call pruning after vect_analyze_data_ref_accesses,
2025 since we use grouping information gathered by interleaving analysis. */
2026 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2027 if (!ok)
2028 return false;
2030 /* This pass will decide on using loop versioning and/or loop peeling in
2031 order to enhance the alignment of data references in the loop. */
2032 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2033 if (!ok)
2035 if (dump_enabled_p ())
2036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2037 "bad data alignment.\n");
2038 return false;
2041 if (slp)
2043 /* Analyze operations in the SLP instances. Note this may
2044 remove unsupported SLP instances which makes the above
2045 SLP kind detection invalid. */
2046 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2047 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2048 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2049 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2050 goto again;
2053 /* Scan all the remaining operations in the loop that are not subject
2054 to SLP and make sure they are vectorizable. */
2055 ok = vect_analyze_loop_operations (loop_vinfo);
2056 if (!ok)
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2060 "bad operation or unsupported loop bound.\n");
2061 return false;
2064 /* Analyze cost. Decide if worth while to vectorize. */
2065 int min_profitable_estimate, min_profitable_iters;
2066 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2067 &min_profitable_estimate);
2069 if (min_profitable_iters < 0)
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2073 "not vectorized: vectorization not profitable.\n");
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 "not vectorized: vector version will never be "
2077 "profitable.\n");
2078 goto again;
2081 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2082 * vectorization_factor) - 1);
2084 /* Use the cost model only if it is more conservative than user specified
2085 threshold. */
2086 th = (unsigned) min_scalar_loop_bound;
2087 if (min_profitable_iters
2088 && (!min_scalar_loop_bound
2089 || min_profitable_iters > min_scalar_loop_bound))
2090 th = (unsigned) min_profitable_iters;
2092 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2094 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2095 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2097 if (dump_enabled_p ())
2098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2099 "not vectorized: vectorization not profitable.\n");
2100 if (dump_enabled_p ())
2101 dump_printf_loc (MSG_NOTE, vect_location,
2102 "not vectorized: iteration count smaller than user "
2103 "specified loop bound parameter or minimum profitable "
2104 "iterations (whichever is more conservative).\n");
2105 goto again;
2108 estimated_niter
2109 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2110 if (estimated_niter == -1)
2111 estimated_niter = max_niter;
2112 if (estimated_niter != -1
2113 && ((unsigned HOST_WIDE_INT) estimated_niter
2114 <= MAX (th, (unsigned)min_profitable_estimate)))
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118 "not vectorized: estimated iteration count too "
2119 "small.\n");
2120 if (dump_enabled_p ())
2121 dump_printf_loc (MSG_NOTE, vect_location,
2122 "not vectorized: estimated iteration count smaller "
2123 "than specified loop bound parameter or minimum "
2124 "profitable iterations (whichever is more "
2125 "conservative).\n");
2126 goto again;
2129 /* Decide whether we need to create an epilogue loop to handle
2130 remaining scalar iterations. */
2131 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2132 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2133 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2135 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2136 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2138 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2139 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2140 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2141 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2143 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2144 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2145 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2146 /* In case of versioning, check if the maximum number of
2147 iterations is greater than th. If they are identical,
2148 the epilogue is unnecessary. */
2149 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2150 || (unsigned HOST_WIDE_INT) max_niter > th)))
2151 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2153 /* If an epilogue loop is required make sure we can create one. */
2154 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2155 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2157 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2159 if (!vect_can_advance_ivs_p (loop_vinfo)
2160 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2161 single_exit (LOOP_VINFO_LOOP
2162 (loop_vinfo))))
2164 if (dump_enabled_p ())
2165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2166 "not vectorized: can't create required "
2167 "epilog loop\n");
2168 goto again;
2172 gcc_assert (vectorization_factor
2173 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2175 /* Ok to vectorize! */
2176 return true;
2178 again:
2179 /* Try again with SLP forced off but if we didn't do any SLP there is
2180 no point in re-trying. */
2181 if (!slp)
2182 return false;
2184 /* If there are reduction chains re-trying will fail anyway. */
2185 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2186 return false;
2188 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2189 via interleaving or lane instructions. */
2190 slp_instance instance;
2191 slp_tree node;
2192 unsigned i, j;
2193 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2195 stmt_vec_info vinfo;
2196 vinfo = vinfo_for_stmt
2197 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2198 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2199 continue;
2200 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2201 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2202 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2203 if (! vect_store_lanes_supported (vectype, size)
2204 && ! vect_grouped_store_supported (vectype, size))
2205 return false;
2206 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2208 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2209 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2210 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2211 size = STMT_VINFO_GROUP_SIZE (vinfo);
2212 vectype = STMT_VINFO_VECTYPE (vinfo);
2213 if (! vect_load_lanes_supported (vectype, size)
2214 && ! vect_grouped_load_supported (vectype, single_element_p,
2215 size))
2216 return false;
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_NOTE, vect_location,
2222 "re-trying with SLP disabled\n");
2224 /* Roll back state appropriately. No SLP this time. */
2225 slp = false;
2226 /* Restore vectorization factor as it were without SLP. */
2227 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2228 /* Free the SLP instances. */
2229 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2230 vect_free_slp_instance (instance);
2231 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2232 /* Reset SLP type to loop_vect on all stmts. */
2233 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2235 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2236 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2237 !gsi_end_p (si); gsi_next (&si))
2239 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2240 STMT_SLP_TYPE (stmt_info) = loop_vect;
2241 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2243 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2244 STMT_SLP_TYPE (stmt_info) = loop_vect;
2245 for (gimple_stmt_iterator pi
2246 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2247 !gsi_end_p (pi); gsi_next (&pi))
2249 gimple *pstmt = gsi_stmt (pi);
2250 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2255 /* Free optimized alias test DDRS. */
2256 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2257 /* Reset target cost data. */
2258 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2259 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2260 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2261 /* Reset assorted flags. */
2262 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2263 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2264 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2266 goto start_over;
2269 /* Function vect_analyze_loop.
2271 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2272 for it. The different analyses will record information in the
2273 loop_vec_info struct. */
2274 loop_vec_info
2275 vect_analyze_loop (struct loop *loop)
2277 loop_vec_info loop_vinfo;
2278 unsigned int vector_sizes;
2280 /* Autodetect first vector size we try. */
2281 current_vector_size = 0;
2282 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE, vect_location,
2286 "===== analyze_loop_nest =====\n");
2288 if (loop_outer (loop)
2289 && loop_vec_info_for_loop (loop_outer (loop))
2290 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE, vect_location,
2294 "outer-loop already vectorized.\n");
2295 return NULL;
2298 while (1)
2300 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2301 loop_vinfo = vect_analyze_loop_form (loop);
2302 if (!loop_vinfo)
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306 "bad loop form.\n");
2307 return NULL;
2310 bool fatal = false;
2311 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2313 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2315 return loop_vinfo;
2318 destroy_loop_vec_info (loop_vinfo, true);
2320 vector_sizes &= ~current_vector_size;
2321 if (fatal
2322 || vector_sizes == 0
2323 || current_vector_size == 0)
2324 return NULL;
2326 /* Try the next biggest vector size. */
2327 current_vector_size = 1 << floor_log2 (vector_sizes);
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_NOTE, vect_location,
2330 "***** Re-trying analysis with "
2331 "vector size %d\n", current_vector_size);
2336 /* Function reduction_code_for_scalar_code
2338 Input:
2339 CODE - tree_code of a reduction operations.
2341 Output:
2342 REDUC_CODE - the corresponding tree-code to be used to reduce the
2343 vector of partial results into a single scalar result, or ERROR_MARK
2344 if the operation is a supported reduction operation, but does not have
2345 such a tree-code.
2347 Return FALSE if CODE currently cannot be vectorized as reduction. */
2349 static bool
2350 reduction_code_for_scalar_code (enum tree_code code,
2351 enum tree_code *reduc_code)
2353 switch (code)
2355 case MAX_EXPR:
2356 *reduc_code = REDUC_MAX_EXPR;
2357 return true;
2359 case MIN_EXPR:
2360 *reduc_code = REDUC_MIN_EXPR;
2361 return true;
2363 case PLUS_EXPR:
2364 *reduc_code = REDUC_PLUS_EXPR;
2365 return true;
2367 case MULT_EXPR:
2368 case MINUS_EXPR:
2369 case BIT_IOR_EXPR:
2370 case BIT_XOR_EXPR:
2371 case BIT_AND_EXPR:
2372 *reduc_code = ERROR_MARK;
2373 return true;
2375 default:
2376 return false;
2381 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2382 STMT is printed with a message MSG. */
2384 static void
2385 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2387 dump_printf_loc (msg_type, vect_location, "%s", msg);
2388 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2392 /* Detect SLP reduction of the form:
2394 #a1 = phi <a5, a0>
2395 a2 = operation (a1)
2396 a3 = operation (a2)
2397 a4 = operation (a3)
2398 a5 = operation (a4)
2400 #a = phi <a5>
2402 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2403 FIRST_STMT is the first reduction stmt in the chain
2404 (a2 = operation (a1)).
2406 Return TRUE if a reduction chain was detected. */
2408 static bool
2409 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2410 gimple *first_stmt)
2412 struct loop *loop = (gimple_bb (phi))->loop_father;
2413 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2414 enum tree_code code;
2415 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2416 stmt_vec_info use_stmt_info, current_stmt_info;
2417 tree lhs;
2418 imm_use_iterator imm_iter;
2419 use_operand_p use_p;
2420 int nloop_uses, size = 0, n_out_of_loop_uses;
2421 bool found = false;
2423 if (loop != vect_loop)
2424 return false;
2426 lhs = PHI_RESULT (phi);
2427 code = gimple_assign_rhs_code (first_stmt);
2428 while (1)
2430 nloop_uses = 0;
2431 n_out_of_loop_uses = 0;
2432 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2434 gimple *use_stmt = USE_STMT (use_p);
2435 if (is_gimple_debug (use_stmt))
2436 continue;
2438 /* Check if we got back to the reduction phi. */
2439 if (use_stmt == phi)
2441 loop_use_stmt = use_stmt;
2442 found = true;
2443 break;
2446 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2448 loop_use_stmt = use_stmt;
2449 nloop_uses++;
2451 else
2452 n_out_of_loop_uses++;
2454 /* There are can be either a single use in the loop or two uses in
2455 phi nodes. */
2456 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2457 return false;
2460 if (found)
2461 break;
2463 /* We reached a statement with no loop uses. */
2464 if (nloop_uses == 0)
2465 return false;
2467 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2468 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2469 return false;
2471 if (!is_gimple_assign (loop_use_stmt)
2472 || code != gimple_assign_rhs_code (loop_use_stmt)
2473 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2474 return false;
2476 /* Insert USE_STMT into reduction chain. */
2477 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2478 if (current_stmt)
2480 current_stmt_info = vinfo_for_stmt (current_stmt);
2481 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2482 GROUP_FIRST_ELEMENT (use_stmt_info)
2483 = GROUP_FIRST_ELEMENT (current_stmt_info);
2485 else
2486 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2488 lhs = gimple_assign_lhs (loop_use_stmt);
2489 current_stmt = loop_use_stmt;
2490 size++;
2493 if (!found || loop_use_stmt != phi || size < 2)
2494 return false;
2496 /* Swap the operands, if needed, to make the reduction operand be the second
2497 operand. */
2498 lhs = PHI_RESULT (phi);
2499 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2500 while (next_stmt)
2502 if (gimple_assign_rhs2 (next_stmt) == lhs)
2504 tree op = gimple_assign_rhs1 (next_stmt);
2505 gimple *def_stmt = NULL;
2507 if (TREE_CODE (op) == SSA_NAME)
2508 def_stmt = SSA_NAME_DEF_STMT (op);
2510 /* Check that the other def is either defined in the loop
2511 ("vect_internal_def"), or it's an induction (defined by a
2512 loop-header phi-node). */
2513 if (def_stmt
2514 && gimple_bb (def_stmt)
2515 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2516 && (is_gimple_assign (def_stmt)
2517 || is_gimple_call (def_stmt)
2518 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2519 == vect_induction_def
2520 || (gimple_code (def_stmt) == GIMPLE_PHI
2521 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2522 == vect_internal_def
2523 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2525 lhs = gimple_assign_lhs (next_stmt);
2526 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2527 continue;
2530 return false;
2532 else
2534 tree op = gimple_assign_rhs2 (next_stmt);
2535 gimple *def_stmt = NULL;
2537 if (TREE_CODE (op) == SSA_NAME)
2538 def_stmt = SSA_NAME_DEF_STMT (op);
2540 /* Check that the other def is either defined in the loop
2541 ("vect_internal_def"), or it's an induction (defined by a
2542 loop-header phi-node). */
2543 if (def_stmt
2544 && gimple_bb (def_stmt)
2545 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2546 && (is_gimple_assign (def_stmt)
2547 || is_gimple_call (def_stmt)
2548 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2549 == vect_induction_def
2550 || (gimple_code (def_stmt) == GIMPLE_PHI
2551 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2552 == vect_internal_def
2553 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2555 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2558 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2561 swap_ssa_operands (next_stmt,
2562 gimple_assign_rhs1_ptr (next_stmt),
2563 gimple_assign_rhs2_ptr (next_stmt));
2564 update_stmt (next_stmt);
2566 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2567 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2569 else
2570 return false;
2573 lhs = gimple_assign_lhs (next_stmt);
2574 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2577 /* Save the chain for further analysis in SLP detection. */
2578 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2579 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2580 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2582 return true;
2586 /* Function vect_is_simple_reduction_1
2588 (1) Detect a cross-iteration def-use cycle that represents a simple
2589 reduction computation. We look for the following pattern:
2591 loop_header:
2592 a1 = phi < a0, a2 >
2593 a3 = ...
2594 a2 = operation (a3, a1)
2598 a3 = ...
2599 loop_header:
2600 a1 = phi < a0, a2 >
2601 a2 = operation (a3, a1)
2603 such that:
2604 1. operation is commutative and associative and it is safe to
2605 change the order of the computation (if CHECK_REDUCTION is true)
2606 2. no uses for a2 in the loop (a2 is used out of the loop)
2607 3. no uses of a1 in the loop besides the reduction operation
2608 4. no uses of a1 outside the loop.
2610 Conditions 1,4 are tested here.
2611 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2613 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2614 nested cycles, if CHECK_REDUCTION is false.
2616 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2617 reductions:
2619 a1 = phi < a0, a2 >
2620 inner loop (def of a3)
2621 a2 = phi < a3 >
2623 (4) Detect condition expressions, ie:
2624 for (int i = 0; i < N; i++)
2625 if (a[i] < val)
2626 ret_val = a[i];
2630 static gimple *
2631 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2632 bool check_reduction, bool *double_reduc,
2633 bool need_wrapping_integral_overflow,
2634 enum vect_reduction_type *v_reduc_type)
2636 struct loop *loop = (gimple_bb (phi))->loop_father;
2637 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2638 edge latch_e = loop_latch_edge (loop);
2639 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2640 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2641 enum tree_code orig_code, code;
2642 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2643 tree type;
2644 int nloop_uses;
2645 tree name;
2646 imm_use_iterator imm_iter;
2647 use_operand_p use_p;
2648 bool phi_def;
2650 *double_reduc = false;
2651 *v_reduc_type = TREE_CODE_REDUCTION;
2653 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2654 otherwise, we assume outer loop vectorization. */
2655 gcc_assert ((check_reduction && loop == vect_loop)
2656 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2658 name = PHI_RESULT (phi);
2659 /* ??? If there are no uses of the PHI result the inner loop reduction
2660 won't be detected as possibly double-reduction by vectorizable_reduction
2661 because that tries to walk the PHI arg from the preheader edge which
2662 can be constant. See PR60382. */
2663 if (has_zero_uses (name))
2664 return NULL;
2665 nloop_uses = 0;
2666 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2668 gimple *use_stmt = USE_STMT (use_p);
2669 if (is_gimple_debug (use_stmt))
2670 continue;
2672 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2674 if (dump_enabled_p ())
2675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2676 "intermediate value used outside loop.\n");
2678 return NULL;
2681 nloop_uses++;
2682 if (nloop_uses > 1)
2684 if (dump_enabled_p ())
2685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2686 "reduction used in loop.\n");
2687 return NULL;
2690 phi_use_stmt = use_stmt;
2693 if (TREE_CODE (loop_arg) != SSA_NAME)
2695 if (dump_enabled_p ())
2697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2698 "reduction: not ssa_name: ");
2699 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2700 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2702 return NULL;
2705 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2706 if (!def_stmt)
2708 if (dump_enabled_p ())
2709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2710 "reduction: no def_stmt.\n");
2711 return NULL;
2714 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2716 if (dump_enabled_p ())
2717 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2718 return NULL;
2721 if (is_gimple_assign (def_stmt))
2723 name = gimple_assign_lhs (def_stmt);
2724 phi_def = false;
2726 else
2728 name = PHI_RESULT (def_stmt);
2729 phi_def = true;
2732 nloop_uses = 0;
2733 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2735 gimple *use_stmt = USE_STMT (use_p);
2736 if (is_gimple_debug (use_stmt))
2737 continue;
2738 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2739 nloop_uses++;
2740 if (nloop_uses > 1)
2742 if (dump_enabled_p ())
2743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744 "reduction used in loop.\n");
2745 return NULL;
2749 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2750 defined in the inner loop. */
2751 if (phi_def)
2753 op1 = PHI_ARG_DEF (def_stmt, 0);
2755 if (gimple_phi_num_args (def_stmt) != 1
2756 || TREE_CODE (op1) != SSA_NAME)
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "unsupported phi node definition.\n");
2762 return NULL;
2765 def1 = SSA_NAME_DEF_STMT (op1);
2766 if (gimple_bb (def1)
2767 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2768 && loop->inner
2769 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2770 && is_gimple_assign (def1)
2771 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2773 if (dump_enabled_p ())
2774 report_vect_op (MSG_NOTE, def_stmt,
2775 "detected double reduction: ");
2777 *double_reduc = true;
2778 return def_stmt;
2781 return NULL;
2784 code = orig_code = gimple_assign_rhs_code (def_stmt);
2786 /* We can handle "res -= x[i]", which is non-associative by
2787 simply rewriting this into "res += -x[i]". Avoid changing
2788 gimple instruction for the first simple tests and only do this
2789 if we're allowed to change code at all. */
2790 if (code == MINUS_EXPR
2791 && (op1 = gimple_assign_rhs1 (def_stmt))
2792 && TREE_CODE (op1) == SSA_NAME
2793 && SSA_NAME_DEF_STMT (op1) == phi)
2794 code = PLUS_EXPR;
2796 if (code == COND_EXPR)
2798 if (check_reduction)
2799 *v_reduc_type = COND_REDUCTION;
2801 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2803 if (dump_enabled_p ())
2804 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2805 "reduction: not commutative/associative: ");
2806 return NULL;
2809 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2811 if (code != COND_EXPR)
2813 if (dump_enabled_p ())
2814 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2815 "reduction: not binary operation: ");
2817 return NULL;
2820 op3 = gimple_assign_rhs1 (def_stmt);
2821 if (COMPARISON_CLASS_P (op3))
2823 op4 = TREE_OPERAND (op3, 1);
2824 op3 = TREE_OPERAND (op3, 0);
2827 op1 = gimple_assign_rhs2 (def_stmt);
2828 op2 = gimple_assign_rhs3 (def_stmt);
2830 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2832 if (dump_enabled_p ())
2833 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2834 "reduction: uses not ssa_names: ");
2836 return NULL;
2839 else
2841 op1 = gimple_assign_rhs1 (def_stmt);
2842 op2 = gimple_assign_rhs2 (def_stmt);
2844 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2846 if (dump_enabled_p ())
2847 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2848 "reduction: uses not ssa_names: ");
2850 return NULL;
2854 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2855 if ((TREE_CODE (op1) == SSA_NAME
2856 && !types_compatible_p (type,TREE_TYPE (op1)))
2857 || (TREE_CODE (op2) == SSA_NAME
2858 && !types_compatible_p (type, TREE_TYPE (op2)))
2859 || (op3 && TREE_CODE (op3) == SSA_NAME
2860 && !types_compatible_p (type, TREE_TYPE (op3)))
2861 || (op4 && TREE_CODE (op4) == SSA_NAME
2862 && !types_compatible_p (type, TREE_TYPE (op4))))
2864 if (dump_enabled_p ())
2866 dump_printf_loc (MSG_NOTE, vect_location,
2867 "reduction: multiple types: operation type: ");
2868 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2869 dump_printf (MSG_NOTE, ", operands types: ");
2870 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2871 TREE_TYPE (op1));
2872 dump_printf (MSG_NOTE, ",");
2873 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2874 TREE_TYPE (op2));
2875 if (op3)
2877 dump_printf (MSG_NOTE, ",");
2878 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2879 TREE_TYPE (op3));
2882 if (op4)
2884 dump_printf (MSG_NOTE, ",");
2885 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2886 TREE_TYPE (op4));
2888 dump_printf (MSG_NOTE, "\n");
2891 return NULL;
2894 /* Check that it's ok to change the order of the computation.
2895 Generally, when vectorizing a reduction we change the order of the
2896 computation. This may change the behavior of the program in some
2897 cases, so we need to check that this is ok. One exception is when
2898 vectorizing an outer-loop: the inner-loop is executed sequentially,
2899 and therefore vectorizing reductions in the inner-loop during
2900 outer-loop vectorization is safe. */
2902 if (*v_reduc_type != COND_REDUCTION
2903 && check_reduction)
2905 /* CHECKME: check for !flag_finite_math_only too? */
2906 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2908 /* Changing the order of operations changes the semantics. */
2909 if (dump_enabled_p ())
2910 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2911 "reduction: unsafe fp math optimization: ");
2912 return NULL;
2914 else if (INTEGRAL_TYPE_P (type))
2916 if (!operation_no_trapping_overflow (type, code))
2918 /* Changing the order of operations changes the semantics. */
2919 if (dump_enabled_p ())
2920 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2921 "reduction: unsafe int math optimization"
2922 " (overflow traps): ");
2923 return NULL;
2925 if (need_wrapping_integral_overflow
2926 && !TYPE_OVERFLOW_WRAPS (type)
2927 && operation_can_overflow (code))
2929 /* Changing the order of operations changes the semantics. */
2930 if (dump_enabled_p ())
2931 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2932 "reduction: unsafe int math optimization"
2933 " (overflow doesn't wrap): ");
2934 return NULL;
2937 else if (SAT_FIXED_POINT_TYPE_P (type))
2939 /* Changing the order of operations changes the semantics. */
2940 if (dump_enabled_p ())
2941 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2942 "reduction: unsafe fixed-point math optimization: ");
2943 return NULL;
2947 /* Reduction is safe. We're dealing with one of the following:
2948 1) integer arithmetic and no trapv
2949 2) floating point arithmetic, and special flags permit this optimization
2950 3) nested cycle (i.e., outer loop vectorization). */
2951 if (TREE_CODE (op1) == SSA_NAME)
2952 def1 = SSA_NAME_DEF_STMT (op1);
2954 if (TREE_CODE (op2) == SSA_NAME)
2955 def2 = SSA_NAME_DEF_STMT (op2);
2957 if (code != COND_EXPR
2958 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2960 if (dump_enabled_p ())
2961 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2962 return NULL;
2965 /* Check that one def is the reduction def, defined by PHI,
2966 the other def is either defined in the loop ("vect_internal_def"),
2967 or it's an induction (defined by a loop-header phi-node). */
2969 if (def2 && def2 == phi
2970 && (code == COND_EXPR
2971 || !def1 || gimple_nop_p (def1)
2972 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2973 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2974 && (is_gimple_assign (def1)
2975 || is_gimple_call (def1)
2976 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2977 == vect_induction_def
2978 || (gimple_code (def1) == GIMPLE_PHI
2979 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2980 == vect_internal_def
2981 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2983 if (dump_enabled_p ())
2984 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2985 return def_stmt;
2988 if (def1 && def1 == phi
2989 && (code == COND_EXPR
2990 || !def2 || gimple_nop_p (def2)
2991 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2992 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2993 && (is_gimple_assign (def2)
2994 || is_gimple_call (def2)
2995 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2996 == vect_induction_def
2997 || (gimple_code (def2) == GIMPLE_PHI
2998 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2999 == vect_internal_def
3000 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3002 if (check_reduction
3003 && orig_code != MINUS_EXPR)
3005 if (code == COND_EXPR)
3007 /* No current known use where this case would be useful. */
3008 if (dump_enabled_p ())
3009 report_vect_op (MSG_NOTE, def_stmt,
3010 "detected reduction: cannot currently swap "
3011 "operands for cond_expr");
3012 return NULL;
3015 /* Swap operands (just for simplicity - so that the rest of the code
3016 can assume that the reduction variable is always the last (second)
3017 argument). */
3018 if (dump_enabled_p ())
3019 report_vect_op (MSG_NOTE, def_stmt,
3020 "detected reduction: need to swap operands: ");
3022 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3023 gimple_assign_rhs2_ptr (def_stmt));
3025 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3026 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3028 else
3030 if (dump_enabled_p ())
3031 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3034 return def_stmt;
3037 /* Try to find SLP reduction chain. */
3038 if (check_reduction && code != COND_EXPR
3039 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3041 if (dump_enabled_p ())
3042 report_vect_op (MSG_NOTE, def_stmt,
3043 "reduction: detected reduction chain: ");
3045 return def_stmt;
3048 if (dump_enabled_p ())
3049 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3050 "reduction: unknown pattern: ");
3052 return NULL;
3055 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3056 in-place if it enables detection of more reductions. Arguments
3057 as there. */
3059 gimple *
3060 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3061 bool check_reduction, bool *double_reduc,
3062 bool need_wrapping_integral_overflow)
3064 enum vect_reduction_type v_reduc_type;
3065 return vect_is_simple_reduction (loop_info, phi, check_reduction,
3066 double_reduc,
3067 need_wrapping_integral_overflow,
3068 &v_reduc_type);
3071 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3073 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3074 int *peel_iters_epilogue,
3075 stmt_vector_for_cost *scalar_cost_vec,
3076 stmt_vector_for_cost *prologue_cost_vec,
3077 stmt_vector_for_cost *epilogue_cost_vec)
3079 int retval = 0;
3080 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3082 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3084 *peel_iters_epilogue = vf/2;
3085 if (dump_enabled_p ())
3086 dump_printf_loc (MSG_NOTE, vect_location,
3087 "cost model: epilogue peel iters set to vf/2 "
3088 "because loop iterations are unknown .\n");
3090 /* If peeled iterations are known but number of scalar loop
3091 iterations are unknown, count a taken branch per peeled loop. */
3092 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3093 NULL, 0, vect_prologue);
3094 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3095 NULL, 0, vect_epilogue);
3097 else
3099 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3100 peel_iters_prologue = niters < peel_iters_prologue ?
3101 niters : peel_iters_prologue;
3102 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3103 /* If we need to peel for gaps, but no peeling is required, we have to
3104 peel VF iterations. */
3105 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3106 *peel_iters_epilogue = vf;
3109 stmt_info_for_cost *si;
3110 int j;
3111 if (peel_iters_prologue)
3112 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3113 retval += record_stmt_cost (prologue_cost_vec,
3114 si->count * peel_iters_prologue,
3115 si->kind, NULL, si->misalign,
3116 vect_prologue);
3117 if (*peel_iters_epilogue)
3118 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3119 retval += record_stmt_cost (epilogue_cost_vec,
3120 si->count * *peel_iters_epilogue,
3121 si->kind, NULL, si->misalign,
3122 vect_epilogue);
3124 return retval;
3127 /* Function vect_estimate_min_profitable_iters
3129 Return the number of iterations required for the vector version of the
3130 loop to be profitable relative to the cost of the scalar version of the
3131 loop.
3133 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3134 of iterations for vectorization. -1 value means loop vectorization
3135 is not profitable. This returned value may be used for dynamic
3136 profitability check.
3138 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3139 for static check against estimated number of iterations. */
3141 static void
3142 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3143 int *ret_min_profitable_niters,
3144 int *ret_min_profitable_estimate)
3146 int min_profitable_iters;
3147 int min_profitable_estimate;
3148 int peel_iters_prologue;
3149 int peel_iters_epilogue;
3150 unsigned vec_inside_cost = 0;
3151 int vec_outside_cost = 0;
3152 unsigned vec_prologue_cost = 0;
3153 unsigned vec_epilogue_cost = 0;
3154 int scalar_single_iter_cost = 0;
3155 int scalar_outside_cost = 0;
3156 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3157 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3158 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3160 /* Cost model disabled. */
3161 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3163 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3164 *ret_min_profitable_niters = 0;
3165 *ret_min_profitable_estimate = 0;
3166 return;
3169 /* Requires loop versioning tests to handle misalignment. */
3170 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3172 /* FIXME: Make cost depend on complexity of individual check. */
3173 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3174 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3175 vect_prologue);
3176 dump_printf (MSG_NOTE,
3177 "cost model: Adding cost of checks for loop "
3178 "versioning to treat misalignment.\n");
3181 /* Requires loop versioning with alias checks. */
3182 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3184 /* FIXME: Make cost depend on complexity of individual check. */
3185 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3186 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3187 vect_prologue);
3188 dump_printf (MSG_NOTE,
3189 "cost model: Adding cost of checks for loop "
3190 "versioning aliasing.\n");
3193 /* Requires loop versioning with niter checks. */
3194 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3196 /* FIXME: Make cost depend on complexity of individual check. */
3197 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3198 vect_prologue);
3199 dump_printf (MSG_NOTE,
3200 "cost model: Adding cost of checks for loop "
3201 "versioning niters.\n");
3204 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3205 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3206 vect_prologue);
3208 /* Count statements in scalar loop. Using this as scalar cost for a single
3209 iteration for now.
3211 TODO: Add outer loop support.
3213 TODO: Consider assigning different costs to different scalar
3214 statements. */
3216 scalar_single_iter_cost
3217 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3219 /* Add additional cost for the peeled instructions in prologue and epilogue
3220 loop.
3222 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3223 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3225 TODO: Build an expression that represents peel_iters for prologue and
3226 epilogue to be used in a run-time test. */
3228 if (npeel < 0)
3230 peel_iters_prologue = vf/2;
3231 dump_printf (MSG_NOTE, "cost model: "
3232 "prologue peel iters set to vf/2.\n");
3234 /* If peeling for alignment is unknown, loop bound of main loop becomes
3235 unknown. */
3236 peel_iters_epilogue = vf/2;
3237 dump_printf (MSG_NOTE, "cost model: "
3238 "epilogue peel iters set to vf/2 because "
3239 "peeling for alignment is unknown.\n");
3241 /* If peeled iterations are unknown, count a taken branch and a not taken
3242 branch per peeled loop. Even if scalar loop iterations are known,
3243 vector iterations are not known since peeled prologue iterations are
3244 not known. Hence guards remain the same. */
3245 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3246 NULL, 0, vect_prologue);
3247 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3248 NULL, 0, vect_prologue);
3249 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3250 NULL, 0, vect_epilogue);
3251 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3252 NULL, 0, vect_epilogue);
3253 stmt_info_for_cost *si;
3254 int j;
3255 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3257 struct _stmt_vec_info *stmt_info
3258 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3259 (void) add_stmt_cost (target_cost_data,
3260 si->count * peel_iters_prologue,
3261 si->kind, stmt_info, si->misalign,
3262 vect_prologue);
3263 (void) add_stmt_cost (target_cost_data,
3264 si->count * peel_iters_epilogue,
3265 si->kind, stmt_info, si->misalign,
3266 vect_epilogue);
3269 else
3271 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3272 stmt_info_for_cost *si;
3273 int j;
3274 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3276 prologue_cost_vec.create (2);
3277 epilogue_cost_vec.create (2);
3278 peel_iters_prologue = npeel;
3280 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3281 &peel_iters_epilogue,
3282 &LOOP_VINFO_SCALAR_ITERATION_COST
3283 (loop_vinfo),
3284 &prologue_cost_vec,
3285 &epilogue_cost_vec);
3287 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3289 struct _stmt_vec_info *stmt_info
3290 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3291 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3292 si->misalign, vect_prologue);
3295 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3297 struct _stmt_vec_info *stmt_info
3298 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3299 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3300 si->misalign, vect_epilogue);
3303 prologue_cost_vec.release ();
3304 epilogue_cost_vec.release ();
3307 /* FORNOW: The scalar outside cost is incremented in one of the
3308 following ways:
3310 1. The vectorizer checks for alignment and aliasing and generates
3311 a condition that allows dynamic vectorization. A cost model
3312 check is ANDED with the versioning condition. Hence scalar code
3313 path now has the added cost of the versioning check.
3315 if (cost > th & versioning_check)
3316 jmp to vector code
3318 Hence run-time scalar is incremented by not-taken branch cost.
3320 2. The vectorizer then checks if a prologue is required. If the
3321 cost model check was not done before during versioning, it has to
3322 be done before the prologue check.
3324 if (cost <= th)
3325 prologue = scalar_iters
3326 if (prologue == 0)
3327 jmp to vector code
3328 else
3329 execute prologue
3330 if (prologue == num_iters)
3331 go to exit
3333 Hence the run-time scalar cost is incremented by a taken branch,
3334 plus a not-taken branch, plus a taken branch cost.
3336 3. The vectorizer then checks if an epilogue is required. If the
3337 cost model check was not done before during prologue check, it
3338 has to be done with the epilogue check.
3340 if (prologue == 0)
3341 jmp to vector code
3342 else
3343 execute prologue
3344 if (prologue == num_iters)
3345 go to exit
3346 vector code:
3347 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3348 jmp to epilogue
3350 Hence the run-time scalar cost should be incremented by 2 taken
3351 branches.
3353 TODO: The back end may reorder the BBS's differently and reverse
3354 conditions/branch directions. Change the estimates below to
3355 something more reasonable. */
3357 /* If the number of iterations is known and we do not do versioning, we can
3358 decide whether to vectorize at compile time. Hence the scalar version
3359 do not carry cost model guard costs. */
3360 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3361 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3363 /* Cost model check occurs at versioning. */
3364 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3365 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3366 else
3368 /* Cost model check occurs at prologue generation. */
3369 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3370 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3371 + vect_get_stmt_cost (cond_branch_not_taken);
3372 /* Cost model check occurs at epilogue generation. */
3373 else
3374 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3378 /* Complete the target-specific cost calculations. */
3379 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3380 &vec_inside_cost, &vec_epilogue_cost);
3382 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3384 if (dump_enabled_p ())
3386 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3387 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3388 vec_inside_cost);
3389 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3390 vec_prologue_cost);
3391 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3392 vec_epilogue_cost);
3393 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3394 scalar_single_iter_cost);
3395 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3396 scalar_outside_cost);
3397 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3398 vec_outside_cost);
3399 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3400 peel_iters_prologue);
3401 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3402 peel_iters_epilogue);
3405 /* Calculate number of iterations required to make the vector version
3406 profitable, relative to the loop bodies only. The following condition
3407 must hold true:
3408 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3409 where
3410 SIC = scalar iteration cost, VIC = vector iteration cost,
3411 VOC = vector outside cost, VF = vectorization factor,
3412 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3413 SOC = scalar outside cost for run time cost model check. */
3415 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3417 if (vec_outside_cost <= 0)
3418 min_profitable_iters = 1;
3419 else
3421 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3422 - vec_inside_cost * peel_iters_prologue
3423 - vec_inside_cost * peel_iters_epilogue)
3424 / ((scalar_single_iter_cost * vf)
3425 - vec_inside_cost);
3427 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3428 <= (((int) vec_inside_cost * min_profitable_iters)
3429 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3430 min_profitable_iters++;
3433 /* vector version will never be profitable. */
3434 else
3436 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3437 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3438 "did not happen for a simd loop");
3440 if (dump_enabled_p ())
3441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3442 "cost model: the vector iteration cost = %d "
3443 "divided by the scalar iteration cost = %d "
3444 "is greater or equal to the vectorization factor = %d"
3445 ".\n",
3446 vec_inside_cost, scalar_single_iter_cost, vf);
3447 *ret_min_profitable_niters = -1;
3448 *ret_min_profitable_estimate = -1;
3449 return;
3452 dump_printf (MSG_NOTE,
3453 " Calculated minimum iters for profitability: %d\n",
3454 min_profitable_iters);
3456 min_profitable_iters =
3457 min_profitable_iters < vf ? vf : min_profitable_iters;
3459 /* Because the condition we create is:
3460 if (niters <= min_profitable_iters)
3461 then skip the vectorized loop. */
3462 min_profitable_iters--;
3464 if (dump_enabled_p ())
3465 dump_printf_loc (MSG_NOTE, vect_location,
3466 " Runtime profitability threshold = %d\n",
3467 min_profitable_iters);
3469 *ret_min_profitable_niters = min_profitable_iters;
3471 /* Calculate number of iterations required to make the vector version
3472 profitable, relative to the loop bodies only.
3474 Non-vectorized variant is SIC * niters and it must win over vector
3475 variant on the expected loop trip count. The following condition must hold true:
3476 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3478 if (vec_outside_cost <= 0)
3479 min_profitable_estimate = 1;
3480 else
3482 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3483 - vec_inside_cost * peel_iters_prologue
3484 - vec_inside_cost * peel_iters_epilogue)
3485 / ((scalar_single_iter_cost * vf)
3486 - vec_inside_cost);
3488 min_profitable_estimate --;
3489 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3490 if (dump_enabled_p ())
3491 dump_printf_loc (MSG_NOTE, vect_location,
3492 " Static estimate profitability threshold = %d\n",
3493 min_profitable_estimate);
3495 *ret_min_profitable_estimate = min_profitable_estimate;
3498 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3499 vector elements (not bits) for a vector of mode MODE. */
3500 static void
3501 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3502 unsigned char *sel)
3504 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3506 for (i = 0; i < nelt; i++)
3507 sel[i] = (i + offset) & (2*nelt - 1);
3510 /* Checks whether the target supports whole-vector shifts for vectors of mode
3511 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3512 it supports vec_perm_const with masks for all necessary shift amounts. */
3513 static bool
3514 have_whole_vector_shift (enum machine_mode mode)
3516 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3517 return true;
3519 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3520 return false;
3522 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3523 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3525 for (i = nelt/2; i >= 1; i/=2)
3527 calc_vec_perm_mask_for_shift (mode, i, sel);
3528 if (!can_vec_perm_p (mode, false, sel))
3529 return false;
3531 return true;
3534 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3536 static tree
3537 get_reduction_op (gimple *stmt, int reduc_index)
3539 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3541 case GIMPLE_SINGLE_RHS:
3542 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3543 == ternary_op);
3544 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3545 case GIMPLE_UNARY_RHS:
3546 return gimple_assign_rhs1 (stmt);
3547 case GIMPLE_BINARY_RHS:
3548 return (reduc_index
3549 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3550 case GIMPLE_TERNARY_RHS:
3551 return gimple_op (stmt, reduc_index + 1);
3552 default:
3553 gcc_unreachable ();
3557 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3558 functions. Design better to avoid maintenance issues. */
3560 /* Function vect_model_reduction_cost.
3562 Models cost for a reduction operation, including the vector ops
3563 generated within the strip-mine loop, the initial definition before
3564 the loop, and the epilogue code that must be generated. */
3566 static bool
3567 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3568 int ncopies, int reduc_index)
3570 int prologue_cost = 0, epilogue_cost = 0;
3571 enum tree_code code;
3572 optab optab;
3573 tree vectype;
3574 gimple *stmt, *orig_stmt;
3575 tree reduction_op;
3576 machine_mode mode;
3577 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3578 struct loop *loop = NULL;
3579 void *target_cost_data;
3581 if (loop_vinfo)
3583 loop = LOOP_VINFO_LOOP (loop_vinfo);
3584 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3586 else
3587 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3589 /* Condition reductions generate two reductions in the loop. */
3590 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3591 ncopies *= 2;
3593 /* Cost of reduction op inside loop. */
3594 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3595 stmt_info, 0, vect_body);
3596 stmt = STMT_VINFO_STMT (stmt_info);
3598 reduction_op = get_reduction_op (stmt, reduc_index);
3600 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3601 if (!vectype)
3603 if (dump_enabled_p ())
3605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3606 "unsupported data-type ");
3607 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3608 TREE_TYPE (reduction_op));
3609 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3611 return false;
3614 mode = TYPE_MODE (vectype);
3615 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3617 if (!orig_stmt)
3618 orig_stmt = STMT_VINFO_STMT (stmt_info);
3620 code = gimple_assign_rhs_code (orig_stmt);
3622 /* Add in cost for initial definition.
3623 For cond reduction we have four vectors: initial index, step, initial
3624 result of the data reduction, initial value of the index reduction. */
3625 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3626 == COND_REDUCTION ? 4 : 1;
3627 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3628 scalar_to_vec, stmt_info, 0,
3629 vect_prologue);
3631 /* Determine cost of epilogue code.
3633 We have a reduction operator that will reduce the vector in one statement.
3634 Also requires scalar extract. */
3636 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3638 if (reduc_code != ERROR_MARK)
3640 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3642 /* An EQ stmt and an COND_EXPR stmt. */
3643 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3644 vector_stmt, stmt_info, 0,
3645 vect_epilogue);
3646 /* Reduction of the max index and a reduction of the found
3647 values. */
3648 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3649 vec_to_scalar, stmt_info, 0,
3650 vect_epilogue);
3651 /* A broadcast of the max value. */
3652 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3653 scalar_to_vec, stmt_info, 0,
3654 vect_epilogue);
3656 else
3658 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3659 stmt_info, 0, vect_epilogue);
3660 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3661 vec_to_scalar, stmt_info, 0,
3662 vect_epilogue);
3665 else
3667 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3668 tree bitsize =
3669 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3670 int element_bitsize = tree_to_uhwi (bitsize);
3671 int nelements = vec_size_in_bits / element_bitsize;
3673 optab = optab_for_tree_code (code, vectype, optab_default);
3675 /* We have a whole vector shift available. */
3676 if (VECTOR_MODE_P (mode)
3677 && optab_handler (optab, mode) != CODE_FOR_nothing
3678 && have_whole_vector_shift (mode))
3680 /* Final reduction via vector shifts and the reduction operator.
3681 Also requires scalar extract. */
3682 epilogue_cost += add_stmt_cost (target_cost_data,
3683 exact_log2 (nelements) * 2,
3684 vector_stmt, stmt_info, 0,
3685 vect_epilogue);
3686 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3687 vec_to_scalar, stmt_info, 0,
3688 vect_epilogue);
3690 else
3691 /* Use extracts and reduction op for final reduction. For N
3692 elements, we have N extracts and N-1 reduction ops. */
3693 epilogue_cost += add_stmt_cost (target_cost_data,
3694 nelements + nelements - 1,
3695 vector_stmt, stmt_info, 0,
3696 vect_epilogue);
3700 if (dump_enabled_p ())
3701 dump_printf (MSG_NOTE,
3702 "vect_model_reduction_cost: inside_cost = %d, "
3703 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3704 prologue_cost, epilogue_cost);
3706 return true;
3710 /* Function vect_model_induction_cost.
3712 Models cost for induction operations. */
3714 static void
3715 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3717 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3718 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3719 unsigned inside_cost, prologue_cost;
3721 /* loop cost for vec_loop. */
3722 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3723 stmt_info, 0, vect_body);
3725 /* prologue cost for vec_init and vec_step. */
3726 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3727 stmt_info, 0, vect_prologue);
3729 if (dump_enabled_p ())
3730 dump_printf_loc (MSG_NOTE, vect_location,
3731 "vect_model_induction_cost: inside_cost = %d, "
3732 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3736 /* Function get_initial_def_for_induction
3738 Input:
3739 STMT - a stmt that performs an induction operation in the loop.
3740 IV_PHI - the initial value of the induction variable
3742 Output:
3743 Return a vector variable, initialized with the first VF values of
3744 the induction variable. E.g., for an iv with IV_PHI='X' and
3745 evolution S, for a vector of 4 units, we want to return:
3746 [X, X + S, X + 2*S, X + 3*S]. */
3748 static tree
3749 get_initial_def_for_induction (gimple *iv_phi)
3751 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3752 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3753 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3754 tree vectype;
3755 int nunits;
3756 edge pe = loop_preheader_edge (loop);
3757 struct loop *iv_loop;
3758 basic_block new_bb;
3759 tree new_vec, vec_init, vec_step, t;
3760 tree new_name;
3761 gimple *new_stmt;
3762 gphi *induction_phi;
3763 tree induc_def, vec_def, vec_dest;
3764 tree init_expr, step_expr;
3765 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3766 int i;
3767 int ncopies;
3768 tree expr;
3769 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3770 bool nested_in_vect_loop = false;
3771 gimple_seq stmts;
3772 imm_use_iterator imm_iter;
3773 use_operand_p use_p;
3774 gimple *exit_phi;
3775 edge latch_e;
3776 tree loop_arg;
3777 gimple_stmt_iterator si;
3778 basic_block bb = gimple_bb (iv_phi);
3779 tree stepvectype;
3780 tree resvectype;
3782 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3783 if (nested_in_vect_loop_p (loop, iv_phi))
3785 nested_in_vect_loop = true;
3786 iv_loop = loop->inner;
3788 else
3789 iv_loop = loop;
3790 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3792 latch_e = loop_latch_edge (iv_loop);
3793 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3795 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3796 gcc_assert (step_expr != NULL_TREE);
3798 pe = loop_preheader_edge (iv_loop);
3799 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3800 loop_preheader_edge (iv_loop));
3802 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3803 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3804 gcc_assert (vectype);
3805 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3806 ncopies = vf / nunits;
3808 gcc_assert (phi_info);
3809 gcc_assert (ncopies >= 1);
3811 /* Convert the step to the desired type. */
3812 stmts = NULL;
3813 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3814 if (stmts)
3816 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3817 gcc_assert (!new_bb);
3820 /* Find the first insertion point in the BB. */
3821 si = gsi_after_labels (bb);
3823 /* Create the vector that holds the initial_value of the induction. */
3824 if (nested_in_vect_loop)
3826 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3827 been created during vectorization of previous stmts. We obtain it
3828 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3829 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3830 /* If the initial value is not of proper type, convert it. */
3831 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3833 new_stmt
3834 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3835 vect_simple_var,
3836 "vec_iv_"),
3837 VIEW_CONVERT_EXPR,
3838 build1 (VIEW_CONVERT_EXPR, vectype,
3839 vec_init));
3840 vec_init = gimple_assign_lhs (new_stmt);
3841 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3842 new_stmt);
3843 gcc_assert (!new_bb);
3844 set_vinfo_for_stmt (new_stmt,
3845 new_stmt_vec_info (new_stmt, loop_vinfo));
3848 else
3850 vec<constructor_elt, va_gc> *v;
3852 /* iv_loop is the loop to be vectorized. Create:
3853 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3854 stmts = NULL;
3855 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3857 vec_alloc (v, nunits);
3858 bool constant_p = is_gimple_min_invariant (new_name);
3859 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3860 for (i = 1; i < nunits; i++)
3862 /* Create: new_name_i = new_name + step_expr */
3863 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3864 new_name, step_expr);
3865 if (!is_gimple_min_invariant (new_name))
3866 constant_p = false;
3867 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3869 if (stmts)
3871 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3872 gcc_assert (!new_bb);
3875 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3876 if (constant_p)
3877 new_vec = build_vector_from_ctor (vectype, v);
3878 else
3879 new_vec = build_constructor (vectype, v);
3880 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3884 /* Create the vector that holds the step of the induction. */
3885 if (nested_in_vect_loop)
3886 /* iv_loop is nested in the loop to be vectorized. Generate:
3887 vec_step = [S, S, S, S] */
3888 new_name = step_expr;
3889 else
3891 /* iv_loop is the loop to be vectorized. Generate:
3892 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3893 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3895 expr = build_int_cst (integer_type_node, vf);
3896 expr = fold_convert (TREE_TYPE (step_expr), expr);
3898 else
3899 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3900 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3901 expr, step_expr);
3902 if (TREE_CODE (step_expr) == SSA_NAME)
3903 new_name = vect_init_vector (iv_phi, new_name,
3904 TREE_TYPE (step_expr), NULL);
3907 t = unshare_expr (new_name);
3908 gcc_assert (CONSTANT_CLASS_P (new_name)
3909 || TREE_CODE (new_name) == SSA_NAME);
3910 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3911 gcc_assert (stepvectype);
3912 new_vec = build_vector_from_val (stepvectype, t);
3913 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3916 /* Create the following def-use cycle:
3917 loop prolog:
3918 vec_init = ...
3919 vec_step = ...
3920 loop:
3921 vec_iv = PHI <vec_init, vec_loop>
3923 STMT
3925 vec_loop = vec_iv + vec_step; */
3927 /* Create the induction-phi that defines the induction-operand. */
3928 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3929 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3930 set_vinfo_for_stmt (induction_phi,
3931 new_stmt_vec_info (induction_phi, loop_vinfo));
3932 induc_def = PHI_RESULT (induction_phi);
3934 /* Create the iv update inside the loop */
3935 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3936 vec_def = make_ssa_name (vec_dest, new_stmt);
3937 gimple_assign_set_lhs (new_stmt, vec_def);
3938 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3939 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3941 /* Set the arguments of the phi node: */
3942 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3943 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3944 UNKNOWN_LOCATION);
3947 /* In case that vectorization factor (VF) is bigger than the number
3948 of elements that we can fit in a vectype (nunits), we have to generate
3949 more than one vector stmt - i.e - we need to "unroll" the
3950 vector stmt by a factor VF/nunits. For more details see documentation
3951 in vectorizable_operation. */
3953 if (ncopies > 1)
3955 stmt_vec_info prev_stmt_vinfo;
3956 /* FORNOW. This restriction should be relaxed. */
3957 gcc_assert (!nested_in_vect_loop);
3959 /* Create the vector that holds the step of the induction. */
3960 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3962 expr = build_int_cst (integer_type_node, nunits);
3963 expr = fold_convert (TREE_TYPE (step_expr), expr);
3965 else
3966 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3967 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3968 expr, step_expr);
3969 if (TREE_CODE (step_expr) == SSA_NAME)
3970 new_name = vect_init_vector (iv_phi, new_name,
3971 TREE_TYPE (step_expr), NULL);
3972 t = unshare_expr (new_name);
3973 gcc_assert (CONSTANT_CLASS_P (new_name)
3974 || TREE_CODE (new_name) == SSA_NAME);
3975 new_vec = build_vector_from_val (stepvectype, t);
3976 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3978 vec_def = induc_def;
3979 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3980 for (i = 1; i < ncopies; i++)
3982 /* vec_i = vec_prev + vec_step */
3983 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3984 vec_def, vec_step);
3985 vec_def = make_ssa_name (vec_dest, new_stmt);
3986 gimple_assign_set_lhs (new_stmt, vec_def);
3988 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3989 if (!useless_type_conversion_p (resvectype, vectype))
3991 new_stmt
3992 = gimple_build_assign
3993 (vect_get_new_vect_var (resvectype, vect_simple_var,
3994 "vec_iv_"),
3995 VIEW_CONVERT_EXPR,
3996 build1 (VIEW_CONVERT_EXPR, resvectype,
3997 gimple_assign_lhs (new_stmt)));
3998 gimple_assign_set_lhs (new_stmt,
3999 make_ssa_name
4000 (gimple_assign_lhs (new_stmt), new_stmt));
4001 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4003 set_vinfo_for_stmt (new_stmt,
4004 new_stmt_vec_info (new_stmt, loop_vinfo));
4005 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
4006 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
4010 if (nested_in_vect_loop)
4012 /* Find the loop-closed exit-phi of the induction, and record
4013 the final vector of induction results: */
4014 exit_phi = NULL;
4015 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
4017 gimple *use_stmt = USE_STMT (use_p);
4018 if (is_gimple_debug (use_stmt))
4019 continue;
4021 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
4023 exit_phi = use_stmt;
4024 break;
4027 if (exit_phi)
4029 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
4030 /* FORNOW. Currently not supporting the case that an inner-loop induction
4031 is not used in the outer-loop (i.e. only outside the outer-loop). */
4032 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
4033 && !STMT_VINFO_LIVE_P (stmt_vinfo));
4035 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
4036 if (dump_enabled_p ())
4038 dump_printf_loc (MSG_NOTE, vect_location,
4039 "vector of inductions after inner-loop:");
4040 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
4046 if (dump_enabled_p ())
4048 dump_printf_loc (MSG_NOTE, vect_location,
4049 "transform induction: created def-use cycle: ");
4050 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
4051 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
4052 SSA_NAME_DEF_STMT (vec_def), 0);
4055 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4056 if (!useless_type_conversion_p (resvectype, vectype))
4058 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4059 vect_simple_var,
4060 "vec_iv_"),
4061 VIEW_CONVERT_EXPR,
4062 build1 (VIEW_CONVERT_EXPR, resvectype,
4063 induc_def));
4064 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4065 gimple_assign_set_lhs (new_stmt, induc_def);
4066 si = gsi_after_labels (bb);
4067 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4068 set_vinfo_for_stmt (new_stmt,
4069 new_stmt_vec_info (new_stmt, loop_vinfo));
4070 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4071 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4074 return induc_def;
4078 /* Function get_initial_def_for_reduction
4080 Input:
4081 STMT - a stmt that performs a reduction operation in the loop.
4082 INIT_VAL - the initial value of the reduction variable
4084 Output:
4085 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4086 of the reduction (used for adjusting the epilog - see below).
4087 Return a vector variable, initialized according to the operation that STMT
4088 performs. This vector will be used as the initial value of the
4089 vector of partial results.
4091 Option1 (adjust in epilog): Initialize the vector as follows:
4092 add/bit or/xor: [0,0,...,0,0]
4093 mult/bit and: [1,1,...,1,1]
4094 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4095 and when necessary (e.g. add/mult case) let the caller know
4096 that it needs to adjust the result by init_val.
4098 Option2: Initialize the vector as follows:
4099 add/bit or/xor: [init_val,0,0,...,0]
4100 mult/bit and: [init_val,1,1,...,1]
4101 min/max/cond_expr: [init_val,init_val,...,init_val]
4102 and no adjustments are needed.
4104 For example, for the following code:
4106 s = init_val;
4107 for (i=0;i<n;i++)
4108 s = s + a[i];
4110 STMT is 's = s + a[i]', and the reduction variable is 's'.
4111 For a vector of 4 units, we want to return either [0,0,0,init_val],
4112 or [0,0,0,0] and let the caller know that it needs to adjust
4113 the result at the end by 'init_val'.
4115 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4116 initialization vector is simpler (same element in all entries), if
4117 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4119 A cost model should help decide between these two schemes. */
4121 tree
4122 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4123 tree *adjustment_def)
4125 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4126 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4128 tree scalar_type = TREE_TYPE (init_val);
4129 tree vectype = get_vectype_for_scalar_type (scalar_type);
4130 int nunits;
4131 enum tree_code code = gimple_assign_rhs_code (stmt);
4132 tree def_for_init;
4133 tree init_def;
4134 tree *elts;
4135 int i;
4136 bool nested_in_vect_loop = false;
4137 REAL_VALUE_TYPE real_init_val = dconst0;
4138 int int_init_val = 0;
4139 gimple *def_stmt = NULL;
4140 gimple_seq stmts = NULL;
4142 gcc_assert (vectype);
4143 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4145 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4146 || SCALAR_FLOAT_TYPE_P (scalar_type));
4148 if (nested_in_vect_loop_p (loop, stmt))
4149 nested_in_vect_loop = true;
4150 else
4151 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4153 /* In case of double reduction we only create a vector variable to be put
4154 in the reduction phi node. The actual statement creation is done in
4155 vect_create_epilog_for_reduction. */
4156 if (adjustment_def && nested_in_vect_loop
4157 && TREE_CODE (init_val) == SSA_NAME
4158 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4159 && gimple_code (def_stmt) == GIMPLE_PHI
4160 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4161 && vinfo_for_stmt (def_stmt)
4162 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4163 == vect_double_reduction_def)
4165 *adjustment_def = NULL;
4166 return vect_create_destination_var (init_val, vectype);
4169 /* In case of a nested reduction do not use an adjustment def as
4170 that case is not supported by the epilogue generation correctly
4171 if ncopies is not one. */
4172 if (adjustment_def && nested_in_vect_loop)
4174 *adjustment_def = NULL;
4175 return vect_get_vec_def_for_operand (init_val, stmt);
4178 switch (code)
4180 case WIDEN_SUM_EXPR:
4181 case DOT_PROD_EXPR:
4182 case SAD_EXPR:
4183 case PLUS_EXPR:
4184 case MINUS_EXPR:
4185 case BIT_IOR_EXPR:
4186 case BIT_XOR_EXPR:
4187 case MULT_EXPR:
4188 case BIT_AND_EXPR:
4189 /* ADJUSMENT_DEF is NULL when called from
4190 vect_create_epilog_for_reduction to vectorize double reduction. */
4191 if (adjustment_def)
4192 *adjustment_def = init_val;
4194 if (code == MULT_EXPR)
4196 real_init_val = dconst1;
4197 int_init_val = 1;
4200 if (code == BIT_AND_EXPR)
4201 int_init_val = -1;
4203 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4204 def_for_init = build_real (scalar_type, real_init_val);
4205 else
4206 def_for_init = build_int_cst (scalar_type, int_init_val);
4208 /* Create a vector of '0' or '1' except the first element. */
4209 elts = XALLOCAVEC (tree, nunits);
4210 for (i = nunits - 2; i >= 0; --i)
4211 elts[i + 1] = def_for_init;
4213 /* Option1: the first element is '0' or '1' as well. */
4214 if (adjustment_def)
4216 elts[0] = def_for_init;
4217 init_def = build_vector (vectype, elts);
4218 break;
4221 /* Option2: the first element is INIT_VAL. */
4222 elts[0] = init_val;
4223 if (TREE_CONSTANT (init_val))
4224 init_def = build_vector (vectype, elts);
4225 else
4227 vec<constructor_elt, va_gc> *v;
4228 vec_alloc (v, nunits);
4229 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4230 for (i = 1; i < nunits; ++i)
4231 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4232 init_def = build_constructor (vectype, v);
4235 break;
4237 case MIN_EXPR:
4238 case MAX_EXPR:
4239 case COND_EXPR:
4240 if (adjustment_def)
4242 *adjustment_def = NULL_TREE;
4243 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4245 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4246 break;
4249 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4250 if (! gimple_seq_empty_p (stmts))
4251 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4252 init_def = build_vector_from_val (vectype, init_val);
4253 break;
4255 default:
4256 gcc_unreachable ();
4259 return init_def;
4262 /* Function vect_create_epilog_for_reduction
4264 Create code at the loop-epilog to finalize the result of a reduction
4265 computation.
4267 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4268 reduction statements.
4269 STMT is the scalar reduction stmt that is being vectorized.
4270 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4271 number of elements that we can fit in a vectype (nunits). In this case
4272 we have to generate more than one vector stmt - i.e - we need to "unroll"
4273 the vector stmt by a factor VF/nunits. For more details see documentation
4274 in vectorizable_operation.
4275 REDUC_CODE is the tree-code for the epilog reduction.
4276 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4277 computation.
4278 REDUC_INDEX is the index of the operand in the right hand side of the
4279 statement that is defined by REDUCTION_PHI.
4280 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4281 SLP_NODE is an SLP node containing a group of reduction statements. The
4282 first one in this group is STMT.
4283 INDUCTION_INDEX is the index of the loop for condition reductions.
4284 Otherwise it is undefined.
4286 This function:
4287 1. Creates the reduction def-use cycles: sets the arguments for
4288 REDUCTION_PHIS:
4289 The loop-entry argument is the vectorized initial-value of the reduction.
4290 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4291 sums.
4292 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4293 by applying the operation specified by REDUC_CODE if available, or by
4294 other means (whole-vector shifts or a scalar loop).
4295 The function also creates a new phi node at the loop exit to preserve
4296 loop-closed form, as illustrated below.
4298 The flow at the entry to this function:
4300 loop:
4301 vec_def = phi <null, null> # REDUCTION_PHI
4302 VECT_DEF = vector_stmt # vectorized form of STMT
4303 s_loop = scalar_stmt # (scalar) STMT
4304 loop_exit:
4305 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4306 use <s_out0>
4307 use <s_out0>
4309 The above is transformed by this function into:
4311 loop:
4312 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4313 VECT_DEF = vector_stmt # vectorized form of STMT
4314 s_loop = scalar_stmt # (scalar) STMT
4315 loop_exit:
4316 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4317 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4318 v_out2 = reduce <v_out1>
4319 s_out3 = extract_field <v_out2, 0>
4320 s_out4 = adjust_result <s_out3>
4321 use <s_out4>
4322 use <s_out4>
4325 static void
4326 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4327 int ncopies, enum tree_code reduc_code,
4328 vec<gimple *> reduction_phis,
4329 int reduc_index, bool double_reduc,
4330 slp_tree slp_node, tree induction_index)
4332 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4333 stmt_vec_info prev_phi_info;
4334 tree vectype;
4335 machine_mode mode;
4336 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4337 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4338 basic_block exit_bb;
4339 tree scalar_dest;
4340 tree scalar_type;
4341 gimple *new_phi = NULL, *phi;
4342 gimple_stmt_iterator exit_gsi;
4343 tree vec_dest;
4344 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4345 gimple *epilog_stmt = NULL;
4346 enum tree_code code = gimple_assign_rhs_code (stmt);
4347 gimple *exit_phi;
4348 tree bitsize;
4349 tree adjustment_def = NULL;
4350 tree vec_initial_def = NULL;
4351 tree reduction_op, expr, def, initial_def = NULL;
4352 tree orig_name, scalar_result;
4353 imm_use_iterator imm_iter, phi_imm_iter;
4354 use_operand_p use_p, phi_use_p;
4355 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4356 bool nested_in_vect_loop = false;
4357 auto_vec<gimple *> new_phis;
4358 auto_vec<gimple *> inner_phis;
4359 enum vect_def_type dt = vect_unknown_def_type;
4360 int j, i;
4361 auto_vec<tree> scalar_results;
4362 unsigned int group_size = 1, k, ratio;
4363 auto_vec<tree> vec_initial_defs;
4364 auto_vec<gimple *> phis;
4365 bool slp_reduc = false;
4366 tree new_phi_result;
4367 gimple *inner_phi = NULL;
4369 if (slp_node)
4370 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4372 if (nested_in_vect_loop_p (loop, stmt))
4374 outer_loop = loop;
4375 loop = loop->inner;
4376 nested_in_vect_loop = true;
4377 gcc_assert (!slp_node);
4380 reduction_op = get_reduction_op (stmt, reduc_index);
4382 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4383 gcc_assert (vectype);
4384 mode = TYPE_MODE (vectype);
4386 /* 1. Create the reduction def-use cycle:
4387 Set the arguments of REDUCTION_PHIS, i.e., transform
4389 loop:
4390 vec_def = phi <null, null> # REDUCTION_PHI
4391 VECT_DEF = vector_stmt # vectorized form of STMT
4394 into:
4396 loop:
4397 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4398 VECT_DEF = vector_stmt # vectorized form of STMT
4401 (in case of SLP, do it for all the phis). */
4403 /* Get the loop-entry arguments. */
4404 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4405 if (slp_node)
4406 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4407 NULL, slp_node, reduc_index);
4408 else
4410 /* Get at the scalar def before the loop, that defines the initial value
4411 of the reduction variable. */
4412 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4413 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4414 loop_preheader_edge (loop));
4415 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4416 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4417 &adjustment_def);
4418 vec_initial_defs.create (1);
4419 vec_initial_defs.quick_push (vec_initial_def);
4422 /* Set phi nodes arguments. */
4423 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4425 tree vec_init_def, def;
4426 gimple_seq stmts;
4427 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4428 true, NULL_TREE);
4429 if (stmts)
4430 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4432 def = vect_defs[i];
4433 for (j = 0; j < ncopies; j++)
4435 if (j != 0)
4437 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4438 if (nested_in_vect_loop)
4439 vec_init_def
4440 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4441 vec_init_def);
4444 /* Set the loop-entry arg of the reduction-phi. */
4446 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4447 == INTEGER_INDUC_COND_REDUCTION)
4449 /* Initialise the reduction phi to zero. This prevents initial
4450 values of non-zero interferring with the reduction op. */
4451 gcc_assert (ncopies == 1);
4452 gcc_assert (i == 0);
4454 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4455 tree zero_vec = build_zero_cst (vec_init_def_type);
4457 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4458 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4460 else
4461 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4462 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4464 /* Set the loop-latch arg for the reduction-phi. */
4465 if (j > 0)
4466 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4468 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4469 UNKNOWN_LOCATION);
4471 if (dump_enabled_p ())
4473 dump_printf_loc (MSG_NOTE, vect_location,
4474 "transform reduction: created def-use cycle: ");
4475 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4476 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4481 /* 2. Create epilog code.
4482 The reduction epilog code operates across the elements of the vector
4483 of partial results computed by the vectorized loop.
4484 The reduction epilog code consists of:
4486 step 1: compute the scalar result in a vector (v_out2)
4487 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4488 step 3: adjust the scalar result (s_out3) if needed.
4490 Step 1 can be accomplished using one the following three schemes:
4491 (scheme 1) using reduc_code, if available.
4492 (scheme 2) using whole-vector shifts, if available.
4493 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4494 combined.
4496 The overall epilog code looks like this:
4498 s_out0 = phi <s_loop> # original EXIT_PHI
4499 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4500 v_out2 = reduce <v_out1> # step 1
4501 s_out3 = extract_field <v_out2, 0> # step 2
4502 s_out4 = adjust_result <s_out3> # step 3
4504 (step 3 is optional, and steps 1 and 2 may be combined).
4505 Lastly, the uses of s_out0 are replaced by s_out4. */
4508 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4509 v_out1 = phi <VECT_DEF>
4510 Store them in NEW_PHIS. */
4512 exit_bb = single_exit (loop)->dest;
4513 prev_phi_info = NULL;
4514 new_phis.create (vect_defs.length ());
4515 FOR_EACH_VEC_ELT (vect_defs, i, def)
4517 for (j = 0; j < ncopies; j++)
4519 tree new_def = copy_ssa_name (def);
4520 phi = create_phi_node (new_def, exit_bb);
4521 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4522 if (j == 0)
4523 new_phis.quick_push (phi);
4524 else
4526 def = vect_get_vec_def_for_stmt_copy (dt, def);
4527 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4530 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4531 prev_phi_info = vinfo_for_stmt (phi);
4535 /* The epilogue is created for the outer-loop, i.e., for the loop being
4536 vectorized. Create exit phis for the outer loop. */
4537 if (double_reduc)
4539 loop = outer_loop;
4540 exit_bb = single_exit (loop)->dest;
4541 inner_phis.create (vect_defs.length ());
4542 FOR_EACH_VEC_ELT (new_phis, i, phi)
4544 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4545 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4546 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4547 PHI_RESULT (phi));
4548 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4549 loop_vinfo));
4550 inner_phis.quick_push (phi);
4551 new_phis[i] = outer_phi;
4552 prev_phi_info = vinfo_for_stmt (outer_phi);
4553 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4555 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4556 new_result = copy_ssa_name (PHI_RESULT (phi));
4557 outer_phi = create_phi_node (new_result, exit_bb);
4558 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4559 PHI_RESULT (phi));
4560 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4561 loop_vinfo));
4562 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4563 prev_phi_info = vinfo_for_stmt (outer_phi);
4568 exit_gsi = gsi_after_labels (exit_bb);
4570 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4571 (i.e. when reduc_code is not available) and in the final adjustment
4572 code (if needed). Also get the original scalar reduction variable as
4573 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4574 represents a reduction pattern), the tree-code and scalar-def are
4575 taken from the original stmt that the pattern-stmt (STMT) replaces.
4576 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4577 are taken from STMT. */
4579 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4580 if (!orig_stmt)
4582 /* Regular reduction */
4583 orig_stmt = stmt;
4585 else
4587 /* Reduction pattern */
4588 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4589 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4590 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4593 code = gimple_assign_rhs_code (orig_stmt);
4594 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4595 partial results are added and not subtracted. */
4596 if (code == MINUS_EXPR)
4597 code = PLUS_EXPR;
4599 scalar_dest = gimple_assign_lhs (orig_stmt);
4600 scalar_type = TREE_TYPE (scalar_dest);
4601 scalar_results.create (group_size);
4602 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4603 bitsize = TYPE_SIZE (scalar_type);
4605 /* In case this is a reduction in an inner-loop while vectorizing an outer
4606 loop - we don't need to extract a single scalar result at the end of the
4607 inner-loop (unless it is double reduction, i.e., the use of reduction is
4608 outside the outer-loop). The final vector of partial results will be used
4609 in the vectorized outer-loop, or reduced to a scalar result at the end of
4610 the outer-loop. */
4611 if (nested_in_vect_loop && !double_reduc)
4612 goto vect_finalize_reduction;
4614 /* SLP reduction without reduction chain, e.g.,
4615 # a1 = phi <a2, a0>
4616 # b1 = phi <b2, b0>
4617 a2 = operation (a1)
4618 b2 = operation (b1) */
4619 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4621 /* In case of reduction chain, e.g.,
4622 # a1 = phi <a3, a0>
4623 a2 = operation (a1)
4624 a3 = operation (a2),
4626 we may end up with more than one vector result. Here we reduce them to
4627 one vector. */
4628 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4630 tree first_vect = PHI_RESULT (new_phis[0]);
4631 tree tmp;
4632 gassign *new_vec_stmt = NULL;
4634 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4635 for (k = 1; k < new_phis.length (); k++)
4637 gimple *next_phi = new_phis[k];
4638 tree second_vect = PHI_RESULT (next_phi);
4640 tmp = build2 (code, vectype, first_vect, second_vect);
4641 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4642 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4643 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4644 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4647 new_phi_result = first_vect;
4648 if (new_vec_stmt)
4650 new_phis.truncate (0);
4651 new_phis.safe_push (new_vec_stmt);
4654 else
4655 new_phi_result = PHI_RESULT (new_phis[0]);
4657 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4659 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4660 various data values where the condition matched and another vector
4661 (INDUCTION_INDEX) containing all the indexes of those matches. We
4662 need to extract the last matching index (which will be the index with
4663 highest value) and use this to index into the data vector.
4664 For the case where there were no matches, the data vector will contain
4665 all default values and the index vector will be all zeros. */
4667 /* Get various versions of the type of the vector of indexes. */
4668 tree index_vec_type = TREE_TYPE (induction_index);
4669 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4670 tree index_scalar_type = TREE_TYPE (index_vec_type);
4671 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4672 (index_vec_type);
4674 /* Get an unsigned integer version of the type of the data vector. */
4675 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4676 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4677 tree vectype_unsigned = build_vector_type
4678 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4680 /* First we need to create a vector (ZERO_VEC) of zeros and another
4681 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4682 can create using a MAX reduction and then expanding.
4683 In the case where the loop never made any matches, the max index will
4684 be zero. */
4686 /* Vector of {0, 0, 0,...}. */
4687 tree zero_vec = make_ssa_name (vectype);
4688 tree zero_vec_rhs = build_zero_cst (vectype);
4689 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4690 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4692 /* Find maximum value from the vector of found indexes. */
4693 tree max_index = make_ssa_name (index_scalar_type);
4694 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4695 induction_index);
4696 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4698 /* Vector of {max_index, max_index, max_index,...}. */
4699 tree max_index_vec = make_ssa_name (index_vec_type);
4700 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4701 max_index);
4702 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4703 max_index_vec_rhs);
4704 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4706 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4707 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4708 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4709 otherwise. Only one value should match, resulting in a vector
4710 (VEC_COND) with one data value and the rest zeros.
4711 In the case where the loop never made any matches, every index will
4712 match, resulting in a vector with all data values (which will all be
4713 the default value). */
4715 /* Compare the max index vector to the vector of found indexes to find
4716 the position of the max value. */
4717 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4718 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4719 induction_index,
4720 max_index_vec);
4721 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4723 /* Use the compare to choose either values from the data vector or
4724 zero. */
4725 tree vec_cond = make_ssa_name (vectype);
4726 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4727 vec_compare, new_phi_result,
4728 zero_vec);
4729 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4731 /* Finally we need to extract the data value from the vector (VEC_COND)
4732 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4733 reduction, but because this doesn't exist, we can use a MAX reduction
4734 instead. The data value might be signed or a float so we need to cast
4735 it first.
4736 In the case where the loop never made any matches, the data values are
4737 all identical, and so will reduce down correctly. */
4739 /* Make the matched data values unsigned. */
4740 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4741 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4742 vec_cond);
4743 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4744 VIEW_CONVERT_EXPR,
4745 vec_cond_cast_rhs);
4746 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4748 /* Reduce down to a scalar value. */
4749 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4750 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4751 optab_default);
4752 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4753 != CODE_FOR_nothing);
4754 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4755 REDUC_MAX_EXPR,
4756 vec_cond_cast);
4757 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4759 /* Convert the reduced value back to the result type and set as the
4760 result. */
4761 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4762 data_reduc);
4763 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4764 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4765 gimple_assign_set_lhs (epilog_stmt, new_temp);
4766 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4767 scalar_results.safe_push (new_temp);
4770 /* 2.3 Create the reduction code, using one of the three schemes described
4771 above. In SLP we simply need to extract all the elements from the
4772 vector (without reducing them), so we use scalar shifts. */
4773 else if (reduc_code != ERROR_MARK && !slp_reduc)
4775 tree tmp;
4776 tree vec_elem_type;
4778 /*** Case 1: Create:
4779 v_out2 = reduc_expr <v_out1> */
4781 if (dump_enabled_p ())
4782 dump_printf_loc (MSG_NOTE, vect_location,
4783 "Reduce using direct vector reduction.\n");
4785 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4786 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4788 tree tmp_dest =
4789 vect_create_destination_var (scalar_dest, vec_elem_type);
4790 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4791 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4792 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4793 gimple_assign_set_lhs (epilog_stmt, new_temp);
4794 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4796 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4798 else
4799 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4801 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4802 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4803 gimple_assign_set_lhs (epilog_stmt, new_temp);
4804 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4806 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4807 == INTEGER_INDUC_COND_REDUCTION)
4809 /* Earlier we set the initial value to be zero. Check the result
4810 and if it is zero then replace with the original initial
4811 value. */
4812 tree zero = build_zero_cst (scalar_type);
4813 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4815 tmp = make_ssa_name (new_scalar_dest);
4816 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4817 initial_def, new_temp);
4818 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4819 new_temp = tmp;
4822 scalar_results.safe_push (new_temp);
4824 else
4826 bool reduce_with_shift = have_whole_vector_shift (mode);
4827 int element_bitsize = tree_to_uhwi (bitsize);
4828 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4829 tree vec_temp;
4831 /* Regardless of whether we have a whole vector shift, if we're
4832 emulating the operation via tree-vect-generic, we don't want
4833 to use it. Only the first round of the reduction is likely
4834 to still be profitable via emulation. */
4835 /* ??? It might be better to emit a reduction tree code here, so that
4836 tree-vect-generic can expand the first round via bit tricks. */
4837 if (!VECTOR_MODE_P (mode))
4838 reduce_with_shift = false;
4839 else
4841 optab optab = optab_for_tree_code (code, vectype, optab_default);
4842 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4843 reduce_with_shift = false;
4846 if (reduce_with_shift && !slp_reduc)
4848 int nelements = vec_size_in_bits / element_bitsize;
4849 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4851 int elt_offset;
4853 tree zero_vec = build_zero_cst (vectype);
4854 /*** Case 2: Create:
4855 for (offset = nelements/2; offset >= 1; offset/=2)
4857 Create: va' = vec_shift <va, offset>
4858 Create: va = vop <va, va'>
4859 } */
4861 tree rhs;
4863 if (dump_enabled_p ())
4864 dump_printf_loc (MSG_NOTE, vect_location,
4865 "Reduce using vector shifts\n");
4867 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4868 new_temp = new_phi_result;
4869 for (elt_offset = nelements / 2;
4870 elt_offset >= 1;
4871 elt_offset /= 2)
4873 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4874 tree mask = vect_gen_perm_mask_any (vectype, sel);
4875 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4876 new_temp, zero_vec, mask);
4877 new_name = make_ssa_name (vec_dest, epilog_stmt);
4878 gimple_assign_set_lhs (epilog_stmt, new_name);
4879 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4881 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4882 new_temp);
4883 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4884 gimple_assign_set_lhs (epilog_stmt, new_temp);
4885 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4888 /* 2.4 Extract the final scalar result. Create:
4889 s_out3 = extract_field <v_out2, bitpos> */
4891 if (dump_enabled_p ())
4892 dump_printf_loc (MSG_NOTE, vect_location,
4893 "extract scalar result\n");
4895 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4896 bitsize, bitsize_zero_node);
4897 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4898 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4899 gimple_assign_set_lhs (epilog_stmt, new_temp);
4900 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4901 scalar_results.safe_push (new_temp);
4903 else
4905 /*** Case 3: Create:
4906 s = extract_field <v_out2, 0>
4907 for (offset = element_size;
4908 offset < vector_size;
4909 offset += element_size;)
4911 Create: s' = extract_field <v_out2, offset>
4912 Create: s = op <s, s'> // For non SLP cases
4913 } */
4915 if (dump_enabled_p ())
4916 dump_printf_loc (MSG_NOTE, vect_location,
4917 "Reduce using scalar code.\n");
4919 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4920 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4922 int bit_offset;
4923 if (gimple_code (new_phi) == GIMPLE_PHI)
4924 vec_temp = PHI_RESULT (new_phi);
4925 else
4926 vec_temp = gimple_assign_lhs (new_phi);
4927 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4928 bitsize_zero_node);
4929 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4930 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4931 gimple_assign_set_lhs (epilog_stmt, new_temp);
4932 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4934 /* In SLP we don't need to apply reduction operation, so we just
4935 collect s' values in SCALAR_RESULTS. */
4936 if (slp_reduc)
4937 scalar_results.safe_push (new_temp);
4939 for (bit_offset = element_bitsize;
4940 bit_offset < vec_size_in_bits;
4941 bit_offset += element_bitsize)
4943 tree bitpos = bitsize_int (bit_offset);
4944 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4945 bitsize, bitpos);
4947 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4948 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4949 gimple_assign_set_lhs (epilog_stmt, new_name);
4950 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4952 if (slp_reduc)
4954 /* In SLP we don't need to apply reduction operation, so
4955 we just collect s' values in SCALAR_RESULTS. */
4956 new_temp = new_name;
4957 scalar_results.safe_push (new_name);
4959 else
4961 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4962 new_name, new_temp);
4963 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4964 gimple_assign_set_lhs (epilog_stmt, new_temp);
4965 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4970 /* The only case where we need to reduce scalar results in SLP, is
4971 unrolling. If the size of SCALAR_RESULTS is greater than
4972 GROUP_SIZE, we reduce them combining elements modulo
4973 GROUP_SIZE. */
4974 if (slp_reduc)
4976 tree res, first_res, new_res;
4977 gimple *new_stmt;
4979 /* Reduce multiple scalar results in case of SLP unrolling. */
4980 for (j = group_size; scalar_results.iterate (j, &res);
4981 j++)
4983 first_res = scalar_results[j % group_size];
4984 new_stmt = gimple_build_assign (new_scalar_dest, code,
4985 first_res, res);
4986 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4987 gimple_assign_set_lhs (new_stmt, new_res);
4988 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4989 scalar_results[j % group_size] = new_res;
4992 else
4993 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4994 scalar_results.safe_push (new_temp);
4998 vect_finalize_reduction:
5000 if (double_reduc)
5001 loop = loop->inner;
5003 /* 2.5 Adjust the final result by the initial value of the reduction
5004 variable. (When such adjustment is not needed, then
5005 'adjustment_def' is zero). For example, if code is PLUS we create:
5006 new_temp = loop_exit_def + adjustment_def */
5008 if (adjustment_def)
5010 gcc_assert (!slp_reduc);
5011 if (nested_in_vect_loop)
5013 new_phi = new_phis[0];
5014 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5015 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5016 new_dest = vect_create_destination_var (scalar_dest, vectype);
5018 else
5020 new_temp = scalar_results[0];
5021 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5022 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5023 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5026 epilog_stmt = gimple_build_assign (new_dest, expr);
5027 new_temp = make_ssa_name (new_dest, epilog_stmt);
5028 gimple_assign_set_lhs (epilog_stmt, new_temp);
5029 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5030 if (nested_in_vect_loop)
5032 set_vinfo_for_stmt (epilog_stmt,
5033 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5034 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5035 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5037 if (!double_reduc)
5038 scalar_results.quick_push (new_temp);
5039 else
5040 scalar_results[0] = new_temp;
5042 else
5043 scalar_results[0] = new_temp;
5045 new_phis[0] = epilog_stmt;
5048 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5049 phis with new adjusted scalar results, i.e., replace use <s_out0>
5050 with use <s_out4>.
5052 Transform:
5053 loop_exit:
5054 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5055 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5056 v_out2 = reduce <v_out1>
5057 s_out3 = extract_field <v_out2, 0>
5058 s_out4 = adjust_result <s_out3>
5059 use <s_out0>
5060 use <s_out0>
5062 into:
5064 loop_exit:
5065 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5066 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5067 v_out2 = reduce <v_out1>
5068 s_out3 = extract_field <v_out2, 0>
5069 s_out4 = adjust_result <s_out3>
5070 use <s_out4>
5071 use <s_out4> */
5074 /* In SLP reduction chain we reduce vector results into one vector if
5075 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5076 the last stmt in the reduction chain, since we are looking for the loop
5077 exit phi node. */
5078 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5080 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5081 /* Handle reduction patterns. */
5082 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5083 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5085 scalar_dest = gimple_assign_lhs (dest_stmt);
5086 group_size = 1;
5089 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5090 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5091 need to match SCALAR_RESULTS with corresponding statements. The first
5092 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5093 the first vector stmt, etc.
5094 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5095 if (group_size > new_phis.length ())
5097 ratio = group_size / new_phis.length ();
5098 gcc_assert (!(group_size % new_phis.length ()));
5100 else
5101 ratio = 1;
5103 for (k = 0; k < group_size; k++)
5105 if (k % ratio == 0)
5107 epilog_stmt = new_phis[k / ratio];
5108 reduction_phi = reduction_phis[k / ratio];
5109 if (double_reduc)
5110 inner_phi = inner_phis[k / ratio];
5113 if (slp_reduc)
5115 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5117 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5118 /* SLP statements can't participate in patterns. */
5119 gcc_assert (!orig_stmt);
5120 scalar_dest = gimple_assign_lhs (current_stmt);
5123 phis.create (3);
5124 /* Find the loop-closed-use at the loop exit of the original scalar
5125 result. (The reduction result is expected to have two immediate uses -
5126 one at the latch block, and one at the loop exit). */
5127 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5128 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5129 && !is_gimple_debug (USE_STMT (use_p)))
5130 phis.safe_push (USE_STMT (use_p));
5132 /* While we expect to have found an exit_phi because of loop-closed-ssa
5133 form we can end up without one if the scalar cycle is dead. */
5135 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5137 if (outer_loop)
5139 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5140 gphi *vect_phi;
5142 /* FORNOW. Currently not supporting the case that an inner-loop
5143 reduction is not used in the outer-loop (but only outside the
5144 outer-loop), unless it is double reduction. */
5145 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5146 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5147 || double_reduc);
5149 if (double_reduc)
5150 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5151 else
5152 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5153 if (!double_reduc
5154 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5155 != vect_double_reduction_def)
5156 continue;
5158 /* Handle double reduction:
5160 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5161 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5162 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5163 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5165 At that point the regular reduction (stmt2 and stmt3) is
5166 already vectorized, as well as the exit phi node, stmt4.
5167 Here we vectorize the phi node of double reduction, stmt1, and
5168 update all relevant statements. */
5170 /* Go through all the uses of s2 to find double reduction phi
5171 node, i.e., stmt1 above. */
5172 orig_name = PHI_RESULT (exit_phi);
5173 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5175 stmt_vec_info use_stmt_vinfo;
5176 stmt_vec_info new_phi_vinfo;
5177 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5178 basic_block bb = gimple_bb (use_stmt);
5179 gimple *use;
5181 /* Check that USE_STMT is really double reduction phi
5182 node. */
5183 if (gimple_code (use_stmt) != GIMPLE_PHI
5184 || gimple_phi_num_args (use_stmt) != 2
5185 || bb->loop_father != outer_loop)
5186 continue;
5187 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5188 if (!use_stmt_vinfo
5189 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5190 != vect_double_reduction_def)
5191 continue;
5193 /* Create vector phi node for double reduction:
5194 vs1 = phi <vs0, vs2>
5195 vs1 was created previously in this function by a call to
5196 vect_get_vec_def_for_operand and is stored in
5197 vec_initial_def;
5198 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5199 vs0 is created here. */
5201 /* Create vector phi node. */
5202 vect_phi = create_phi_node (vec_initial_def, bb);
5203 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5204 loop_vec_info_for_loop (outer_loop));
5205 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5207 /* Create vs0 - initial def of the double reduction phi. */
5208 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5209 loop_preheader_edge (outer_loop));
5210 init_def = get_initial_def_for_reduction (stmt,
5211 preheader_arg, NULL);
5212 vect_phi_init = vect_init_vector (use_stmt, init_def,
5213 vectype, NULL);
5215 /* Update phi node arguments with vs0 and vs2. */
5216 add_phi_arg (vect_phi, vect_phi_init,
5217 loop_preheader_edge (outer_loop),
5218 UNKNOWN_LOCATION);
5219 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5220 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5221 if (dump_enabled_p ())
5223 dump_printf_loc (MSG_NOTE, vect_location,
5224 "created double reduction phi node: ");
5225 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5228 vect_phi_res = PHI_RESULT (vect_phi);
5230 /* Replace the use, i.e., set the correct vs1 in the regular
5231 reduction phi node. FORNOW, NCOPIES is always 1, so the
5232 loop is redundant. */
5233 use = reduction_phi;
5234 for (j = 0; j < ncopies; j++)
5236 edge pr_edge = loop_preheader_edge (loop);
5237 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5238 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5244 phis.release ();
5245 if (nested_in_vect_loop)
5247 if (double_reduc)
5248 loop = outer_loop;
5249 else
5250 continue;
5253 phis.create (3);
5254 /* Find the loop-closed-use at the loop exit of the original scalar
5255 result. (The reduction result is expected to have two immediate uses,
5256 one at the latch block, and one at the loop exit). For double
5257 reductions we are looking for exit phis of the outer loop. */
5258 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5260 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5262 if (!is_gimple_debug (USE_STMT (use_p)))
5263 phis.safe_push (USE_STMT (use_p));
5265 else
5267 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5269 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5271 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5273 if (!flow_bb_inside_loop_p (loop,
5274 gimple_bb (USE_STMT (phi_use_p)))
5275 && !is_gimple_debug (USE_STMT (phi_use_p)))
5276 phis.safe_push (USE_STMT (phi_use_p));
5282 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5284 /* Replace the uses: */
5285 orig_name = PHI_RESULT (exit_phi);
5286 scalar_result = scalar_results[k];
5287 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5288 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5289 SET_USE (use_p, scalar_result);
5292 phis.release ();
5297 /* Function is_nonwrapping_integer_induction.
5299 Check if STMT (which is part of loop LOOP) both increments and
5300 does not cause overflow. */
5302 static bool
5303 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5305 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5306 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5307 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5308 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5309 widest_int ni, max_loop_value, lhs_max;
5310 bool overflow = false;
5312 /* Make sure the loop is integer based. */
5313 if (TREE_CODE (base) != INTEGER_CST
5314 || TREE_CODE (step) != INTEGER_CST)
5315 return false;
5317 /* Check that the induction increments. */
5318 if (tree_int_cst_sgn (step) == -1)
5319 return false;
5321 /* Check that the max size of the loop will not wrap. */
5323 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5324 return true;
5326 if (! max_stmt_executions (loop, &ni))
5327 return false;
5329 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5330 &overflow);
5331 if (overflow)
5332 return false;
5334 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5335 TYPE_SIGN (lhs_type), &overflow);
5336 if (overflow)
5337 return false;
5339 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5340 <= TYPE_PRECISION (lhs_type));
5343 /* Function vectorizable_reduction.
5345 Check if STMT performs a reduction operation that can be vectorized.
5346 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5347 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5348 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5350 This function also handles reduction idioms (patterns) that have been
5351 recognized in advance during vect_pattern_recog. In this case, STMT may be
5352 of this form:
5353 X = pattern_expr (arg0, arg1, ..., X)
5354 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5355 sequence that had been detected and replaced by the pattern-stmt (STMT).
5357 This function also handles reduction of condition expressions, for example:
5358 for (int i = 0; i < N; i++)
5359 if (a[i] < value)
5360 last = a[i];
5361 This is handled by vectorising the loop and creating an additional vector
5362 containing the loop indexes for which "a[i] < value" was true. In the
5363 function epilogue this is reduced to a single max value and then used to
5364 index into the vector of results.
5366 In some cases of reduction patterns, the type of the reduction variable X is
5367 different than the type of the other arguments of STMT.
5368 In such cases, the vectype that is used when transforming STMT into a vector
5369 stmt is different than the vectype that is used to determine the
5370 vectorization factor, because it consists of a different number of elements
5371 than the actual number of elements that are being operated upon in parallel.
5373 For example, consider an accumulation of shorts into an int accumulator.
5374 On some targets it's possible to vectorize this pattern operating on 8
5375 shorts at a time (hence, the vectype for purposes of determining the
5376 vectorization factor should be V8HI); on the other hand, the vectype that
5377 is used to create the vector form is actually V4SI (the type of the result).
5379 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5380 indicates what is the actual level of parallelism (V8HI in the example), so
5381 that the right vectorization factor would be derived. This vectype
5382 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5383 be used to create the vectorized stmt. The right vectype for the vectorized
5384 stmt is obtained from the type of the result X:
5385 get_vectype_for_scalar_type (TREE_TYPE (X))
5387 This means that, contrary to "regular" reductions (or "regular" stmts in
5388 general), the following equation:
5389 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5390 does *NOT* necessarily hold for reduction patterns. */
5392 bool
5393 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5394 gimple **vec_stmt, slp_tree slp_node)
5396 tree vec_dest;
5397 tree scalar_dest;
5398 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5399 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5400 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5401 tree vectype_in = NULL_TREE;
5402 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5403 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5404 enum tree_code code, orig_code, epilog_reduc_code;
5405 machine_mode vec_mode;
5406 int op_type;
5407 optab optab, reduc_optab;
5408 tree new_temp = NULL_TREE;
5409 gimple *def_stmt;
5410 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5411 gphi *new_phi = NULL;
5412 tree scalar_type;
5413 bool is_simple_use;
5414 gimple *orig_stmt;
5415 stmt_vec_info orig_stmt_info;
5416 tree expr = NULL_TREE;
5417 int i;
5418 int ncopies;
5419 int epilog_copies;
5420 stmt_vec_info prev_stmt_info, prev_phi_info;
5421 bool single_defuse_cycle = false;
5422 tree reduc_def = NULL_TREE;
5423 gimple *new_stmt = NULL;
5424 int j;
5425 tree ops[3];
5426 bool nested_cycle = false, found_nested_cycle_def = false;
5427 gimple *reduc_def_stmt = NULL;
5428 bool double_reduc = false, dummy;
5429 basic_block def_bb;
5430 struct loop * def_stmt_loop, *outer_loop = NULL;
5431 tree def_arg;
5432 gimple *def_arg_stmt;
5433 auto_vec<tree> vec_oprnds0;
5434 auto_vec<tree> vec_oprnds1;
5435 auto_vec<tree> vect_defs;
5436 auto_vec<gimple *> phis;
5437 int vec_num;
5438 tree def0, def1, tem, op1 = NULL_TREE;
5439 bool first_p = true;
5440 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5441 tree cond_reduc_val = NULL_TREE, const_cond_cmp = NULL_TREE;
5443 /* In case of reduction chain we switch to the first stmt in the chain, but
5444 we don't update STMT_INFO, since only the last stmt is marked as reduction
5445 and has reduction properties. */
5446 if (GROUP_FIRST_ELEMENT (stmt_info)
5447 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5449 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5450 first_p = false;
5453 if (nested_in_vect_loop_p (loop, stmt))
5455 outer_loop = loop;
5456 loop = loop->inner;
5457 nested_cycle = true;
5460 /* 1. Is vectorizable reduction? */
5461 /* Not supportable if the reduction variable is used in the loop, unless
5462 it's a reduction chain. */
5463 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5464 && !GROUP_FIRST_ELEMENT (stmt_info))
5465 return false;
5467 /* Reductions that are not used even in an enclosing outer-loop,
5468 are expected to be "live" (used out of the loop). */
5469 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5470 && !STMT_VINFO_LIVE_P (stmt_info))
5471 return false;
5473 /* Make sure it was already recognized as a reduction computation. */
5474 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5475 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5476 return false;
5478 /* 2. Has this been recognized as a reduction pattern?
5480 Check if STMT represents a pattern that has been recognized
5481 in earlier analysis stages. For stmts that represent a pattern,
5482 the STMT_VINFO_RELATED_STMT field records the last stmt in
5483 the original sequence that constitutes the pattern. */
5485 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5486 if (orig_stmt)
5488 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5489 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5490 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5493 /* 3. Check the operands of the operation. The first operands are defined
5494 inside the loop body. The last operand is the reduction variable,
5495 which is defined by the loop-header-phi. */
5497 gcc_assert (is_gimple_assign (stmt));
5499 /* Flatten RHS. */
5500 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5502 case GIMPLE_SINGLE_RHS:
5503 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5504 if (op_type == ternary_op)
5506 tree rhs = gimple_assign_rhs1 (stmt);
5507 ops[0] = TREE_OPERAND (rhs, 0);
5508 ops[1] = TREE_OPERAND (rhs, 1);
5509 ops[2] = TREE_OPERAND (rhs, 2);
5510 code = TREE_CODE (rhs);
5512 else
5513 return false;
5514 break;
5516 case GIMPLE_BINARY_RHS:
5517 code = gimple_assign_rhs_code (stmt);
5518 op_type = TREE_CODE_LENGTH (code);
5519 gcc_assert (op_type == binary_op);
5520 ops[0] = gimple_assign_rhs1 (stmt);
5521 ops[1] = gimple_assign_rhs2 (stmt);
5522 break;
5524 case GIMPLE_TERNARY_RHS:
5525 code = gimple_assign_rhs_code (stmt);
5526 op_type = TREE_CODE_LENGTH (code);
5527 gcc_assert (op_type == ternary_op);
5528 ops[0] = gimple_assign_rhs1 (stmt);
5529 ops[1] = gimple_assign_rhs2 (stmt);
5530 ops[2] = gimple_assign_rhs3 (stmt);
5531 break;
5533 case GIMPLE_UNARY_RHS:
5534 return false;
5536 default:
5537 gcc_unreachable ();
5539 /* The default is that the reduction variable is the last in statement. */
5540 int reduc_index = op_type - 1;
5541 if (code == MINUS_EXPR)
5542 reduc_index = 0;
5544 if (code == COND_EXPR && slp_node)
5545 return false;
5547 scalar_dest = gimple_assign_lhs (stmt);
5548 scalar_type = TREE_TYPE (scalar_dest);
5549 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5550 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5551 return false;
5553 /* Do not try to vectorize bit-precision reductions. */
5554 if ((TYPE_PRECISION (scalar_type)
5555 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5556 return false;
5558 /* All uses but the last are expected to be defined in the loop.
5559 The last use is the reduction variable. In case of nested cycle this
5560 assumption is not true: we use reduc_index to record the index of the
5561 reduction variable. */
5562 for (i = 0; i < op_type; i++)
5564 if (i == reduc_index)
5565 continue;
5567 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5568 if (i == 0 && code == COND_EXPR)
5569 continue;
5571 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5572 &def_stmt, &dt, &tem);
5573 if (!vectype_in)
5574 vectype_in = tem;
5575 gcc_assert (is_simple_use);
5577 if (dt != vect_internal_def
5578 && dt != vect_external_def
5579 && dt != vect_constant_def
5580 && dt != vect_induction_def
5581 && !(dt == vect_nested_cycle && nested_cycle))
5582 return false;
5584 if (dt == vect_nested_cycle)
5586 found_nested_cycle_def = true;
5587 reduc_def_stmt = def_stmt;
5588 reduc_index = i;
5591 if (i == 1 && code == COND_EXPR)
5593 /* Record how value of COND_EXPR is defined. */
5594 if (dt == vect_constant_def)
5596 cond_reduc_dt = dt;
5597 cond_reduc_val = ops[i];
5599 if (dt == vect_induction_def && def_stmt != NULL
5600 && is_nonwrapping_integer_induction (def_stmt, loop))
5601 cond_reduc_dt = dt;
5605 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5606 &def_stmt, &dt, &tem);
5607 if (!vectype_in)
5608 vectype_in = tem;
5609 gcc_assert (is_simple_use);
5610 if (!found_nested_cycle_def)
5611 reduc_def_stmt = def_stmt;
5613 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5614 return false;
5616 if (!(dt == vect_reduction_def
5617 || dt == vect_nested_cycle
5618 || ((dt == vect_internal_def || dt == vect_external_def
5619 || dt == vect_constant_def || dt == vect_induction_def)
5620 && nested_cycle && found_nested_cycle_def)))
5622 /* For pattern recognized stmts, orig_stmt might be a reduction,
5623 but some helper statements for the pattern might not, or
5624 might be COND_EXPRs with reduction uses in the condition. */
5625 gcc_assert (orig_stmt);
5626 return false;
5629 enum vect_reduction_type v_reduc_type;
5630 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5631 !nested_cycle, &dummy, false,
5632 &v_reduc_type);
5634 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5635 /* If we have a condition reduction, see if we can simplify it further. */
5636 if (v_reduc_type == COND_REDUCTION)
5638 if (cond_reduc_dt == vect_induction_def)
5640 if (dump_enabled_p ())
5641 dump_printf_loc (MSG_NOTE, vect_location,
5642 "condition expression based on "
5643 "integer induction.\n");
5644 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5645 = INTEGER_INDUC_COND_REDUCTION;
5648 if (cond_reduc_dt == vect_constant_def)
5650 enum vect_def_type cond_initial_dt;
5651 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5652 tree cond_initial_val
5653 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5655 gcc_assert (cond_reduc_val != NULL_TREE);
5656 vect_is_simple_use (cond_initial_val, loop_vinfo,
5657 &def_stmt, &cond_initial_dt);
5658 if (cond_initial_dt == vect_constant_def
5659 && types_compatible_p (TREE_TYPE (cond_initial_val),
5660 TREE_TYPE (cond_reduc_val)))
5662 tree e = fold_build2 (LE_EXPR, boolean_type_node,
5663 cond_initial_val, cond_reduc_val);
5664 if (e && (integer_onep (e) || integer_zerop (e)))
5666 if (dump_enabled_p ())
5667 dump_printf_loc (MSG_NOTE, vect_location,
5668 "condition expression based on "
5669 "compile time constant.\n");
5670 const_cond_cmp = e;
5671 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5672 = CONST_COND_REDUCTION;
5678 if (orig_stmt)
5679 gcc_assert (tmp == orig_stmt
5680 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5681 else
5682 /* We changed STMT to be the first stmt in reduction chain, hence we
5683 check that in this case the first element in the chain is STMT. */
5684 gcc_assert (stmt == tmp
5685 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5687 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5688 return false;
5690 if (slp_node)
5691 ncopies = 1;
5692 else
5693 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5694 / TYPE_VECTOR_SUBPARTS (vectype_in));
5696 gcc_assert (ncopies >= 1);
5698 vec_mode = TYPE_MODE (vectype_in);
5700 if (code == COND_EXPR)
5702 /* Only call during the analysis stage, otherwise we'll lose
5703 STMT_VINFO_TYPE. */
5704 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5705 ops[reduc_index], 0, NULL))
5707 if (dump_enabled_p ())
5708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5709 "unsupported condition in reduction\n");
5710 return false;
5713 else
5715 /* 4. Supportable by target? */
5717 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5718 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5720 /* Shifts and rotates are only supported by vectorizable_shifts,
5721 not vectorizable_reduction. */
5722 if (dump_enabled_p ())
5723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5724 "unsupported shift or rotation.\n");
5725 return false;
5728 /* 4.1. check support for the operation in the loop */
5729 optab = optab_for_tree_code (code, vectype_in, optab_default);
5730 if (!optab)
5732 if (dump_enabled_p ())
5733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5734 "no optab.\n");
5736 return false;
5739 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5741 if (dump_enabled_p ())
5742 dump_printf (MSG_NOTE, "op not supported by target.\n");
5744 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5745 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5746 < vect_min_worthwhile_factor (code))
5747 return false;
5749 if (dump_enabled_p ())
5750 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5753 /* Worthwhile without SIMD support? */
5754 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5755 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5756 < vect_min_worthwhile_factor (code))
5758 if (dump_enabled_p ())
5759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5760 "not worthwhile without SIMD support.\n");
5762 return false;
5766 /* 4.2. Check support for the epilog operation.
5768 If STMT represents a reduction pattern, then the type of the
5769 reduction variable may be different than the type of the rest
5770 of the arguments. For example, consider the case of accumulation
5771 of shorts into an int accumulator; The original code:
5772 S1: int_a = (int) short_a;
5773 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5775 was replaced with:
5776 STMT: int_acc = widen_sum <short_a, int_acc>
5778 This means that:
5779 1. The tree-code that is used to create the vector operation in the
5780 epilog code (that reduces the partial results) is not the
5781 tree-code of STMT, but is rather the tree-code of the original
5782 stmt from the pattern that STMT is replacing. I.e, in the example
5783 above we want to use 'widen_sum' in the loop, but 'plus' in the
5784 epilog.
5785 2. The type (mode) we use to check available target support
5786 for the vector operation to be created in the *epilog*, is
5787 determined by the type of the reduction variable (in the example
5788 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5789 However the type (mode) we use to check available target support
5790 for the vector operation to be created *inside the loop*, is
5791 determined by the type of the other arguments to STMT (in the
5792 example we'd check this: optab_handler (widen_sum_optab,
5793 vect_short_mode)).
5795 This is contrary to "regular" reductions, in which the types of all
5796 the arguments are the same as the type of the reduction variable.
5797 For "regular" reductions we can therefore use the same vector type
5798 (and also the same tree-code) when generating the epilog code and
5799 when generating the code inside the loop. */
5801 if (orig_stmt)
5803 /* This is a reduction pattern: get the vectype from the type of the
5804 reduction variable, and get the tree-code from orig_stmt. */
5805 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5806 == TREE_CODE_REDUCTION);
5807 orig_code = gimple_assign_rhs_code (orig_stmt);
5808 gcc_assert (vectype_out);
5809 vec_mode = TYPE_MODE (vectype_out);
5811 else
5813 /* Regular reduction: use the same vectype and tree-code as used for
5814 the vector code inside the loop can be used for the epilog code. */
5815 orig_code = code;
5817 if (code == MINUS_EXPR)
5818 orig_code = PLUS_EXPR;
5820 /* For simple condition reductions, replace with the actual expression
5821 we want to base our reduction around. */
5822 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
5824 gcc_assert (const_cond_cmp != NULL_TREE);
5825 gcc_assert (integer_onep (const_cond_cmp)
5826 || integer_zerop (const_cond_cmp));
5827 orig_code = integer_onep (const_cond_cmp) ? MAX_EXPR : MIN_EXPR;
5829 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5830 == INTEGER_INDUC_COND_REDUCTION)
5831 orig_code = MAX_EXPR;
5834 if (nested_cycle)
5836 def_bb = gimple_bb (reduc_def_stmt);
5837 def_stmt_loop = def_bb->loop_father;
5838 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5839 loop_preheader_edge (def_stmt_loop));
5840 if (TREE_CODE (def_arg) == SSA_NAME
5841 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5842 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5843 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5844 && vinfo_for_stmt (def_arg_stmt)
5845 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5846 == vect_double_reduction_def)
5847 double_reduc = true;
5850 epilog_reduc_code = ERROR_MARK;
5852 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
5854 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5856 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5857 optab_default);
5858 if (!reduc_optab)
5860 if (dump_enabled_p ())
5861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5862 "no optab for reduction.\n");
5864 epilog_reduc_code = ERROR_MARK;
5866 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5868 if (dump_enabled_p ())
5869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5870 "reduc op not supported by target.\n");
5872 epilog_reduc_code = ERROR_MARK;
5875 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5876 generated in the epilog using multiple expressions. This does not
5877 work for condition reductions. */
5878 if (epilog_reduc_code == ERROR_MARK
5879 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5880 == INTEGER_INDUC_COND_REDUCTION
5881 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5882 == CONST_COND_REDUCTION))
5884 if (dump_enabled_p ())
5885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5886 "no reduc code for scalar code.\n");
5887 return false;
5890 else
5892 if (!nested_cycle || double_reduc)
5894 if (dump_enabled_p ())
5895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5896 "no reduc code for scalar code.\n");
5898 return false;
5902 else
5904 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5905 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5906 cr_index_vector_type = build_vector_type
5907 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5909 epilog_reduc_code = REDUC_MAX_EXPR;
5910 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5911 optab_default);
5912 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5913 == CODE_FOR_nothing)
5915 if (dump_enabled_p ())
5916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5917 "reduc max op not supported by target.\n");
5918 return false;
5922 if ((double_reduc
5923 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
5924 && ncopies > 1)
5926 if (dump_enabled_p ())
5927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5928 "multiple types in double reduction or condition "
5929 "reduction.\n");
5930 return false;
5933 /* In case of widenning multiplication by a constant, we update the type
5934 of the constant to be the type of the other operand. We check that the
5935 constant fits the type in the pattern recognition pass. */
5936 if (code == DOT_PROD_EXPR
5937 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5939 if (TREE_CODE (ops[0]) == INTEGER_CST)
5940 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5941 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5942 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5943 else
5945 if (dump_enabled_p ())
5946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5947 "invalid types in dot-prod\n");
5949 return false;
5953 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5955 widest_int ni;
5957 if (! max_loop_iterations (loop, &ni))
5959 if (dump_enabled_p ())
5960 dump_printf_loc (MSG_NOTE, vect_location,
5961 "loop count not known, cannot create cond "
5962 "reduction.\n");
5963 return false;
5965 /* Convert backedges to iterations. */
5966 ni += 1;
5968 /* The additional index will be the same type as the condition. Check
5969 that the loop can fit into this less one (because we'll use up the
5970 zero slot for when there are no matches). */
5971 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5972 if (wi::geu_p (ni, wi::to_widest (max_index)))
5974 if (dump_enabled_p ())
5975 dump_printf_loc (MSG_NOTE, vect_location,
5976 "loop size is greater than data size.\n");
5977 return false;
5981 if (!vec_stmt) /* transformation not required. */
5983 if (first_p
5984 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5985 reduc_index))
5986 return false;
5987 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5988 return true;
5991 /** Transform. **/
5993 if (dump_enabled_p ())
5994 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5996 /* FORNOW: Multiple types are not supported for condition. */
5997 if (code == COND_EXPR)
5998 gcc_assert (ncopies == 1);
6000 /* Create the destination vector */
6001 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6003 /* In case the vectorization factor (VF) is bigger than the number
6004 of elements that we can fit in a vectype (nunits), we have to generate
6005 more than one vector stmt - i.e - we need to "unroll" the
6006 vector stmt by a factor VF/nunits. For more details see documentation
6007 in vectorizable_operation. */
6009 /* If the reduction is used in an outer loop we need to generate
6010 VF intermediate results, like so (e.g. for ncopies=2):
6011 r0 = phi (init, r0)
6012 r1 = phi (init, r1)
6013 r0 = x0 + r0;
6014 r1 = x1 + r1;
6015 (i.e. we generate VF results in 2 registers).
6016 In this case we have a separate def-use cycle for each copy, and therefore
6017 for each copy we get the vector def for the reduction variable from the
6018 respective phi node created for this copy.
6020 Otherwise (the reduction is unused in the loop nest), we can combine
6021 together intermediate results, like so (e.g. for ncopies=2):
6022 r = phi (init, r)
6023 r = x0 + r;
6024 r = x1 + r;
6025 (i.e. we generate VF/2 results in a single register).
6026 In this case for each copy we get the vector def for the reduction variable
6027 from the vectorized reduction operation generated in the previous iteration.
6030 if (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6032 single_defuse_cycle = true;
6033 epilog_copies = 1;
6035 else
6036 epilog_copies = ncopies;
6038 prev_stmt_info = NULL;
6039 prev_phi_info = NULL;
6040 if (slp_node)
6041 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6042 else
6044 vec_num = 1;
6045 vec_oprnds0.create (1);
6046 if (op_type == ternary_op)
6047 vec_oprnds1.create (1);
6050 phis.create (vec_num);
6051 vect_defs.create (vec_num);
6052 if (!slp_node)
6053 vect_defs.quick_push (NULL_TREE);
6055 for (j = 0; j < ncopies; j++)
6057 if (j == 0 || !single_defuse_cycle)
6059 for (i = 0; i < vec_num; i++)
6061 /* Create the reduction-phi that defines the reduction
6062 operand. */
6063 new_phi = create_phi_node (vec_dest, loop->header);
6064 set_vinfo_for_stmt (new_phi,
6065 new_stmt_vec_info (new_phi, loop_vinfo));
6066 if (j == 0 || slp_node)
6067 phis.quick_push (new_phi);
6071 if (code == COND_EXPR)
6073 gcc_assert (!slp_node);
6074 vectorizable_condition (stmt, gsi, vec_stmt,
6075 PHI_RESULT (phis[0]),
6076 reduc_index, NULL);
6077 /* Multiple types are not supported for condition. */
6078 break;
6081 /* Handle uses. */
6082 if (j == 0)
6084 if (slp_node)
6086 /* Get vec defs for all the operands except the reduction index,
6087 ensuring the ordering of the ops in the vector is kept. */
6088 auto_vec<tree, 3> slp_ops;
6089 auto_vec<vec<tree>, 3> vec_defs;
6091 slp_ops.quick_push ((reduc_index == 0) ? NULL : ops[0]);
6092 slp_ops.quick_push ((reduc_index == 1) ? NULL : ops[1]);
6093 if (op_type == ternary_op)
6094 slp_ops.quick_push ((reduc_index == 2) ? NULL : ops[2]);
6096 vect_get_slp_defs (slp_ops, slp_node, &vec_defs, -1);
6098 vec_oprnds0.safe_splice (vec_defs[(reduc_index == 0) ? 1 : 0]);
6099 if (op_type == ternary_op)
6100 vec_oprnds1.safe_splice (vec_defs[(reduc_index == 2) ? 1 : 2]);
6102 else
6104 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6105 stmt);
6106 vec_oprnds0.quick_push (loop_vec_def0);
6107 if (op_type == ternary_op)
6109 op1 = (reduc_index == 0) ? ops[2] : ops[1];
6110 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6111 vec_oprnds1.quick_push (loop_vec_def1);
6115 else
6117 if (!slp_node)
6119 enum vect_def_type dt;
6120 gimple *dummy_stmt;
6122 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6123 &dummy_stmt, &dt);
6124 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6125 loop_vec_def0);
6126 vec_oprnds0[0] = loop_vec_def0;
6127 if (op_type == ternary_op)
6129 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6130 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6131 loop_vec_def1);
6132 vec_oprnds1[0] = loop_vec_def1;
6136 if (single_defuse_cycle)
6137 reduc_def = gimple_assign_lhs (new_stmt);
6139 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6142 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6144 if (slp_node)
6145 reduc_def = PHI_RESULT (phis[i]);
6146 else
6148 if (!single_defuse_cycle || j == 0)
6149 reduc_def = PHI_RESULT (new_phi);
6152 def1 = ((op_type == ternary_op)
6153 ? vec_oprnds1[i] : NULL);
6154 if (op_type == binary_op)
6156 if (reduc_index == 0)
6157 expr = build2 (code, vectype_out, reduc_def, def0);
6158 else
6159 expr = build2 (code, vectype_out, def0, reduc_def);
6161 else
6163 if (reduc_index == 0)
6164 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6165 else
6167 if (reduc_index == 1)
6168 expr = build3 (code, vectype_out, def0, reduc_def, def1);
6169 else
6170 expr = build3 (code, vectype_out, def0, def1, reduc_def);
6174 new_stmt = gimple_build_assign (vec_dest, expr);
6175 new_temp = make_ssa_name (vec_dest, new_stmt);
6176 gimple_assign_set_lhs (new_stmt, new_temp);
6177 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6179 if (slp_node)
6181 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6182 vect_defs.quick_push (new_temp);
6184 else
6185 vect_defs[0] = new_temp;
6188 if (slp_node)
6189 continue;
6191 if (j == 0)
6192 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6193 else
6194 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6196 prev_stmt_info = vinfo_for_stmt (new_stmt);
6197 prev_phi_info = vinfo_for_stmt (new_phi);
6200 tree indx_before_incr, indx_after_incr, cond_name = NULL;
6202 /* Finalize the reduction-phi (set its arguments) and create the
6203 epilog reduction code. */
6204 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6206 new_temp = gimple_assign_lhs (*vec_stmt);
6207 vect_defs[0] = new_temp;
6209 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6210 which is updated with the current index of the loop for every match of
6211 the original loop's cond_expr (VEC_STMT). This results in a vector
6212 containing the last time the condition passed for that vector lane.
6213 The first match will be a 1 to allow 0 to be used for non-matching
6214 indexes. If there are no matches at all then the vector will be all
6215 zeroes. */
6216 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6218 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6219 int k;
6221 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6223 /* First we create a simple vector induction variable which starts
6224 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6225 vector size (STEP). */
6227 /* Create a {1,2,3,...} vector. */
6228 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6229 for (k = 0; k < nunits_out; ++k)
6230 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6231 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6233 /* Create a vector of the step value. */
6234 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6235 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6237 /* Create an induction variable. */
6238 gimple_stmt_iterator incr_gsi;
6239 bool insert_after;
6240 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6241 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6242 insert_after, &indx_before_incr, &indx_after_incr);
6244 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6245 filled with zeros (VEC_ZERO). */
6247 /* Create a vector of 0s. */
6248 tree zero = build_zero_cst (cr_index_scalar_type);
6249 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6251 /* Create a vector phi node. */
6252 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6253 new_phi = create_phi_node (new_phi_tree, loop->header);
6254 set_vinfo_for_stmt (new_phi,
6255 new_stmt_vec_info (new_phi, loop_vinfo));
6256 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6257 UNKNOWN_LOCATION);
6259 /* Now take the condition from the loops original cond_expr
6260 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6261 every match uses values from the induction variable
6262 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6263 (NEW_PHI_TREE).
6264 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6265 the new cond_expr (INDEX_COND_EXPR). */
6267 /* Duplicate the condition from vec_stmt. */
6268 tree ccompare = unshare_expr (gimple_assign_rhs1 (*vec_stmt));
6270 /* Create a conditional, where the condition is taken from vec_stmt
6271 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
6272 else is the phi (NEW_PHI_TREE). */
6273 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6274 ccompare, indx_before_incr,
6275 new_phi_tree);
6276 cond_name = make_ssa_name (cr_index_vector_type);
6277 gimple *index_condition = gimple_build_assign (cond_name,
6278 index_cond_expr);
6279 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6280 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6281 loop_vinfo);
6282 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6283 set_vinfo_for_stmt (index_condition, index_vec_info);
6285 /* Update the phi with the vec cond. */
6286 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6287 UNKNOWN_LOCATION);
6291 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6292 epilog_reduc_code, phis, reduc_index,
6293 double_reduc, slp_node, cond_name);
6295 return true;
6298 /* Function vect_min_worthwhile_factor.
6300 For a loop where we could vectorize the operation indicated by CODE,
6301 return the minimum vectorization factor that makes it worthwhile
6302 to use generic vectors. */
6304 vect_min_worthwhile_factor (enum tree_code code)
6306 switch (code)
6308 case PLUS_EXPR:
6309 case MINUS_EXPR:
6310 case NEGATE_EXPR:
6311 return 4;
6313 case BIT_AND_EXPR:
6314 case BIT_IOR_EXPR:
6315 case BIT_XOR_EXPR:
6316 case BIT_NOT_EXPR:
6317 return 2;
6319 default:
6320 return INT_MAX;
6325 /* Function vectorizable_induction
6327 Check if PHI performs an induction computation that can be vectorized.
6328 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6329 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6330 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6332 bool
6333 vectorizable_induction (gimple *phi,
6334 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6335 gimple **vec_stmt)
6337 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6338 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6339 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6340 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6341 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6342 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6343 tree vec_def;
6345 gcc_assert (ncopies >= 1);
6346 /* FORNOW. These restrictions should be relaxed. */
6347 if (nested_in_vect_loop_p (loop, phi))
6349 imm_use_iterator imm_iter;
6350 use_operand_p use_p;
6351 gimple *exit_phi;
6352 edge latch_e;
6353 tree loop_arg;
6355 if (ncopies > 1)
6357 if (dump_enabled_p ())
6358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6359 "multiple types in nested loop.\n");
6360 return false;
6363 exit_phi = NULL;
6364 latch_e = loop_latch_edge (loop->inner);
6365 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6366 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6368 gimple *use_stmt = USE_STMT (use_p);
6369 if (is_gimple_debug (use_stmt))
6370 continue;
6372 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6374 exit_phi = use_stmt;
6375 break;
6378 if (exit_phi)
6380 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6381 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6382 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6384 if (dump_enabled_p ())
6385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6386 "inner-loop induction only used outside "
6387 "of the outer vectorized loop.\n");
6388 return false;
6393 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6394 return false;
6396 /* FORNOW: SLP not supported. */
6397 if (STMT_SLP_TYPE (stmt_info))
6398 return false;
6400 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6402 if (gimple_code (phi) != GIMPLE_PHI)
6403 return false;
6405 if (!vec_stmt) /* transformation not required. */
6407 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6408 if (dump_enabled_p ())
6409 dump_printf_loc (MSG_NOTE, vect_location,
6410 "=== vectorizable_induction ===\n");
6411 vect_model_induction_cost (stmt_info, ncopies);
6412 return true;
6415 /** Transform. **/
6417 if (dump_enabled_p ())
6418 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6420 vec_def = get_initial_def_for_induction (phi);
6421 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6422 return true;
6425 /* Function vectorizable_live_operation.
6427 STMT computes a value that is used outside the loop. Check if
6428 it can be supported. */
6430 bool
6431 vectorizable_live_operation (gimple *stmt,
6432 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6433 slp_tree slp_node, int slp_index,
6434 gimple **vec_stmt)
6436 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6437 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6438 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6439 imm_use_iterator imm_iter;
6440 tree lhs, lhs_type, bitsize, vec_bitsize;
6441 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6442 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6443 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6444 gimple *use_stmt;
6445 auto_vec<tree> vec_oprnds;
6447 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6449 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6450 return false;
6452 /* FORNOW. CHECKME. */
6453 if (nested_in_vect_loop_p (loop, stmt))
6454 return false;
6456 /* If STMT is not relevant and it is a simple assignment and its inputs are
6457 invariant then it can remain in place, unvectorized. The original last
6458 scalar value that it computes will be used. */
6459 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6461 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
6462 if (dump_enabled_p ())
6463 dump_printf_loc (MSG_NOTE, vect_location,
6464 "statement is simple and uses invariant. Leaving in "
6465 "place.\n");
6466 return true;
6469 if (!vec_stmt)
6470 /* No transformation required. */
6471 return true;
6473 /* If stmt has a related stmt, then use that for getting the lhs. */
6474 if (is_pattern_stmt_p (stmt_info))
6475 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
6477 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
6478 : gimple_get_lhs (stmt);
6479 lhs_type = TREE_TYPE (lhs);
6481 /* Find all uses of STMT outside the loop - there should be at least one. */
6482 auto_vec<gimple *, 4> worklist;
6483 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
6484 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
6485 && !is_gimple_debug (use_stmt))
6486 worklist.safe_push (use_stmt);
6487 gcc_assert (worklist.length () >= 1);
6489 bitsize = TYPE_SIZE (TREE_TYPE (vectype));
6490 vec_bitsize = TYPE_SIZE (vectype);
6492 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
6493 tree vec_lhs, bitstart;
6494 if (slp_node)
6496 gcc_assert (slp_index >= 0);
6498 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6499 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6501 /* Get the last occurrence of the scalar index from the concatenation of
6502 all the slp vectors. Calculate which slp vector it is and the index
6503 within. */
6504 int pos = (num_vec * nunits) - num_scalar + slp_index;
6505 int vec_entry = pos / nunits;
6506 int vec_index = pos % nunits;
6508 /* Get the correct slp vectorized stmt. */
6509 vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
6511 /* Get entry to use. */
6512 bitstart = build_int_cst (unsigned_type_node, vec_index);
6513 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
6515 else
6517 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
6518 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
6520 /* For multiple copies, get the last copy. */
6521 for (int i = 1; i < ncopies; ++i)
6522 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
6523 vec_lhs);
6525 /* Get the last lane in the vector. */
6526 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
6529 /* Create a new vectorized stmt for the uses of STMT and insert outside the
6530 loop. */
6531 gimple_seq stmts = NULL;
6532 tree new_tree = build3 (BIT_FIELD_REF, TREE_TYPE (vectype), vec_lhs, bitsize,
6533 bitstart);
6534 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
6535 true, NULL_TREE);
6536 if (stmts)
6537 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
6539 /* Replace all uses of the USE_STMT in the worklist with the newly inserted
6540 statement. */
6541 while (!worklist.is_empty ())
6543 use_stmt = worklist.pop ();
6544 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
6545 update_stmt (use_stmt);
6548 return true;
6551 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6553 static void
6554 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6556 ssa_op_iter op_iter;
6557 imm_use_iterator imm_iter;
6558 def_operand_p def_p;
6559 gimple *ustmt;
6561 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6563 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6565 basic_block bb;
6567 if (!is_gimple_debug (ustmt))
6568 continue;
6570 bb = gimple_bb (ustmt);
6572 if (!flow_bb_inside_loop_p (loop, bb))
6574 if (gimple_debug_bind_p (ustmt))
6576 if (dump_enabled_p ())
6577 dump_printf_loc (MSG_NOTE, vect_location,
6578 "killing debug use\n");
6580 gimple_debug_bind_reset_value (ustmt);
6581 update_stmt (ustmt);
6583 else
6584 gcc_unreachable ();
6591 /* This function builds ni_name = number of iterations. Statements
6592 are emitted on the loop preheader edge. */
6594 static tree
6595 vect_build_loop_niters (loop_vec_info loop_vinfo)
6597 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6598 if (TREE_CODE (ni) == INTEGER_CST)
6599 return ni;
6600 else
6602 tree ni_name, var;
6603 gimple_seq stmts = NULL;
6604 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6606 var = create_tmp_var (TREE_TYPE (ni), "niters");
6607 ni_name = force_gimple_operand (ni, &stmts, false, var);
6608 if (stmts)
6609 gsi_insert_seq_on_edge_immediate (pe, stmts);
6611 return ni_name;
6616 /* This function generates the following statements:
6618 ni_name = number of iterations loop executes
6619 ratio = ni_name / vf
6620 ratio_mult_vf_name = ratio * vf
6622 and places them on the loop preheader edge. */
6624 static void
6625 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6626 tree ni_name,
6627 tree *ratio_mult_vf_name_ptr,
6628 tree *ratio_name_ptr)
6630 tree ni_minus_gap_name;
6631 tree var;
6632 tree ratio_name;
6633 tree ratio_mult_vf_name;
6634 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6635 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6636 tree log_vf;
6638 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6640 /* If epilogue loop is required because of data accesses with gaps, we
6641 subtract one iteration from the total number of iterations here for
6642 correct calculation of RATIO. */
6643 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6645 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6646 ni_name,
6647 build_one_cst (TREE_TYPE (ni_name)));
6648 if (!is_gimple_val (ni_minus_gap_name))
6650 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6651 gimple *stmts = NULL;
6652 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6653 true, var);
6654 gsi_insert_seq_on_edge_immediate (pe, stmts);
6657 else
6658 ni_minus_gap_name = ni_name;
6660 /* Create: ratio = ni >> log2(vf) */
6661 /* ??? As we have ni == number of latch executions + 1, ni could
6662 have overflown to zero. So avoid computing ratio based on ni
6663 but compute it using the fact that we know ratio will be at least
6664 one, thus via (ni - vf) >> log2(vf) + 1. */
6665 ratio_name
6666 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6667 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6668 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6669 ni_minus_gap_name,
6670 build_int_cst
6671 (TREE_TYPE (ni_name), vf)),
6672 log_vf),
6673 build_int_cst (TREE_TYPE (ni_name), 1));
6674 if (!is_gimple_val (ratio_name))
6676 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6677 gimple *stmts = NULL;
6678 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6679 gsi_insert_seq_on_edge_immediate (pe, stmts);
6681 *ratio_name_ptr = ratio_name;
6683 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6685 if (ratio_mult_vf_name_ptr)
6687 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6688 ratio_name, log_vf);
6689 if (!is_gimple_val (ratio_mult_vf_name))
6691 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6692 gimple *stmts = NULL;
6693 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6694 true, var);
6695 gsi_insert_seq_on_edge_immediate (pe, stmts);
6697 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6700 return;
6704 /* Function vect_transform_loop.
6706 The analysis phase has determined that the loop is vectorizable.
6707 Vectorize the loop - created vectorized stmts to replace the scalar
6708 stmts in the loop, and update the loop exit condition. */
6710 void
6711 vect_transform_loop (loop_vec_info loop_vinfo)
6713 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6714 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6715 int nbbs = loop->num_nodes;
6716 int i;
6717 tree ratio = NULL;
6718 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6719 bool grouped_store;
6720 bool slp_scheduled = false;
6721 gimple *stmt, *pattern_stmt;
6722 gimple_seq pattern_def_seq = NULL;
6723 gimple_stmt_iterator pattern_def_si = gsi_none ();
6724 bool transform_pattern_stmt = false;
6725 bool check_profitability = false;
6726 int th;
6727 /* Record number of iterations before we started tampering with the profile. */
6728 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6730 if (dump_enabled_p ())
6731 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6733 /* If profile is inprecise, we have chance to fix it up. */
6734 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6735 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6737 /* Use the more conservative vectorization threshold. If the number
6738 of iterations is constant assume the cost check has been performed
6739 by our caller. If the threshold makes all loops profitable that
6740 run at least the vectorization factor number of times checking
6741 is pointless, too. */
6742 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6743 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6744 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6746 if (dump_enabled_p ())
6747 dump_printf_loc (MSG_NOTE, vect_location,
6748 "Profitability threshold is %d loop iterations.\n",
6749 th);
6750 check_profitability = true;
6753 /* Make sure there exists a single-predecessor exit bb. Do this before
6754 versioning. */
6755 edge e = single_exit (loop);
6756 if (! single_pred_p (e->dest))
6758 split_loop_exit_edge (e);
6759 if (dump_enabled_p ())
6760 dump_printf (MSG_NOTE, "split exit edge\n");
6763 /* Version the loop first, if required, so the profitability check
6764 comes first. */
6766 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
6768 vect_loop_versioning (loop_vinfo, th, check_profitability);
6769 check_profitability = false;
6772 /* Make sure there exists a single-predecessor exit bb also on the
6773 scalar loop copy. Do this after versioning but before peeling
6774 so CFG structure is fine for both scalar and if-converted loop
6775 to make slpeel_duplicate_current_defs_from_edges face matched
6776 loop closed PHI nodes on the exit. */
6777 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
6779 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
6780 if (! single_pred_p (e->dest))
6782 split_loop_exit_edge (e);
6783 if (dump_enabled_p ())
6784 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
6788 tree ni_name = vect_build_loop_niters (loop_vinfo);
6789 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6791 /* Peel the loop if there are data refs with unknown alignment.
6792 Only one data ref with unknown store is allowed. */
6794 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6796 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6797 th, check_profitability);
6798 check_profitability = false;
6799 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6800 be re-computed. */
6801 ni_name = NULL_TREE;
6804 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6805 compile time constant), or it is a constant that doesn't divide by the
6806 vectorization factor, then an epilog loop needs to be created.
6807 We therefore duplicate the loop: the original loop will be vectorized,
6808 and will compute the first (n/VF) iterations. The second copy of the loop
6809 will remain scalar and will compute the remaining (n%VF) iterations.
6810 (VF is the vectorization factor). */
6812 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6813 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6815 tree ratio_mult_vf;
6816 if (!ni_name)
6817 ni_name = vect_build_loop_niters (loop_vinfo);
6818 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6819 &ratio);
6820 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6821 th, check_profitability);
6823 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6824 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6825 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6826 else
6828 if (!ni_name)
6829 ni_name = vect_build_loop_niters (loop_vinfo);
6830 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6833 /* 1) Make sure the loop header has exactly two entries
6834 2) Make sure we have a preheader basic block. */
6836 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6838 split_edge (loop_preheader_edge (loop));
6840 /* FORNOW: the vectorizer supports only loops which body consist
6841 of one basic block (header + empty latch). When the vectorizer will
6842 support more involved loop forms, the order by which the BBs are
6843 traversed need to be reconsidered. */
6845 for (i = 0; i < nbbs; i++)
6847 basic_block bb = bbs[i];
6848 stmt_vec_info stmt_info;
6850 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6851 gsi_next (&si))
6853 gphi *phi = si.phi ();
6854 if (dump_enabled_p ())
6856 dump_printf_loc (MSG_NOTE, vect_location,
6857 "------>vectorizing phi: ");
6858 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6860 stmt_info = vinfo_for_stmt (phi);
6861 if (!stmt_info)
6862 continue;
6864 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6865 vect_loop_kill_debug_uses (loop, phi);
6867 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6868 && !STMT_VINFO_LIVE_P (stmt_info))
6869 continue;
6871 if (STMT_VINFO_VECTYPE (stmt_info)
6872 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6873 != (unsigned HOST_WIDE_INT) vectorization_factor)
6874 && dump_enabled_p ())
6875 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6877 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6881 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6885 pattern_stmt = NULL;
6886 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6887 !gsi_end_p (si) || transform_pattern_stmt;)
6889 bool is_store;
6891 if (transform_pattern_stmt)
6892 stmt = pattern_stmt;
6893 else
6895 stmt = gsi_stmt (si);
6896 /* During vectorization remove existing clobber stmts. */
6897 if (gimple_clobber_p (stmt))
6899 unlink_stmt_vdef (stmt);
6900 gsi_remove (&si, true);
6901 release_defs (stmt);
6902 continue;
6906 if (dump_enabled_p ())
6908 dump_printf_loc (MSG_NOTE, vect_location,
6909 "------>vectorizing statement: ");
6910 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6913 stmt_info = vinfo_for_stmt (stmt);
6915 /* vector stmts created in the outer-loop during vectorization of
6916 stmts in an inner-loop may not have a stmt_info, and do not
6917 need to be vectorized. */
6918 if (!stmt_info)
6920 gsi_next (&si);
6921 continue;
6924 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6925 vect_loop_kill_debug_uses (loop, stmt);
6927 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6928 && !STMT_VINFO_LIVE_P (stmt_info))
6930 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6931 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6932 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6933 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6935 stmt = pattern_stmt;
6936 stmt_info = vinfo_for_stmt (stmt);
6938 else
6940 gsi_next (&si);
6941 continue;
6944 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6945 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6946 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6947 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6948 transform_pattern_stmt = true;
6950 /* If pattern statement has def stmts, vectorize them too. */
6951 if (is_pattern_stmt_p (stmt_info))
6953 if (pattern_def_seq == NULL)
6955 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6956 pattern_def_si = gsi_start (pattern_def_seq);
6958 else if (!gsi_end_p (pattern_def_si))
6959 gsi_next (&pattern_def_si);
6960 if (pattern_def_seq != NULL)
6962 gimple *pattern_def_stmt = NULL;
6963 stmt_vec_info pattern_def_stmt_info = NULL;
6965 while (!gsi_end_p (pattern_def_si))
6967 pattern_def_stmt = gsi_stmt (pattern_def_si);
6968 pattern_def_stmt_info
6969 = vinfo_for_stmt (pattern_def_stmt);
6970 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6971 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6972 break;
6973 gsi_next (&pattern_def_si);
6976 if (!gsi_end_p (pattern_def_si))
6978 if (dump_enabled_p ())
6980 dump_printf_loc (MSG_NOTE, vect_location,
6981 "==> vectorizing pattern def "
6982 "stmt: ");
6983 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6984 pattern_def_stmt, 0);
6987 stmt = pattern_def_stmt;
6988 stmt_info = pattern_def_stmt_info;
6990 else
6992 pattern_def_si = gsi_none ();
6993 transform_pattern_stmt = false;
6996 else
6997 transform_pattern_stmt = false;
7000 if (STMT_VINFO_VECTYPE (stmt_info))
7002 unsigned int nunits
7003 = (unsigned int)
7004 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7005 if (!STMT_SLP_TYPE (stmt_info)
7006 && nunits != (unsigned int) vectorization_factor
7007 && dump_enabled_p ())
7008 /* For SLP VF is set according to unrolling factor, and not
7009 to vector size, hence for SLP this print is not valid. */
7010 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7013 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7014 reached. */
7015 if (STMT_SLP_TYPE (stmt_info))
7017 if (!slp_scheduled)
7019 slp_scheduled = true;
7021 if (dump_enabled_p ())
7022 dump_printf_loc (MSG_NOTE, vect_location,
7023 "=== scheduling SLP instances ===\n");
7025 vect_schedule_slp (loop_vinfo);
7028 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7029 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7031 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7033 pattern_def_seq = NULL;
7034 gsi_next (&si);
7036 continue;
7040 /* -------- vectorize statement ------------ */
7041 if (dump_enabled_p ())
7042 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7044 grouped_store = false;
7045 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7046 if (is_store)
7048 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7050 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7051 interleaving chain was completed - free all the stores in
7052 the chain. */
7053 gsi_next (&si);
7054 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7056 else
7058 /* Free the attached stmt_vec_info and remove the stmt. */
7059 gimple *store = gsi_stmt (si);
7060 free_stmt_vec_info (store);
7061 unlink_stmt_vdef (store);
7062 gsi_remove (&si, true);
7063 release_defs (store);
7066 /* Stores can only appear at the end of pattern statements. */
7067 gcc_assert (!transform_pattern_stmt);
7068 pattern_def_seq = NULL;
7070 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7072 pattern_def_seq = NULL;
7073 gsi_next (&si);
7075 } /* stmts in BB */
7076 } /* BBs in loop */
7078 slpeel_make_loop_iterate_ntimes (loop, ratio);
7080 /* Reduce loop iterations by the vectorization factor. */
7081 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
7082 expected_iterations / vectorization_factor);
7083 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
7085 if (loop->nb_iterations_upper_bound != 0)
7086 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
7087 if (loop->nb_iterations_likely_upper_bound != 0)
7088 loop->nb_iterations_likely_upper_bound
7089 = loop->nb_iterations_likely_upper_bound - 1;
7091 loop->nb_iterations_upper_bound
7092 = wi::udiv_floor (loop->nb_iterations_upper_bound + 1,
7093 vectorization_factor) - 1;
7094 loop->nb_iterations_likely_upper_bound
7095 = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + 1,
7096 vectorization_factor) - 1;
7098 if (loop->any_estimate)
7100 loop->nb_iterations_estimate
7101 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
7102 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
7103 && loop->nb_iterations_estimate != 0)
7104 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
7107 if (dump_enabled_p ())
7109 dump_printf_loc (MSG_NOTE, vect_location,
7110 "LOOP VECTORIZED\n");
7111 if (loop->inner)
7112 dump_printf_loc (MSG_NOTE, vect_location,
7113 "OUTER LOOP VECTORIZED\n");
7114 dump_printf (MSG_NOTE, "\n");
7117 /* Free SLP instances here because otherwise stmt reference counting
7118 won't work. */
7119 slp_instance instance;
7120 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7121 vect_free_slp_instance (instance);
7122 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7123 /* Clear-up safelen field since its value is invalid after vectorization
7124 since vectorized loop can have loop-carried dependencies. */
7125 loop->safelen = 0;
7128 /* The code below is trying to perform simple optimization - revert
7129 if-conversion for masked stores, i.e. if the mask of a store is zero
7130 do not perform it and all stored value producers also if possible.
7131 For example,
7132 for (i=0; i<n; i++)
7133 if (c[i])
7135 p1[i] += 1;
7136 p2[i] = p3[i] +2;
7138 this transformation will produce the following semi-hammock:
7140 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7142 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7143 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7144 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7145 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7146 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7147 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7151 void
7152 optimize_mask_stores (struct loop *loop)
7154 basic_block *bbs = get_loop_body (loop);
7155 unsigned nbbs = loop->num_nodes;
7156 unsigned i;
7157 basic_block bb;
7158 gimple_stmt_iterator gsi;
7159 gimple *stmt;
7160 auto_vec<gimple *> worklist;
7162 vect_location = find_loop_location (loop);
7163 /* Pick up all masked stores in loop if any. */
7164 for (i = 0; i < nbbs; i++)
7166 bb = bbs[i];
7167 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7168 gsi_next (&gsi))
7170 stmt = gsi_stmt (gsi);
7171 if (is_gimple_call (stmt)
7172 && gimple_call_internal_p (stmt)
7173 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7174 worklist.safe_push (stmt);
7178 free (bbs);
7179 if (worklist.is_empty ())
7180 return;
7182 /* Loop has masked stores. */
7183 while (!worklist.is_empty ())
7185 gimple *last, *last_store;
7186 edge e, efalse;
7187 tree mask;
7188 basic_block store_bb, join_bb;
7189 gimple_stmt_iterator gsi_to;
7190 tree vdef, new_vdef;
7191 gphi *phi;
7192 tree vectype;
7193 tree zero;
7195 last = worklist.pop ();
7196 mask = gimple_call_arg (last, 2);
7197 bb = gimple_bb (last);
7198 /* Create new bb. */
7199 e = split_block (bb, last);
7200 join_bb = e->dest;
7201 store_bb = create_empty_bb (bb);
7202 add_bb_to_loop (store_bb, loop);
7203 e->flags = EDGE_TRUE_VALUE;
7204 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7205 /* Put STORE_BB to likely part. */
7206 efalse->probability = PROB_UNLIKELY;
7207 store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7208 make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7209 if (dom_info_available_p (CDI_DOMINATORS))
7210 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7211 if (dump_enabled_p ())
7212 dump_printf_loc (MSG_NOTE, vect_location,
7213 "Create new block %d to sink mask stores.",
7214 store_bb->index);
7215 /* Create vector comparison with boolean result. */
7216 vectype = TREE_TYPE (mask);
7217 zero = build_zero_cst (vectype);
7218 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7219 gsi = gsi_last_bb (bb);
7220 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7221 /* Create new PHI node for vdef of the last masked store:
7222 .MEM_2 = VDEF <.MEM_1>
7223 will be converted to
7224 .MEM.3 = VDEF <.MEM_1>
7225 and new PHI node will be created in join bb
7226 .MEM_2 = PHI <.MEM_1, .MEM_3>
7228 vdef = gimple_vdef (last);
7229 new_vdef = make_ssa_name (gimple_vop (cfun), last);
7230 gimple_set_vdef (last, new_vdef);
7231 phi = create_phi_node (vdef, join_bb);
7232 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7234 /* Put all masked stores with the same mask to STORE_BB if possible. */
7235 while (true)
7237 gimple_stmt_iterator gsi_from;
7238 gimple *stmt1 = NULL;
7240 /* Move masked store to STORE_BB. */
7241 last_store = last;
7242 gsi = gsi_for_stmt (last);
7243 gsi_from = gsi;
7244 /* Shift GSI to the previous stmt for further traversal. */
7245 gsi_prev (&gsi);
7246 gsi_to = gsi_start_bb (store_bb);
7247 gsi_move_before (&gsi_from, &gsi_to);
7248 /* Setup GSI_TO to the non-empty block start. */
7249 gsi_to = gsi_start_bb (store_bb);
7250 if (dump_enabled_p ())
7252 dump_printf_loc (MSG_NOTE, vect_location,
7253 "Move stmt to created bb\n");
7254 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7256 /* Move all stored value producers if possible. */
7257 while (!gsi_end_p (gsi))
7259 tree lhs;
7260 imm_use_iterator imm_iter;
7261 use_operand_p use_p;
7262 bool res;
7264 /* Skip debug statements. */
7265 if (is_gimple_debug (gsi_stmt (gsi)))
7267 gsi_prev (&gsi);
7268 continue;
7270 stmt1 = gsi_stmt (gsi);
7271 /* Do not consider statements writing to memory or having
7272 volatile operand. */
7273 if (gimple_vdef (stmt1)
7274 || gimple_has_volatile_ops (stmt1))
7275 break;
7276 gsi_from = gsi;
7277 gsi_prev (&gsi);
7278 lhs = gimple_get_lhs (stmt1);
7279 if (!lhs)
7280 break;
7282 /* LHS of vectorized stmt must be SSA_NAME. */
7283 if (TREE_CODE (lhs) != SSA_NAME)
7284 break;
7286 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7288 /* Remove dead scalar statement. */
7289 if (has_zero_uses (lhs))
7291 gsi_remove (&gsi_from, true);
7292 continue;
7296 /* Check that LHS does not have uses outside of STORE_BB. */
7297 res = true;
7298 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7300 gimple *use_stmt;
7301 use_stmt = USE_STMT (use_p);
7302 if (is_gimple_debug (use_stmt))
7303 continue;
7304 if (gimple_bb (use_stmt) != store_bb)
7306 res = false;
7307 break;
7310 if (!res)
7311 break;
7313 if (gimple_vuse (stmt1)
7314 && gimple_vuse (stmt1) != gimple_vuse (last_store))
7315 break;
7317 /* Can move STMT1 to STORE_BB. */
7318 if (dump_enabled_p ())
7320 dump_printf_loc (MSG_NOTE, vect_location,
7321 "Move stmt to created bb\n");
7322 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7324 gsi_move_before (&gsi_from, &gsi_to);
7325 /* Shift GSI_TO for further insertion. */
7326 gsi_prev (&gsi_to);
7328 /* Put other masked stores with the same mask to STORE_BB. */
7329 if (worklist.is_empty ()
7330 || gimple_call_arg (worklist.last (), 2) != mask
7331 || worklist.last () != stmt1)
7332 break;
7333 last = worklist.pop ();
7335 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);