Update to patch that Aldy committed directly here.
[official-gcc.git] / gcc / tree-vect-loop.c
blob9145dbf19e169ab05ee04fa276777a8439c68f2c
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "cfghooks.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "cfganal.h"
36 #include "gimple-pretty-print.h"
37 #include "internal-fn.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-pass.h"
45 #include "cfgloop.h"
46 #include "flags.h"
47 #include "insn-config.h"
48 #include "expmed.h"
49 #include "dojump.h"
50 #include "explow.h"
51 #include "calls.h"
52 #include "emit-rtl.h"
53 #include "varasm.h"
54 #include "stmt.h"
55 #include "expr.h"
56 #include "recog.h"
57 #include "insn-codes.h"
58 #include "optabs.h"
59 #include "params.h"
60 #include "diagnostic-core.h"
61 #include "tree-chrec.h"
62 #include "tree-scalar-evolution.h"
63 #include "tree-vectorizer.h"
64 #include "target.h"
66 /* Loop Vectorization Pass.
68 This pass tries to vectorize loops.
70 For example, the vectorizer transforms the following simple loop:
72 short a[N]; short b[N]; short c[N]; int i;
74 for (i=0; i<N; i++){
75 a[i] = b[i] + c[i];
78 as if it was manually vectorized by rewriting the source code into:
80 typedef int __attribute__((mode(V8HI))) v8hi;
81 short a[N]; short b[N]; short c[N]; int i;
82 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
83 v8hi va, vb, vc;
85 for (i=0; i<N/8; i++){
86 vb = pb[i];
87 vc = pc[i];
88 va = vb + vc;
89 pa[i] = va;
92 The main entry to this pass is vectorize_loops(), in which
93 the vectorizer applies a set of analyses on a given set of loops,
94 followed by the actual vectorization transformation for the loops that
95 had successfully passed the analysis phase.
96 Throughout this pass we make a distinction between two types of
97 data: scalars (which are represented by SSA_NAMES), and memory references
98 ("data-refs"). These two types of data require different handling both
99 during analysis and transformation. The types of data-refs that the
100 vectorizer currently supports are ARRAY_REFS which base is an array DECL
101 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
102 accesses are required to have a simple (consecutive) access pattern.
104 Analysis phase:
105 ===============
106 The driver for the analysis phase is vect_analyze_loop().
107 It applies a set of analyses, some of which rely on the scalar evolution
108 analyzer (scev) developed by Sebastian Pop.
110 During the analysis phase the vectorizer records some information
111 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
112 loop, as well as general information about the loop as a whole, which is
113 recorded in a "loop_vec_info" struct attached to each loop.
115 Transformation phase:
116 =====================
117 The loop transformation phase scans all the stmts in the loop, and
118 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
119 the loop that needs to be vectorized. It inserts the vector code sequence
120 just before the scalar stmt S, and records a pointer to the vector code
121 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
122 attached to S). This pointer will be used for the vectorization of following
123 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
124 otherwise, we rely on dead code elimination for removing it.
126 For example, say stmt S1 was vectorized into stmt VS1:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 S2: a = b;
132 To vectorize stmt S2, the vectorizer first finds the stmt that defines
133 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
134 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
135 resulting sequence would be:
137 VS1: vb = px[i];
138 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
139 VS2: va = vb;
140 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
142 Operands that are not SSA_NAMEs, are data-refs that appear in
143 load/store operations (like 'x[i]' in S1), and are handled differently.
145 Target modeling:
146 =================
147 Currently the only target specific information that is used is the
148 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
149 Targets that can support different sizes of vectors, for now will need
150 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
151 flexibility will be added in the future.
153 Since we only vectorize operations which vector form can be
154 expressed using existing tree codes, to verify that an operation is
155 supported, the vectorizer checks the relevant optab at the relevant
156 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
157 the value found is CODE_FOR_nothing, then there's no target support, and
158 we can't vectorize the stmt.
160 For additional information on this project see:
161 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
164 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
166 /* Function vect_determine_vectorization_factor
168 Determine the vectorization factor (VF). VF is the number of data elements
169 that are operated upon in parallel in a single iteration of the vectorized
170 loop. For example, when vectorizing a loop that operates on 4byte elements,
171 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
172 elements can fit in a single vector register.
174 We currently support vectorization of loops in which all types operated upon
175 are of the same size. Therefore this function currently sets VF according to
176 the size of the types operated upon, and fails if there are multiple sizes
177 in the loop.
179 VF is also the factor by which the loop iterations are strip-mined, e.g.:
180 original loop:
181 for (i=0; i<N; i++){
182 a[i] = b[i] + c[i];
185 vectorized loop:
186 for (i=0; i<N; i+=VF){
187 a[i:VF] = b[i:VF] + c[i:VF];
191 static bool
192 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
194 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
195 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
196 int nbbs = loop->num_nodes;
197 unsigned int vectorization_factor = 0;
198 tree scalar_type;
199 gphi *phi;
200 tree vectype;
201 unsigned int nunits;
202 stmt_vec_info stmt_info;
203 int i;
204 HOST_WIDE_INT dummy;
205 gimple stmt, pattern_stmt = NULL;
206 gimple_seq pattern_def_seq = NULL;
207 gimple_stmt_iterator pattern_def_si = gsi_none ();
208 bool analyze_pattern_stmt = false;
210 if (dump_enabled_p ())
211 dump_printf_loc (MSG_NOTE, vect_location,
212 "=== vect_determine_vectorization_factor ===\n");
214 for (i = 0; i < nbbs; i++)
216 basic_block bb = bbs[i];
218 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
219 gsi_next (&si))
221 phi = si.phi ();
222 stmt_info = vinfo_for_stmt (phi);
223 if (dump_enabled_p ())
225 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
226 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
227 dump_printf (MSG_NOTE, "\n");
230 gcc_assert (stmt_info);
232 if (STMT_VINFO_RELEVANT_P (stmt_info))
234 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
235 scalar_type = TREE_TYPE (PHI_RESULT (phi));
237 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "get vectype for scalar type: ");
241 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
242 dump_printf (MSG_NOTE, "\n");
245 vectype = get_vectype_for_scalar_type (scalar_type);
246 if (!vectype)
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
251 "not vectorized: unsupported "
252 "data-type ");
253 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
254 scalar_type);
255 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
257 return false;
259 STMT_VINFO_VECTYPE (stmt_info) = vectype;
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
264 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
265 dump_printf (MSG_NOTE, "\n");
268 nunits = TYPE_VECTOR_SUBPARTS (vectype);
269 if (dump_enabled_p ())
270 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
271 nunits);
273 if (!vectorization_factor
274 || (nunits > vectorization_factor))
275 vectorization_factor = nunits;
279 for (gimple_stmt_iterator si = gsi_start_bb (bb);
280 !gsi_end_p (si) || analyze_pattern_stmt;)
282 tree vf_vectype;
284 if (analyze_pattern_stmt)
285 stmt = pattern_stmt;
286 else
287 stmt = gsi_stmt (si);
289 stmt_info = vinfo_for_stmt (stmt);
291 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE, vect_location,
294 "==> examining statement: ");
295 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
296 dump_printf (MSG_NOTE, "\n");
299 gcc_assert (stmt_info);
301 /* Skip stmts which do not need to be vectorized. */
302 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
303 && !STMT_VINFO_LIVE_P (stmt_info))
304 || gimple_clobber_p (stmt))
306 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
307 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
308 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
309 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
311 stmt = pattern_stmt;
312 stmt_info = vinfo_for_stmt (pattern_stmt);
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "==> examining pattern statement: ");
317 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
318 dump_printf (MSG_NOTE, "\n");
321 else
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
325 gsi_next (&si);
326 continue;
329 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
330 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
331 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
332 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
333 analyze_pattern_stmt = true;
335 /* If a pattern statement has def stmts, analyze them too. */
336 if (is_pattern_stmt_p (stmt_info))
338 if (pattern_def_seq == NULL)
340 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
341 pattern_def_si = gsi_start (pattern_def_seq);
343 else if (!gsi_end_p (pattern_def_si))
344 gsi_next (&pattern_def_si);
345 if (pattern_def_seq != NULL)
347 gimple pattern_def_stmt = NULL;
348 stmt_vec_info pattern_def_stmt_info = NULL;
350 while (!gsi_end_p (pattern_def_si))
352 pattern_def_stmt = gsi_stmt (pattern_def_si);
353 pattern_def_stmt_info
354 = vinfo_for_stmt (pattern_def_stmt);
355 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
356 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
357 break;
358 gsi_next (&pattern_def_si);
361 if (!gsi_end_p (pattern_def_si))
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location,
366 "==> examining pattern def stmt: ");
367 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
368 pattern_def_stmt, 0);
369 dump_printf (MSG_NOTE, "\n");
372 stmt = pattern_def_stmt;
373 stmt_info = pattern_def_stmt_info;
375 else
377 pattern_def_si = gsi_none ();
378 analyze_pattern_stmt = false;
381 else
382 analyze_pattern_stmt = false;
385 if (gimple_get_lhs (stmt) == NULL_TREE
386 /* MASK_STORE has no lhs, but is ok. */
387 && (!is_gimple_call (stmt)
388 || !gimple_call_internal_p (stmt)
389 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
391 if (is_gimple_call (stmt))
393 /* Ignore calls with no lhs. These must be calls to
394 #pragma omp simd functions, and what vectorization factor
395 it really needs can't be determined until
396 vectorizable_simd_clone_call. */
397 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
399 pattern_def_seq = NULL;
400 gsi_next (&si);
402 continue;
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "not vectorized: irregular stmt.");
408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return false;
415 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
417 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "not vectorized: vector stmt in loop:");
421 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
422 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
424 return false;
427 if (STMT_VINFO_VECTYPE (stmt_info))
429 /* The only case when a vectype had been already set is for stmts
430 that contain a dataref, or for "pattern-stmts" (stmts
431 generated by the vectorizer to represent/replace a certain
432 idiom). */
433 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
434 || is_pattern_stmt_p (stmt_info)
435 || !gsi_end_p (pattern_def_si));
436 vectype = STMT_VINFO_VECTYPE (stmt_info);
438 else
440 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
441 if (is_gimple_call (stmt)
442 && gimple_call_internal_p (stmt)
443 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
444 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
445 else
446 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_NOTE, vect_location,
450 "get vectype for scalar type: ");
451 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
452 dump_printf (MSG_NOTE, "\n");
454 vectype = get_vectype_for_scalar_type (scalar_type);
455 if (!vectype)
457 if (dump_enabled_p ())
459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
460 "not vectorized: unsupported "
461 "data-type ");
462 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
463 scalar_type);
464 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
466 return false;
469 STMT_VINFO_VECTYPE (stmt_info) = vectype;
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
474 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
475 dump_printf (MSG_NOTE, "\n");
479 /* The vectorization factor is according to the smallest
480 scalar type (or the largest vector size, but we only
481 support one vector size per loop). */
482 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
483 &dummy);
484 if (dump_enabled_p ())
486 dump_printf_loc (MSG_NOTE, vect_location,
487 "get vectype for scalar type: ");
488 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
489 dump_printf (MSG_NOTE, "\n");
491 vf_vectype = get_vectype_for_scalar_type (scalar_type);
492 if (!vf_vectype)
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "not vectorized: unsupported data-type ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
499 scalar_type);
500 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
502 return false;
505 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
506 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
511 "not vectorized: different sized vector "
512 "types in statement, ");
513 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
514 vectype);
515 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
516 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
517 vf_vectype);
518 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
520 return false;
523 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
526 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
527 dump_printf (MSG_NOTE, "\n");
530 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
531 if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
533 if (!vectorization_factor
534 || (nunits > vectorization_factor))
535 vectorization_factor = nunits;
537 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
539 pattern_def_seq = NULL;
540 gsi_next (&si);
545 /* TODO: Analyze cost. Decide if worth while to vectorize. */
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
548 vectorization_factor);
549 if (vectorization_factor <= 1)
551 if (dump_enabled_p ())
552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
553 "not vectorized: unsupported data-type\n");
554 return false;
556 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
558 return true;
562 /* Function vect_is_simple_iv_evolution.
564 FORNOW: A simple evolution of an induction variables in the loop is
565 considered a polynomial evolution. */
567 static bool
568 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
569 tree * step)
571 tree init_expr;
572 tree step_expr;
573 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
574 basic_block bb;
576 /* When there is no evolution in this loop, the evolution function
577 is not "simple". */
578 if (evolution_part == NULL_TREE)
579 return false;
581 /* When the evolution is a polynomial of degree >= 2
582 the evolution function is not "simple". */
583 if (tree_is_chrec (evolution_part))
584 return false;
586 step_expr = evolution_part;
587 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
589 if (dump_enabled_p ())
591 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
592 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
593 dump_printf (MSG_NOTE, ", init: ");
594 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
595 dump_printf (MSG_NOTE, "\n");
598 *init = init_expr;
599 *step = step_expr;
601 if (TREE_CODE (step_expr) != INTEGER_CST
602 && (TREE_CODE (step_expr) != SSA_NAME
603 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
604 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
605 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
606 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
607 || !flag_associative_math)))
608 && (TREE_CODE (step_expr) != REAL_CST
609 || !flag_associative_math))
611 if (dump_enabled_p ())
612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
613 "step unknown.\n");
614 return false;
617 return true;
620 /* Function vect_analyze_scalar_cycles_1.
622 Examine the cross iteration def-use cycles of scalar variables
623 in LOOP. LOOP_VINFO represents the loop that is now being
624 considered for vectorization (can be LOOP, or an outer-loop
625 enclosing LOOP). */
627 static void
628 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
630 basic_block bb = loop->header;
631 tree init, step;
632 auto_vec<gimple, 64> worklist;
633 gphi_iterator gsi;
634 bool double_reduc;
636 if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE, vect_location,
638 "=== vect_analyze_scalar_cycles ===\n");
640 /* First - identify all inductions. Reduction detection assumes that all the
641 inductions have been identified, therefore, this order must not be
642 changed. */
643 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
645 gphi *phi = gsi.phi ();
646 tree access_fn = NULL;
647 tree def = PHI_RESULT (phi);
648 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
650 if (dump_enabled_p ())
652 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
653 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
654 dump_printf (MSG_NOTE, "\n");
657 /* Skip virtual phi's. The data dependences that are associated with
658 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
659 if (virtual_operand_p (def))
660 continue;
662 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
664 /* Analyze the evolution function. */
665 access_fn = analyze_scalar_evolution (loop, def);
666 if (access_fn)
668 STRIP_NOPS (access_fn);
669 if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "Access function of PHI: ");
673 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
674 dump_printf (MSG_NOTE, "\n");
676 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
677 = evolution_part_in_loop_num (access_fn, loop->num);
680 if (!access_fn
681 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
682 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
683 && TREE_CODE (step) != INTEGER_CST))
685 worklist.safe_push (phi);
686 continue;
689 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
691 if (dump_enabled_p ())
692 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
693 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
697 /* Second - identify all reductions and nested cycles. */
698 while (worklist.length () > 0)
700 gimple phi = worklist.pop ();
701 tree def = PHI_RESULT (phi);
702 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
703 gimple reduc_stmt;
704 bool nested_cycle;
706 if (dump_enabled_p ())
708 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
709 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
710 dump_printf (MSG_NOTE, "\n");
713 gcc_assert (!virtual_operand_p (def)
714 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
716 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
717 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
718 &double_reduc);
719 if (reduc_stmt)
721 if (double_reduc)
723 if (dump_enabled_p ())
724 dump_printf_loc (MSG_NOTE, vect_location,
725 "Detected double reduction.\n");
727 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
728 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
729 vect_double_reduction_def;
731 else
733 if (nested_cycle)
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_NOTE, vect_location,
737 "Detected vectorizable nested cycle.\n");
739 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
740 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
741 vect_nested_cycle;
743 else
745 if (dump_enabled_p ())
746 dump_printf_loc (MSG_NOTE, vect_location,
747 "Detected reduction.\n");
749 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
750 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
751 vect_reduction_def;
752 /* Store the reduction cycles for possible vectorization in
753 loop-aware SLP. */
754 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
758 else
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
761 "Unknown def-use cycle pattern.\n");
766 /* Function vect_analyze_scalar_cycles.
768 Examine the cross iteration def-use cycles of scalar variables, by
769 analyzing the loop-header PHIs of scalar variables. Classify each
770 cycle as one of the following: invariant, induction, reduction, unknown.
771 We do that for the loop represented by LOOP_VINFO, and also to its
772 inner-loop, if exists.
773 Examples for scalar cycles:
775 Example1: reduction:
777 loop1:
778 for (i=0; i<N; i++)
779 sum += a[i];
781 Example2: induction:
783 loop2:
784 for (i=0; i<N; i++)
785 a[i] = i; */
787 static void
788 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
790 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
792 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
794 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
795 Reductions in such inner-loop therefore have different properties than
796 the reductions in the nest that gets vectorized:
797 1. When vectorized, they are executed in the same order as in the original
798 scalar loop, so we can't change the order of computation when
799 vectorizing them.
800 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
801 current checks are too strict. */
803 if (loop->inner)
804 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
807 /* Transfer group and reduction information from STMT to its pattern stmt. */
809 static void
810 vect_fixup_reduc_chain (gimple stmt)
812 gimple firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
813 gimple stmtp;
814 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
815 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
816 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
819 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
820 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
821 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
822 if (stmt)
823 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
824 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
826 while (stmt);
827 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
830 /* Fixup scalar cycles that now have their stmts detected as patterns. */
832 static void
833 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
835 gimple first;
836 unsigned i;
838 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
839 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
841 vect_fixup_reduc_chain (first);
842 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
843 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1.
853 Return the loop exit condition. */
856 static gcond *
857 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
858 tree *number_of_iterationsm1)
860 tree niters;
862 if (dump_enabled_p ())
863 dump_printf_loc (MSG_NOTE, vect_location,
864 "=== get_loop_niters ===\n");
866 niters = number_of_latch_executions (loop);
867 *number_of_iterationsm1 = niters;
869 /* We want the number of loop header executions which is the number
870 of latch executions plus one.
871 ??? For UINT_MAX latch executions this number overflows to zero
872 for loops like do { n++; } while (n != 0); */
873 if (niters && !chrec_contains_undetermined (niters))
874 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
875 build_int_cst (TREE_TYPE (niters), 1));
876 *number_of_iterations = niters;
878 return get_loop_exit_condition (loop);
882 /* Function bb_in_loop_p
884 Used as predicate for dfs order traversal of the loop bbs. */
886 static bool
887 bb_in_loop_p (const_basic_block bb, const void *data)
889 const struct loop *const loop = (const struct loop *)data;
890 if (flow_bb_inside_loop_p (loop, bb))
891 return true;
892 return false;
896 /* Function new_loop_vec_info.
898 Create and initialize a new loop_vec_info struct for LOOP, as well as
899 stmt_vec_info structs for all the stmts in LOOP. */
901 static loop_vec_info
902 new_loop_vec_info (struct loop *loop)
904 loop_vec_info res;
905 basic_block *bbs;
906 gimple_stmt_iterator si;
907 unsigned int i, nbbs;
909 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
910 LOOP_VINFO_LOOP (res) = loop;
912 bbs = get_loop_body (loop);
914 /* Create/Update stmt_info for all stmts in the loop. */
915 for (i = 0; i < loop->num_nodes; i++)
917 basic_block bb = bbs[i];
919 /* BBs in a nested inner-loop will have been already processed (because
920 we will have called vect_analyze_loop_form for any nested inner-loop).
921 Therefore, for stmts in an inner-loop we just want to update the
922 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
923 loop_info of the outer-loop we are currently considering to vectorize
924 (instead of the loop_info of the inner-loop).
925 For stmts in other BBs we need to create a stmt_info from scratch. */
926 if (bb->loop_father != loop)
928 /* Inner-loop bb. */
929 gcc_assert (loop->inner && bb->loop_father == loop->inner);
930 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
932 gimple phi = gsi_stmt (si);
933 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
934 loop_vec_info inner_loop_vinfo =
935 STMT_VINFO_LOOP_VINFO (stmt_info);
936 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
937 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
939 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
941 gimple stmt = gsi_stmt (si);
942 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
943 loop_vec_info inner_loop_vinfo =
944 STMT_VINFO_LOOP_VINFO (stmt_info);
945 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
946 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
949 else
951 /* bb in current nest. */
952 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
954 gimple phi = gsi_stmt (si);
955 gimple_set_uid (phi, 0);
956 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
959 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
961 gimple stmt = gsi_stmt (si);
962 gimple_set_uid (stmt, 0);
963 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
968 /* CHECKME: We want to visit all BBs before their successors (except for
969 latch blocks, for which this assertion wouldn't hold). In the simple
970 case of the loop forms we allow, a dfs order of the BBs would the same
971 as reversed postorder traversal, so we are safe. */
973 free (bbs);
974 bbs = XCNEWVEC (basic_block, loop->num_nodes);
975 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
976 bbs, loop->num_nodes, loop);
977 gcc_assert (nbbs == loop->num_nodes);
979 LOOP_VINFO_BBS (res) = bbs;
980 LOOP_VINFO_NITERSM1 (res) = NULL;
981 LOOP_VINFO_NITERS (res) = NULL;
982 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
983 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
984 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
985 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
986 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
987 LOOP_VINFO_VECT_FACTOR (res) = 0;
988 LOOP_VINFO_LOOP_NEST (res).create (3);
989 LOOP_VINFO_DATAREFS (res).create (10);
990 LOOP_VINFO_DDRS (res).create (10 * 10);
991 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
992 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
993 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
994 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
995 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
996 LOOP_VINFO_GROUPED_STORES (res).create (10);
997 LOOP_VINFO_REDUCTIONS (res).create (10);
998 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
999 LOOP_VINFO_SLP_INSTANCES (res).create (10);
1000 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1001 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1002 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1003 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1004 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1006 return res;
1010 /* Function destroy_loop_vec_info.
1012 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1013 stmts in the loop. */
1015 void
1016 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1018 struct loop *loop;
1019 basic_block *bbs;
1020 int nbbs;
1021 gimple_stmt_iterator si;
1022 int j;
1023 vec<slp_instance> slp_instances;
1024 slp_instance instance;
1025 bool swapped;
1027 if (!loop_vinfo)
1028 return;
1030 loop = LOOP_VINFO_LOOP (loop_vinfo);
1032 bbs = LOOP_VINFO_BBS (loop_vinfo);
1033 nbbs = clean_stmts ? loop->num_nodes : 0;
1034 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1036 for (j = 0; j < nbbs; j++)
1038 basic_block bb = bbs[j];
1039 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1040 free_stmt_vec_info (gsi_stmt (si));
1042 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1044 gimple stmt = gsi_stmt (si);
1046 /* We may have broken canonical form by moving a constant
1047 into RHS1 of a commutative op. Fix such occurrences. */
1048 if (swapped && is_gimple_assign (stmt))
1050 enum tree_code code = gimple_assign_rhs_code (stmt);
1052 if ((code == PLUS_EXPR
1053 || code == POINTER_PLUS_EXPR
1054 || code == MULT_EXPR)
1055 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1056 swap_ssa_operands (stmt,
1057 gimple_assign_rhs1_ptr (stmt),
1058 gimple_assign_rhs2_ptr (stmt));
1061 /* Free stmt_vec_info. */
1062 free_stmt_vec_info (stmt);
1063 gsi_next (&si);
1067 free (LOOP_VINFO_BBS (loop_vinfo));
1068 vect_destroy_datarefs (loop_vinfo, NULL);
1069 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1070 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1071 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1072 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1073 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1074 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1075 vect_free_slp_instance (instance);
1077 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1078 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1079 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1080 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1082 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1083 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1085 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1086 loop_vinfo->scalar_cost_vec.release ();
1088 free (loop_vinfo);
1089 loop->aux = NULL;
1093 /* Calculate the cost of one scalar iteration of the loop. */
1094 static void
1095 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1097 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1098 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1099 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1100 int innerloop_iters, i;
1102 /* Count statements in scalar loop. Using this as scalar cost for a single
1103 iteration for now.
1105 TODO: Add outer loop support.
1107 TODO: Consider assigning different costs to different scalar
1108 statements. */
1110 /* FORNOW. */
1111 innerloop_iters = 1;
1112 if (loop->inner)
1113 innerloop_iters = 50; /* FIXME */
1115 for (i = 0; i < nbbs; i++)
1117 gimple_stmt_iterator si;
1118 basic_block bb = bbs[i];
1120 if (bb->loop_father == loop->inner)
1121 factor = innerloop_iters;
1122 else
1123 factor = 1;
1125 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1127 gimple stmt = gsi_stmt (si);
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1130 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1131 continue;
1133 /* Skip stmts that are not vectorized inside the loop. */
1134 if (stmt_info
1135 && !STMT_VINFO_RELEVANT_P (stmt_info)
1136 && (!STMT_VINFO_LIVE_P (stmt_info)
1137 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1138 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1139 continue;
1141 vect_cost_for_stmt kind;
1142 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1144 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1145 kind = scalar_load;
1146 else
1147 kind = scalar_store;
1149 else
1150 kind = scalar_stmt;
1152 scalar_single_iter_cost
1153 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1154 factor, kind, NULL, 0, vect_prologue);
1157 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1158 = scalar_single_iter_cost;
1162 /* Function vect_analyze_loop_1.
1164 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1165 for it. The different analyses will record information in the
1166 loop_vec_info struct. This is a subset of the analyses applied in
1167 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1168 that is now considered for (outer-loop) vectorization. */
1170 static loop_vec_info
1171 vect_analyze_loop_1 (struct loop *loop)
1173 loop_vec_info loop_vinfo;
1175 if (dump_enabled_p ())
1176 dump_printf_loc (MSG_NOTE, vect_location,
1177 "===== analyze_loop_nest_1 =====\n");
1179 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1181 loop_vinfo = vect_analyze_loop_form (loop);
1182 if (!loop_vinfo)
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "bad inner-loop form.\n");
1187 return NULL;
1190 return loop_vinfo;
1194 /* Function vect_analyze_loop_form.
1196 Verify that certain CFG restrictions hold, including:
1197 - the loop has a pre-header
1198 - the loop has a single entry and exit
1199 - the loop exit condition is simple enough, and the number of iterations
1200 can be analyzed (a countable loop). */
1202 loop_vec_info
1203 vect_analyze_loop_form (struct loop *loop)
1205 loop_vec_info loop_vinfo;
1206 gcond *loop_cond;
1207 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1208 loop_vec_info inner_loop_vinfo = NULL;
1210 if (dump_enabled_p ())
1211 dump_printf_loc (MSG_NOTE, vect_location,
1212 "=== vect_analyze_loop_form ===\n");
1214 /* Different restrictions apply when we are considering an inner-most loop,
1215 vs. an outer (nested) loop.
1216 (FORNOW. May want to relax some of these restrictions in the future). */
1218 if (!loop->inner)
1220 /* Inner-most loop. We currently require that the number of BBs is
1221 exactly 2 (the header and latch). Vectorizable inner-most loops
1222 look like this:
1224 (pre-header)
1226 header <--------+
1227 | | |
1228 | +--> latch --+
1230 (exit-bb) */
1232 if (loop->num_nodes != 2)
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236 "not vectorized: control flow in loop.\n");
1237 return NULL;
1240 if (empty_block_p (loop->header))
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244 "not vectorized: empty loop.\n");
1245 return NULL;
1248 else
1250 struct loop *innerloop = loop->inner;
1251 edge entryedge;
1253 /* Nested loop. We currently require that the loop is doubly-nested,
1254 contains a single inner loop, and the number of BBs is exactly 5.
1255 Vectorizable outer-loops look like this:
1257 (pre-header)
1259 header <---+
1261 inner-loop |
1263 tail ------+
1265 (exit-bb)
1267 The inner-loop has the properties expected of inner-most loops
1268 as described above. */
1270 if ((loop->inner)->inner || (loop->inner)->next)
1272 if (dump_enabled_p ())
1273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1274 "not vectorized: multiple nested loops.\n");
1275 return NULL;
1278 /* Analyze the inner-loop. */
1279 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1280 if (!inner_loop_vinfo)
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284 "not vectorized: Bad inner loop.\n");
1285 return NULL;
1288 if (!expr_invariant_in_loop_p (loop,
1289 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: inner-loop count not"
1294 " invariant.\n");
1295 destroy_loop_vec_info (inner_loop_vinfo, true);
1296 return NULL;
1299 if (loop->num_nodes != 5)
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: control flow in loop.\n");
1304 destroy_loop_vec_info (inner_loop_vinfo, true);
1305 return NULL;
1308 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1309 entryedge = EDGE_PRED (innerloop->header, 0);
1310 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1311 entryedge = EDGE_PRED (innerloop->header, 1);
1313 if (entryedge->src != loop->header
1314 || !single_exit (innerloop)
1315 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1317 if (dump_enabled_p ())
1318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1319 "not vectorized: unsupported outerloop form.\n");
1320 destroy_loop_vec_info (inner_loop_vinfo, true);
1321 return NULL;
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_NOTE, vect_location,
1326 "Considering outer-loop vectorization.\n");
1329 if (!single_exit (loop)
1330 || EDGE_COUNT (loop->header->preds) != 2)
1332 if (dump_enabled_p ())
1334 if (!single_exit (loop))
1335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336 "not vectorized: multiple exits.\n");
1337 else if (EDGE_COUNT (loop->header->preds) != 2)
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339 "not vectorized: too many incoming edges.\n");
1341 if (inner_loop_vinfo)
1342 destroy_loop_vec_info (inner_loop_vinfo, true);
1343 return NULL;
1346 /* We assume that the loop exit condition is at the end of the loop. i.e,
1347 that the loop is represented as a do-while (with a proper if-guard
1348 before the loop if needed), where the loop header contains all the
1349 executable statements, and the latch is empty. */
1350 if (!empty_block_p (loop->latch)
1351 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1355 "not vectorized: latch block not empty.\n");
1356 if (inner_loop_vinfo)
1357 destroy_loop_vec_info (inner_loop_vinfo, true);
1358 return NULL;
1361 /* Make sure there exists a single-predecessor exit bb: */
1362 if (!single_pred_p (single_exit (loop)->dest))
1364 edge e = single_exit (loop);
1365 if (!(e->flags & EDGE_ABNORMAL))
1367 split_loop_exit_edge (e);
1368 if (dump_enabled_p ())
1369 dump_printf (MSG_NOTE, "split exit edge.\n");
1371 else
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375 "not vectorized: abnormal loop exit edge.\n");
1376 if (inner_loop_vinfo)
1377 destroy_loop_vec_info (inner_loop_vinfo, true);
1378 return NULL;
1382 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1383 &number_of_iterationsm1);
1384 if (!loop_cond)
1386 if (dump_enabled_p ())
1387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388 "not vectorized: complicated exit condition.\n");
1389 if (inner_loop_vinfo)
1390 destroy_loop_vec_info (inner_loop_vinfo, true);
1391 return NULL;
1394 if (!number_of_iterations
1395 || chrec_contains_undetermined (number_of_iterations))
1397 if (dump_enabled_p ())
1398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1399 "not vectorized: number of iterations cannot be "
1400 "computed.\n");
1401 if (inner_loop_vinfo)
1402 destroy_loop_vec_info (inner_loop_vinfo, true);
1403 return NULL;
1406 if (integer_zerop (number_of_iterations))
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410 "not vectorized: number of iterations = 0.\n");
1411 if (inner_loop_vinfo)
1412 destroy_loop_vec_info (inner_loop_vinfo, true);
1413 return NULL;
1416 loop_vinfo = new_loop_vec_info (loop);
1417 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1418 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1419 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1421 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1423 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_NOTE, vect_location,
1426 "Symbolic number of iterations is ");
1427 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1428 dump_printf (MSG_NOTE, "\n");
1432 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1434 /* CHECKME: May want to keep it around it in the future. */
1435 if (inner_loop_vinfo)
1436 destroy_loop_vec_info (inner_loop_vinfo, false);
1438 gcc_assert (!loop->aux);
1439 loop->aux = loop_vinfo;
1440 return loop_vinfo;
1443 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1444 statements update the vectorization factor. */
1446 static void
1447 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1449 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1450 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1451 int nbbs = loop->num_nodes;
1452 unsigned int vectorization_factor;
1453 int i;
1455 if (dump_enabled_p ())
1456 dump_printf_loc (MSG_NOTE, vect_location,
1457 "=== vect_update_vf_for_slp ===\n");
1459 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1460 gcc_assert (vectorization_factor != 0);
1462 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1463 vectorization factor of the loop is the unrolling factor required by
1464 the SLP instances. If that unrolling factor is 1, we say, that we
1465 perform pure SLP on loop - cross iteration parallelism is not
1466 exploited. */
1467 bool only_slp_in_loop = true;
1468 for (i = 0; i < nbbs; i++)
1470 basic_block bb = bbs[i];
1471 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1472 gsi_next (&si))
1474 gimple stmt = gsi_stmt (si);
1475 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1476 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1477 && STMT_VINFO_RELATED_STMT (stmt_info))
1479 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1480 stmt_info = vinfo_for_stmt (stmt);
1482 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1483 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1484 && !PURE_SLP_STMT (stmt_info))
1485 /* STMT needs both SLP and loop-based vectorization. */
1486 only_slp_in_loop = false;
1490 if (only_slp_in_loop)
1491 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1492 else
1493 vectorization_factor
1494 = least_common_multiple (vectorization_factor,
1495 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1497 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1498 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE, vect_location,
1500 "Updating vectorization factor to %d\n",
1501 vectorization_factor);
1504 /* Function vect_analyze_loop_operations.
1506 Scan the loop stmts and make sure they are all vectorizable. */
1508 static bool
1509 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1511 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1512 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1513 int nbbs = loop->num_nodes;
1514 unsigned int vectorization_factor;
1515 int i;
1516 stmt_vec_info stmt_info;
1517 bool need_to_vectorize = false;
1518 int min_profitable_iters;
1519 int min_scalar_loop_bound;
1520 unsigned int th;
1521 bool ok;
1522 HOST_WIDE_INT max_niter;
1523 HOST_WIDE_INT estimated_niter;
1524 int min_profitable_estimate;
1526 if (dump_enabled_p ())
1527 dump_printf_loc (MSG_NOTE, vect_location,
1528 "=== vect_analyze_loop_operations ===\n");
1530 for (i = 0; i < nbbs; i++)
1532 basic_block bb = bbs[i];
1534 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535 gsi_next (&si))
1537 gphi *phi = si.phi ();
1538 ok = true;
1540 stmt_info = vinfo_for_stmt (phi);
1541 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1544 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1545 dump_printf (MSG_NOTE, "\n");
1548 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1549 (i.e., a phi in the tail of the outer-loop). */
1550 if (! is_loop_header_bb_p (bb))
1552 /* FORNOW: we currently don't support the case that these phis
1553 are not used in the outerloop (unless it is double reduction,
1554 i.e., this phi is vect_reduction_def), cause this case
1555 requires to actually do something here. */
1556 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1557 || STMT_VINFO_LIVE_P (stmt_info))
1558 && STMT_VINFO_DEF_TYPE (stmt_info)
1559 != vect_double_reduction_def)
1561 if (dump_enabled_p ())
1562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1563 "Unsupported loop-closed phi in "
1564 "outer-loop.\n");
1565 return false;
1568 /* If PHI is used in the outer loop, we check that its operand
1569 is defined in the inner loop. */
1570 if (STMT_VINFO_RELEVANT_P (stmt_info))
1572 tree phi_op;
1573 gimple op_def_stmt;
1575 if (gimple_phi_num_args (phi) != 1)
1576 return false;
1578 phi_op = PHI_ARG_DEF (phi, 0);
1579 if (TREE_CODE (phi_op) != SSA_NAME)
1580 return false;
1582 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1583 if (gimple_nop_p (op_def_stmt)
1584 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1585 || !vinfo_for_stmt (op_def_stmt))
1586 return false;
1588 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1589 != vect_used_in_outer
1590 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1591 != vect_used_in_outer_by_reduction)
1592 return false;
1595 continue;
1598 gcc_assert (stmt_info);
1600 if (STMT_VINFO_LIVE_P (stmt_info))
1602 /* FORNOW: not yet supported. */
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 "not vectorized: value used after loop.\n");
1606 return false;
1609 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1610 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1612 /* A scalar-dependence cycle that we don't support. */
1613 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615 "not vectorized: scalar dependence cycle.\n");
1616 return false;
1619 if (STMT_VINFO_RELEVANT_P (stmt_info))
1621 need_to_vectorize = true;
1622 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1623 ok = vectorizable_induction (phi, NULL, NULL);
1626 if (!ok)
1628 if (dump_enabled_p ())
1630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1631 "not vectorized: relevant phi not "
1632 "supported: ");
1633 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1634 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1636 return false;
1640 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1641 gsi_next (&si))
1643 gimple stmt = gsi_stmt (si);
1644 if (!gimple_clobber_p (stmt)
1645 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1646 return false;
1648 } /* bbs */
1650 /* All operations in the loop are either irrelevant (deal with loop
1651 control, or dead), or only used outside the loop and can be moved
1652 out of the loop (e.g. invariants, inductions). The loop can be
1653 optimized away by scalar optimizations. We're better off not
1654 touching this loop. */
1655 if (!need_to_vectorize)
1657 if (dump_enabled_p ())
1658 dump_printf_loc (MSG_NOTE, vect_location,
1659 "All the computation can be taken out of the loop.\n");
1660 if (dump_enabled_p ())
1661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1662 "not vectorized: redundant loop. no profit to "
1663 "vectorize.\n");
1664 return false;
1667 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1668 gcc_assert (vectorization_factor != 0);
1670 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1671 dump_printf_loc (MSG_NOTE, vect_location,
1672 "vectorization_factor = %d, niters = "
1673 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1674 LOOP_VINFO_INT_NITERS (loop_vinfo));
1676 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1677 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1678 || ((max_niter = max_stmt_executions_int (loop)) != -1
1679 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "not vectorized: iteration count too small.\n");
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686 "not vectorized: iteration count smaller than "
1687 "vectorization factor.\n");
1688 return false;
1691 /* Analyze cost. Decide if worth while to vectorize. */
1693 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1694 &min_profitable_estimate);
1695 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1697 if (min_profitable_iters < 0)
1699 if (dump_enabled_p ())
1700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1701 "not vectorized: vectorization not profitable.\n");
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704 "not vectorized: vector version will never be "
1705 "profitable.\n");
1706 return false;
1709 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1710 * vectorization_factor) - 1);
1713 /* Use the cost model only if it is more conservative than user specified
1714 threshold. */
1716 th = (unsigned) min_scalar_loop_bound;
1717 if (min_profitable_iters
1718 && (!min_scalar_loop_bound
1719 || min_profitable_iters > min_scalar_loop_bound))
1720 th = (unsigned) min_profitable_iters;
1722 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1724 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1725 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729 "not vectorized: vectorization not profitable.\n");
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location,
1732 "not vectorized: iteration count smaller than user "
1733 "specified loop bound parameter or minimum profitable "
1734 "iterations (whichever is more conservative).\n");
1735 return false;
1738 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1739 && ((unsigned HOST_WIDE_INT) estimated_niter
1740 <= MAX (th, (unsigned)min_profitable_estimate)))
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 "not vectorized: estimated iteration count too "
1745 "small.\n");
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "not vectorized: estimated iteration count smaller "
1749 "than specified loop bound parameter or minimum "
1750 "profitable iterations (whichever is more "
1751 "conservative).\n");
1752 return false;
1755 return true;
1759 /* Function vect_analyze_loop_2.
1761 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1762 for it. The different analyses will record information in the
1763 loop_vec_info struct. */
1764 static bool
1765 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1767 bool ok;
1768 int max_vf = MAX_VECTORIZATION_FACTOR;
1769 int min_vf = 2;
1770 unsigned int th;
1771 unsigned int n_stmts = 0;
1773 /* Find all data references in the loop (which correspond to vdefs/vuses)
1774 and analyze their evolution in the loop. Also adjust the minimal
1775 vectorization factor according to the loads and stores.
1777 FORNOW: Handle only simple, array references, which
1778 alignment can be forced, and aligned pointer-references. */
1780 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1781 if (!ok)
1783 if (dump_enabled_p ())
1784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785 "bad data references.\n");
1786 return false;
1789 /* Classify all cross-iteration scalar data-flow cycles.
1790 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1792 vect_analyze_scalar_cycles (loop_vinfo);
1794 vect_pattern_recog (loop_vinfo, NULL);
1796 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1798 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1799 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1801 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1802 if (!ok)
1804 if (dump_enabled_p ())
1805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806 "bad data access.\n");
1807 return false;
1810 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1812 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1813 if (!ok)
1815 if (dump_enabled_p ())
1816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1817 "unexpected pattern.\n");
1818 return false;
1821 /* Analyze data dependences between the data-refs in the loop
1822 and adjust the maximum vectorization factor according to
1823 the dependences.
1824 FORNOW: fail at the first data dependence that we encounter. */
1826 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1827 if (!ok
1828 || max_vf < min_vf)
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1832 "bad data dependence.\n");
1833 return false;
1836 ok = vect_determine_vectorization_factor (loop_vinfo);
1837 if (!ok)
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841 "can't determine vectorization factor.\n");
1842 return false;
1844 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 "bad data dependence.\n");
1849 return false;
1852 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1853 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1854 if (!ok)
1855 return false;
1857 /* If there are any SLP instances mark them as pure_slp. */
1858 bool slp = vect_make_slp_decision (loop_vinfo);
1859 if (slp)
1861 /* Find stmts that need to be both vectorized and SLPed. */
1862 vect_detect_hybrid_slp (loop_vinfo);
1864 /* Update the vectorization factor based on the SLP decision. */
1865 vect_update_vf_for_slp (loop_vinfo);
1868 /* Analyze the alignment of the data-refs in the loop.
1869 Fail if a data reference is found that cannot be vectorized. */
1871 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1872 if (!ok)
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876 "bad data alignment.\n");
1877 return false;
1880 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1881 It is important to call pruning after vect_analyze_data_ref_accesses,
1882 since we use grouping information gathered by interleaving analysis. */
1883 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1884 if (!ok)
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 "number of versioning for alias "
1889 "run-time tests exceeds %d "
1890 "(--param vect-max-version-for-alias-checks)\n",
1891 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1892 return false;
1895 /* Compute the scalar iteration cost. */
1896 vect_get_single_scalar_iteration_cost (loop_vinfo);
1898 /* This pass will decide on using loop versioning and/or loop peeling in
1899 order to enhance the alignment of data references in the loop. */
1901 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1902 if (!ok)
1904 if (dump_enabled_p ())
1905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906 "bad data alignment.\n");
1907 return false;
1910 if (slp)
1912 /* Analyze operations in the SLP instances. Note this may
1913 remove unsupported SLP instances which makes the above
1914 SLP kind detection invalid. */
1915 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1916 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1917 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1918 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1919 return false;
1922 /* Scan all the remaining operations in the loop that are not subject
1923 to SLP and make sure they are vectorizable. */
1924 ok = vect_analyze_loop_operations (loop_vinfo);
1925 if (!ok)
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "bad operation or unsupported loop bound.\n");
1930 return false;
1933 /* Decide whether we need to create an epilogue loop to handle
1934 remaining scalar iterations. */
1935 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1936 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1937 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1939 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1940 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1942 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1943 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1944 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1945 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1947 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1948 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1949 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1950 /* In case of versioning, check if the maximum number of
1951 iterations is greater than th. If they are identical,
1952 the epilogue is unnecessary. */
1953 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1954 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1955 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1956 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1957 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1959 /* If an epilogue loop is required make sure we can create one. */
1960 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1961 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1965 if (!vect_can_advance_ivs_p (loop_vinfo)
1966 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1967 single_exit (LOOP_VINFO_LOOP
1968 (loop_vinfo))))
1970 if (dump_enabled_p ())
1971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1972 "not vectorized: can't create required "
1973 "epilog loop\n");
1974 return false;
1978 return true;
1981 /* Function vect_analyze_loop.
1983 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1984 for it. The different analyses will record information in the
1985 loop_vec_info struct. */
1986 loop_vec_info
1987 vect_analyze_loop (struct loop *loop)
1989 loop_vec_info loop_vinfo;
1990 unsigned int vector_sizes;
1992 /* Autodetect first vector size we try. */
1993 current_vector_size = 0;
1994 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1996 if (dump_enabled_p ())
1997 dump_printf_loc (MSG_NOTE, vect_location,
1998 "===== analyze_loop_nest =====\n");
2000 if (loop_outer (loop)
2001 && loop_vec_info_for_loop (loop_outer (loop))
2002 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2004 if (dump_enabled_p ())
2005 dump_printf_loc (MSG_NOTE, vect_location,
2006 "outer-loop already vectorized.\n");
2007 return NULL;
2010 while (1)
2012 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2013 loop_vinfo = vect_analyze_loop_form (loop);
2014 if (!loop_vinfo)
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "bad loop form.\n");
2019 return NULL;
2022 if (vect_analyze_loop_2 (loop_vinfo))
2024 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2026 return loop_vinfo;
2029 destroy_loop_vec_info (loop_vinfo, true);
2031 vector_sizes &= ~current_vector_size;
2032 if (vector_sizes == 0
2033 || current_vector_size == 0)
2034 return NULL;
2036 /* Try the next biggest vector size. */
2037 current_vector_size = 1 << floor_log2 (vector_sizes);
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_NOTE, vect_location,
2040 "***** Re-trying analysis with "
2041 "vector size %d\n", current_vector_size);
2046 /* Function reduction_code_for_scalar_code
2048 Input:
2049 CODE - tree_code of a reduction operations.
2051 Output:
2052 REDUC_CODE - the corresponding tree-code to be used to reduce the
2053 vector of partial results into a single scalar result, or ERROR_MARK
2054 if the operation is a supported reduction operation, but does not have
2055 such a tree-code.
2057 Return FALSE if CODE currently cannot be vectorized as reduction. */
2059 static bool
2060 reduction_code_for_scalar_code (enum tree_code code,
2061 enum tree_code *reduc_code)
2063 switch (code)
2065 case MAX_EXPR:
2066 *reduc_code = REDUC_MAX_EXPR;
2067 return true;
2069 case MIN_EXPR:
2070 *reduc_code = REDUC_MIN_EXPR;
2071 return true;
2073 case PLUS_EXPR:
2074 *reduc_code = REDUC_PLUS_EXPR;
2075 return true;
2077 case MULT_EXPR:
2078 case MINUS_EXPR:
2079 case BIT_IOR_EXPR:
2080 case BIT_XOR_EXPR:
2081 case BIT_AND_EXPR:
2082 *reduc_code = ERROR_MARK;
2083 return true;
2085 default:
2086 return false;
2091 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2092 STMT is printed with a message MSG. */
2094 static void
2095 report_vect_op (int msg_type, gimple stmt, const char *msg)
2097 dump_printf_loc (msg_type, vect_location, "%s", msg);
2098 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2099 dump_printf (msg_type, "\n");
2103 /* Detect SLP reduction of the form:
2105 #a1 = phi <a5, a0>
2106 a2 = operation (a1)
2107 a3 = operation (a2)
2108 a4 = operation (a3)
2109 a5 = operation (a4)
2111 #a = phi <a5>
2113 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2114 FIRST_STMT is the first reduction stmt in the chain
2115 (a2 = operation (a1)).
2117 Return TRUE if a reduction chain was detected. */
2119 static bool
2120 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2122 struct loop *loop = (gimple_bb (phi))->loop_father;
2123 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2124 enum tree_code code;
2125 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2126 stmt_vec_info use_stmt_info, current_stmt_info;
2127 tree lhs;
2128 imm_use_iterator imm_iter;
2129 use_operand_p use_p;
2130 int nloop_uses, size = 0, n_out_of_loop_uses;
2131 bool found = false;
2133 if (loop != vect_loop)
2134 return false;
2136 lhs = PHI_RESULT (phi);
2137 code = gimple_assign_rhs_code (first_stmt);
2138 while (1)
2140 nloop_uses = 0;
2141 n_out_of_loop_uses = 0;
2142 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2144 gimple use_stmt = USE_STMT (use_p);
2145 if (is_gimple_debug (use_stmt))
2146 continue;
2148 /* Check if we got back to the reduction phi. */
2149 if (use_stmt == phi)
2151 loop_use_stmt = use_stmt;
2152 found = true;
2153 break;
2156 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2158 loop_use_stmt = use_stmt;
2159 nloop_uses++;
2161 else
2162 n_out_of_loop_uses++;
2164 /* There are can be either a single use in the loop or two uses in
2165 phi nodes. */
2166 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2167 return false;
2170 if (found)
2171 break;
2173 /* We reached a statement with no loop uses. */
2174 if (nloop_uses == 0)
2175 return false;
2177 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2178 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2179 return false;
2181 if (!is_gimple_assign (loop_use_stmt)
2182 || code != gimple_assign_rhs_code (loop_use_stmt)
2183 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2184 return false;
2186 /* Insert USE_STMT into reduction chain. */
2187 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2188 if (current_stmt)
2190 current_stmt_info = vinfo_for_stmt (current_stmt);
2191 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2192 GROUP_FIRST_ELEMENT (use_stmt_info)
2193 = GROUP_FIRST_ELEMENT (current_stmt_info);
2195 else
2196 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2198 lhs = gimple_assign_lhs (loop_use_stmt);
2199 current_stmt = loop_use_stmt;
2200 size++;
2203 if (!found || loop_use_stmt != phi || size < 2)
2204 return false;
2206 /* Swap the operands, if needed, to make the reduction operand be the second
2207 operand. */
2208 lhs = PHI_RESULT (phi);
2209 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2210 while (next_stmt)
2212 if (gimple_assign_rhs2 (next_stmt) == lhs)
2214 tree op = gimple_assign_rhs1 (next_stmt);
2215 gimple def_stmt = NULL;
2217 if (TREE_CODE (op) == SSA_NAME)
2218 def_stmt = SSA_NAME_DEF_STMT (op);
2220 /* Check that the other def is either defined in the loop
2221 ("vect_internal_def"), or it's an induction (defined by a
2222 loop-header phi-node). */
2223 if (def_stmt
2224 && gimple_bb (def_stmt)
2225 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2226 && (is_gimple_assign (def_stmt)
2227 || is_gimple_call (def_stmt)
2228 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2229 == vect_induction_def
2230 || (gimple_code (def_stmt) == GIMPLE_PHI
2231 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2232 == vect_internal_def
2233 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2235 lhs = gimple_assign_lhs (next_stmt);
2236 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2237 continue;
2240 return false;
2242 else
2244 tree op = gimple_assign_rhs2 (next_stmt);
2245 gimple def_stmt = NULL;
2247 if (TREE_CODE (op) == SSA_NAME)
2248 def_stmt = SSA_NAME_DEF_STMT (op);
2250 /* Check that the other def is either defined in the loop
2251 ("vect_internal_def"), or it's an induction (defined by a
2252 loop-header phi-node). */
2253 if (def_stmt
2254 && gimple_bb (def_stmt)
2255 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2256 && (is_gimple_assign (def_stmt)
2257 || is_gimple_call (def_stmt)
2258 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2259 == vect_induction_def
2260 || (gimple_code (def_stmt) == GIMPLE_PHI
2261 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2262 == vect_internal_def
2263 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2265 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2268 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2269 dump_printf (MSG_NOTE, "\n");
2272 swap_ssa_operands (next_stmt,
2273 gimple_assign_rhs1_ptr (next_stmt),
2274 gimple_assign_rhs2_ptr (next_stmt));
2275 update_stmt (next_stmt);
2277 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2278 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2280 else
2281 return false;
2284 lhs = gimple_assign_lhs (next_stmt);
2285 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2288 /* Save the chain for further analysis in SLP detection. */
2289 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2290 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2291 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2293 return true;
2297 /* Function vect_is_simple_reduction_1
2299 (1) Detect a cross-iteration def-use cycle that represents a simple
2300 reduction computation. We look for the following pattern:
2302 loop_header:
2303 a1 = phi < a0, a2 >
2304 a3 = ...
2305 a2 = operation (a3, a1)
2309 a3 = ...
2310 loop_header:
2311 a1 = phi < a0, a2 >
2312 a2 = operation (a3, a1)
2314 such that:
2315 1. operation is commutative and associative and it is safe to
2316 change the order of the computation (if CHECK_REDUCTION is true)
2317 2. no uses for a2 in the loop (a2 is used out of the loop)
2318 3. no uses of a1 in the loop besides the reduction operation
2319 4. no uses of a1 outside the loop.
2321 Conditions 1,4 are tested here.
2322 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2324 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2325 nested cycles, if CHECK_REDUCTION is false.
2327 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2328 reductions:
2330 a1 = phi < a0, a2 >
2331 inner loop (def of a3)
2332 a2 = phi < a3 >
2334 If MODIFY is true it tries also to rework the code in-place to enable
2335 detection of more reduction patterns. For the time being we rewrite
2336 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2339 static gimple
2340 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2341 bool check_reduction, bool *double_reduc,
2342 bool modify)
2344 struct loop *loop = (gimple_bb (phi))->loop_father;
2345 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2346 edge latch_e = loop_latch_edge (loop);
2347 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2348 gimple def_stmt, def1 = NULL, def2 = NULL;
2349 enum tree_code orig_code, code;
2350 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2351 tree type;
2352 int nloop_uses;
2353 tree name;
2354 imm_use_iterator imm_iter;
2355 use_operand_p use_p;
2356 bool phi_def;
2358 *double_reduc = false;
2360 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2361 otherwise, we assume outer loop vectorization. */
2362 gcc_assert ((check_reduction && loop == vect_loop)
2363 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2365 name = PHI_RESULT (phi);
2366 /* ??? If there are no uses of the PHI result the inner loop reduction
2367 won't be detected as possibly double-reduction by vectorizable_reduction
2368 because that tries to walk the PHI arg from the preheader edge which
2369 can be constant. See PR60382. */
2370 if (has_zero_uses (name))
2371 return NULL;
2372 nloop_uses = 0;
2373 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2375 gimple use_stmt = USE_STMT (use_p);
2376 if (is_gimple_debug (use_stmt))
2377 continue;
2379 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2381 if (dump_enabled_p ())
2382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383 "intermediate value used outside loop.\n");
2385 return NULL;
2388 nloop_uses++;
2389 if (nloop_uses > 1)
2391 if (dump_enabled_p ())
2392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2393 "reduction used in loop.\n");
2394 return NULL;
2398 if (TREE_CODE (loop_arg) != SSA_NAME)
2400 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2403 "reduction: not ssa_name: ");
2404 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2405 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2407 return NULL;
2410 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2411 if (!def_stmt)
2413 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2415 "reduction: no def_stmt.\n");
2416 return NULL;
2419 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2421 if (dump_enabled_p ())
2423 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2424 dump_printf (MSG_NOTE, "\n");
2426 return NULL;
2429 if (is_gimple_assign (def_stmt))
2431 name = gimple_assign_lhs (def_stmt);
2432 phi_def = false;
2434 else
2436 name = PHI_RESULT (def_stmt);
2437 phi_def = true;
2440 nloop_uses = 0;
2441 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2443 gimple use_stmt = USE_STMT (use_p);
2444 if (is_gimple_debug (use_stmt))
2445 continue;
2446 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2447 nloop_uses++;
2448 if (nloop_uses > 1)
2450 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452 "reduction used in loop.\n");
2453 return NULL;
2457 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2458 defined in the inner loop. */
2459 if (phi_def)
2461 op1 = PHI_ARG_DEF (def_stmt, 0);
2463 if (gimple_phi_num_args (def_stmt) != 1
2464 || TREE_CODE (op1) != SSA_NAME)
2466 if (dump_enabled_p ())
2467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2468 "unsupported phi node definition.\n");
2470 return NULL;
2473 def1 = SSA_NAME_DEF_STMT (op1);
2474 if (gimple_bb (def1)
2475 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2476 && loop->inner
2477 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2478 && is_gimple_assign (def1))
2480 if (dump_enabled_p ())
2481 report_vect_op (MSG_NOTE, def_stmt,
2482 "detected double reduction: ");
2484 *double_reduc = true;
2485 return def_stmt;
2488 return NULL;
2491 code = orig_code = gimple_assign_rhs_code (def_stmt);
2493 /* We can handle "res -= x[i]", which is non-associative by
2494 simply rewriting this into "res += -x[i]". Avoid changing
2495 gimple instruction for the first simple tests and only do this
2496 if we're allowed to change code at all. */
2497 if (code == MINUS_EXPR
2498 && modify
2499 && (op1 = gimple_assign_rhs1 (def_stmt))
2500 && TREE_CODE (op1) == SSA_NAME
2501 && SSA_NAME_DEF_STMT (op1) == phi)
2502 code = PLUS_EXPR;
2504 if (check_reduction
2505 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2507 if (dump_enabled_p ())
2508 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2509 "reduction: not commutative/associative: ");
2510 return NULL;
2513 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2515 if (code != COND_EXPR)
2517 if (dump_enabled_p ())
2518 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2519 "reduction: not binary operation: ");
2521 return NULL;
2524 op3 = gimple_assign_rhs1 (def_stmt);
2525 if (COMPARISON_CLASS_P (op3))
2527 op4 = TREE_OPERAND (op3, 1);
2528 op3 = TREE_OPERAND (op3, 0);
2531 op1 = gimple_assign_rhs2 (def_stmt);
2532 op2 = gimple_assign_rhs3 (def_stmt);
2534 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2536 if (dump_enabled_p ())
2537 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2538 "reduction: uses not ssa_names: ");
2540 return NULL;
2543 else
2545 op1 = gimple_assign_rhs1 (def_stmt);
2546 op2 = gimple_assign_rhs2 (def_stmt);
2548 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2550 if (dump_enabled_p ())
2551 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2552 "reduction: uses not ssa_names: ");
2554 return NULL;
2558 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2559 if ((TREE_CODE (op1) == SSA_NAME
2560 && !types_compatible_p (type,TREE_TYPE (op1)))
2561 || (TREE_CODE (op2) == SSA_NAME
2562 && !types_compatible_p (type, TREE_TYPE (op2)))
2563 || (op3 && TREE_CODE (op3) == SSA_NAME
2564 && !types_compatible_p (type, TREE_TYPE (op3)))
2565 || (op4 && TREE_CODE (op4) == SSA_NAME
2566 && !types_compatible_p (type, TREE_TYPE (op4))))
2568 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_NOTE, vect_location,
2571 "reduction: multiple types: operation type: ");
2572 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2573 dump_printf (MSG_NOTE, ", operands types: ");
2574 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2575 TREE_TYPE (op1));
2576 dump_printf (MSG_NOTE, ",");
2577 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2578 TREE_TYPE (op2));
2579 if (op3)
2581 dump_printf (MSG_NOTE, ",");
2582 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2583 TREE_TYPE (op3));
2586 if (op4)
2588 dump_printf (MSG_NOTE, ",");
2589 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2590 TREE_TYPE (op4));
2592 dump_printf (MSG_NOTE, "\n");
2595 return NULL;
2598 /* Check that it's ok to change the order of the computation.
2599 Generally, when vectorizing a reduction we change the order of the
2600 computation. This may change the behavior of the program in some
2601 cases, so we need to check that this is ok. One exception is when
2602 vectorizing an outer-loop: the inner-loop is executed sequentially,
2603 and therefore vectorizing reductions in the inner-loop during
2604 outer-loop vectorization is safe. */
2606 /* CHECKME: check for !flag_finite_math_only too? */
2607 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2608 && check_reduction)
2610 /* Changing the order of operations changes the semantics. */
2611 if (dump_enabled_p ())
2612 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2613 "reduction: unsafe fp math optimization: ");
2614 return NULL;
2616 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2617 && check_reduction)
2619 /* Changing the order of operations changes the semantics. */
2620 if (dump_enabled_p ())
2621 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2622 "reduction: unsafe int math optimization: ");
2623 return NULL;
2625 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2627 /* Changing the order of operations changes the semantics. */
2628 if (dump_enabled_p ())
2629 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2630 "reduction: unsafe fixed-point math optimization: ");
2631 return NULL;
2634 /* If we detected "res -= x[i]" earlier, rewrite it into
2635 "res += -x[i]" now. If this turns out to be useless reassoc
2636 will clean it up again. */
2637 if (orig_code == MINUS_EXPR)
2639 tree rhs = gimple_assign_rhs2 (def_stmt);
2640 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2641 gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2642 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2643 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2644 loop_info, NULL));
2645 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2646 gimple_assign_set_rhs2 (def_stmt, negrhs);
2647 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2648 update_stmt (def_stmt);
2651 /* Reduction is safe. We're dealing with one of the following:
2652 1) integer arithmetic and no trapv
2653 2) floating point arithmetic, and special flags permit this optimization
2654 3) nested cycle (i.e., outer loop vectorization). */
2655 if (TREE_CODE (op1) == SSA_NAME)
2656 def1 = SSA_NAME_DEF_STMT (op1);
2658 if (TREE_CODE (op2) == SSA_NAME)
2659 def2 = SSA_NAME_DEF_STMT (op2);
2661 if (code != COND_EXPR
2662 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2664 if (dump_enabled_p ())
2665 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2666 return NULL;
2669 /* Check that one def is the reduction def, defined by PHI,
2670 the other def is either defined in the loop ("vect_internal_def"),
2671 or it's an induction (defined by a loop-header phi-node). */
2673 if (def2 && def2 == phi
2674 && (code == COND_EXPR
2675 || !def1 || gimple_nop_p (def1)
2676 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2677 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2678 && (is_gimple_assign (def1)
2679 || is_gimple_call (def1)
2680 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2681 == vect_induction_def
2682 || (gimple_code (def1) == GIMPLE_PHI
2683 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2684 == vect_internal_def
2685 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2687 if (dump_enabled_p ())
2688 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2689 return def_stmt;
2692 if (def1 && def1 == phi
2693 && (code == COND_EXPR
2694 || !def2 || gimple_nop_p (def2)
2695 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2696 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2697 && (is_gimple_assign (def2)
2698 || is_gimple_call (def2)
2699 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2700 == vect_induction_def
2701 || (gimple_code (def2) == GIMPLE_PHI
2702 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2703 == vect_internal_def
2704 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2706 if (check_reduction)
2708 /* Swap operands (just for simplicity - so that the rest of the code
2709 can assume that the reduction variable is always the last (second)
2710 argument). */
2711 if (dump_enabled_p ())
2712 report_vect_op (MSG_NOTE, def_stmt,
2713 "detected reduction: need to swap operands: ");
2715 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2716 gimple_assign_rhs2_ptr (def_stmt));
2718 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2719 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2721 else
2723 if (dump_enabled_p ())
2724 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2727 return def_stmt;
2730 /* Try to find SLP reduction chain. */
2731 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2733 if (dump_enabled_p ())
2734 report_vect_op (MSG_NOTE, def_stmt,
2735 "reduction: detected reduction chain: ");
2737 return def_stmt;
2740 if (dump_enabled_p ())
2741 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2742 "reduction: unknown pattern: ");
2744 return NULL;
2747 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2748 in-place. Arguments as there. */
2750 static gimple
2751 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2752 bool check_reduction, bool *double_reduc)
2754 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2755 double_reduc, false);
2758 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2759 in-place if it enables detection of more reductions. Arguments
2760 as there. */
2762 gimple
2763 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2764 bool check_reduction, bool *double_reduc)
2766 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2767 double_reduc, true);
2770 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2772 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2773 int *peel_iters_epilogue,
2774 stmt_vector_for_cost *scalar_cost_vec,
2775 stmt_vector_for_cost *prologue_cost_vec,
2776 stmt_vector_for_cost *epilogue_cost_vec)
2778 int retval = 0;
2779 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2781 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2783 *peel_iters_epilogue = vf/2;
2784 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_NOTE, vect_location,
2786 "cost model: epilogue peel iters set to vf/2 "
2787 "because loop iterations are unknown .\n");
2789 /* If peeled iterations are known but number of scalar loop
2790 iterations are unknown, count a taken branch per peeled loop. */
2791 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2792 NULL, 0, vect_prologue);
2793 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2794 NULL, 0, vect_epilogue);
2796 else
2798 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2799 peel_iters_prologue = niters < peel_iters_prologue ?
2800 niters : peel_iters_prologue;
2801 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2802 /* If we need to peel for gaps, but no peeling is required, we have to
2803 peel VF iterations. */
2804 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2805 *peel_iters_epilogue = vf;
2808 stmt_info_for_cost *si;
2809 int j;
2810 if (peel_iters_prologue)
2811 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2812 retval += record_stmt_cost (prologue_cost_vec,
2813 si->count * peel_iters_prologue,
2814 si->kind, NULL, si->misalign,
2815 vect_prologue);
2816 if (*peel_iters_epilogue)
2817 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2818 retval += record_stmt_cost (epilogue_cost_vec,
2819 si->count * *peel_iters_epilogue,
2820 si->kind, NULL, si->misalign,
2821 vect_epilogue);
2823 return retval;
2826 /* Function vect_estimate_min_profitable_iters
2828 Return the number of iterations required for the vector version of the
2829 loop to be profitable relative to the cost of the scalar version of the
2830 loop. */
2832 static void
2833 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2834 int *ret_min_profitable_niters,
2835 int *ret_min_profitable_estimate)
2837 int min_profitable_iters;
2838 int min_profitable_estimate;
2839 int peel_iters_prologue;
2840 int peel_iters_epilogue;
2841 unsigned vec_inside_cost = 0;
2842 int vec_outside_cost = 0;
2843 unsigned vec_prologue_cost = 0;
2844 unsigned vec_epilogue_cost = 0;
2845 int scalar_single_iter_cost = 0;
2846 int scalar_outside_cost = 0;
2847 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2848 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2849 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2851 /* Cost model disabled. */
2852 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2854 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2855 *ret_min_profitable_niters = 0;
2856 *ret_min_profitable_estimate = 0;
2857 return;
2860 /* Requires loop versioning tests to handle misalignment. */
2861 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2863 /* FIXME: Make cost depend on complexity of individual check. */
2864 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2865 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2866 vect_prologue);
2867 dump_printf (MSG_NOTE,
2868 "cost model: Adding cost of checks for loop "
2869 "versioning to treat misalignment.\n");
2872 /* Requires loop versioning with alias checks. */
2873 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2875 /* FIXME: Make cost depend on complexity of individual check. */
2876 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2877 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2878 vect_prologue);
2879 dump_printf (MSG_NOTE,
2880 "cost model: Adding cost of checks for loop "
2881 "versioning aliasing.\n");
2884 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2885 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2886 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2887 vect_prologue);
2889 /* Count statements in scalar loop. Using this as scalar cost for a single
2890 iteration for now.
2892 TODO: Add outer loop support.
2894 TODO: Consider assigning different costs to different scalar
2895 statements. */
2897 scalar_single_iter_cost
2898 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2900 /* Add additional cost for the peeled instructions in prologue and epilogue
2901 loop.
2903 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2904 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2906 TODO: Build an expression that represents peel_iters for prologue and
2907 epilogue to be used in a run-time test. */
2909 if (npeel < 0)
2911 peel_iters_prologue = vf/2;
2912 dump_printf (MSG_NOTE, "cost model: "
2913 "prologue peel iters set to vf/2.\n");
2915 /* If peeling for alignment is unknown, loop bound of main loop becomes
2916 unknown. */
2917 peel_iters_epilogue = vf/2;
2918 dump_printf (MSG_NOTE, "cost model: "
2919 "epilogue peel iters set to vf/2 because "
2920 "peeling for alignment is unknown.\n");
2922 /* If peeled iterations are unknown, count a taken branch and a not taken
2923 branch per peeled loop. Even if scalar loop iterations are known,
2924 vector iterations are not known since peeled prologue iterations are
2925 not known. Hence guards remain the same. */
2926 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2927 NULL, 0, vect_prologue);
2928 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2929 NULL, 0, vect_prologue);
2930 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2931 NULL, 0, vect_epilogue);
2932 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2933 NULL, 0, vect_epilogue);
2934 stmt_info_for_cost *si;
2935 int j;
2936 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2938 struct _stmt_vec_info *stmt_info
2939 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2940 (void) add_stmt_cost (target_cost_data,
2941 si->count * peel_iters_prologue,
2942 si->kind, stmt_info, si->misalign,
2943 vect_prologue);
2944 (void) add_stmt_cost (target_cost_data,
2945 si->count * peel_iters_epilogue,
2946 si->kind, stmt_info, si->misalign,
2947 vect_epilogue);
2950 else
2952 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2953 stmt_info_for_cost *si;
2954 int j;
2955 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2957 prologue_cost_vec.create (2);
2958 epilogue_cost_vec.create (2);
2959 peel_iters_prologue = npeel;
2961 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2962 &peel_iters_epilogue,
2963 &LOOP_VINFO_SCALAR_ITERATION_COST
2964 (loop_vinfo),
2965 &prologue_cost_vec,
2966 &epilogue_cost_vec);
2968 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2970 struct _stmt_vec_info *stmt_info
2971 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2972 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2973 si->misalign, vect_prologue);
2976 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2978 struct _stmt_vec_info *stmt_info
2979 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2980 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2981 si->misalign, vect_epilogue);
2984 prologue_cost_vec.release ();
2985 epilogue_cost_vec.release ();
2988 /* FORNOW: The scalar outside cost is incremented in one of the
2989 following ways:
2991 1. The vectorizer checks for alignment and aliasing and generates
2992 a condition that allows dynamic vectorization. A cost model
2993 check is ANDED with the versioning condition. Hence scalar code
2994 path now has the added cost of the versioning check.
2996 if (cost > th & versioning_check)
2997 jmp to vector code
2999 Hence run-time scalar is incremented by not-taken branch cost.
3001 2. The vectorizer then checks if a prologue is required. If the
3002 cost model check was not done before during versioning, it has to
3003 be done before the prologue check.
3005 if (cost <= th)
3006 prologue = scalar_iters
3007 if (prologue == 0)
3008 jmp to vector code
3009 else
3010 execute prologue
3011 if (prologue == num_iters)
3012 go to exit
3014 Hence the run-time scalar cost is incremented by a taken branch,
3015 plus a not-taken branch, plus a taken branch cost.
3017 3. The vectorizer then checks if an epilogue is required. If the
3018 cost model check was not done before during prologue check, it
3019 has to be done with the epilogue check.
3021 if (prologue == 0)
3022 jmp to vector code
3023 else
3024 execute prologue
3025 if (prologue == num_iters)
3026 go to exit
3027 vector code:
3028 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3029 jmp to epilogue
3031 Hence the run-time scalar cost should be incremented by 2 taken
3032 branches.
3034 TODO: The back end may reorder the BBS's differently and reverse
3035 conditions/branch directions. Change the estimates below to
3036 something more reasonable. */
3038 /* If the number of iterations is known and we do not do versioning, we can
3039 decide whether to vectorize at compile time. Hence the scalar version
3040 do not carry cost model guard costs. */
3041 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3042 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3043 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3045 /* Cost model check occurs at versioning. */
3046 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3047 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3048 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3049 else
3051 /* Cost model check occurs at prologue generation. */
3052 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3053 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3054 + vect_get_stmt_cost (cond_branch_not_taken);
3055 /* Cost model check occurs at epilogue generation. */
3056 else
3057 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3061 /* Complete the target-specific cost calculations. */
3062 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3063 &vec_inside_cost, &vec_epilogue_cost);
3065 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3067 if (dump_enabled_p ())
3069 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3070 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3071 vec_inside_cost);
3072 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3073 vec_prologue_cost);
3074 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3075 vec_epilogue_cost);
3076 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3077 scalar_single_iter_cost);
3078 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3079 scalar_outside_cost);
3080 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3081 vec_outside_cost);
3082 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3083 peel_iters_prologue);
3084 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3085 peel_iters_epilogue);
3088 /* Calculate number of iterations required to make the vector version
3089 profitable, relative to the loop bodies only. The following condition
3090 must hold true:
3091 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3092 where
3093 SIC = scalar iteration cost, VIC = vector iteration cost,
3094 VOC = vector outside cost, VF = vectorization factor,
3095 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3096 SOC = scalar outside cost for run time cost model check. */
3098 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3100 if (vec_outside_cost <= 0)
3101 min_profitable_iters = 1;
3102 else
3104 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3105 - vec_inside_cost * peel_iters_prologue
3106 - vec_inside_cost * peel_iters_epilogue)
3107 / ((scalar_single_iter_cost * vf)
3108 - vec_inside_cost);
3110 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3111 <= (((int) vec_inside_cost * min_profitable_iters)
3112 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3113 min_profitable_iters++;
3116 /* vector version will never be profitable. */
3117 else
3119 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3120 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3121 "did not happen for a simd loop");
3123 if (dump_enabled_p ())
3124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3125 "cost model: the vector iteration cost = %d "
3126 "divided by the scalar iteration cost = %d "
3127 "is greater or equal to the vectorization factor = %d"
3128 ".\n",
3129 vec_inside_cost, scalar_single_iter_cost, vf);
3130 *ret_min_profitable_niters = -1;
3131 *ret_min_profitable_estimate = -1;
3132 return;
3135 dump_printf (MSG_NOTE,
3136 " Calculated minimum iters for profitability: %d\n",
3137 min_profitable_iters);
3139 min_profitable_iters =
3140 min_profitable_iters < vf ? vf : min_profitable_iters;
3142 /* Because the condition we create is:
3143 if (niters <= min_profitable_iters)
3144 then skip the vectorized loop. */
3145 min_profitable_iters--;
3147 if (dump_enabled_p ())
3148 dump_printf_loc (MSG_NOTE, vect_location,
3149 " Runtime profitability threshold = %d\n",
3150 min_profitable_iters);
3152 *ret_min_profitable_niters = min_profitable_iters;
3154 /* Calculate number of iterations required to make the vector version
3155 profitable, relative to the loop bodies only.
3157 Non-vectorized variant is SIC * niters and it must win over vector
3158 variant on the expected loop trip count. The following condition must hold true:
3159 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3161 if (vec_outside_cost <= 0)
3162 min_profitable_estimate = 1;
3163 else
3165 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3166 - vec_inside_cost * peel_iters_prologue
3167 - vec_inside_cost * peel_iters_epilogue)
3168 / ((scalar_single_iter_cost * vf)
3169 - vec_inside_cost);
3171 min_profitable_estimate --;
3172 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3173 if (dump_enabled_p ())
3174 dump_printf_loc (MSG_NOTE, vect_location,
3175 " Static estimate profitability threshold = %d\n",
3176 min_profitable_iters);
3178 *ret_min_profitable_estimate = min_profitable_estimate;
3181 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3182 vector elements (not bits) for a vector of mode MODE. */
3183 static void
3184 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3185 unsigned char *sel)
3187 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3189 for (i = 0; i < nelt; i++)
3190 sel[i] = (i + offset) & (2*nelt - 1);
3193 /* Checks whether the target supports whole-vector shifts for vectors of mode
3194 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3195 it supports vec_perm_const with masks for all necessary shift amounts. */
3196 static bool
3197 have_whole_vector_shift (enum machine_mode mode)
3199 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3200 return true;
3202 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3203 return false;
3205 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3206 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3208 for (i = nelt/2; i >= 1; i/=2)
3210 calc_vec_perm_mask_for_shift (mode, i, sel);
3211 if (!can_vec_perm_p (mode, false, sel))
3212 return false;
3214 return true;
3217 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3219 static tree
3220 get_reduction_op (gimple stmt, int reduc_index)
3222 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3224 case GIMPLE_SINGLE_RHS:
3225 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3226 == ternary_op);
3227 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3228 case GIMPLE_UNARY_RHS:
3229 return gimple_assign_rhs1 (stmt);
3230 case GIMPLE_BINARY_RHS:
3231 return (reduc_index
3232 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3233 case GIMPLE_TERNARY_RHS:
3234 return gimple_op (stmt, reduc_index + 1);
3235 default:
3236 gcc_unreachable ();
3240 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3241 functions. Design better to avoid maintenance issues. */
3243 /* Function vect_model_reduction_cost.
3245 Models cost for a reduction operation, including the vector ops
3246 generated within the strip-mine loop, the initial definition before
3247 the loop, and the epilogue code that must be generated. */
3249 static bool
3250 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3251 int ncopies, int reduc_index)
3253 int prologue_cost = 0, epilogue_cost = 0;
3254 enum tree_code code;
3255 optab optab;
3256 tree vectype;
3257 gimple stmt, orig_stmt;
3258 tree reduction_op;
3259 machine_mode mode;
3260 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3261 struct loop *loop = NULL;
3262 void *target_cost_data;
3264 if (loop_vinfo)
3266 loop = LOOP_VINFO_LOOP (loop_vinfo);
3267 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3269 else
3270 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3272 /* Cost of reduction op inside loop. */
3273 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3274 stmt_info, 0, vect_body);
3275 stmt = STMT_VINFO_STMT (stmt_info);
3277 reduction_op = get_reduction_op (stmt, reduc_index);
3279 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3280 if (!vectype)
3282 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3285 "unsupported data-type ");
3286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3287 TREE_TYPE (reduction_op));
3288 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3290 return false;
3293 mode = TYPE_MODE (vectype);
3294 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3296 if (!orig_stmt)
3297 orig_stmt = STMT_VINFO_STMT (stmt_info);
3299 code = gimple_assign_rhs_code (orig_stmt);
3301 /* Add in cost for initial definition. */
3302 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3303 stmt_info, 0, vect_prologue);
3305 /* Determine cost of epilogue code.
3307 We have a reduction operator that will reduce the vector in one statement.
3308 Also requires scalar extract. */
3310 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3312 if (reduc_code != ERROR_MARK)
3314 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3315 stmt_info, 0, vect_epilogue);
3316 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3317 stmt_info, 0, vect_epilogue);
3319 else
3321 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3322 tree bitsize =
3323 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3324 int element_bitsize = tree_to_uhwi (bitsize);
3325 int nelements = vec_size_in_bits / element_bitsize;
3327 optab = optab_for_tree_code (code, vectype, optab_default);
3329 /* We have a whole vector shift available. */
3330 if (VECTOR_MODE_P (mode)
3331 && optab_handler (optab, mode) != CODE_FOR_nothing
3332 && have_whole_vector_shift (mode))
3334 /* Final reduction via vector shifts and the reduction operator.
3335 Also requires scalar extract. */
3336 epilogue_cost += add_stmt_cost (target_cost_data,
3337 exact_log2 (nelements) * 2,
3338 vector_stmt, stmt_info, 0,
3339 vect_epilogue);
3340 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3341 vec_to_scalar, stmt_info, 0,
3342 vect_epilogue);
3344 else
3345 /* Use extracts and reduction op for final reduction. For N
3346 elements, we have N extracts and N-1 reduction ops. */
3347 epilogue_cost += add_stmt_cost (target_cost_data,
3348 nelements + nelements - 1,
3349 vector_stmt, stmt_info, 0,
3350 vect_epilogue);
3354 if (dump_enabled_p ())
3355 dump_printf (MSG_NOTE,
3356 "vect_model_reduction_cost: inside_cost = %d, "
3357 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3358 prologue_cost, epilogue_cost);
3360 return true;
3364 /* Function vect_model_induction_cost.
3366 Models cost for induction operations. */
3368 static void
3369 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3371 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3372 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3373 unsigned inside_cost, prologue_cost;
3375 /* loop cost for vec_loop. */
3376 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3377 stmt_info, 0, vect_body);
3379 /* prologue cost for vec_init and vec_step. */
3380 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3381 stmt_info, 0, vect_prologue);
3383 if (dump_enabled_p ())
3384 dump_printf_loc (MSG_NOTE, vect_location,
3385 "vect_model_induction_cost: inside_cost = %d, "
3386 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3390 /* Function get_initial_def_for_induction
3392 Input:
3393 STMT - a stmt that performs an induction operation in the loop.
3394 IV_PHI - the initial value of the induction variable
3396 Output:
3397 Return a vector variable, initialized with the first VF values of
3398 the induction variable. E.g., for an iv with IV_PHI='X' and
3399 evolution S, for a vector of 4 units, we want to return:
3400 [X, X + S, X + 2*S, X + 3*S]. */
3402 static tree
3403 get_initial_def_for_induction (gimple iv_phi)
3405 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3406 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3407 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3408 tree vectype;
3409 int nunits;
3410 edge pe = loop_preheader_edge (loop);
3411 struct loop *iv_loop;
3412 basic_block new_bb;
3413 tree new_vec, vec_init, vec_step, t;
3414 tree new_var;
3415 tree new_name;
3416 gimple init_stmt, new_stmt;
3417 gphi *induction_phi;
3418 tree induc_def, vec_def, vec_dest;
3419 tree init_expr, step_expr;
3420 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3421 int i;
3422 int ncopies;
3423 tree expr;
3424 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3425 bool nested_in_vect_loop = false;
3426 gimple_seq stmts = NULL;
3427 imm_use_iterator imm_iter;
3428 use_operand_p use_p;
3429 gimple exit_phi;
3430 edge latch_e;
3431 tree loop_arg;
3432 gimple_stmt_iterator si;
3433 basic_block bb = gimple_bb (iv_phi);
3434 tree stepvectype;
3435 tree resvectype;
3437 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3438 if (nested_in_vect_loop_p (loop, iv_phi))
3440 nested_in_vect_loop = true;
3441 iv_loop = loop->inner;
3443 else
3444 iv_loop = loop;
3445 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3447 latch_e = loop_latch_edge (iv_loop);
3448 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3450 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3451 gcc_assert (step_expr != NULL_TREE);
3453 pe = loop_preheader_edge (iv_loop);
3454 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3455 loop_preheader_edge (iv_loop));
3457 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3458 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3459 gcc_assert (vectype);
3460 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3461 ncopies = vf / nunits;
3463 gcc_assert (phi_info);
3464 gcc_assert (ncopies >= 1);
3466 /* Convert the step to the desired type. */
3467 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3468 step_expr),
3469 &stmts, true, NULL_TREE);
3470 if (stmts)
3472 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3473 gcc_assert (!new_bb);
3476 /* Find the first insertion point in the BB. */
3477 si = gsi_after_labels (bb);
3479 /* Create the vector that holds the initial_value of the induction. */
3480 if (nested_in_vect_loop)
3482 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3483 been created during vectorization of previous stmts. We obtain it
3484 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3485 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3486 /* If the initial value is not of proper type, convert it. */
3487 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3489 new_stmt
3490 = gimple_build_assign (vect_get_new_vect_var (vectype,
3491 vect_simple_var,
3492 "vec_iv_"),
3493 VIEW_CONVERT_EXPR,
3494 build1 (VIEW_CONVERT_EXPR, vectype,
3495 vec_init));
3496 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3497 gimple_assign_set_lhs (new_stmt, vec_init);
3498 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3499 new_stmt);
3500 gcc_assert (!new_bb);
3501 set_vinfo_for_stmt (new_stmt,
3502 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3505 else
3507 vec<constructor_elt, va_gc> *v;
3509 /* iv_loop is the loop to be vectorized. Create:
3510 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3511 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3512 vect_scalar_var, "var_");
3513 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3514 init_expr),
3515 &stmts, false, new_var);
3516 if (stmts)
3518 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3519 gcc_assert (!new_bb);
3522 vec_alloc (v, nunits);
3523 bool constant_p = is_gimple_min_invariant (new_name);
3524 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3525 for (i = 1; i < nunits; i++)
3527 /* Create: new_name_i = new_name + step_expr */
3528 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3529 new_name, step_expr);
3530 if (!is_gimple_min_invariant (new_name))
3532 init_stmt = gimple_build_assign (new_var, new_name);
3533 new_name = make_ssa_name (new_var, init_stmt);
3534 gimple_assign_set_lhs (init_stmt, new_name);
3535 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3536 gcc_assert (!new_bb);
3537 if (dump_enabled_p ())
3539 dump_printf_loc (MSG_NOTE, vect_location,
3540 "created new init_stmt: ");
3541 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3542 dump_printf (MSG_NOTE, "\n");
3544 constant_p = false;
3546 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3548 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3549 if (constant_p)
3550 new_vec = build_vector_from_ctor (vectype, v);
3551 else
3552 new_vec = build_constructor (vectype, v);
3553 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3557 /* Create the vector that holds the step of the induction. */
3558 if (nested_in_vect_loop)
3559 /* iv_loop is nested in the loop to be vectorized. Generate:
3560 vec_step = [S, S, S, S] */
3561 new_name = step_expr;
3562 else
3564 /* iv_loop is the loop to be vectorized. Generate:
3565 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3566 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3568 expr = build_int_cst (integer_type_node, vf);
3569 expr = fold_convert (TREE_TYPE (step_expr), expr);
3571 else
3572 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3573 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3574 expr, step_expr);
3575 if (TREE_CODE (step_expr) == SSA_NAME)
3576 new_name = vect_init_vector (iv_phi, new_name,
3577 TREE_TYPE (step_expr), NULL);
3580 t = unshare_expr (new_name);
3581 gcc_assert (CONSTANT_CLASS_P (new_name)
3582 || TREE_CODE (new_name) == SSA_NAME);
3583 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3584 gcc_assert (stepvectype);
3585 new_vec = build_vector_from_val (stepvectype, t);
3586 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3589 /* Create the following def-use cycle:
3590 loop prolog:
3591 vec_init = ...
3592 vec_step = ...
3593 loop:
3594 vec_iv = PHI <vec_init, vec_loop>
3596 STMT
3598 vec_loop = vec_iv + vec_step; */
3600 /* Create the induction-phi that defines the induction-operand. */
3601 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3602 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3603 set_vinfo_for_stmt (induction_phi,
3604 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3605 induc_def = PHI_RESULT (induction_phi);
3607 /* Create the iv update inside the loop */
3608 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3609 vec_def = make_ssa_name (vec_dest, new_stmt);
3610 gimple_assign_set_lhs (new_stmt, vec_def);
3611 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3612 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3613 NULL));
3615 /* Set the arguments of the phi node: */
3616 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3617 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3618 UNKNOWN_LOCATION);
3621 /* In case that vectorization factor (VF) is bigger than the number
3622 of elements that we can fit in a vectype (nunits), we have to generate
3623 more than one vector stmt - i.e - we need to "unroll" the
3624 vector stmt by a factor VF/nunits. For more details see documentation
3625 in vectorizable_operation. */
3627 if (ncopies > 1)
3629 stmt_vec_info prev_stmt_vinfo;
3630 /* FORNOW. This restriction should be relaxed. */
3631 gcc_assert (!nested_in_vect_loop);
3633 /* Create the vector that holds the step of the induction. */
3634 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3636 expr = build_int_cst (integer_type_node, nunits);
3637 expr = fold_convert (TREE_TYPE (step_expr), expr);
3639 else
3640 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3641 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3642 expr, step_expr);
3643 if (TREE_CODE (step_expr) == SSA_NAME)
3644 new_name = vect_init_vector (iv_phi, new_name,
3645 TREE_TYPE (step_expr), NULL);
3646 t = unshare_expr (new_name);
3647 gcc_assert (CONSTANT_CLASS_P (new_name)
3648 || TREE_CODE (new_name) == SSA_NAME);
3649 new_vec = build_vector_from_val (stepvectype, t);
3650 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3652 vec_def = induc_def;
3653 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3654 for (i = 1; i < ncopies; i++)
3656 /* vec_i = vec_prev + vec_step */
3657 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3658 vec_def, vec_step);
3659 vec_def = make_ssa_name (vec_dest, new_stmt);
3660 gimple_assign_set_lhs (new_stmt, vec_def);
3662 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3663 if (!useless_type_conversion_p (resvectype, vectype))
3665 new_stmt
3666 = gimple_build_assign
3667 (vect_get_new_vect_var (resvectype, vect_simple_var,
3668 "vec_iv_"),
3669 VIEW_CONVERT_EXPR,
3670 build1 (VIEW_CONVERT_EXPR, resvectype,
3671 gimple_assign_lhs (new_stmt)));
3672 gimple_assign_set_lhs (new_stmt,
3673 make_ssa_name
3674 (gimple_assign_lhs (new_stmt), new_stmt));
3675 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3677 set_vinfo_for_stmt (new_stmt,
3678 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3679 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3680 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3684 if (nested_in_vect_loop)
3686 /* Find the loop-closed exit-phi of the induction, and record
3687 the final vector of induction results: */
3688 exit_phi = NULL;
3689 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3691 gimple use_stmt = USE_STMT (use_p);
3692 if (is_gimple_debug (use_stmt))
3693 continue;
3695 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3697 exit_phi = use_stmt;
3698 break;
3701 if (exit_phi)
3703 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3704 /* FORNOW. Currently not supporting the case that an inner-loop induction
3705 is not used in the outer-loop (i.e. only outside the outer-loop). */
3706 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3707 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3709 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3710 if (dump_enabled_p ())
3712 dump_printf_loc (MSG_NOTE, vect_location,
3713 "vector of inductions after inner-loop:");
3714 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3715 dump_printf (MSG_NOTE, "\n");
3721 if (dump_enabled_p ())
3723 dump_printf_loc (MSG_NOTE, vect_location,
3724 "transform induction: created def-use cycle: ");
3725 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3726 dump_printf (MSG_NOTE, "\n");
3727 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3728 SSA_NAME_DEF_STMT (vec_def), 0);
3729 dump_printf (MSG_NOTE, "\n");
3732 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3733 if (!useless_type_conversion_p (resvectype, vectype))
3735 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3736 vect_simple_var,
3737 "vec_iv_"),
3738 VIEW_CONVERT_EXPR,
3739 build1 (VIEW_CONVERT_EXPR, resvectype,
3740 induc_def));
3741 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3742 gimple_assign_set_lhs (new_stmt, induc_def);
3743 si = gsi_after_labels (bb);
3744 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3745 set_vinfo_for_stmt (new_stmt,
3746 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3747 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3748 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3751 return induc_def;
3755 /* Function get_initial_def_for_reduction
3757 Input:
3758 STMT - a stmt that performs a reduction operation in the loop.
3759 INIT_VAL - the initial value of the reduction variable
3761 Output:
3762 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3763 of the reduction (used for adjusting the epilog - see below).
3764 Return a vector variable, initialized according to the operation that STMT
3765 performs. This vector will be used as the initial value of the
3766 vector of partial results.
3768 Option1 (adjust in epilog): Initialize the vector as follows:
3769 add/bit or/xor: [0,0,...,0,0]
3770 mult/bit and: [1,1,...,1,1]
3771 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3772 and when necessary (e.g. add/mult case) let the caller know
3773 that it needs to adjust the result by init_val.
3775 Option2: Initialize the vector as follows:
3776 add/bit or/xor: [init_val,0,0,...,0]
3777 mult/bit and: [init_val,1,1,...,1]
3778 min/max/cond_expr: [init_val,init_val,...,init_val]
3779 and no adjustments are needed.
3781 For example, for the following code:
3783 s = init_val;
3784 for (i=0;i<n;i++)
3785 s = s + a[i];
3787 STMT is 's = s + a[i]', and the reduction variable is 's'.
3788 For a vector of 4 units, we want to return either [0,0,0,init_val],
3789 or [0,0,0,0] and let the caller know that it needs to adjust
3790 the result at the end by 'init_val'.
3792 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3793 initialization vector is simpler (same element in all entries), if
3794 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3796 A cost model should help decide between these two schemes. */
3798 tree
3799 get_initial_def_for_reduction (gimple stmt, tree init_val,
3800 tree *adjustment_def)
3802 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3803 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3804 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3805 tree scalar_type = TREE_TYPE (init_val);
3806 tree vectype = get_vectype_for_scalar_type (scalar_type);
3807 int nunits;
3808 enum tree_code code = gimple_assign_rhs_code (stmt);
3809 tree def_for_init;
3810 tree init_def;
3811 tree *elts;
3812 int i;
3813 bool nested_in_vect_loop = false;
3814 tree init_value;
3815 REAL_VALUE_TYPE real_init_val = dconst0;
3816 int int_init_val = 0;
3817 gimple def_stmt = NULL;
3819 gcc_assert (vectype);
3820 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3822 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3823 || SCALAR_FLOAT_TYPE_P (scalar_type));
3825 if (nested_in_vect_loop_p (loop, stmt))
3826 nested_in_vect_loop = true;
3827 else
3828 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3830 /* In case of double reduction we only create a vector variable to be put
3831 in the reduction phi node. The actual statement creation is done in
3832 vect_create_epilog_for_reduction. */
3833 if (adjustment_def && nested_in_vect_loop
3834 && TREE_CODE (init_val) == SSA_NAME
3835 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3836 && gimple_code (def_stmt) == GIMPLE_PHI
3837 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3838 && vinfo_for_stmt (def_stmt)
3839 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3840 == vect_double_reduction_def)
3842 *adjustment_def = NULL;
3843 return vect_create_destination_var (init_val, vectype);
3846 if (TREE_CONSTANT (init_val))
3848 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3849 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3850 else
3851 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3853 else
3854 init_value = init_val;
3856 switch (code)
3858 case WIDEN_SUM_EXPR:
3859 case DOT_PROD_EXPR:
3860 case SAD_EXPR:
3861 case PLUS_EXPR:
3862 case MINUS_EXPR:
3863 case BIT_IOR_EXPR:
3864 case BIT_XOR_EXPR:
3865 case MULT_EXPR:
3866 case BIT_AND_EXPR:
3867 /* ADJUSMENT_DEF is NULL when called from
3868 vect_create_epilog_for_reduction to vectorize double reduction. */
3869 if (adjustment_def)
3871 if (nested_in_vect_loop)
3872 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3873 NULL);
3874 else
3875 *adjustment_def = init_val;
3878 if (code == MULT_EXPR)
3880 real_init_val = dconst1;
3881 int_init_val = 1;
3884 if (code == BIT_AND_EXPR)
3885 int_init_val = -1;
3887 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3888 def_for_init = build_real (scalar_type, real_init_val);
3889 else
3890 def_for_init = build_int_cst (scalar_type, int_init_val);
3892 /* Create a vector of '0' or '1' except the first element. */
3893 elts = XALLOCAVEC (tree, nunits);
3894 for (i = nunits - 2; i >= 0; --i)
3895 elts[i + 1] = def_for_init;
3897 /* Option1: the first element is '0' or '1' as well. */
3898 if (adjustment_def)
3900 elts[0] = def_for_init;
3901 init_def = build_vector (vectype, elts);
3902 break;
3905 /* Option2: the first element is INIT_VAL. */
3906 elts[0] = init_val;
3907 if (TREE_CONSTANT (init_val))
3908 init_def = build_vector (vectype, elts);
3909 else
3911 vec<constructor_elt, va_gc> *v;
3912 vec_alloc (v, nunits);
3913 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3914 for (i = 1; i < nunits; ++i)
3915 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3916 init_def = build_constructor (vectype, v);
3919 break;
3921 case MIN_EXPR:
3922 case MAX_EXPR:
3923 case COND_EXPR:
3924 if (adjustment_def)
3926 *adjustment_def = NULL_TREE;
3927 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3928 break;
3931 init_def = build_vector_from_val (vectype, init_value);
3932 break;
3934 default:
3935 gcc_unreachable ();
3938 return init_def;
3941 /* Function vect_create_epilog_for_reduction
3943 Create code at the loop-epilog to finalize the result of a reduction
3944 computation.
3946 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3947 reduction statements.
3948 STMT is the scalar reduction stmt that is being vectorized.
3949 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3950 number of elements that we can fit in a vectype (nunits). In this case
3951 we have to generate more than one vector stmt - i.e - we need to "unroll"
3952 the vector stmt by a factor VF/nunits. For more details see documentation
3953 in vectorizable_operation.
3954 REDUC_CODE is the tree-code for the epilog reduction.
3955 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3956 computation.
3957 REDUC_INDEX is the index of the operand in the right hand side of the
3958 statement that is defined by REDUCTION_PHI.
3959 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3960 SLP_NODE is an SLP node containing a group of reduction statements. The
3961 first one in this group is STMT.
3963 This function:
3964 1. Creates the reduction def-use cycles: sets the arguments for
3965 REDUCTION_PHIS:
3966 The loop-entry argument is the vectorized initial-value of the reduction.
3967 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3968 sums.
3969 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3970 by applying the operation specified by REDUC_CODE if available, or by
3971 other means (whole-vector shifts or a scalar loop).
3972 The function also creates a new phi node at the loop exit to preserve
3973 loop-closed form, as illustrated below.
3975 The flow at the entry to this function:
3977 loop:
3978 vec_def = phi <null, null> # REDUCTION_PHI
3979 VECT_DEF = vector_stmt # vectorized form of STMT
3980 s_loop = scalar_stmt # (scalar) STMT
3981 loop_exit:
3982 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3983 use <s_out0>
3984 use <s_out0>
3986 The above is transformed by this function into:
3988 loop:
3989 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3990 VECT_DEF = vector_stmt # vectorized form of STMT
3991 s_loop = scalar_stmt # (scalar) STMT
3992 loop_exit:
3993 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3994 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3995 v_out2 = reduce <v_out1>
3996 s_out3 = extract_field <v_out2, 0>
3997 s_out4 = adjust_result <s_out3>
3998 use <s_out4>
3999 use <s_out4>
4002 static void
4003 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
4004 int ncopies, enum tree_code reduc_code,
4005 vec<gimple> reduction_phis,
4006 int reduc_index, bool double_reduc,
4007 slp_tree slp_node)
4009 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4010 stmt_vec_info prev_phi_info;
4011 tree vectype;
4012 machine_mode mode;
4013 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4014 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4015 basic_block exit_bb;
4016 tree scalar_dest;
4017 tree scalar_type;
4018 gimple new_phi = NULL, phi;
4019 gimple_stmt_iterator exit_gsi;
4020 tree vec_dest;
4021 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4022 gimple epilog_stmt = NULL;
4023 enum tree_code code = gimple_assign_rhs_code (stmt);
4024 gimple exit_phi;
4025 tree bitsize;
4026 tree adjustment_def = NULL;
4027 tree vec_initial_def = NULL;
4028 tree reduction_op, expr, def;
4029 tree orig_name, scalar_result;
4030 imm_use_iterator imm_iter, phi_imm_iter;
4031 use_operand_p use_p, phi_use_p;
4032 gimple use_stmt, orig_stmt, reduction_phi = NULL;
4033 bool nested_in_vect_loop = false;
4034 auto_vec<gimple> new_phis;
4035 auto_vec<gimple> inner_phis;
4036 enum vect_def_type dt = vect_unknown_def_type;
4037 int j, i;
4038 auto_vec<tree> scalar_results;
4039 unsigned int group_size = 1, k, ratio;
4040 auto_vec<tree> vec_initial_defs;
4041 auto_vec<gimple> phis;
4042 bool slp_reduc = false;
4043 tree new_phi_result;
4044 gimple inner_phi = NULL;
4046 if (slp_node)
4047 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4049 if (nested_in_vect_loop_p (loop, stmt))
4051 outer_loop = loop;
4052 loop = loop->inner;
4053 nested_in_vect_loop = true;
4054 gcc_assert (!slp_node);
4057 reduction_op = get_reduction_op (stmt, reduc_index);
4059 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4060 gcc_assert (vectype);
4061 mode = TYPE_MODE (vectype);
4063 /* 1. Create the reduction def-use cycle:
4064 Set the arguments of REDUCTION_PHIS, i.e., transform
4066 loop:
4067 vec_def = phi <null, null> # REDUCTION_PHI
4068 VECT_DEF = vector_stmt # vectorized form of STMT
4071 into:
4073 loop:
4074 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4075 VECT_DEF = vector_stmt # vectorized form of STMT
4078 (in case of SLP, do it for all the phis). */
4080 /* Get the loop-entry arguments. */
4081 if (slp_node)
4082 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4083 NULL, slp_node, reduc_index);
4084 else
4086 vec_initial_defs.create (1);
4087 /* For the case of reduction, vect_get_vec_def_for_operand returns
4088 the scalar def before the loop, that defines the initial value
4089 of the reduction variable. */
4090 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4091 &adjustment_def);
4092 vec_initial_defs.quick_push (vec_initial_def);
4095 /* Set phi nodes arguments. */
4096 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4098 tree vec_init_def, def;
4099 gimple_seq stmts;
4100 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4101 true, NULL_TREE);
4102 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4103 def = vect_defs[i];
4104 for (j = 0; j < ncopies; j++)
4106 /* Set the loop-entry arg of the reduction-phi. */
4107 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4108 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4110 /* Set the loop-latch arg for the reduction-phi. */
4111 if (j > 0)
4112 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4114 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4115 UNKNOWN_LOCATION);
4117 if (dump_enabled_p ())
4119 dump_printf_loc (MSG_NOTE, vect_location,
4120 "transform reduction: created def-use cycle: ");
4121 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4122 dump_printf (MSG_NOTE, "\n");
4123 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4124 dump_printf (MSG_NOTE, "\n");
4127 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4131 /* 2. Create epilog code.
4132 The reduction epilog code operates across the elements of the vector
4133 of partial results computed by the vectorized loop.
4134 The reduction epilog code consists of:
4136 step 1: compute the scalar result in a vector (v_out2)
4137 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4138 step 3: adjust the scalar result (s_out3) if needed.
4140 Step 1 can be accomplished using one the following three schemes:
4141 (scheme 1) using reduc_code, if available.
4142 (scheme 2) using whole-vector shifts, if available.
4143 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4144 combined.
4146 The overall epilog code looks like this:
4148 s_out0 = phi <s_loop> # original EXIT_PHI
4149 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4150 v_out2 = reduce <v_out1> # step 1
4151 s_out3 = extract_field <v_out2, 0> # step 2
4152 s_out4 = adjust_result <s_out3> # step 3
4154 (step 3 is optional, and steps 1 and 2 may be combined).
4155 Lastly, the uses of s_out0 are replaced by s_out4. */
4158 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4159 v_out1 = phi <VECT_DEF>
4160 Store them in NEW_PHIS. */
4162 exit_bb = single_exit (loop)->dest;
4163 prev_phi_info = NULL;
4164 new_phis.create (vect_defs.length ());
4165 FOR_EACH_VEC_ELT (vect_defs, i, def)
4167 for (j = 0; j < ncopies; j++)
4169 tree new_def = copy_ssa_name (def);
4170 phi = create_phi_node (new_def, exit_bb);
4171 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4172 if (j == 0)
4173 new_phis.quick_push (phi);
4174 else
4176 def = vect_get_vec_def_for_stmt_copy (dt, def);
4177 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4180 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4181 prev_phi_info = vinfo_for_stmt (phi);
4185 /* The epilogue is created for the outer-loop, i.e., for the loop being
4186 vectorized. Create exit phis for the outer loop. */
4187 if (double_reduc)
4189 loop = outer_loop;
4190 exit_bb = single_exit (loop)->dest;
4191 inner_phis.create (vect_defs.length ());
4192 FOR_EACH_VEC_ELT (new_phis, i, phi)
4194 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4195 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4196 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4197 PHI_RESULT (phi));
4198 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4199 loop_vinfo, NULL));
4200 inner_phis.quick_push (phi);
4201 new_phis[i] = outer_phi;
4202 prev_phi_info = vinfo_for_stmt (outer_phi);
4203 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4205 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4206 new_result = copy_ssa_name (PHI_RESULT (phi));
4207 outer_phi = create_phi_node (new_result, exit_bb);
4208 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4209 PHI_RESULT (phi));
4210 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4211 loop_vinfo, NULL));
4212 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4213 prev_phi_info = vinfo_for_stmt (outer_phi);
4218 exit_gsi = gsi_after_labels (exit_bb);
4220 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4221 (i.e. when reduc_code is not available) and in the final adjustment
4222 code (if needed). Also get the original scalar reduction variable as
4223 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4224 represents a reduction pattern), the tree-code and scalar-def are
4225 taken from the original stmt that the pattern-stmt (STMT) replaces.
4226 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4227 are taken from STMT. */
4229 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4230 if (!orig_stmt)
4232 /* Regular reduction */
4233 orig_stmt = stmt;
4235 else
4237 /* Reduction pattern */
4238 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4239 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4240 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4243 code = gimple_assign_rhs_code (orig_stmt);
4244 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4245 partial results are added and not subtracted. */
4246 if (code == MINUS_EXPR)
4247 code = PLUS_EXPR;
4249 scalar_dest = gimple_assign_lhs (orig_stmt);
4250 scalar_type = TREE_TYPE (scalar_dest);
4251 scalar_results.create (group_size);
4252 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4253 bitsize = TYPE_SIZE (scalar_type);
4255 /* In case this is a reduction in an inner-loop while vectorizing an outer
4256 loop - we don't need to extract a single scalar result at the end of the
4257 inner-loop (unless it is double reduction, i.e., the use of reduction is
4258 outside the outer-loop). The final vector of partial results will be used
4259 in the vectorized outer-loop, or reduced to a scalar result at the end of
4260 the outer-loop. */
4261 if (nested_in_vect_loop && !double_reduc)
4262 goto vect_finalize_reduction;
4264 /* SLP reduction without reduction chain, e.g.,
4265 # a1 = phi <a2, a0>
4266 # b1 = phi <b2, b0>
4267 a2 = operation (a1)
4268 b2 = operation (b1) */
4269 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4271 /* In case of reduction chain, e.g.,
4272 # a1 = phi <a3, a0>
4273 a2 = operation (a1)
4274 a3 = operation (a2),
4276 we may end up with more than one vector result. Here we reduce them to
4277 one vector. */
4278 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4280 tree first_vect = PHI_RESULT (new_phis[0]);
4281 tree tmp;
4282 gassign *new_vec_stmt = NULL;
4284 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4285 for (k = 1; k < new_phis.length (); k++)
4287 gimple next_phi = new_phis[k];
4288 tree second_vect = PHI_RESULT (next_phi);
4290 tmp = build2 (code, vectype, first_vect, second_vect);
4291 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4292 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4293 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4294 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4297 new_phi_result = first_vect;
4298 if (new_vec_stmt)
4300 new_phis.truncate (0);
4301 new_phis.safe_push (new_vec_stmt);
4304 else
4305 new_phi_result = PHI_RESULT (new_phis[0]);
4307 /* 2.3 Create the reduction code, using one of the three schemes described
4308 above. In SLP we simply need to extract all the elements from the
4309 vector (without reducing them), so we use scalar shifts. */
4310 if (reduc_code != ERROR_MARK && !slp_reduc)
4312 tree tmp;
4313 tree vec_elem_type;
4315 /*** Case 1: Create:
4316 v_out2 = reduc_expr <v_out1> */
4318 if (dump_enabled_p ())
4319 dump_printf_loc (MSG_NOTE, vect_location,
4320 "Reduce using direct vector reduction.\n");
4322 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4323 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4325 tree tmp_dest =
4326 vect_create_destination_var (scalar_dest, vec_elem_type);
4327 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4328 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4329 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4330 gimple_assign_set_lhs (epilog_stmt, new_temp);
4331 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4333 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4335 else
4336 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4337 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4338 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4339 gimple_assign_set_lhs (epilog_stmt, new_temp);
4340 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4341 scalar_results.safe_push (new_temp);
4343 else
4345 bool reduce_with_shift = have_whole_vector_shift (mode);
4346 int element_bitsize = tree_to_uhwi (bitsize);
4347 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4348 tree vec_temp;
4350 /* Regardless of whether we have a whole vector shift, if we're
4351 emulating the operation via tree-vect-generic, we don't want
4352 to use it. Only the first round of the reduction is likely
4353 to still be profitable via emulation. */
4354 /* ??? It might be better to emit a reduction tree code here, so that
4355 tree-vect-generic can expand the first round via bit tricks. */
4356 if (!VECTOR_MODE_P (mode))
4357 reduce_with_shift = false;
4358 else
4360 optab optab = optab_for_tree_code (code, vectype, optab_default);
4361 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4362 reduce_with_shift = false;
4365 if (reduce_with_shift && !slp_reduc)
4367 int nelements = vec_size_in_bits / element_bitsize;
4368 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4370 int elt_offset;
4372 tree zero_vec = build_zero_cst (vectype);
4373 /*** Case 2: Create:
4374 for (offset = nelements/2; offset >= 1; offset/=2)
4376 Create: va' = vec_shift <va, offset>
4377 Create: va = vop <va, va'>
4378 } */
4380 tree rhs;
4382 if (dump_enabled_p ())
4383 dump_printf_loc (MSG_NOTE, vect_location,
4384 "Reduce using vector shifts\n");
4386 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4387 new_temp = new_phi_result;
4388 for (elt_offset = nelements / 2;
4389 elt_offset >= 1;
4390 elt_offset /= 2)
4392 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4393 tree mask = vect_gen_perm_mask_any (vectype, sel);
4394 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4395 new_temp, zero_vec, mask);
4396 new_name = make_ssa_name (vec_dest, epilog_stmt);
4397 gimple_assign_set_lhs (epilog_stmt, new_name);
4398 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4400 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4401 new_temp);
4402 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4403 gimple_assign_set_lhs (epilog_stmt, new_temp);
4404 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4407 /* 2.4 Extract the final scalar result. Create:
4408 s_out3 = extract_field <v_out2, bitpos> */
4410 if (dump_enabled_p ())
4411 dump_printf_loc (MSG_NOTE, vect_location,
4412 "extract scalar result\n");
4414 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4415 bitsize, bitsize_zero_node);
4416 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4417 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4418 gimple_assign_set_lhs (epilog_stmt, new_temp);
4419 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4420 scalar_results.safe_push (new_temp);
4422 else
4424 /*** Case 3: Create:
4425 s = extract_field <v_out2, 0>
4426 for (offset = element_size;
4427 offset < vector_size;
4428 offset += element_size;)
4430 Create: s' = extract_field <v_out2, offset>
4431 Create: s = op <s, s'> // For non SLP cases
4432 } */
4434 if (dump_enabled_p ())
4435 dump_printf_loc (MSG_NOTE, vect_location,
4436 "Reduce using scalar code.\n");
4438 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4439 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4441 int bit_offset;
4442 if (gimple_code (new_phi) == GIMPLE_PHI)
4443 vec_temp = PHI_RESULT (new_phi);
4444 else
4445 vec_temp = gimple_assign_lhs (new_phi);
4446 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4447 bitsize_zero_node);
4448 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4449 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4450 gimple_assign_set_lhs (epilog_stmt, new_temp);
4451 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4453 /* In SLP we don't need to apply reduction operation, so we just
4454 collect s' values in SCALAR_RESULTS. */
4455 if (slp_reduc)
4456 scalar_results.safe_push (new_temp);
4458 for (bit_offset = element_bitsize;
4459 bit_offset < vec_size_in_bits;
4460 bit_offset += element_bitsize)
4462 tree bitpos = bitsize_int (bit_offset);
4463 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4464 bitsize, bitpos);
4466 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4467 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4468 gimple_assign_set_lhs (epilog_stmt, new_name);
4469 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4471 if (slp_reduc)
4473 /* In SLP we don't need to apply reduction operation, so
4474 we just collect s' values in SCALAR_RESULTS. */
4475 new_temp = new_name;
4476 scalar_results.safe_push (new_name);
4478 else
4480 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4481 new_name, new_temp);
4482 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4483 gimple_assign_set_lhs (epilog_stmt, new_temp);
4484 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4489 /* The only case where we need to reduce scalar results in SLP, is
4490 unrolling. If the size of SCALAR_RESULTS is greater than
4491 GROUP_SIZE, we reduce them combining elements modulo
4492 GROUP_SIZE. */
4493 if (slp_reduc)
4495 tree res, first_res, new_res;
4496 gimple new_stmt;
4498 /* Reduce multiple scalar results in case of SLP unrolling. */
4499 for (j = group_size; scalar_results.iterate (j, &res);
4500 j++)
4502 first_res = scalar_results[j % group_size];
4503 new_stmt = gimple_build_assign (new_scalar_dest, code,
4504 first_res, res);
4505 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4506 gimple_assign_set_lhs (new_stmt, new_res);
4507 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4508 scalar_results[j % group_size] = new_res;
4511 else
4512 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4513 scalar_results.safe_push (new_temp);
4517 vect_finalize_reduction:
4519 if (double_reduc)
4520 loop = loop->inner;
4522 /* 2.5 Adjust the final result by the initial value of the reduction
4523 variable. (When such adjustment is not needed, then
4524 'adjustment_def' is zero). For example, if code is PLUS we create:
4525 new_temp = loop_exit_def + adjustment_def */
4527 if (adjustment_def)
4529 gcc_assert (!slp_reduc);
4530 if (nested_in_vect_loop)
4532 new_phi = new_phis[0];
4533 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4534 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4535 new_dest = vect_create_destination_var (scalar_dest, vectype);
4537 else
4539 new_temp = scalar_results[0];
4540 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4541 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4542 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4545 epilog_stmt = gimple_build_assign (new_dest, expr);
4546 new_temp = make_ssa_name (new_dest, epilog_stmt);
4547 gimple_assign_set_lhs (epilog_stmt, new_temp);
4548 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4549 if (nested_in_vect_loop)
4551 set_vinfo_for_stmt (epilog_stmt,
4552 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4553 NULL));
4554 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4555 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4557 if (!double_reduc)
4558 scalar_results.quick_push (new_temp);
4559 else
4560 scalar_results[0] = new_temp;
4562 else
4563 scalar_results[0] = new_temp;
4565 new_phis[0] = epilog_stmt;
4568 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4569 phis with new adjusted scalar results, i.e., replace use <s_out0>
4570 with use <s_out4>.
4572 Transform:
4573 loop_exit:
4574 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4575 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4576 v_out2 = reduce <v_out1>
4577 s_out3 = extract_field <v_out2, 0>
4578 s_out4 = adjust_result <s_out3>
4579 use <s_out0>
4580 use <s_out0>
4582 into:
4584 loop_exit:
4585 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4586 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4587 v_out2 = reduce <v_out1>
4588 s_out3 = extract_field <v_out2, 0>
4589 s_out4 = adjust_result <s_out3>
4590 use <s_out4>
4591 use <s_out4> */
4594 /* In SLP reduction chain we reduce vector results into one vector if
4595 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4596 the last stmt in the reduction chain, since we are looking for the loop
4597 exit phi node. */
4598 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4600 gimple dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4601 /* Handle reduction patterns. */
4602 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4603 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4605 scalar_dest = gimple_assign_lhs (dest_stmt);
4606 group_size = 1;
4609 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4610 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4611 need to match SCALAR_RESULTS with corresponding statements. The first
4612 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4613 the first vector stmt, etc.
4614 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4615 if (group_size > new_phis.length ())
4617 ratio = group_size / new_phis.length ();
4618 gcc_assert (!(group_size % new_phis.length ()));
4620 else
4621 ratio = 1;
4623 for (k = 0; k < group_size; k++)
4625 if (k % ratio == 0)
4627 epilog_stmt = new_phis[k / ratio];
4628 reduction_phi = reduction_phis[k / ratio];
4629 if (double_reduc)
4630 inner_phi = inner_phis[k / ratio];
4633 if (slp_reduc)
4635 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4637 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4638 /* SLP statements can't participate in patterns. */
4639 gcc_assert (!orig_stmt);
4640 scalar_dest = gimple_assign_lhs (current_stmt);
4643 phis.create (3);
4644 /* Find the loop-closed-use at the loop exit of the original scalar
4645 result. (The reduction result is expected to have two immediate uses -
4646 one at the latch block, and one at the loop exit). */
4647 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4648 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4649 && !is_gimple_debug (USE_STMT (use_p)))
4650 phis.safe_push (USE_STMT (use_p));
4652 /* While we expect to have found an exit_phi because of loop-closed-ssa
4653 form we can end up without one if the scalar cycle is dead. */
4655 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4657 if (outer_loop)
4659 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4660 gphi *vect_phi;
4662 /* FORNOW. Currently not supporting the case that an inner-loop
4663 reduction is not used in the outer-loop (but only outside the
4664 outer-loop), unless it is double reduction. */
4665 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4666 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4667 || double_reduc);
4669 if (double_reduc)
4670 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4671 else
4672 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4673 if (!double_reduc
4674 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4675 != vect_double_reduction_def)
4676 continue;
4678 /* Handle double reduction:
4680 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4681 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4682 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4683 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4685 At that point the regular reduction (stmt2 and stmt3) is
4686 already vectorized, as well as the exit phi node, stmt4.
4687 Here we vectorize the phi node of double reduction, stmt1, and
4688 update all relevant statements. */
4690 /* Go through all the uses of s2 to find double reduction phi
4691 node, i.e., stmt1 above. */
4692 orig_name = PHI_RESULT (exit_phi);
4693 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4695 stmt_vec_info use_stmt_vinfo;
4696 stmt_vec_info new_phi_vinfo;
4697 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4698 basic_block bb = gimple_bb (use_stmt);
4699 gimple use;
4701 /* Check that USE_STMT is really double reduction phi
4702 node. */
4703 if (gimple_code (use_stmt) != GIMPLE_PHI
4704 || gimple_phi_num_args (use_stmt) != 2
4705 || bb->loop_father != outer_loop)
4706 continue;
4707 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4708 if (!use_stmt_vinfo
4709 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4710 != vect_double_reduction_def)
4711 continue;
4713 /* Create vector phi node for double reduction:
4714 vs1 = phi <vs0, vs2>
4715 vs1 was created previously in this function by a call to
4716 vect_get_vec_def_for_operand and is stored in
4717 vec_initial_def;
4718 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4719 vs0 is created here. */
4721 /* Create vector phi node. */
4722 vect_phi = create_phi_node (vec_initial_def, bb);
4723 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4724 loop_vec_info_for_loop (outer_loop), NULL);
4725 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4727 /* Create vs0 - initial def of the double reduction phi. */
4728 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4729 loop_preheader_edge (outer_loop));
4730 init_def = get_initial_def_for_reduction (stmt,
4731 preheader_arg, NULL);
4732 vect_phi_init = vect_init_vector (use_stmt, init_def,
4733 vectype, NULL);
4735 /* Update phi node arguments with vs0 and vs2. */
4736 add_phi_arg (vect_phi, vect_phi_init,
4737 loop_preheader_edge (outer_loop),
4738 UNKNOWN_LOCATION);
4739 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4740 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4741 if (dump_enabled_p ())
4743 dump_printf_loc (MSG_NOTE, vect_location,
4744 "created double reduction phi node: ");
4745 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4746 dump_printf (MSG_NOTE, "\n");
4749 vect_phi_res = PHI_RESULT (vect_phi);
4751 /* Replace the use, i.e., set the correct vs1 in the regular
4752 reduction phi node. FORNOW, NCOPIES is always 1, so the
4753 loop is redundant. */
4754 use = reduction_phi;
4755 for (j = 0; j < ncopies; j++)
4757 edge pr_edge = loop_preheader_edge (loop);
4758 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4759 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4765 phis.release ();
4766 if (nested_in_vect_loop)
4768 if (double_reduc)
4769 loop = outer_loop;
4770 else
4771 continue;
4774 phis.create (3);
4775 /* Find the loop-closed-use at the loop exit of the original scalar
4776 result. (The reduction result is expected to have two immediate uses,
4777 one at the latch block, and one at the loop exit). For double
4778 reductions we are looking for exit phis of the outer loop. */
4779 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4781 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4783 if (!is_gimple_debug (USE_STMT (use_p)))
4784 phis.safe_push (USE_STMT (use_p));
4786 else
4788 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4790 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4792 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4794 if (!flow_bb_inside_loop_p (loop,
4795 gimple_bb (USE_STMT (phi_use_p)))
4796 && !is_gimple_debug (USE_STMT (phi_use_p)))
4797 phis.safe_push (USE_STMT (phi_use_p));
4803 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4805 /* Replace the uses: */
4806 orig_name = PHI_RESULT (exit_phi);
4807 scalar_result = scalar_results[k];
4808 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4809 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4810 SET_USE (use_p, scalar_result);
4813 phis.release ();
4818 /* Function vectorizable_reduction.
4820 Check if STMT performs a reduction operation that can be vectorized.
4821 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4822 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4823 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4825 This function also handles reduction idioms (patterns) that have been
4826 recognized in advance during vect_pattern_recog. In this case, STMT may be
4827 of this form:
4828 X = pattern_expr (arg0, arg1, ..., X)
4829 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4830 sequence that had been detected and replaced by the pattern-stmt (STMT).
4832 In some cases of reduction patterns, the type of the reduction variable X is
4833 different than the type of the other arguments of STMT.
4834 In such cases, the vectype that is used when transforming STMT into a vector
4835 stmt is different than the vectype that is used to determine the
4836 vectorization factor, because it consists of a different number of elements
4837 than the actual number of elements that are being operated upon in parallel.
4839 For example, consider an accumulation of shorts into an int accumulator.
4840 On some targets it's possible to vectorize this pattern operating on 8
4841 shorts at a time (hence, the vectype for purposes of determining the
4842 vectorization factor should be V8HI); on the other hand, the vectype that
4843 is used to create the vector form is actually V4SI (the type of the result).
4845 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4846 indicates what is the actual level of parallelism (V8HI in the example), so
4847 that the right vectorization factor would be derived. This vectype
4848 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4849 be used to create the vectorized stmt. The right vectype for the vectorized
4850 stmt is obtained from the type of the result X:
4851 get_vectype_for_scalar_type (TREE_TYPE (X))
4853 This means that, contrary to "regular" reductions (or "regular" stmts in
4854 general), the following equation:
4855 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4856 does *NOT* necessarily hold for reduction patterns. */
4858 bool
4859 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4860 gimple *vec_stmt, slp_tree slp_node)
4862 tree vec_dest;
4863 tree scalar_dest;
4864 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4865 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4866 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4867 tree vectype_in = NULL_TREE;
4868 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4869 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4870 enum tree_code code, orig_code, epilog_reduc_code;
4871 machine_mode vec_mode;
4872 int op_type;
4873 optab optab, reduc_optab;
4874 tree new_temp = NULL_TREE;
4875 tree def;
4876 gimple def_stmt;
4877 enum vect_def_type dt;
4878 gphi *new_phi = NULL;
4879 tree scalar_type;
4880 bool is_simple_use;
4881 gimple orig_stmt;
4882 stmt_vec_info orig_stmt_info;
4883 tree expr = NULL_TREE;
4884 int i;
4885 int ncopies;
4886 int epilog_copies;
4887 stmt_vec_info prev_stmt_info, prev_phi_info;
4888 bool single_defuse_cycle = false;
4889 tree reduc_def = NULL_TREE;
4890 gimple new_stmt = NULL;
4891 int j;
4892 tree ops[3];
4893 bool nested_cycle = false, found_nested_cycle_def = false;
4894 gimple reduc_def_stmt = NULL;
4895 bool double_reduc = false, dummy;
4896 basic_block def_bb;
4897 struct loop * def_stmt_loop, *outer_loop = NULL;
4898 tree def_arg;
4899 gimple def_arg_stmt;
4900 auto_vec<tree> vec_oprnds0;
4901 auto_vec<tree> vec_oprnds1;
4902 auto_vec<tree> vect_defs;
4903 auto_vec<gimple> phis;
4904 int vec_num;
4905 tree def0, def1, tem, op0, op1 = NULL_TREE;
4906 bool first_p = true;
4908 /* In case of reduction chain we switch to the first stmt in the chain, but
4909 we don't update STMT_INFO, since only the last stmt is marked as reduction
4910 and has reduction properties. */
4911 if (GROUP_FIRST_ELEMENT (stmt_info)
4912 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4914 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4915 first_p = false;
4918 if (nested_in_vect_loop_p (loop, stmt))
4920 outer_loop = loop;
4921 loop = loop->inner;
4922 nested_cycle = true;
4925 /* 1. Is vectorizable reduction? */
4926 /* Not supportable if the reduction variable is used in the loop, unless
4927 it's a reduction chain. */
4928 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4929 && !GROUP_FIRST_ELEMENT (stmt_info))
4930 return false;
4932 /* Reductions that are not used even in an enclosing outer-loop,
4933 are expected to be "live" (used out of the loop). */
4934 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4935 && !STMT_VINFO_LIVE_P (stmt_info))
4936 return false;
4938 /* Make sure it was already recognized as a reduction computation. */
4939 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4940 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4941 return false;
4943 /* 2. Has this been recognized as a reduction pattern?
4945 Check if STMT represents a pattern that has been recognized
4946 in earlier analysis stages. For stmts that represent a pattern,
4947 the STMT_VINFO_RELATED_STMT field records the last stmt in
4948 the original sequence that constitutes the pattern. */
4950 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4951 if (orig_stmt)
4953 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4954 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4955 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4958 /* 3. Check the operands of the operation. The first operands are defined
4959 inside the loop body. The last operand is the reduction variable,
4960 which is defined by the loop-header-phi. */
4962 gcc_assert (is_gimple_assign (stmt));
4964 /* Flatten RHS. */
4965 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4967 case GIMPLE_SINGLE_RHS:
4968 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4969 if (op_type == ternary_op)
4971 tree rhs = gimple_assign_rhs1 (stmt);
4972 ops[0] = TREE_OPERAND (rhs, 0);
4973 ops[1] = TREE_OPERAND (rhs, 1);
4974 ops[2] = TREE_OPERAND (rhs, 2);
4975 code = TREE_CODE (rhs);
4977 else
4978 return false;
4979 break;
4981 case GIMPLE_BINARY_RHS:
4982 code = gimple_assign_rhs_code (stmt);
4983 op_type = TREE_CODE_LENGTH (code);
4984 gcc_assert (op_type == binary_op);
4985 ops[0] = gimple_assign_rhs1 (stmt);
4986 ops[1] = gimple_assign_rhs2 (stmt);
4987 break;
4989 case GIMPLE_TERNARY_RHS:
4990 code = gimple_assign_rhs_code (stmt);
4991 op_type = TREE_CODE_LENGTH (code);
4992 gcc_assert (op_type == ternary_op);
4993 ops[0] = gimple_assign_rhs1 (stmt);
4994 ops[1] = gimple_assign_rhs2 (stmt);
4995 ops[2] = gimple_assign_rhs3 (stmt);
4996 break;
4998 case GIMPLE_UNARY_RHS:
4999 return false;
5001 default:
5002 gcc_unreachable ();
5004 /* The default is that the reduction variable is the last in statement. */
5005 int reduc_index = op_type - 1;
5007 if (code == COND_EXPR && slp_node)
5008 return false;
5010 scalar_dest = gimple_assign_lhs (stmt);
5011 scalar_type = TREE_TYPE (scalar_dest);
5012 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5013 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5014 return false;
5016 /* Do not try to vectorize bit-precision reductions. */
5017 if ((TYPE_PRECISION (scalar_type)
5018 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5019 return false;
5021 /* All uses but the last are expected to be defined in the loop.
5022 The last use is the reduction variable. In case of nested cycle this
5023 assumption is not true: we use reduc_index to record the index of the
5024 reduction variable. */
5025 for (i = 0; i < op_type - 1; i++)
5027 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5028 if (i == 0 && code == COND_EXPR)
5029 continue;
5031 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5032 &def_stmt, &def, &dt, &tem);
5033 if (!vectype_in)
5034 vectype_in = tem;
5035 gcc_assert (is_simple_use);
5037 if (dt != vect_internal_def
5038 && dt != vect_external_def
5039 && dt != vect_constant_def
5040 && dt != vect_induction_def
5041 && !(dt == vect_nested_cycle && nested_cycle))
5042 return false;
5044 if (dt == vect_nested_cycle)
5046 found_nested_cycle_def = true;
5047 reduc_def_stmt = def_stmt;
5048 reduc_index = i;
5052 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5053 &def_stmt, &def, &dt, &tem);
5054 if (!vectype_in)
5055 vectype_in = tem;
5056 gcc_assert (is_simple_use);
5057 if (!found_nested_cycle_def)
5058 reduc_def_stmt = def_stmt;
5060 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5061 return false;
5063 if (!(dt == vect_reduction_def
5064 || dt == vect_nested_cycle
5065 || ((dt == vect_internal_def || dt == vect_external_def
5066 || dt == vect_constant_def || dt == vect_induction_def)
5067 && nested_cycle && found_nested_cycle_def)))
5069 /* For pattern recognized stmts, orig_stmt might be a reduction,
5070 but some helper statements for the pattern might not, or
5071 might be COND_EXPRs with reduction uses in the condition. */
5072 gcc_assert (orig_stmt);
5073 return false;
5076 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5077 !nested_cycle, &dummy);
5078 if (orig_stmt)
5079 gcc_assert (tmp == orig_stmt
5080 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5081 else
5082 /* We changed STMT to be the first stmt in reduction chain, hence we
5083 check that in this case the first element in the chain is STMT. */
5084 gcc_assert (stmt == tmp
5085 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5087 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5088 return false;
5090 if (slp_node || PURE_SLP_STMT (stmt_info))
5091 ncopies = 1;
5092 else
5093 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5094 / TYPE_VECTOR_SUBPARTS (vectype_in));
5096 gcc_assert (ncopies >= 1);
5098 vec_mode = TYPE_MODE (vectype_in);
5100 if (code == COND_EXPR)
5102 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5104 if (dump_enabled_p ())
5105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5106 "unsupported condition in reduction\n");
5108 return false;
5111 else
5113 /* 4. Supportable by target? */
5115 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5116 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5118 /* Shifts and rotates are only supported by vectorizable_shifts,
5119 not vectorizable_reduction. */
5120 if (dump_enabled_p ())
5121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5122 "unsupported shift or rotation.\n");
5123 return false;
5126 /* 4.1. check support for the operation in the loop */
5127 optab = optab_for_tree_code (code, vectype_in, optab_default);
5128 if (!optab)
5130 if (dump_enabled_p ())
5131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132 "no optab.\n");
5134 return false;
5137 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5139 if (dump_enabled_p ())
5140 dump_printf (MSG_NOTE, "op not supported by target.\n");
5142 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5143 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5144 < vect_min_worthwhile_factor (code))
5145 return false;
5147 if (dump_enabled_p ())
5148 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5151 /* Worthwhile without SIMD support? */
5152 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5153 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5154 < vect_min_worthwhile_factor (code))
5156 if (dump_enabled_p ())
5157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5158 "not worthwhile without SIMD support.\n");
5160 return false;
5164 /* 4.2. Check support for the epilog operation.
5166 If STMT represents a reduction pattern, then the type of the
5167 reduction variable may be different than the type of the rest
5168 of the arguments. For example, consider the case of accumulation
5169 of shorts into an int accumulator; The original code:
5170 S1: int_a = (int) short_a;
5171 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5173 was replaced with:
5174 STMT: int_acc = widen_sum <short_a, int_acc>
5176 This means that:
5177 1. The tree-code that is used to create the vector operation in the
5178 epilog code (that reduces the partial results) is not the
5179 tree-code of STMT, but is rather the tree-code of the original
5180 stmt from the pattern that STMT is replacing. I.e, in the example
5181 above we want to use 'widen_sum' in the loop, but 'plus' in the
5182 epilog.
5183 2. The type (mode) we use to check available target support
5184 for the vector operation to be created in the *epilog*, is
5185 determined by the type of the reduction variable (in the example
5186 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5187 However the type (mode) we use to check available target support
5188 for the vector operation to be created *inside the loop*, is
5189 determined by the type of the other arguments to STMT (in the
5190 example we'd check this: optab_handler (widen_sum_optab,
5191 vect_short_mode)).
5193 This is contrary to "regular" reductions, in which the types of all
5194 the arguments are the same as the type of the reduction variable.
5195 For "regular" reductions we can therefore use the same vector type
5196 (and also the same tree-code) when generating the epilog code and
5197 when generating the code inside the loop. */
5199 if (orig_stmt)
5201 /* This is a reduction pattern: get the vectype from the type of the
5202 reduction variable, and get the tree-code from orig_stmt. */
5203 orig_code = gimple_assign_rhs_code (orig_stmt);
5204 gcc_assert (vectype_out);
5205 vec_mode = TYPE_MODE (vectype_out);
5207 else
5209 /* Regular reduction: use the same vectype and tree-code as used for
5210 the vector code inside the loop can be used for the epilog code. */
5211 orig_code = code;
5214 if (nested_cycle)
5216 def_bb = gimple_bb (reduc_def_stmt);
5217 def_stmt_loop = def_bb->loop_father;
5218 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5219 loop_preheader_edge (def_stmt_loop));
5220 if (TREE_CODE (def_arg) == SSA_NAME
5221 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5222 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5223 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5224 && vinfo_for_stmt (def_arg_stmt)
5225 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5226 == vect_double_reduction_def)
5227 double_reduc = true;
5230 epilog_reduc_code = ERROR_MARK;
5231 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5233 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5234 optab_default);
5235 if (!reduc_optab)
5237 if (dump_enabled_p ())
5238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5239 "no optab for reduction.\n");
5241 epilog_reduc_code = ERROR_MARK;
5243 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5245 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5246 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5248 if (dump_enabled_p ())
5249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5250 "reduc op not supported by target.\n");
5252 epilog_reduc_code = ERROR_MARK;
5256 else
5258 if (!nested_cycle || double_reduc)
5260 if (dump_enabled_p ())
5261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5262 "no reduc code for scalar code.\n");
5264 return false;
5268 if (double_reduc && ncopies > 1)
5270 if (dump_enabled_p ())
5271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5272 "multiple types in double reduction\n");
5274 return false;
5277 /* In case of widenning multiplication by a constant, we update the type
5278 of the constant to be the type of the other operand. We check that the
5279 constant fits the type in the pattern recognition pass. */
5280 if (code == DOT_PROD_EXPR
5281 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5283 if (TREE_CODE (ops[0]) == INTEGER_CST)
5284 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5285 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5286 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5287 else
5289 if (dump_enabled_p ())
5290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5291 "invalid types in dot-prod\n");
5293 return false;
5297 if (!vec_stmt) /* transformation not required. */
5299 if (first_p
5300 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5301 reduc_index))
5302 return false;
5303 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5304 return true;
5307 /** Transform. **/
5309 if (dump_enabled_p ())
5310 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5312 /* FORNOW: Multiple types are not supported for condition. */
5313 if (code == COND_EXPR)
5314 gcc_assert (ncopies == 1);
5316 /* Create the destination vector */
5317 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5319 /* In case the vectorization factor (VF) is bigger than the number
5320 of elements that we can fit in a vectype (nunits), we have to generate
5321 more than one vector stmt - i.e - we need to "unroll" the
5322 vector stmt by a factor VF/nunits. For more details see documentation
5323 in vectorizable_operation. */
5325 /* If the reduction is used in an outer loop we need to generate
5326 VF intermediate results, like so (e.g. for ncopies=2):
5327 r0 = phi (init, r0)
5328 r1 = phi (init, r1)
5329 r0 = x0 + r0;
5330 r1 = x1 + r1;
5331 (i.e. we generate VF results in 2 registers).
5332 In this case we have a separate def-use cycle for each copy, and therefore
5333 for each copy we get the vector def for the reduction variable from the
5334 respective phi node created for this copy.
5336 Otherwise (the reduction is unused in the loop nest), we can combine
5337 together intermediate results, like so (e.g. for ncopies=2):
5338 r = phi (init, r)
5339 r = x0 + r;
5340 r = x1 + r;
5341 (i.e. we generate VF/2 results in a single register).
5342 In this case for each copy we get the vector def for the reduction variable
5343 from the vectorized reduction operation generated in the previous iteration.
5346 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5348 single_defuse_cycle = true;
5349 epilog_copies = 1;
5351 else
5352 epilog_copies = ncopies;
5354 prev_stmt_info = NULL;
5355 prev_phi_info = NULL;
5356 if (slp_node)
5357 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5358 else
5360 vec_num = 1;
5361 vec_oprnds0.create (1);
5362 if (op_type == ternary_op)
5363 vec_oprnds1.create (1);
5366 phis.create (vec_num);
5367 vect_defs.create (vec_num);
5368 if (!slp_node)
5369 vect_defs.quick_push (NULL_TREE);
5371 for (j = 0; j < ncopies; j++)
5373 if (j == 0 || !single_defuse_cycle)
5375 for (i = 0; i < vec_num; i++)
5377 /* Create the reduction-phi that defines the reduction
5378 operand. */
5379 new_phi = create_phi_node (vec_dest, loop->header);
5380 set_vinfo_for_stmt (new_phi,
5381 new_stmt_vec_info (new_phi, loop_vinfo,
5382 NULL));
5383 if (j == 0 || slp_node)
5384 phis.quick_push (new_phi);
5388 if (code == COND_EXPR)
5390 gcc_assert (!slp_node);
5391 vectorizable_condition (stmt, gsi, vec_stmt,
5392 PHI_RESULT (phis[0]),
5393 reduc_index, NULL);
5394 /* Multiple types are not supported for condition. */
5395 break;
5398 /* Handle uses. */
5399 if (j == 0)
5401 op0 = ops[!reduc_index];
5402 if (op_type == ternary_op)
5404 if (reduc_index == 0)
5405 op1 = ops[2];
5406 else
5407 op1 = ops[1];
5410 if (slp_node)
5411 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5412 slp_node, -1);
5413 else
5415 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5416 stmt, NULL);
5417 vec_oprnds0.quick_push (loop_vec_def0);
5418 if (op_type == ternary_op)
5420 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5421 NULL);
5422 vec_oprnds1.quick_push (loop_vec_def1);
5426 else
5428 if (!slp_node)
5430 enum vect_def_type dt;
5431 gimple dummy_stmt;
5432 tree dummy;
5434 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5435 &dummy_stmt, &dummy, &dt);
5436 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5437 loop_vec_def0);
5438 vec_oprnds0[0] = loop_vec_def0;
5439 if (op_type == ternary_op)
5441 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5442 &dummy, &dt);
5443 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5444 loop_vec_def1);
5445 vec_oprnds1[0] = loop_vec_def1;
5449 if (single_defuse_cycle)
5450 reduc_def = gimple_assign_lhs (new_stmt);
5452 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5455 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5457 if (slp_node)
5458 reduc_def = PHI_RESULT (phis[i]);
5459 else
5461 if (!single_defuse_cycle || j == 0)
5462 reduc_def = PHI_RESULT (new_phi);
5465 def1 = ((op_type == ternary_op)
5466 ? vec_oprnds1[i] : NULL);
5467 if (op_type == binary_op)
5469 if (reduc_index == 0)
5470 expr = build2 (code, vectype_out, reduc_def, def0);
5471 else
5472 expr = build2 (code, vectype_out, def0, reduc_def);
5474 else
5476 if (reduc_index == 0)
5477 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5478 else
5480 if (reduc_index == 1)
5481 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5482 else
5483 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5487 new_stmt = gimple_build_assign (vec_dest, expr);
5488 new_temp = make_ssa_name (vec_dest, new_stmt);
5489 gimple_assign_set_lhs (new_stmt, new_temp);
5490 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5492 if (slp_node)
5494 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5495 vect_defs.quick_push (new_temp);
5497 else
5498 vect_defs[0] = new_temp;
5501 if (slp_node)
5502 continue;
5504 if (j == 0)
5505 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5506 else
5507 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5509 prev_stmt_info = vinfo_for_stmt (new_stmt);
5510 prev_phi_info = vinfo_for_stmt (new_phi);
5513 /* Finalize the reduction-phi (set its arguments) and create the
5514 epilog reduction code. */
5515 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5517 new_temp = gimple_assign_lhs (*vec_stmt);
5518 vect_defs[0] = new_temp;
5521 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5522 epilog_reduc_code, phis, reduc_index,
5523 double_reduc, slp_node);
5525 return true;
5528 /* Function vect_min_worthwhile_factor.
5530 For a loop where we could vectorize the operation indicated by CODE,
5531 return the minimum vectorization factor that makes it worthwhile
5532 to use generic vectors. */
5534 vect_min_worthwhile_factor (enum tree_code code)
5536 switch (code)
5538 case PLUS_EXPR:
5539 case MINUS_EXPR:
5540 case NEGATE_EXPR:
5541 return 4;
5543 case BIT_AND_EXPR:
5544 case BIT_IOR_EXPR:
5545 case BIT_XOR_EXPR:
5546 case BIT_NOT_EXPR:
5547 return 2;
5549 default:
5550 return INT_MAX;
5555 /* Function vectorizable_induction
5557 Check if PHI performs an induction computation that can be vectorized.
5558 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5559 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5560 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5562 bool
5563 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5564 gimple *vec_stmt)
5566 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5567 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5568 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5569 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5570 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5571 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5572 tree vec_def;
5574 gcc_assert (ncopies >= 1);
5575 /* FORNOW. These restrictions should be relaxed. */
5576 if (nested_in_vect_loop_p (loop, phi))
5578 imm_use_iterator imm_iter;
5579 use_operand_p use_p;
5580 gimple exit_phi;
5581 edge latch_e;
5582 tree loop_arg;
5584 if (ncopies > 1)
5586 if (dump_enabled_p ())
5587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5588 "multiple types in nested loop.\n");
5589 return false;
5592 exit_phi = NULL;
5593 latch_e = loop_latch_edge (loop->inner);
5594 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5595 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5597 gimple use_stmt = USE_STMT (use_p);
5598 if (is_gimple_debug (use_stmt))
5599 continue;
5601 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5603 exit_phi = use_stmt;
5604 break;
5607 if (exit_phi)
5609 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5610 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5611 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5613 if (dump_enabled_p ())
5614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5615 "inner-loop induction only used outside "
5616 "of the outer vectorized loop.\n");
5617 return false;
5622 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5623 return false;
5625 /* FORNOW: SLP not supported. */
5626 if (STMT_SLP_TYPE (stmt_info))
5627 return false;
5629 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5631 if (gimple_code (phi) != GIMPLE_PHI)
5632 return false;
5634 if (!vec_stmt) /* transformation not required. */
5636 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5637 if (dump_enabled_p ())
5638 dump_printf_loc (MSG_NOTE, vect_location,
5639 "=== vectorizable_induction ===\n");
5640 vect_model_induction_cost (stmt_info, ncopies);
5641 return true;
5644 /** Transform. **/
5646 if (dump_enabled_p ())
5647 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5649 vec_def = get_initial_def_for_induction (phi);
5650 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5651 return true;
5654 /* Function vectorizable_live_operation.
5656 STMT computes a value that is used outside the loop. Check if
5657 it can be supported. */
5659 bool
5660 vectorizable_live_operation (gimple stmt,
5661 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5662 gimple *vec_stmt)
5664 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5665 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5666 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5667 int i;
5668 int op_type;
5669 tree op;
5670 tree def;
5671 gimple def_stmt;
5672 enum vect_def_type dt;
5673 enum tree_code code;
5674 enum gimple_rhs_class rhs_class;
5676 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5678 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5679 return false;
5681 if (!is_gimple_assign (stmt))
5683 if (gimple_call_internal_p (stmt)
5684 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5685 && gimple_call_lhs (stmt)
5686 && loop->simduid
5687 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5688 && loop->simduid
5689 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5691 edge e = single_exit (loop);
5692 basic_block merge_bb = e->dest;
5693 imm_use_iterator imm_iter;
5694 use_operand_p use_p;
5695 tree lhs = gimple_call_lhs (stmt);
5697 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5699 gimple use_stmt = USE_STMT (use_p);
5700 if (gimple_code (use_stmt) == GIMPLE_PHI
5701 && gimple_bb (use_stmt) == merge_bb)
5703 if (vec_stmt)
5705 tree vfm1
5706 = build_int_cst (unsigned_type_node,
5707 loop_vinfo->vectorization_factor - 1);
5708 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5710 return true;
5715 return false;
5718 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5719 return false;
5721 /* FORNOW. CHECKME. */
5722 if (nested_in_vect_loop_p (loop, stmt))
5723 return false;
5725 code = gimple_assign_rhs_code (stmt);
5726 op_type = TREE_CODE_LENGTH (code);
5727 rhs_class = get_gimple_rhs_class (code);
5728 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5729 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5731 /* FORNOW: support only if all uses are invariant. This means
5732 that the scalar operations can remain in place, unvectorized.
5733 The original last scalar value that they compute will be used. */
5735 for (i = 0; i < op_type; i++)
5737 if (rhs_class == GIMPLE_SINGLE_RHS)
5738 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5739 else
5740 op = gimple_op (stmt, i + 1);
5741 if (op
5742 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5743 &dt))
5745 if (dump_enabled_p ())
5746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5747 "use not simple.\n");
5748 return false;
5751 if (dt != vect_external_def && dt != vect_constant_def)
5752 return false;
5755 /* No transformation is required for the cases we currently support. */
5756 return true;
5759 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5761 static void
5762 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5764 ssa_op_iter op_iter;
5765 imm_use_iterator imm_iter;
5766 def_operand_p def_p;
5767 gimple ustmt;
5769 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5771 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5773 basic_block bb;
5775 if (!is_gimple_debug (ustmt))
5776 continue;
5778 bb = gimple_bb (ustmt);
5780 if (!flow_bb_inside_loop_p (loop, bb))
5782 if (gimple_debug_bind_p (ustmt))
5784 if (dump_enabled_p ())
5785 dump_printf_loc (MSG_NOTE, vect_location,
5786 "killing debug use\n");
5788 gimple_debug_bind_reset_value (ustmt);
5789 update_stmt (ustmt);
5791 else
5792 gcc_unreachable ();
5799 /* This function builds ni_name = number of iterations. Statements
5800 are emitted on the loop preheader edge. */
5802 static tree
5803 vect_build_loop_niters (loop_vec_info loop_vinfo)
5805 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5806 if (TREE_CODE (ni) == INTEGER_CST)
5807 return ni;
5808 else
5810 tree ni_name, var;
5811 gimple_seq stmts = NULL;
5812 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5814 var = create_tmp_var (TREE_TYPE (ni), "niters");
5815 ni_name = force_gimple_operand (ni, &stmts, false, var);
5816 if (stmts)
5817 gsi_insert_seq_on_edge_immediate (pe, stmts);
5819 return ni_name;
5824 /* This function generates the following statements:
5826 ni_name = number of iterations loop executes
5827 ratio = ni_name / vf
5828 ratio_mult_vf_name = ratio * vf
5830 and places them on the loop preheader edge. */
5832 static void
5833 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5834 tree ni_name,
5835 tree *ratio_mult_vf_name_ptr,
5836 tree *ratio_name_ptr)
5838 tree ni_minus_gap_name;
5839 tree var;
5840 tree ratio_name;
5841 tree ratio_mult_vf_name;
5842 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5843 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5844 tree log_vf;
5846 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5848 /* If epilogue loop is required because of data accesses with gaps, we
5849 subtract one iteration from the total number of iterations here for
5850 correct calculation of RATIO. */
5851 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5853 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5854 ni_name,
5855 build_one_cst (TREE_TYPE (ni_name)));
5856 if (!is_gimple_val (ni_minus_gap_name))
5858 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5859 gimple stmts = NULL;
5860 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5861 true, var);
5862 gsi_insert_seq_on_edge_immediate (pe, stmts);
5865 else
5866 ni_minus_gap_name = ni_name;
5868 /* Create: ratio = ni >> log2(vf) */
5869 /* ??? As we have ni == number of latch executions + 1, ni could
5870 have overflown to zero. So avoid computing ratio based on ni
5871 but compute it using the fact that we know ratio will be at least
5872 one, thus via (ni - vf) >> log2(vf) + 1. */
5873 ratio_name
5874 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5875 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5876 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5877 ni_minus_gap_name,
5878 build_int_cst
5879 (TREE_TYPE (ni_name), vf)),
5880 log_vf),
5881 build_int_cst (TREE_TYPE (ni_name), 1));
5882 if (!is_gimple_val (ratio_name))
5884 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5885 gimple stmts = NULL;
5886 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5887 gsi_insert_seq_on_edge_immediate (pe, stmts);
5889 *ratio_name_ptr = ratio_name;
5891 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5893 if (ratio_mult_vf_name_ptr)
5895 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5896 ratio_name, log_vf);
5897 if (!is_gimple_val (ratio_mult_vf_name))
5899 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5900 gimple stmts = NULL;
5901 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5902 true, var);
5903 gsi_insert_seq_on_edge_immediate (pe, stmts);
5905 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5908 return;
5912 /* Function vect_transform_loop.
5914 The analysis phase has determined that the loop is vectorizable.
5915 Vectorize the loop - created vectorized stmts to replace the scalar
5916 stmts in the loop, and update the loop exit condition. */
5918 void
5919 vect_transform_loop (loop_vec_info loop_vinfo)
5921 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5922 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5923 int nbbs = loop->num_nodes;
5924 int i;
5925 tree ratio = NULL;
5926 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5927 bool grouped_store;
5928 bool slp_scheduled = false;
5929 gimple stmt, pattern_stmt;
5930 gimple_seq pattern_def_seq = NULL;
5931 gimple_stmt_iterator pattern_def_si = gsi_none ();
5932 bool transform_pattern_stmt = false;
5933 bool check_profitability = false;
5934 int th;
5935 /* Record number of iterations before we started tampering with the profile. */
5936 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5938 if (dump_enabled_p ())
5939 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5941 /* If profile is inprecise, we have chance to fix it up. */
5942 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5943 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5945 /* Use the more conservative vectorization threshold. If the number
5946 of iterations is constant assume the cost check has been performed
5947 by our caller. If the threshold makes all loops profitable that
5948 run at least the vectorization factor number of times checking
5949 is pointless, too. */
5950 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5951 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5952 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5954 if (dump_enabled_p ())
5955 dump_printf_loc (MSG_NOTE, vect_location,
5956 "Profitability threshold is %d loop iterations.\n",
5957 th);
5958 check_profitability = true;
5961 /* Version the loop first, if required, so the profitability check
5962 comes first. */
5964 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5965 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5967 vect_loop_versioning (loop_vinfo, th, check_profitability);
5968 check_profitability = false;
5971 tree ni_name = vect_build_loop_niters (loop_vinfo);
5972 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5974 /* Peel the loop if there are data refs with unknown alignment.
5975 Only one data ref with unknown store is allowed. */
5977 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5979 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5980 th, check_profitability);
5981 check_profitability = false;
5982 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5983 be re-computed. */
5984 ni_name = NULL_TREE;
5987 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5988 compile time constant), or it is a constant that doesn't divide by the
5989 vectorization factor, then an epilog loop needs to be created.
5990 We therefore duplicate the loop: the original loop will be vectorized,
5991 and will compute the first (n/VF) iterations. The second copy of the loop
5992 will remain scalar and will compute the remaining (n%VF) iterations.
5993 (VF is the vectorization factor). */
5995 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5996 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5998 tree ratio_mult_vf;
5999 if (!ni_name)
6000 ni_name = vect_build_loop_niters (loop_vinfo);
6001 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6002 &ratio);
6003 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6004 th, check_profitability);
6006 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6007 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6008 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6009 else
6011 if (!ni_name)
6012 ni_name = vect_build_loop_niters (loop_vinfo);
6013 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6016 /* 1) Make sure the loop header has exactly two entries
6017 2) Make sure we have a preheader basic block. */
6019 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6021 split_edge (loop_preheader_edge (loop));
6023 /* FORNOW: the vectorizer supports only loops which body consist
6024 of one basic block (header + empty latch). When the vectorizer will
6025 support more involved loop forms, the order by which the BBs are
6026 traversed need to be reconsidered. */
6028 for (i = 0; i < nbbs; i++)
6030 basic_block bb = bbs[i];
6031 stmt_vec_info stmt_info;
6033 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6034 gsi_next (&si))
6036 gphi *phi = si.phi ();
6037 if (dump_enabled_p ())
6039 dump_printf_loc (MSG_NOTE, vect_location,
6040 "------>vectorizing phi: ");
6041 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6042 dump_printf (MSG_NOTE, "\n");
6044 stmt_info = vinfo_for_stmt (phi);
6045 if (!stmt_info)
6046 continue;
6048 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6049 vect_loop_kill_debug_uses (loop, phi);
6051 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6052 && !STMT_VINFO_LIVE_P (stmt_info))
6053 continue;
6055 if (STMT_VINFO_VECTYPE (stmt_info)
6056 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6057 != (unsigned HOST_WIDE_INT) vectorization_factor)
6058 && dump_enabled_p ())
6059 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6061 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6063 if (dump_enabled_p ())
6064 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6065 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6069 pattern_stmt = NULL;
6070 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6071 !gsi_end_p (si) || transform_pattern_stmt;)
6073 bool is_store;
6075 if (transform_pattern_stmt)
6076 stmt = pattern_stmt;
6077 else
6079 stmt = gsi_stmt (si);
6080 /* During vectorization remove existing clobber stmts. */
6081 if (gimple_clobber_p (stmt))
6083 unlink_stmt_vdef (stmt);
6084 gsi_remove (&si, true);
6085 release_defs (stmt);
6086 continue;
6090 if (dump_enabled_p ())
6092 dump_printf_loc (MSG_NOTE, vect_location,
6093 "------>vectorizing statement: ");
6094 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6095 dump_printf (MSG_NOTE, "\n");
6098 stmt_info = vinfo_for_stmt (stmt);
6100 /* vector stmts created in the outer-loop during vectorization of
6101 stmts in an inner-loop may not have a stmt_info, and do not
6102 need to be vectorized. */
6103 if (!stmt_info)
6105 gsi_next (&si);
6106 continue;
6109 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6110 vect_loop_kill_debug_uses (loop, stmt);
6112 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6113 && !STMT_VINFO_LIVE_P (stmt_info))
6115 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6116 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6117 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6118 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6120 stmt = pattern_stmt;
6121 stmt_info = vinfo_for_stmt (stmt);
6123 else
6125 gsi_next (&si);
6126 continue;
6129 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6130 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6131 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6132 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6133 transform_pattern_stmt = true;
6135 /* If pattern statement has def stmts, vectorize them too. */
6136 if (is_pattern_stmt_p (stmt_info))
6138 if (pattern_def_seq == NULL)
6140 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6141 pattern_def_si = gsi_start (pattern_def_seq);
6143 else if (!gsi_end_p (pattern_def_si))
6144 gsi_next (&pattern_def_si);
6145 if (pattern_def_seq != NULL)
6147 gimple pattern_def_stmt = NULL;
6148 stmt_vec_info pattern_def_stmt_info = NULL;
6150 while (!gsi_end_p (pattern_def_si))
6152 pattern_def_stmt = gsi_stmt (pattern_def_si);
6153 pattern_def_stmt_info
6154 = vinfo_for_stmt (pattern_def_stmt);
6155 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6156 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6157 break;
6158 gsi_next (&pattern_def_si);
6161 if (!gsi_end_p (pattern_def_si))
6163 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_NOTE, vect_location,
6166 "==> vectorizing pattern def "
6167 "stmt: ");
6168 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6169 pattern_def_stmt, 0);
6170 dump_printf (MSG_NOTE, "\n");
6173 stmt = pattern_def_stmt;
6174 stmt_info = pattern_def_stmt_info;
6176 else
6178 pattern_def_si = gsi_none ();
6179 transform_pattern_stmt = false;
6182 else
6183 transform_pattern_stmt = false;
6186 if (STMT_VINFO_VECTYPE (stmt_info))
6188 unsigned int nunits
6189 = (unsigned int)
6190 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6191 if (!STMT_SLP_TYPE (stmt_info)
6192 && nunits != (unsigned int) vectorization_factor
6193 && dump_enabled_p ())
6194 /* For SLP VF is set according to unrolling factor, and not
6195 to vector size, hence for SLP this print is not valid. */
6196 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6199 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6200 reached. */
6201 if (STMT_SLP_TYPE (stmt_info))
6203 if (!slp_scheduled)
6205 slp_scheduled = true;
6207 if (dump_enabled_p ())
6208 dump_printf_loc (MSG_NOTE, vect_location,
6209 "=== scheduling SLP instances ===\n");
6211 vect_schedule_slp (loop_vinfo, NULL);
6214 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6215 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6217 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6219 pattern_def_seq = NULL;
6220 gsi_next (&si);
6222 continue;
6226 /* -------- vectorize statement ------------ */
6227 if (dump_enabled_p ())
6228 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6230 grouped_store = false;
6231 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6232 if (is_store)
6234 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6236 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6237 interleaving chain was completed - free all the stores in
6238 the chain. */
6239 gsi_next (&si);
6240 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6242 else
6244 /* Free the attached stmt_vec_info and remove the stmt. */
6245 gimple store = gsi_stmt (si);
6246 free_stmt_vec_info (store);
6247 unlink_stmt_vdef (store);
6248 gsi_remove (&si, true);
6249 release_defs (store);
6252 /* Stores can only appear at the end of pattern statements. */
6253 gcc_assert (!transform_pattern_stmt);
6254 pattern_def_seq = NULL;
6256 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6258 pattern_def_seq = NULL;
6259 gsi_next (&si);
6261 } /* stmts in BB */
6262 } /* BBs in loop */
6264 slpeel_make_loop_iterate_ntimes (loop, ratio);
6266 /* Reduce loop iterations by the vectorization factor. */
6267 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6268 expected_iterations / vectorization_factor);
6269 loop->nb_iterations_upper_bound
6270 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6271 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6272 && loop->nb_iterations_upper_bound != 0)
6273 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6274 if (loop->any_estimate)
6276 loop->nb_iterations_estimate
6277 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6278 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6279 && loop->nb_iterations_estimate != 0)
6280 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6283 if (dump_enabled_p ())
6285 dump_printf_loc (MSG_NOTE, vect_location,
6286 "LOOP VECTORIZED\n");
6287 if (loop->inner)
6288 dump_printf_loc (MSG_NOTE, vect_location,
6289 "OUTER LOOP VECTORIZED\n");
6290 dump_printf (MSG_NOTE, "\n");