* MAINTAINERS (nvptx): Add self.
[official-gcc.git] / gcc / tree-vect-loop.c
blobc31bfbdbad42d4f77c5207a38809fa7ccc90f805
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "cfghooks.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "cfganal.h"
36 #include "gimple-pretty-print.h"
37 #include "internal-fn.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-pass.h"
45 #include "cfgloop.h"
46 #include "flags.h"
47 #include "insn-config.h"
48 #include "expmed.h"
49 #include "dojump.h"
50 #include "explow.h"
51 #include "calls.h"
52 #include "emit-rtl.h"
53 #include "varasm.h"
54 #include "stmt.h"
55 #include "expr.h"
56 #include "recog.h"
57 #include "insn-codes.h"
58 #include "optabs.h"
59 #include "params.h"
60 #include "diagnostic-core.h"
61 #include "tree-chrec.h"
62 #include "tree-scalar-evolution.h"
63 #include "tree-vectorizer.h"
64 #include "target.h"
66 /* Loop Vectorization Pass.
68 This pass tries to vectorize loops.
70 For example, the vectorizer transforms the following simple loop:
72 short a[N]; short b[N]; short c[N]; int i;
74 for (i=0; i<N; i++){
75 a[i] = b[i] + c[i];
78 as if it was manually vectorized by rewriting the source code into:
80 typedef int __attribute__((mode(V8HI))) v8hi;
81 short a[N]; short b[N]; short c[N]; int i;
82 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
83 v8hi va, vb, vc;
85 for (i=0; i<N/8; i++){
86 vb = pb[i];
87 vc = pc[i];
88 va = vb + vc;
89 pa[i] = va;
92 The main entry to this pass is vectorize_loops(), in which
93 the vectorizer applies a set of analyses on a given set of loops,
94 followed by the actual vectorization transformation for the loops that
95 had successfully passed the analysis phase.
96 Throughout this pass we make a distinction between two types of
97 data: scalars (which are represented by SSA_NAMES), and memory references
98 ("data-refs"). These two types of data require different handling both
99 during analysis and transformation. The types of data-refs that the
100 vectorizer currently supports are ARRAY_REFS which base is an array DECL
101 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
102 accesses are required to have a simple (consecutive) access pattern.
104 Analysis phase:
105 ===============
106 The driver for the analysis phase is vect_analyze_loop().
107 It applies a set of analyses, some of which rely on the scalar evolution
108 analyzer (scev) developed by Sebastian Pop.
110 During the analysis phase the vectorizer records some information
111 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
112 loop, as well as general information about the loop as a whole, which is
113 recorded in a "loop_vec_info" struct attached to each loop.
115 Transformation phase:
116 =====================
117 The loop transformation phase scans all the stmts in the loop, and
118 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
119 the loop that needs to be vectorized. It inserts the vector code sequence
120 just before the scalar stmt S, and records a pointer to the vector code
121 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
122 attached to S). This pointer will be used for the vectorization of following
123 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
124 otherwise, we rely on dead code elimination for removing it.
126 For example, say stmt S1 was vectorized into stmt VS1:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 S2: a = b;
132 To vectorize stmt S2, the vectorizer first finds the stmt that defines
133 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
134 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
135 resulting sequence would be:
137 VS1: vb = px[i];
138 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
139 VS2: va = vb;
140 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
142 Operands that are not SSA_NAMEs, are data-refs that appear in
143 load/store operations (like 'x[i]' in S1), and are handled differently.
145 Target modeling:
146 =================
147 Currently the only target specific information that is used is the
148 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
149 Targets that can support different sizes of vectors, for now will need
150 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
151 flexibility will be added in the future.
153 Since we only vectorize operations which vector form can be
154 expressed using existing tree codes, to verify that an operation is
155 supported, the vectorizer checks the relevant optab at the relevant
156 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
157 the value found is CODE_FOR_nothing, then there's no target support, and
158 we can't vectorize the stmt.
160 For additional information on this project see:
161 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
164 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
166 /* Function vect_determine_vectorization_factor
168 Determine the vectorization factor (VF). VF is the number of data elements
169 that are operated upon in parallel in a single iteration of the vectorized
170 loop. For example, when vectorizing a loop that operates on 4byte elements,
171 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
172 elements can fit in a single vector register.
174 We currently support vectorization of loops in which all types operated upon
175 are of the same size. Therefore this function currently sets VF according to
176 the size of the types operated upon, and fails if there are multiple sizes
177 in the loop.
179 VF is also the factor by which the loop iterations are strip-mined, e.g.:
180 original loop:
181 for (i=0; i<N; i++){
182 a[i] = b[i] + c[i];
185 vectorized loop:
186 for (i=0; i<N; i+=VF){
187 a[i:VF] = b[i:VF] + c[i:VF];
191 static bool
192 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
194 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
195 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
196 int nbbs = loop->num_nodes;
197 unsigned int vectorization_factor = 0;
198 tree scalar_type;
199 gphi *phi;
200 tree vectype;
201 unsigned int nunits;
202 stmt_vec_info stmt_info;
203 int i;
204 HOST_WIDE_INT dummy;
205 gimple stmt, pattern_stmt = NULL;
206 gimple_seq pattern_def_seq = NULL;
207 gimple_stmt_iterator pattern_def_si = gsi_none ();
208 bool analyze_pattern_stmt = false;
210 if (dump_enabled_p ())
211 dump_printf_loc (MSG_NOTE, vect_location,
212 "=== vect_determine_vectorization_factor ===\n");
214 for (i = 0; i < nbbs; i++)
216 basic_block bb = bbs[i];
218 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
219 gsi_next (&si))
221 phi = si.phi ();
222 stmt_info = vinfo_for_stmt (phi);
223 if (dump_enabled_p ())
225 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
226 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
227 dump_printf (MSG_NOTE, "\n");
230 gcc_assert (stmt_info);
232 if (STMT_VINFO_RELEVANT_P (stmt_info))
234 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
235 scalar_type = TREE_TYPE (PHI_RESULT (phi));
237 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "get vectype for scalar type: ");
241 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
242 dump_printf (MSG_NOTE, "\n");
245 vectype = get_vectype_for_scalar_type (scalar_type);
246 if (!vectype)
248 if (dump_enabled_p ())
250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
251 "not vectorized: unsupported "
252 "data-type ");
253 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
254 scalar_type);
255 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
257 return false;
259 STMT_VINFO_VECTYPE (stmt_info) = vectype;
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
264 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
265 dump_printf (MSG_NOTE, "\n");
268 nunits = TYPE_VECTOR_SUBPARTS (vectype);
269 if (dump_enabled_p ())
270 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
271 nunits);
273 if (!vectorization_factor
274 || (nunits > vectorization_factor))
275 vectorization_factor = nunits;
279 for (gimple_stmt_iterator si = gsi_start_bb (bb);
280 !gsi_end_p (si) || analyze_pattern_stmt;)
282 tree vf_vectype;
284 if (analyze_pattern_stmt)
285 stmt = pattern_stmt;
286 else
287 stmt = gsi_stmt (si);
289 stmt_info = vinfo_for_stmt (stmt);
291 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE, vect_location,
294 "==> examining statement: ");
295 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
296 dump_printf (MSG_NOTE, "\n");
299 gcc_assert (stmt_info);
301 /* Skip stmts which do not need to be vectorized. */
302 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
303 && !STMT_VINFO_LIVE_P (stmt_info))
304 || gimple_clobber_p (stmt))
306 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
307 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
308 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
309 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
311 stmt = pattern_stmt;
312 stmt_info = vinfo_for_stmt (pattern_stmt);
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "==> examining pattern statement: ");
317 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
318 dump_printf (MSG_NOTE, "\n");
321 else
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
325 gsi_next (&si);
326 continue;
329 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
330 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
331 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
332 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
333 analyze_pattern_stmt = true;
335 /* If a pattern statement has def stmts, analyze them too. */
336 if (is_pattern_stmt_p (stmt_info))
338 if (pattern_def_seq == NULL)
340 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
341 pattern_def_si = gsi_start (pattern_def_seq);
343 else if (!gsi_end_p (pattern_def_si))
344 gsi_next (&pattern_def_si);
345 if (pattern_def_seq != NULL)
347 gimple pattern_def_stmt = NULL;
348 stmt_vec_info pattern_def_stmt_info = NULL;
350 while (!gsi_end_p (pattern_def_si))
352 pattern_def_stmt = gsi_stmt (pattern_def_si);
353 pattern_def_stmt_info
354 = vinfo_for_stmt (pattern_def_stmt);
355 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
356 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
357 break;
358 gsi_next (&pattern_def_si);
361 if (!gsi_end_p (pattern_def_si))
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location,
366 "==> examining pattern def stmt: ");
367 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
368 pattern_def_stmt, 0);
369 dump_printf (MSG_NOTE, "\n");
372 stmt = pattern_def_stmt;
373 stmt_info = pattern_def_stmt_info;
375 else
377 pattern_def_si = gsi_none ();
378 analyze_pattern_stmt = false;
381 else
382 analyze_pattern_stmt = false;
385 if (gimple_get_lhs (stmt) == NULL_TREE
386 /* MASK_STORE has no lhs, but is ok. */
387 && (!is_gimple_call (stmt)
388 || !gimple_call_internal_p (stmt)
389 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
391 if (is_gimple_call (stmt))
393 /* Ignore calls with no lhs. These must be calls to
394 #pragma omp simd functions, and what vectorization factor
395 it really needs can't be determined until
396 vectorizable_simd_clone_call. */
397 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
399 pattern_def_seq = NULL;
400 gsi_next (&si);
402 continue;
404 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "not vectorized: irregular stmt.");
408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return false;
415 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
417 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "not vectorized: vector stmt in loop:");
421 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
422 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
424 return false;
427 if (STMT_VINFO_VECTYPE (stmt_info))
429 /* The only case when a vectype had been already set is for stmts
430 that contain a dataref, or for "pattern-stmts" (stmts
431 generated by the vectorizer to represent/replace a certain
432 idiom). */
433 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
434 || is_pattern_stmt_p (stmt_info)
435 || !gsi_end_p (pattern_def_si));
436 vectype = STMT_VINFO_VECTYPE (stmt_info);
438 else
440 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
441 if (is_gimple_call (stmt)
442 && gimple_call_internal_p (stmt)
443 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
444 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
445 else
446 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_NOTE, vect_location,
450 "get vectype for scalar type: ");
451 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
452 dump_printf (MSG_NOTE, "\n");
454 vectype = get_vectype_for_scalar_type (scalar_type);
455 if (!vectype)
457 if (dump_enabled_p ())
459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
460 "not vectorized: unsupported "
461 "data-type ");
462 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
463 scalar_type);
464 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
466 return false;
469 STMT_VINFO_VECTYPE (stmt_info) = vectype;
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
474 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
475 dump_printf (MSG_NOTE, "\n");
479 /* The vectorization factor is according to the smallest
480 scalar type (or the largest vector size, but we only
481 support one vector size per loop). */
482 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
483 &dummy);
484 if (dump_enabled_p ())
486 dump_printf_loc (MSG_NOTE, vect_location,
487 "get vectype for scalar type: ");
488 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
489 dump_printf (MSG_NOTE, "\n");
491 vf_vectype = get_vectype_for_scalar_type (scalar_type);
492 if (!vf_vectype)
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "not vectorized: unsupported data-type ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
499 scalar_type);
500 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
502 return false;
505 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
506 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
511 "not vectorized: different sized vector "
512 "types in statement, ");
513 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
514 vectype);
515 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
516 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
517 vf_vectype);
518 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
520 return false;
523 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
526 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
527 dump_printf (MSG_NOTE, "\n");
530 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
531 if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
533 if (!vectorization_factor
534 || (nunits > vectorization_factor))
535 vectorization_factor = nunits;
537 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
539 pattern_def_seq = NULL;
540 gsi_next (&si);
545 /* TODO: Analyze cost. Decide if worth while to vectorize. */
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
548 vectorization_factor);
549 if (vectorization_factor <= 1)
551 if (dump_enabled_p ())
552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
553 "not vectorized: unsupported data-type\n");
554 return false;
556 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
558 return true;
562 /* Function vect_is_simple_iv_evolution.
564 FORNOW: A simple evolution of an induction variables in the loop is
565 considered a polynomial evolution. */
567 static bool
568 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
569 tree * step)
571 tree init_expr;
572 tree step_expr;
573 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
574 basic_block bb;
576 /* When there is no evolution in this loop, the evolution function
577 is not "simple". */
578 if (evolution_part == NULL_TREE)
579 return false;
581 /* When the evolution is a polynomial of degree >= 2
582 the evolution function is not "simple". */
583 if (tree_is_chrec (evolution_part))
584 return false;
586 step_expr = evolution_part;
587 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
589 if (dump_enabled_p ())
591 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
592 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
593 dump_printf (MSG_NOTE, ", init: ");
594 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
595 dump_printf (MSG_NOTE, "\n");
598 *init = init_expr;
599 *step = step_expr;
601 if (TREE_CODE (step_expr) != INTEGER_CST
602 && (TREE_CODE (step_expr) != SSA_NAME
603 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
604 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
605 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
606 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
607 || !flag_associative_math)))
608 && (TREE_CODE (step_expr) != REAL_CST
609 || !flag_associative_math))
611 if (dump_enabled_p ())
612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
613 "step unknown.\n");
614 return false;
617 return true;
620 /* Function vect_analyze_scalar_cycles_1.
622 Examine the cross iteration def-use cycles of scalar variables
623 in LOOP. LOOP_VINFO represents the loop that is now being
624 considered for vectorization (can be LOOP, or an outer-loop
625 enclosing LOOP). */
627 static void
628 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
630 basic_block bb = loop->header;
631 tree init, step;
632 auto_vec<gimple, 64> worklist;
633 gphi_iterator gsi;
634 bool double_reduc;
636 if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE, vect_location,
638 "=== vect_analyze_scalar_cycles ===\n");
640 /* First - identify all inductions. Reduction detection assumes that all the
641 inductions have been identified, therefore, this order must not be
642 changed. */
643 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
645 gphi *phi = gsi.phi ();
646 tree access_fn = NULL;
647 tree def = PHI_RESULT (phi);
648 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
650 if (dump_enabled_p ())
652 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
653 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
654 dump_printf (MSG_NOTE, "\n");
657 /* Skip virtual phi's. The data dependences that are associated with
658 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
659 if (virtual_operand_p (def))
660 continue;
662 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
664 /* Analyze the evolution function. */
665 access_fn = analyze_scalar_evolution (loop, def);
666 if (access_fn)
668 STRIP_NOPS (access_fn);
669 if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "Access function of PHI: ");
673 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
674 dump_printf (MSG_NOTE, "\n");
676 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
677 = evolution_part_in_loop_num (access_fn, loop->num);
680 if (!access_fn
681 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
682 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
683 && TREE_CODE (step) != INTEGER_CST))
685 worklist.safe_push (phi);
686 continue;
689 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
691 if (dump_enabled_p ())
692 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
693 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
697 /* Second - identify all reductions and nested cycles. */
698 while (worklist.length () > 0)
700 gimple phi = worklist.pop ();
701 tree def = PHI_RESULT (phi);
702 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
703 gimple reduc_stmt;
704 bool nested_cycle;
706 if (dump_enabled_p ())
708 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
709 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
710 dump_printf (MSG_NOTE, "\n");
713 gcc_assert (!virtual_operand_p (def)
714 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
716 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
717 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
718 &double_reduc, false);
719 if (reduc_stmt)
721 if (double_reduc)
723 if (dump_enabled_p ())
724 dump_printf_loc (MSG_NOTE, vect_location,
725 "Detected double reduction.\n");
727 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
728 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
729 vect_double_reduction_def;
731 else
733 if (nested_cycle)
735 if (dump_enabled_p ())
736 dump_printf_loc (MSG_NOTE, vect_location,
737 "Detected vectorizable nested cycle.\n");
739 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
740 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
741 vect_nested_cycle;
743 else
745 if (dump_enabled_p ())
746 dump_printf_loc (MSG_NOTE, vect_location,
747 "Detected reduction.\n");
749 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
750 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
751 vect_reduction_def;
752 /* Store the reduction cycles for possible vectorization in
753 loop-aware SLP. */
754 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
758 else
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
761 "Unknown def-use cycle pattern.\n");
766 /* Function vect_analyze_scalar_cycles.
768 Examine the cross iteration def-use cycles of scalar variables, by
769 analyzing the loop-header PHIs of scalar variables. Classify each
770 cycle as one of the following: invariant, induction, reduction, unknown.
771 We do that for the loop represented by LOOP_VINFO, and also to its
772 inner-loop, if exists.
773 Examples for scalar cycles:
775 Example1: reduction:
777 loop1:
778 for (i=0; i<N; i++)
779 sum += a[i];
781 Example2: induction:
783 loop2:
784 for (i=0; i<N; i++)
785 a[i] = i; */
787 static void
788 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
790 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
792 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
794 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
795 Reductions in such inner-loop therefore have different properties than
796 the reductions in the nest that gets vectorized:
797 1. When vectorized, they are executed in the same order as in the original
798 scalar loop, so we can't change the order of computation when
799 vectorizing them.
800 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
801 current checks are too strict. */
803 if (loop->inner)
804 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
807 /* Transfer group and reduction information from STMT to its pattern stmt. */
809 static void
810 vect_fixup_reduc_chain (gimple stmt)
812 gimple firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
813 gimple stmtp;
814 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
815 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
816 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
819 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
820 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
821 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
822 if (stmt)
823 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
824 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
826 while (stmt);
827 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
830 /* Fixup scalar cycles that now have their stmts detected as patterns. */
832 static void
833 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
835 gimple first;
836 unsigned i;
838 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
839 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
841 vect_fixup_reduc_chain (first);
842 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
843 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1.
853 Return the loop exit condition. */
856 static gcond *
857 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
858 tree *number_of_iterationsm1)
860 tree niters;
862 if (dump_enabled_p ())
863 dump_printf_loc (MSG_NOTE, vect_location,
864 "=== get_loop_niters ===\n");
866 niters = number_of_latch_executions (loop);
867 *number_of_iterationsm1 = niters;
869 /* We want the number of loop header executions which is the number
870 of latch executions plus one.
871 ??? For UINT_MAX latch executions this number overflows to zero
872 for loops like do { n++; } while (n != 0); */
873 if (niters && !chrec_contains_undetermined (niters))
874 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
875 build_int_cst (TREE_TYPE (niters), 1));
876 *number_of_iterations = niters;
878 return get_loop_exit_condition (loop);
882 /* Function bb_in_loop_p
884 Used as predicate for dfs order traversal of the loop bbs. */
886 static bool
887 bb_in_loop_p (const_basic_block bb, const void *data)
889 const struct loop *const loop = (const struct loop *)data;
890 if (flow_bb_inside_loop_p (loop, bb))
891 return true;
892 return false;
896 /* Function new_loop_vec_info.
898 Create and initialize a new loop_vec_info struct for LOOP, as well as
899 stmt_vec_info structs for all the stmts in LOOP. */
901 static loop_vec_info
902 new_loop_vec_info (struct loop *loop)
904 loop_vec_info res;
905 basic_block *bbs;
906 gimple_stmt_iterator si;
907 unsigned int i, nbbs;
909 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
910 LOOP_VINFO_LOOP (res) = loop;
912 bbs = get_loop_body (loop);
914 /* Create/Update stmt_info for all stmts in the loop. */
915 for (i = 0; i < loop->num_nodes; i++)
917 basic_block bb = bbs[i];
919 /* BBs in a nested inner-loop will have been already processed (because
920 we will have called vect_analyze_loop_form for any nested inner-loop).
921 Therefore, for stmts in an inner-loop we just want to update the
922 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
923 loop_info of the outer-loop we are currently considering to vectorize
924 (instead of the loop_info of the inner-loop).
925 For stmts in other BBs we need to create a stmt_info from scratch. */
926 if (bb->loop_father != loop)
928 /* Inner-loop bb. */
929 gcc_assert (loop->inner && bb->loop_father == loop->inner);
930 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
932 gimple phi = gsi_stmt (si);
933 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
934 loop_vec_info inner_loop_vinfo =
935 STMT_VINFO_LOOP_VINFO (stmt_info);
936 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
937 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
939 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
941 gimple stmt = gsi_stmt (si);
942 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
943 loop_vec_info inner_loop_vinfo =
944 STMT_VINFO_LOOP_VINFO (stmt_info);
945 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
946 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
949 else
951 /* bb in current nest. */
952 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
954 gimple phi = gsi_stmt (si);
955 gimple_set_uid (phi, 0);
956 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
959 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
961 gimple stmt = gsi_stmt (si);
962 gimple_set_uid (stmt, 0);
963 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
968 /* CHECKME: We want to visit all BBs before their successors (except for
969 latch blocks, for which this assertion wouldn't hold). In the simple
970 case of the loop forms we allow, a dfs order of the BBs would the same
971 as reversed postorder traversal, so we are safe. */
973 free (bbs);
974 bbs = XCNEWVEC (basic_block, loop->num_nodes);
975 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
976 bbs, loop->num_nodes, loop);
977 gcc_assert (nbbs == loop->num_nodes);
979 LOOP_VINFO_BBS (res) = bbs;
980 LOOP_VINFO_NITERSM1 (res) = NULL;
981 LOOP_VINFO_NITERS (res) = NULL;
982 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
983 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
984 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
985 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
986 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
987 LOOP_VINFO_VECT_FACTOR (res) = 0;
988 LOOP_VINFO_LOOP_NEST (res).create (3);
989 LOOP_VINFO_DATAREFS (res).create (10);
990 LOOP_VINFO_DDRS (res).create (10 * 10);
991 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
992 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
993 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
994 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
995 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
996 LOOP_VINFO_GROUPED_STORES (res).create (10);
997 LOOP_VINFO_REDUCTIONS (res).create (10);
998 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
999 LOOP_VINFO_SLP_INSTANCES (res).create (10);
1000 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1001 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1002 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1003 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1004 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1006 return res;
1010 /* Function destroy_loop_vec_info.
1012 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1013 stmts in the loop. */
1015 void
1016 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1018 struct loop *loop;
1019 basic_block *bbs;
1020 int nbbs;
1021 gimple_stmt_iterator si;
1022 int j;
1023 vec<slp_instance> slp_instances;
1024 slp_instance instance;
1025 bool swapped;
1027 if (!loop_vinfo)
1028 return;
1030 loop = LOOP_VINFO_LOOP (loop_vinfo);
1032 bbs = LOOP_VINFO_BBS (loop_vinfo);
1033 nbbs = clean_stmts ? loop->num_nodes : 0;
1034 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1036 for (j = 0; j < nbbs; j++)
1038 basic_block bb = bbs[j];
1039 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1040 free_stmt_vec_info (gsi_stmt (si));
1042 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1044 gimple stmt = gsi_stmt (si);
1046 /* We may have broken canonical form by moving a constant
1047 into RHS1 of a commutative op. Fix such occurrences. */
1048 if (swapped && is_gimple_assign (stmt))
1050 enum tree_code code = gimple_assign_rhs_code (stmt);
1052 if ((code == PLUS_EXPR
1053 || code == POINTER_PLUS_EXPR
1054 || code == MULT_EXPR)
1055 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1056 swap_ssa_operands (stmt,
1057 gimple_assign_rhs1_ptr (stmt),
1058 gimple_assign_rhs2_ptr (stmt));
1061 /* Free stmt_vec_info. */
1062 free_stmt_vec_info (stmt);
1063 gsi_next (&si);
1067 free (LOOP_VINFO_BBS (loop_vinfo));
1068 vect_destroy_datarefs (loop_vinfo, NULL);
1069 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1070 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1071 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1072 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1073 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1074 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1075 vect_free_slp_instance (instance);
1077 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1078 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1079 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1080 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1082 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1083 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1085 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1086 loop_vinfo->scalar_cost_vec.release ();
1088 free (loop_vinfo);
1089 loop->aux = NULL;
1093 /* Calculate the cost of one scalar iteration of the loop. */
1094 static void
1095 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1097 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1098 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1099 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1100 int innerloop_iters, i;
1102 /* Count statements in scalar loop. Using this as scalar cost for a single
1103 iteration for now.
1105 TODO: Add outer loop support.
1107 TODO: Consider assigning different costs to different scalar
1108 statements. */
1110 /* FORNOW. */
1111 innerloop_iters = 1;
1112 if (loop->inner)
1113 innerloop_iters = 50; /* FIXME */
1115 for (i = 0; i < nbbs; i++)
1117 gimple_stmt_iterator si;
1118 basic_block bb = bbs[i];
1120 if (bb->loop_father == loop->inner)
1121 factor = innerloop_iters;
1122 else
1123 factor = 1;
1125 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1127 gimple stmt = gsi_stmt (si);
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1130 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1131 continue;
1133 /* Skip stmts that are not vectorized inside the loop. */
1134 if (stmt_info
1135 && !STMT_VINFO_RELEVANT_P (stmt_info)
1136 && (!STMT_VINFO_LIVE_P (stmt_info)
1137 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1138 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1139 continue;
1141 vect_cost_for_stmt kind;
1142 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1144 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1145 kind = scalar_load;
1146 else
1147 kind = scalar_store;
1149 else
1150 kind = scalar_stmt;
1152 scalar_single_iter_cost
1153 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1154 factor, kind, NULL, 0, vect_prologue);
1157 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1158 = scalar_single_iter_cost;
1162 /* Function vect_analyze_loop_1.
1164 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1165 for it. The different analyses will record information in the
1166 loop_vec_info struct. This is a subset of the analyses applied in
1167 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1168 that is now considered for (outer-loop) vectorization. */
1170 static loop_vec_info
1171 vect_analyze_loop_1 (struct loop *loop)
1173 loop_vec_info loop_vinfo;
1175 if (dump_enabled_p ())
1176 dump_printf_loc (MSG_NOTE, vect_location,
1177 "===== analyze_loop_nest_1 =====\n");
1179 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1181 loop_vinfo = vect_analyze_loop_form (loop);
1182 if (!loop_vinfo)
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "bad inner-loop form.\n");
1187 return NULL;
1190 return loop_vinfo;
1194 /* Function vect_analyze_loop_form.
1196 Verify that certain CFG restrictions hold, including:
1197 - the loop has a pre-header
1198 - the loop has a single entry and exit
1199 - the loop exit condition is simple enough, and the number of iterations
1200 can be analyzed (a countable loop). */
1202 loop_vec_info
1203 vect_analyze_loop_form (struct loop *loop)
1205 loop_vec_info loop_vinfo;
1206 gcond *loop_cond;
1207 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1208 loop_vec_info inner_loop_vinfo = NULL;
1210 if (dump_enabled_p ())
1211 dump_printf_loc (MSG_NOTE, vect_location,
1212 "=== vect_analyze_loop_form ===\n");
1214 /* Different restrictions apply when we are considering an inner-most loop,
1215 vs. an outer (nested) loop.
1216 (FORNOW. May want to relax some of these restrictions in the future). */
1218 if (!loop->inner)
1220 /* Inner-most loop. We currently require that the number of BBs is
1221 exactly 2 (the header and latch). Vectorizable inner-most loops
1222 look like this:
1224 (pre-header)
1226 header <--------+
1227 | | |
1228 | +--> latch --+
1230 (exit-bb) */
1232 if (loop->num_nodes != 2)
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236 "not vectorized: control flow in loop.\n");
1237 return NULL;
1240 if (empty_block_p (loop->header))
1242 if (dump_enabled_p ())
1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244 "not vectorized: empty loop.\n");
1245 return NULL;
1248 else
1250 struct loop *innerloop = loop->inner;
1251 edge entryedge;
1253 /* Nested loop. We currently require that the loop is doubly-nested,
1254 contains a single inner loop, and the number of BBs is exactly 5.
1255 Vectorizable outer-loops look like this:
1257 (pre-header)
1259 header <---+
1261 inner-loop |
1263 tail ------+
1265 (exit-bb)
1267 The inner-loop has the properties expected of inner-most loops
1268 as described above. */
1270 if ((loop->inner)->inner || (loop->inner)->next)
1272 if (dump_enabled_p ())
1273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1274 "not vectorized: multiple nested loops.\n");
1275 return NULL;
1278 /* Analyze the inner-loop. */
1279 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1280 if (!inner_loop_vinfo)
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284 "not vectorized: Bad inner loop.\n");
1285 return NULL;
1288 if (!expr_invariant_in_loop_p (loop,
1289 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: inner-loop count not"
1294 " invariant.\n");
1295 destroy_loop_vec_info (inner_loop_vinfo, true);
1296 return NULL;
1299 if (loop->num_nodes != 5)
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: control flow in loop.\n");
1304 destroy_loop_vec_info (inner_loop_vinfo, true);
1305 return NULL;
1308 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1309 entryedge = EDGE_PRED (innerloop->header, 0);
1310 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1311 entryedge = EDGE_PRED (innerloop->header, 1);
1313 if (entryedge->src != loop->header
1314 || !single_exit (innerloop)
1315 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1317 if (dump_enabled_p ())
1318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1319 "not vectorized: unsupported outerloop form.\n");
1320 destroy_loop_vec_info (inner_loop_vinfo, true);
1321 return NULL;
1324 if (dump_enabled_p ())
1325 dump_printf_loc (MSG_NOTE, vect_location,
1326 "Considering outer-loop vectorization.\n");
1329 if (!single_exit (loop)
1330 || EDGE_COUNT (loop->header->preds) != 2)
1332 if (dump_enabled_p ())
1334 if (!single_exit (loop))
1335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336 "not vectorized: multiple exits.\n");
1337 else if (EDGE_COUNT (loop->header->preds) != 2)
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339 "not vectorized: too many incoming edges.\n");
1341 if (inner_loop_vinfo)
1342 destroy_loop_vec_info (inner_loop_vinfo, true);
1343 return NULL;
1346 /* We assume that the loop exit condition is at the end of the loop. i.e,
1347 that the loop is represented as a do-while (with a proper if-guard
1348 before the loop if needed), where the loop header contains all the
1349 executable statements, and the latch is empty. */
1350 if (!empty_block_p (loop->latch)
1351 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1355 "not vectorized: latch block not empty.\n");
1356 if (inner_loop_vinfo)
1357 destroy_loop_vec_info (inner_loop_vinfo, true);
1358 return NULL;
1361 /* Make sure there exists a single-predecessor exit bb: */
1362 if (!single_pred_p (single_exit (loop)->dest))
1364 edge e = single_exit (loop);
1365 if (!(e->flags & EDGE_ABNORMAL))
1367 split_loop_exit_edge (e);
1368 if (dump_enabled_p ())
1369 dump_printf (MSG_NOTE, "split exit edge.\n");
1371 else
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375 "not vectorized: abnormal loop exit edge.\n");
1376 if (inner_loop_vinfo)
1377 destroy_loop_vec_info (inner_loop_vinfo, true);
1378 return NULL;
1382 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1383 &number_of_iterationsm1);
1384 if (!loop_cond)
1386 if (dump_enabled_p ())
1387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388 "not vectorized: complicated exit condition.\n");
1389 if (inner_loop_vinfo)
1390 destroy_loop_vec_info (inner_loop_vinfo, true);
1391 return NULL;
1394 if (!number_of_iterations
1395 || chrec_contains_undetermined (number_of_iterations))
1397 if (dump_enabled_p ())
1398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1399 "not vectorized: number of iterations cannot be "
1400 "computed.\n");
1401 if (inner_loop_vinfo)
1402 destroy_loop_vec_info (inner_loop_vinfo, true);
1403 return NULL;
1406 if (integer_zerop (number_of_iterations))
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410 "not vectorized: number of iterations = 0.\n");
1411 if (inner_loop_vinfo)
1412 destroy_loop_vec_info (inner_loop_vinfo, true);
1413 return NULL;
1416 loop_vinfo = new_loop_vec_info (loop);
1417 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1418 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1419 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1421 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1423 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_NOTE, vect_location,
1426 "Symbolic number of iterations is ");
1427 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1428 dump_printf (MSG_NOTE, "\n");
1432 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1434 /* CHECKME: May want to keep it around it in the future. */
1435 if (inner_loop_vinfo)
1436 destroy_loop_vec_info (inner_loop_vinfo, false);
1438 gcc_assert (!loop->aux);
1439 loop->aux = loop_vinfo;
1440 return loop_vinfo;
1443 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1444 statements update the vectorization factor. */
1446 static void
1447 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1449 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1450 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1451 int nbbs = loop->num_nodes;
1452 unsigned int vectorization_factor;
1453 int i;
1455 if (dump_enabled_p ())
1456 dump_printf_loc (MSG_NOTE, vect_location,
1457 "=== vect_update_vf_for_slp ===\n");
1459 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1460 gcc_assert (vectorization_factor != 0);
1462 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1463 vectorization factor of the loop is the unrolling factor required by
1464 the SLP instances. If that unrolling factor is 1, we say, that we
1465 perform pure SLP on loop - cross iteration parallelism is not
1466 exploited. */
1467 bool only_slp_in_loop = true;
1468 for (i = 0; i < nbbs; i++)
1470 basic_block bb = bbs[i];
1471 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1472 gsi_next (&si))
1474 gimple stmt = gsi_stmt (si);
1475 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1476 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1477 && STMT_VINFO_RELATED_STMT (stmt_info))
1479 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1480 stmt_info = vinfo_for_stmt (stmt);
1482 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1483 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1484 && !PURE_SLP_STMT (stmt_info))
1485 /* STMT needs both SLP and loop-based vectorization. */
1486 only_slp_in_loop = false;
1490 if (only_slp_in_loop)
1491 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1492 else
1493 vectorization_factor
1494 = least_common_multiple (vectorization_factor,
1495 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1497 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1498 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE, vect_location,
1500 "Updating vectorization factor to %d\n",
1501 vectorization_factor);
1504 /* Function vect_analyze_loop_operations.
1506 Scan the loop stmts and make sure they are all vectorizable. */
1508 static bool
1509 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1511 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1512 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1513 int nbbs = loop->num_nodes;
1514 unsigned int vectorization_factor;
1515 int i;
1516 stmt_vec_info stmt_info;
1517 bool need_to_vectorize = false;
1518 int min_profitable_iters;
1519 int min_scalar_loop_bound;
1520 unsigned int th;
1521 bool ok;
1522 HOST_WIDE_INT max_niter;
1523 HOST_WIDE_INT estimated_niter;
1524 int min_profitable_estimate;
1526 if (dump_enabled_p ())
1527 dump_printf_loc (MSG_NOTE, vect_location,
1528 "=== vect_analyze_loop_operations ===\n");
1530 for (i = 0; i < nbbs; i++)
1532 basic_block bb = bbs[i];
1534 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535 gsi_next (&si))
1537 gphi *phi = si.phi ();
1538 ok = true;
1540 stmt_info = vinfo_for_stmt (phi);
1541 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1544 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1545 dump_printf (MSG_NOTE, "\n");
1548 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1549 (i.e., a phi in the tail of the outer-loop). */
1550 if (! is_loop_header_bb_p (bb))
1552 /* FORNOW: we currently don't support the case that these phis
1553 are not used in the outerloop (unless it is double reduction,
1554 i.e., this phi is vect_reduction_def), cause this case
1555 requires to actually do something here. */
1556 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1557 || STMT_VINFO_LIVE_P (stmt_info))
1558 && STMT_VINFO_DEF_TYPE (stmt_info)
1559 != vect_double_reduction_def)
1561 if (dump_enabled_p ())
1562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1563 "Unsupported loop-closed phi in "
1564 "outer-loop.\n");
1565 return false;
1568 /* If PHI is used in the outer loop, we check that its operand
1569 is defined in the inner loop. */
1570 if (STMT_VINFO_RELEVANT_P (stmt_info))
1572 tree phi_op;
1573 gimple op_def_stmt;
1575 if (gimple_phi_num_args (phi) != 1)
1576 return false;
1578 phi_op = PHI_ARG_DEF (phi, 0);
1579 if (TREE_CODE (phi_op) != SSA_NAME)
1580 return false;
1582 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1583 if (gimple_nop_p (op_def_stmt)
1584 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1585 || !vinfo_for_stmt (op_def_stmt))
1586 return false;
1588 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1589 != vect_used_in_outer
1590 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1591 != vect_used_in_outer_by_reduction)
1592 return false;
1595 continue;
1598 gcc_assert (stmt_info);
1600 if (STMT_VINFO_LIVE_P (stmt_info))
1602 /* FORNOW: not yet supported. */
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 "not vectorized: value used after loop.\n");
1606 return false;
1609 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1610 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1612 /* A scalar-dependence cycle that we don't support. */
1613 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615 "not vectorized: scalar dependence cycle.\n");
1616 return false;
1619 if (STMT_VINFO_RELEVANT_P (stmt_info))
1621 need_to_vectorize = true;
1622 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1623 ok = vectorizable_induction (phi, NULL, NULL);
1626 if (!ok)
1628 if (dump_enabled_p ())
1630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1631 "not vectorized: relevant phi not "
1632 "supported: ");
1633 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1634 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1636 return false;
1640 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1641 gsi_next (&si))
1643 gimple stmt = gsi_stmt (si);
1644 if (!gimple_clobber_p (stmt)
1645 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1646 return false;
1648 } /* bbs */
1650 /* All operations in the loop are either irrelevant (deal with loop
1651 control, or dead), or only used outside the loop and can be moved
1652 out of the loop (e.g. invariants, inductions). The loop can be
1653 optimized away by scalar optimizations. We're better off not
1654 touching this loop. */
1655 if (!need_to_vectorize)
1657 if (dump_enabled_p ())
1658 dump_printf_loc (MSG_NOTE, vect_location,
1659 "All the computation can be taken out of the loop.\n");
1660 if (dump_enabled_p ())
1661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1662 "not vectorized: redundant loop. no profit to "
1663 "vectorize.\n");
1664 return false;
1667 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1668 gcc_assert (vectorization_factor != 0);
1670 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1671 dump_printf_loc (MSG_NOTE, vect_location,
1672 "vectorization_factor = %d, niters = "
1673 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1674 LOOP_VINFO_INT_NITERS (loop_vinfo));
1676 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1677 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1678 || ((max_niter = max_stmt_executions_int (loop)) != -1
1679 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "not vectorized: iteration count too small.\n");
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686 "not vectorized: iteration count smaller than "
1687 "vectorization factor.\n");
1688 return false;
1691 /* Analyze cost. Decide if worth while to vectorize. */
1693 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1694 &min_profitable_estimate);
1695 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1697 if (min_profitable_iters < 0)
1699 if (dump_enabled_p ())
1700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1701 "not vectorized: vectorization not profitable.\n");
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704 "not vectorized: vector version will never be "
1705 "profitable.\n");
1706 return false;
1709 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1710 * vectorization_factor) - 1);
1713 /* Use the cost model only if it is more conservative than user specified
1714 threshold. */
1716 th = (unsigned) min_scalar_loop_bound;
1717 if (min_profitable_iters
1718 && (!min_scalar_loop_bound
1719 || min_profitable_iters > min_scalar_loop_bound))
1720 th = (unsigned) min_profitable_iters;
1722 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1724 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1725 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1727 if (dump_enabled_p ())
1728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729 "not vectorized: vectorization not profitable.\n");
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_NOTE, vect_location,
1732 "not vectorized: iteration count smaller than user "
1733 "specified loop bound parameter or minimum profitable "
1734 "iterations (whichever is more conservative).\n");
1735 return false;
1738 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1739 && ((unsigned HOST_WIDE_INT) estimated_niter
1740 <= MAX (th, (unsigned)min_profitable_estimate)))
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 "not vectorized: estimated iteration count too "
1745 "small.\n");
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "not vectorized: estimated iteration count smaller "
1749 "than specified loop bound parameter or minimum "
1750 "profitable iterations (whichever is more "
1751 "conservative).\n");
1752 return false;
1755 return true;
1759 /* Function vect_analyze_loop_2.
1761 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1762 for it. The different analyses will record information in the
1763 loop_vec_info struct. */
1764 static bool
1765 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1767 bool ok;
1768 int max_vf = MAX_VECTORIZATION_FACTOR;
1769 int min_vf = 2;
1770 unsigned int th;
1771 unsigned int n_stmts = 0;
1773 /* Find all data references in the loop (which correspond to vdefs/vuses)
1774 and analyze their evolution in the loop. Also adjust the minimal
1775 vectorization factor according to the loads and stores.
1777 FORNOW: Handle only simple, array references, which
1778 alignment can be forced, and aligned pointer-references. */
1780 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1781 if (!ok)
1783 if (dump_enabled_p ())
1784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785 "bad data references.\n");
1786 return false;
1789 /* Classify all cross-iteration scalar data-flow cycles.
1790 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1792 vect_analyze_scalar_cycles (loop_vinfo);
1794 vect_pattern_recog (loop_vinfo, NULL);
1796 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1798 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1799 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1801 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1802 if (!ok)
1804 if (dump_enabled_p ())
1805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806 "bad data access.\n");
1807 return false;
1810 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1812 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1813 if (!ok)
1815 if (dump_enabled_p ())
1816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1817 "unexpected pattern.\n");
1818 return false;
1821 /* Analyze data dependences between the data-refs in the loop
1822 and adjust the maximum vectorization factor according to
1823 the dependences.
1824 FORNOW: fail at the first data dependence that we encounter. */
1826 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1827 if (!ok
1828 || max_vf < min_vf)
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1832 "bad data dependence.\n");
1833 return false;
1836 ok = vect_determine_vectorization_factor (loop_vinfo);
1837 if (!ok)
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841 "can't determine vectorization factor.\n");
1842 return false;
1844 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 "bad data dependence.\n");
1849 return false;
1852 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1853 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1854 if (!ok)
1855 return false;
1857 /* If there are any SLP instances mark them as pure_slp. */
1858 bool slp = vect_make_slp_decision (loop_vinfo);
1859 if (slp)
1861 /* Find stmts that need to be both vectorized and SLPed. */
1862 vect_detect_hybrid_slp (loop_vinfo);
1864 /* Update the vectorization factor based on the SLP decision. */
1865 vect_update_vf_for_slp (loop_vinfo);
1868 /* Analyze the alignment of the data-refs in the loop.
1869 Fail if a data reference is found that cannot be vectorized. */
1871 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1872 if (!ok)
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876 "bad data alignment.\n");
1877 return false;
1880 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1881 It is important to call pruning after vect_analyze_data_ref_accesses,
1882 since we use grouping information gathered by interleaving analysis. */
1883 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1884 if (!ok)
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 "number of versioning for alias "
1889 "run-time tests exceeds %d "
1890 "(--param vect-max-version-for-alias-checks)\n",
1891 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1892 return false;
1895 /* Compute the scalar iteration cost. */
1896 vect_get_single_scalar_iteration_cost (loop_vinfo);
1898 /* This pass will decide on using loop versioning and/or loop peeling in
1899 order to enhance the alignment of data references in the loop. */
1901 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1902 if (!ok)
1904 if (dump_enabled_p ())
1905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906 "bad data alignment.\n");
1907 return false;
1910 if (slp)
1912 /* Analyze operations in the SLP instances. Note this may
1913 remove unsupported SLP instances which makes the above
1914 SLP kind detection invalid. */
1915 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1916 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1917 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1918 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1919 return false;
1922 /* Scan all the remaining operations in the loop that are not subject
1923 to SLP and make sure they are vectorizable. */
1924 ok = vect_analyze_loop_operations (loop_vinfo);
1925 if (!ok)
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "bad operation or unsupported loop bound.\n");
1930 return false;
1933 /* Decide whether we need to create an epilogue loop to handle
1934 remaining scalar iterations. */
1935 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1936 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1937 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1939 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1940 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1942 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1943 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1944 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1945 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1947 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1948 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1949 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1950 /* In case of versioning, check if the maximum number of
1951 iterations is greater than th. If they are identical,
1952 the epilogue is unnecessary. */
1953 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1954 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1955 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1956 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1957 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1959 /* If an epilogue loop is required make sure we can create one. */
1960 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1961 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1965 if (!vect_can_advance_ivs_p (loop_vinfo)
1966 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1967 single_exit (LOOP_VINFO_LOOP
1968 (loop_vinfo))))
1970 if (dump_enabled_p ())
1971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1972 "not vectorized: can't create required "
1973 "epilog loop\n");
1974 return false;
1978 return true;
1981 /* Function vect_analyze_loop.
1983 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1984 for it. The different analyses will record information in the
1985 loop_vec_info struct. */
1986 loop_vec_info
1987 vect_analyze_loop (struct loop *loop)
1989 loop_vec_info loop_vinfo;
1990 unsigned int vector_sizes;
1992 /* Autodetect first vector size we try. */
1993 current_vector_size = 0;
1994 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1996 if (dump_enabled_p ())
1997 dump_printf_loc (MSG_NOTE, vect_location,
1998 "===== analyze_loop_nest =====\n");
2000 if (loop_outer (loop)
2001 && loop_vec_info_for_loop (loop_outer (loop))
2002 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2004 if (dump_enabled_p ())
2005 dump_printf_loc (MSG_NOTE, vect_location,
2006 "outer-loop already vectorized.\n");
2007 return NULL;
2010 while (1)
2012 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2013 loop_vinfo = vect_analyze_loop_form (loop);
2014 if (!loop_vinfo)
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "bad loop form.\n");
2019 return NULL;
2022 if (vect_analyze_loop_2 (loop_vinfo))
2024 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2026 return loop_vinfo;
2029 destroy_loop_vec_info (loop_vinfo, true);
2031 vector_sizes &= ~current_vector_size;
2032 if (vector_sizes == 0
2033 || current_vector_size == 0)
2034 return NULL;
2036 /* Try the next biggest vector size. */
2037 current_vector_size = 1 << floor_log2 (vector_sizes);
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_NOTE, vect_location,
2040 "***** Re-trying analysis with "
2041 "vector size %d\n", current_vector_size);
2046 /* Function reduction_code_for_scalar_code
2048 Input:
2049 CODE - tree_code of a reduction operations.
2051 Output:
2052 REDUC_CODE - the corresponding tree-code to be used to reduce the
2053 vector of partial results into a single scalar result, or ERROR_MARK
2054 if the operation is a supported reduction operation, but does not have
2055 such a tree-code.
2057 Return FALSE if CODE currently cannot be vectorized as reduction. */
2059 static bool
2060 reduction_code_for_scalar_code (enum tree_code code,
2061 enum tree_code *reduc_code)
2063 switch (code)
2065 case MAX_EXPR:
2066 *reduc_code = REDUC_MAX_EXPR;
2067 return true;
2069 case MIN_EXPR:
2070 *reduc_code = REDUC_MIN_EXPR;
2071 return true;
2073 case PLUS_EXPR:
2074 *reduc_code = REDUC_PLUS_EXPR;
2075 return true;
2077 case MULT_EXPR:
2078 case MINUS_EXPR:
2079 case BIT_IOR_EXPR:
2080 case BIT_XOR_EXPR:
2081 case BIT_AND_EXPR:
2082 *reduc_code = ERROR_MARK;
2083 return true;
2085 default:
2086 return false;
2091 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2092 STMT is printed with a message MSG. */
2094 static void
2095 report_vect_op (int msg_type, gimple stmt, const char *msg)
2097 dump_printf_loc (msg_type, vect_location, "%s", msg);
2098 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2099 dump_printf (msg_type, "\n");
2103 /* Detect SLP reduction of the form:
2105 #a1 = phi <a5, a0>
2106 a2 = operation (a1)
2107 a3 = operation (a2)
2108 a4 = operation (a3)
2109 a5 = operation (a4)
2111 #a = phi <a5>
2113 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2114 FIRST_STMT is the first reduction stmt in the chain
2115 (a2 = operation (a1)).
2117 Return TRUE if a reduction chain was detected. */
2119 static bool
2120 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2122 struct loop *loop = (gimple_bb (phi))->loop_father;
2123 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2124 enum tree_code code;
2125 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2126 stmt_vec_info use_stmt_info, current_stmt_info;
2127 tree lhs;
2128 imm_use_iterator imm_iter;
2129 use_operand_p use_p;
2130 int nloop_uses, size = 0, n_out_of_loop_uses;
2131 bool found = false;
2133 if (loop != vect_loop)
2134 return false;
2136 lhs = PHI_RESULT (phi);
2137 code = gimple_assign_rhs_code (first_stmt);
2138 while (1)
2140 nloop_uses = 0;
2141 n_out_of_loop_uses = 0;
2142 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2144 gimple use_stmt = USE_STMT (use_p);
2145 if (is_gimple_debug (use_stmt))
2146 continue;
2148 /* Check if we got back to the reduction phi. */
2149 if (use_stmt == phi)
2151 loop_use_stmt = use_stmt;
2152 found = true;
2153 break;
2156 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2158 loop_use_stmt = use_stmt;
2159 nloop_uses++;
2161 else
2162 n_out_of_loop_uses++;
2164 /* There are can be either a single use in the loop or two uses in
2165 phi nodes. */
2166 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2167 return false;
2170 if (found)
2171 break;
2173 /* We reached a statement with no loop uses. */
2174 if (nloop_uses == 0)
2175 return false;
2177 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2178 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2179 return false;
2181 if (!is_gimple_assign (loop_use_stmt)
2182 || code != gimple_assign_rhs_code (loop_use_stmt)
2183 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2184 return false;
2186 /* Insert USE_STMT into reduction chain. */
2187 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2188 if (current_stmt)
2190 current_stmt_info = vinfo_for_stmt (current_stmt);
2191 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2192 GROUP_FIRST_ELEMENT (use_stmt_info)
2193 = GROUP_FIRST_ELEMENT (current_stmt_info);
2195 else
2196 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2198 lhs = gimple_assign_lhs (loop_use_stmt);
2199 current_stmt = loop_use_stmt;
2200 size++;
2203 if (!found || loop_use_stmt != phi || size < 2)
2204 return false;
2206 /* Swap the operands, if needed, to make the reduction operand be the second
2207 operand. */
2208 lhs = PHI_RESULT (phi);
2209 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2210 while (next_stmt)
2212 if (gimple_assign_rhs2 (next_stmt) == lhs)
2214 tree op = gimple_assign_rhs1 (next_stmt);
2215 gimple def_stmt = NULL;
2217 if (TREE_CODE (op) == SSA_NAME)
2218 def_stmt = SSA_NAME_DEF_STMT (op);
2220 /* Check that the other def is either defined in the loop
2221 ("vect_internal_def"), or it's an induction (defined by a
2222 loop-header phi-node). */
2223 if (def_stmt
2224 && gimple_bb (def_stmt)
2225 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2226 && (is_gimple_assign (def_stmt)
2227 || is_gimple_call (def_stmt)
2228 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2229 == vect_induction_def
2230 || (gimple_code (def_stmt) == GIMPLE_PHI
2231 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2232 == vect_internal_def
2233 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2235 lhs = gimple_assign_lhs (next_stmt);
2236 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2237 continue;
2240 return false;
2242 else
2244 tree op = gimple_assign_rhs2 (next_stmt);
2245 gimple def_stmt = NULL;
2247 if (TREE_CODE (op) == SSA_NAME)
2248 def_stmt = SSA_NAME_DEF_STMT (op);
2250 /* Check that the other def is either defined in the loop
2251 ("vect_internal_def"), or it's an induction (defined by a
2252 loop-header phi-node). */
2253 if (def_stmt
2254 && gimple_bb (def_stmt)
2255 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2256 && (is_gimple_assign (def_stmt)
2257 || is_gimple_call (def_stmt)
2258 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2259 == vect_induction_def
2260 || (gimple_code (def_stmt) == GIMPLE_PHI
2261 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2262 == vect_internal_def
2263 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2265 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2268 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2269 dump_printf (MSG_NOTE, "\n");
2272 swap_ssa_operands (next_stmt,
2273 gimple_assign_rhs1_ptr (next_stmt),
2274 gimple_assign_rhs2_ptr (next_stmt));
2275 update_stmt (next_stmt);
2277 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2278 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2280 else
2281 return false;
2284 lhs = gimple_assign_lhs (next_stmt);
2285 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2288 /* Save the chain for further analysis in SLP detection. */
2289 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2290 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2291 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2293 return true;
2297 /* Function vect_is_simple_reduction_1
2299 (1) Detect a cross-iteration def-use cycle that represents a simple
2300 reduction computation. We look for the following pattern:
2302 loop_header:
2303 a1 = phi < a0, a2 >
2304 a3 = ...
2305 a2 = operation (a3, a1)
2309 a3 = ...
2310 loop_header:
2311 a1 = phi < a0, a2 >
2312 a2 = operation (a3, a1)
2314 such that:
2315 1. operation is commutative and associative and it is safe to
2316 change the order of the computation (if CHECK_REDUCTION is true)
2317 2. no uses for a2 in the loop (a2 is used out of the loop)
2318 3. no uses of a1 in the loop besides the reduction operation
2319 4. no uses of a1 outside the loop.
2321 Conditions 1,4 are tested here.
2322 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2324 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2325 nested cycles, if CHECK_REDUCTION is false.
2327 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2328 reductions:
2330 a1 = phi < a0, a2 >
2331 inner loop (def of a3)
2332 a2 = phi < a3 >
2334 If MODIFY is true it tries also to rework the code in-place to enable
2335 detection of more reduction patterns. For the time being we rewrite
2336 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2339 static gimple
2340 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2341 bool check_reduction, bool *double_reduc,
2342 bool modify, bool need_wrapping_integral_overflow)
2344 struct loop *loop = (gimple_bb (phi))->loop_father;
2345 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2346 edge latch_e = loop_latch_edge (loop);
2347 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2348 gimple def_stmt, def1 = NULL, def2 = NULL;
2349 enum tree_code orig_code, code;
2350 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2351 tree type;
2352 int nloop_uses;
2353 tree name;
2354 imm_use_iterator imm_iter;
2355 use_operand_p use_p;
2356 bool phi_def;
2358 *double_reduc = false;
2360 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2361 otherwise, we assume outer loop vectorization. */
2362 gcc_assert ((check_reduction && loop == vect_loop)
2363 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2365 name = PHI_RESULT (phi);
2366 /* ??? If there are no uses of the PHI result the inner loop reduction
2367 won't be detected as possibly double-reduction by vectorizable_reduction
2368 because that tries to walk the PHI arg from the preheader edge which
2369 can be constant. See PR60382. */
2370 if (has_zero_uses (name))
2371 return NULL;
2372 nloop_uses = 0;
2373 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2375 gimple use_stmt = USE_STMT (use_p);
2376 if (is_gimple_debug (use_stmt))
2377 continue;
2379 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2381 if (dump_enabled_p ())
2382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383 "intermediate value used outside loop.\n");
2385 return NULL;
2388 nloop_uses++;
2389 if (nloop_uses > 1)
2391 if (dump_enabled_p ())
2392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2393 "reduction used in loop.\n");
2394 return NULL;
2398 if (TREE_CODE (loop_arg) != SSA_NAME)
2400 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2403 "reduction: not ssa_name: ");
2404 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2405 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2407 return NULL;
2410 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2411 if (!def_stmt)
2413 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2415 "reduction: no def_stmt.\n");
2416 return NULL;
2419 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2421 if (dump_enabled_p ())
2423 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2424 dump_printf (MSG_NOTE, "\n");
2426 return NULL;
2429 if (is_gimple_assign (def_stmt))
2431 name = gimple_assign_lhs (def_stmt);
2432 phi_def = false;
2434 else
2436 name = PHI_RESULT (def_stmt);
2437 phi_def = true;
2440 nloop_uses = 0;
2441 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2443 gimple use_stmt = USE_STMT (use_p);
2444 if (is_gimple_debug (use_stmt))
2445 continue;
2446 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2447 nloop_uses++;
2448 if (nloop_uses > 1)
2450 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452 "reduction used in loop.\n");
2453 return NULL;
2457 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2458 defined in the inner loop. */
2459 if (phi_def)
2461 op1 = PHI_ARG_DEF (def_stmt, 0);
2463 if (gimple_phi_num_args (def_stmt) != 1
2464 || TREE_CODE (op1) != SSA_NAME)
2466 if (dump_enabled_p ())
2467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2468 "unsupported phi node definition.\n");
2470 return NULL;
2473 def1 = SSA_NAME_DEF_STMT (op1);
2474 if (gimple_bb (def1)
2475 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2476 && loop->inner
2477 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2478 && is_gimple_assign (def1))
2480 if (dump_enabled_p ())
2481 report_vect_op (MSG_NOTE, def_stmt,
2482 "detected double reduction: ");
2484 *double_reduc = true;
2485 return def_stmt;
2488 return NULL;
2491 code = orig_code = gimple_assign_rhs_code (def_stmt);
2493 /* We can handle "res -= x[i]", which is non-associative by
2494 simply rewriting this into "res += -x[i]". Avoid changing
2495 gimple instruction for the first simple tests and only do this
2496 if we're allowed to change code at all. */
2497 if (code == MINUS_EXPR
2498 && modify
2499 && (op1 = gimple_assign_rhs1 (def_stmt))
2500 && TREE_CODE (op1) == SSA_NAME
2501 && SSA_NAME_DEF_STMT (op1) == phi)
2502 code = PLUS_EXPR;
2504 if (check_reduction
2505 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2507 if (dump_enabled_p ())
2508 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2509 "reduction: not commutative/associative: ");
2510 return NULL;
2513 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2515 if (code != COND_EXPR)
2517 if (dump_enabled_p ())
2518 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2519 "reduction: not binary operation: ");
2521 return NULL;
2524 op3 = gimple_assign_rhs1 (def_stmt);
2525 if (COMPARISON_CLASS_P (op3))
2527 op4 = TREE_OPERAND (op3, 1);
2528 op3 = TREE_OPERAND (op3, 0);
2531 op1 = gimple_assign_rhs2 (def_stmt);
2532 op2 = gimple_assign_rhs3 (def_stmt);
2534 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2536 if (dump_enabled_p ())
2537 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2538 "reduction: uses not ssa_names: ");
2540 return NULL;
2543 else
2545 op1 = gimple_assign_rhs1 (def_stmt);
2546 op2 = gimple_assign_rhs2 (def_stmt);
2548 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2550 if (dump_enabled_p ())
2551 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2552 "reduction: uses not ssa_names: ");
2554 return NULL;
2558 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2559 if ((TREE_CODE (op1) == SSA_NAME
2560 && !types_compatible_p (type,TREE_TYPE (op1)))
2561 || (TREE_CODE (op2) == SSA_NAME
2562 && !types_compatible_p (type, TREE_TYPE (op2)))
2563 || (op3 && TREE_CODE (op3) == SSA_NAME
2564 && !types_compatible_p (type, TREE_TYPE (op3)))
2565 || (op4 && TREE_CODE (op4) == SSA_NAME
2566 && !types_compatible_p (type, TREE_TYPE (op4))))
2568 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_NOTE, vect_location,
2571 "reduction: multiple types: operation type: ");
2572 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2573 dump_printf (MSG_NOTE, ", operands types: ");
2574 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2575 TREE_TYPE (op1));
2576 dump_printf (MSG_NOTE, ",");
2577 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2578 TREE_TYPE (op2));
2579 if (op3)
2581 dump_printf (MSG_NOTE, ",");
2582 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2583 TREE_TYPE (op3));
2586 if (op4)
2588 dump_printf (MSG_NOTE, ",");
2589 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2590 TREE_TYPE (op4));
2592 dump_printf (MSG_NOTE, "\n");
2595 return NULL;
2598 /* Check that it's ok to change the order of the computation.
2599 Generally, when vectorizing a reduction we change the order of the
2600 computation. This may change the behavior of the program in some
2601 cases, so we need to check that this is ok. One exception is when
2602 vectorizing an outer-loop: the inner-loop is executed sequentially,
2603 and therefore vectorizing reductions in the inner-loop during
2604 outer-loop vectorization is safe. */
2606 /* CHECKME: check for !flag_finite_math_only too? */
2607 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2608 && check_reduction)
2610 /* Changing the order of operations changes the semantics. */
2611 if (dump_enabled_p ())
2612 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2613 "reduction: unsafe fp math optimization: ");
2614 return NULL;
2616 else if (INTEGRAL_TYPE_P (type) && check_reduction)
2618 if (TYPE_OVERFLOW_TRAPS (type))
2620 /* Changing the order of operations changes the semantics. */
2621 if (dump_enabled_p ())
2622 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2623 "reduction: unsafe int math optimization"
2624 " (overflow traps): ");
2625 return NULL;
2627 if (need_wrapping_integral_overflow && !TYPE_OVERFLOW_WRAPS (type))
2629 /* Changing the order of operations changes the semantics. */
2630 if (dump_enabled_p ())
2631 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2632 "reduction: unsafe int math optimization"
2633 " (overflow doesn't wrap): ");
2634 return NULL;
2637 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2639 /* Changing the order of operations changes the semantics. */
2640 if (dump_enabled_p ())
2641 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2642 "reduction: unsafe fixed-point math optimization: ");
2643 return NULL;
2646 /* If we detected "res -= x[i]" earlier, rewrite it into
2647 "res += -x[i]" now. If this turns out to be useless reassoc
2648 will clean it up again. */
2649 if (orig_code == MINUS_EXPR)
2651 tree rhs = gimple_assign_rhs2 (def_stmt);
2652 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2653 gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2654 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2655 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2656 loop_info, NULL));
2657 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2658 gimple_assign_set_rhs2 (def_stmt, negrhs);
2659 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2660 update_stmt (def_stmt);
2663 /* Reduction is safe. We're dealing with one of the following:
2664 1) integer arithmetic and no trapv
2665 2) floating point arithmetic, and special flags permit this optimization
2666 3) nested cycle (i.e., outer loop vectorization). */
2667 if (TREE_CODE (op1) == SSA_NAME)
2668 def1 = SSA_NAME_DEF_STMT (op1);
2670 if (TREE_CODE (op2) == SSA_NAME)
2671 def2 = SSA_NAME_DEF_STMT (op2);
2673 if (code != COND_EXPR
2674 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2676 if (dump_enabled_p ())
2677 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2678 return NULL;
2681 /* Check that one def is the reduction def, defined by PHI,
2682 the other def is either defined in the loop ("vect_internal_def"),
2683 or it's an induction (defined by a loop-header phi-node). */
2685 if (def2 && def2 == phi
2686 && (code == COND_EXPR
2687 || !def1 || gimple_nop_p (def1)
2688 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2689 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2690 && (is_gimple_assign (def1)
2691 || is_gimple_call (def1)
2692 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2693 == vect_induction_def
2694 || (gimple_code (def1) == GIMPLE_PHI
2695 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2696 == vect_internal_def
2697 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2699 if (dump_enabled_p ())
2700 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2701 return def_stmt;
2704 if (def1 && def1 == phi
2705 && (code == COND_EXPR
2706 || !def2 || gimple_nop_p (def2)
2707 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2708 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2709 && (is_gimple_assign (def2)
2710 || is_gimple_call (def2)
2711 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2712 == vect_induction_def
2713 || (gimple_code (def2) == GIMPLE_PHI
2714 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2715 == vect_internal_def
2716 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2718 if (check_reduction)
2720 /* Swap operands (just for simplicity - so that the rest of the code
2721 can assume that the reduction variable is always the last (second)
2722 argument). */
2723 if (dump_enabled_p ())
2724 report_vect_op (MSG_NOTE, def_stmt,
2725 "detected reduction: need to swap operands: ");
2727 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2728 gimple_assign_rhs2_ptr (def_stmt));
2730 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2731 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2733 else
2735 if (dump_enabled_p ())
2736 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2739 return def_stmt;
2742 /* Try to find SLP reduction chain. */
2743 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2745 if (dump_enabled_p ())
2746 report_vect_op (MSG_NOTE, def_stmt,
2747 "reduction: detected reduction chain: ");
2749 return def_stmt;
2752 if (dump_enabled_p ())
2753 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2754 "reduction: unknown pattern: ");
2756 return NULL;
2759 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2760 in-place. Arguments as there. */
2762 static gimple
2763 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2764 bool check_reduction, bool *double_reduc,
2765 bool need_wrapping_integral_overflow)
2767 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2768 double_reduc, false,
2769 need_wrapping_integral_overflow);
2772 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2773 in-place if it enables detection of more reductions. Arguments
2774 as there. */
2776 gimple
2777 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2778 bool check_reduction, bool *double_reduc,
2779 bool need_wrapping_integral_overflow)
2781 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2782 double_reduc, true,
2783 need_wrapping_integral_overflow);
2786 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2788 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2789 int *peel_iters_epilogue,
2790 stmt_vector_for_cost *scalar_cost_vec,
2791 stmt_vector_for_cost *prologue_cost_vec,
2792 stmt_vector_for_cost *epilogue_cost_vec)
2794 int retval = 0;
2795 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2797 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2799 *peel_iters_epilogue = vf/2;
2800 if (dump_enabled_p ())
2801 dump_printf_loc (MSG_NOTE, vect_location,
2802 "cost model: epilogue peel iters set to vf/2 "
2803 "because loop iterations are unknown .\n");
2805 /* If peeled iterations are known but number of scalar loop
2806 iterations are unknown, count a taken branch per peeled loop. */
2807 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2808 NULL, 0, vect_prologue);
2809 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2810 NULL, 0, vect_epilogue);
2812 else
2814 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2815 peel_iters_prologue = niters < peel_iters_prologue ?
2816 niters : peel_iters_prologue;
2817 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2818 /* If we need to peel for gaps, but no peeling is required, we have to
2819 peel VF iterations. */
2820 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2821 *peel_iters_epilogue = vf;
2824 stmt_info_for_cost *si;
2825 int j;
2826 if (peel_iters_prologue)
2827 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2828 retval += record_stmt_cost (prologue_cost_vec,
2829 si->count * peel_iters_prologue,
2830 si->kind, NULL, si->misalign,
2831 vect_prologue);
2832 if (*peel_iters_epilogue)
2833 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2834 retval += record_stmt_cost (epilogue_cost_vec,
2835 si->count * *peel_iters_epilogue,
2836 si->kind, NULL, si->misalign,
2837 vect_epilogue);
2839 return retval;
2842 /* Function vect_estimate_min_profitable_iters
2844 Return the number of iterations required for the vector version of the
2845 loop to be profitable relative to the cost of the scalar version of the
2846 loop. */
2848 static void
2849 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2850 int *ret_min_profitable_niters,
2851 int *ret_min_profitable_estimate)
2853 int min_profitable_iters;
2854 int min_profitable_estimate;
2855 int peel_iters_prologue;
2856 int peel_iters_epilogue;
2857 unsigned vec_inside_cost = 0;
2858 int vec_outside_cost = 0;
2859 unsigned vec_prologue_cost = 0;
2860 unsigned vec_epilogue_cost = 0;
2861 int scalar_single_iter_cost = 0;
2862 int scalar_outside_cost = 0;
2863 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2864 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2865 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2867 /* Cost model disabled. */
2868 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2870 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2871 *ret_min_profitable_niters = 0;
2872 *ret_min_profitable_estimate = 0;
2873 return;
2876 /* Requires loop versioning tests to handle misalignment. */
2877 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2879 /* FIXME: Make cost depend on complexity of individual check. */
2880 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2881 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2882 vect_prologue);
2883 dump_printf (MSG_NOTE,
2884 "cost model: Adding cost of checks for loop "
2885 "versioning to treat misalignment.\n");
2888 /* Requires loop versioning with alias checks. */
2889 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2891 /* FIXME: Make cost depend on complexity of individual check. */
2892 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2893 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2894 vect_prologue);
2895 dump_printf (MSG_NOTE,
2896 "cost model: Adding cost of checks for loop "
2897 "versioning aliasing.\n");
2900 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2901 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2902 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2903 vect_prologue);
2905 /* Count statements in scalar loop. Using this as scalar cost for a single
2906 iteration for now.
2908 TODO: Add outer loop support.
2910 TODO: Consider assigning different costs to different scalar
2911 statements. */
2913 scalar_single_iter_cost
2914 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2916 /* Add additional cost for the peeled instructions in prologue and epilogue
2917 loop.
2919 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2920 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2922 TODO: Build an expression that represents peel_iters for prologue and
2923 epilogue to be used in a run-time test. */
2925 if (npeel < 0)
2927 peel_iters_prologue = vf/2;
2928 dump_printf (MSG_NOTE, "cost model: "
2929 "prologue peel iters set to vf/2.\n");
2931 /* If peeling for alignment is unknown, loop bound of main loop becomes
2932 unknown. */
2933 peel_iters_epilogue = vf/2;
2934 dump_printf (MSG_NOTE, "cost model: "
2935 "epilogue peel iters set to vf/2 because "
2936 "peeling for alignment is unknown.\n");
2938 /* If peeled iterations are unknown, count a taken branch and a not taken
2939 branch per peeled loop. Even if scalar loop iterations are known,
2940 vector iterations are not known since peeled prologue iterations are
2941 not known. Hence guards remain the same. */
2942 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2943 NULL, 0, vect_prologue);
2944 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2945 NULL, 0, vect_prologue);
2946 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2947 NULL, 0, vect_epilogue);
2948 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2949 NULL, 0, vect_epilogue);
2950 stmt_info_for_cost *si;
2951 int j;
2952 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2954 struct _stmt_vec_info *stmt_info
2955 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2956 (void) add_stmt_cost (target_cost_data,
2957 si->count * peel_iters_prologue,
2958 si->kind, stmt_info, si->misalign,
2959 vect_prologue);
2960 (void) add_stmt_cost (target_cost_data,
2961 si->count * peel_iters_epilogue,
2962 si->kind, stmt_info, si->misalign,
2963 vect_epilogue);
2966 else
2968 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2969 stmt_info_for_cost *si;
2970 int j;
2971 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2973 prologue_cost_vec.create (2);
2974 epilogue_cost_vec.create (2);
2975 peel_iters_prologue = npeel;
2977 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2978 &peel_iters_epilogue,
2979 &LOOP_VINFO_SCALAR_ITERATION_COST
2980 (loop_vinfo),
2981 &prologue_cost_vec,
2982 &epilogue_cost_vec);
2984 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2986 struct _stmt_vec_info *stmt_info
2987 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2988 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2989 si->misalign, vect_prologue);
2992 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2994 struct _stmt_vec_info *stmt_info
2995 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2996 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2997 si->misalign, vect_epilogue);
3000 prologue_cost_vec.release ();
3001 epilogue_cost_vec.release ();
3004 /* FORNOW: The scalar outside cost is incremented in one of the
3005 following ways:
3007 1. The vectorizer checks for alignment and aliasing and generates
3008 a condition that allows dynamic vectorization. A cost model
3009 check is ANDED with the versioning condition. Hence scalar code
3010 path now has the added cost of the versioning check.
3012 if (cost > th & versioning_check)
3013 jmp to vector code
3015 Hence run-time scalar is incremented by not-taken branch cost.
3017 2. The vectorizer then checks if a prologue is required. If the
3018 cost model check was not done before during versioning, it has to
3019 be done before the prologue check.
3021 if (cost <= th)
3022 prologue = scalar_iters
3023 if (prologue == 0)
3024 jmp to vector code
3025 else
3026 execute prologue
3027 if (prologue == num_iters)
3028 go to exit
3030 Hence the run-time scalar cost is incremented by a taken branch,
3031 plus a not-taken branch, plus a taken branch cost.
3033 3. The vectorizer then checks if an epilogue is required. If the
3034 cost model check was not done before during prologue check, it
3035 has to be done with the epilogue check.
3037 if (prologue == 0)
3038 jmp to vector code
3039 else
3040 execute prologue
3041 if (prologue == num_iters)
3042 go to exit
3043 vector code:
3044 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3045 jmp to epilogue
3047 Hence the run-time scalar cost should be incremented by 2 taken
3048 branches.
3050 TODO: The back end may reorder the BBS's differently and reverse
3051 conditions/branch directions. Change the estimates below to
3052 something more reasonable. */
3054 /* If the number of iterations is known and we do not do versioning, we can
3055 decide whether to vectorize at compile time. Hence the scalar version
3056 do not carry cost model guard costs. */
3057 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3058 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3059 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3061 /* Cost model check occurs at versioning. */
3062 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3063 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3064 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3065 else
3067 /* Cost model check occurs at prologue generation. */
3068 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3069 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3070 + vect_get_stmt_cost (cond_branch_not_taken);
3071 /* Cost model check occurs at epilogue generation. */
3072 else
3073 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3077 /* Complete the target-specific cost calculations. */
3078 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3079 &vec_inside_cost, &vec_epilogue_cost);
3081 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3083 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3086 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3087 vec_inside_cost);
3088 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3089 vec_prologue_cost);
3090 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3091 vec_epilogue_cost);
3092 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3093 scalar_single_iter_cost);
3094 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3095 scalar_outside_cost);
3096 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3097 vec_outside_cost);
3098 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3099 peel_iters_prologue);
3100 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3101 peel_iters_epilogue);
3104 /* Calculate number of iterations required to make the vector version
3105 profitable, relative to the loop bodies only. The following condition
3106 must hold true:
3107 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3108 where
3109 SIC = scalar iteration cost, VIC = vector iteration cost,
3110 VOC = vector outside cost, VF = vectorization factor,
3111 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3112 SOC = scalar outside cost for run time cost model check. */
3114 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3116 if (vec_outside_cost <= 0)
3117 min_profitable_iters = 1;
3118 else
3120 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3121 - vec_inside_cost * peel_iters_prologue
3122 - vec_inside_cost * peel_iters_epilogue)
3123 / ((scalar_single_iter_cost * vf)
3124 - vec_inside_cost);
3126 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3127 <= (((int) vec_inside_cost * min_profitable_iters)
3128 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3129 min_profitable_iters++;
3132 /* vector version will never be profitable. */
3133 else
3135 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3136 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3137 "did not happen for a simd loop");
3139 if (dump_enabled_p ())
3140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3141 "cost model: the vector iteration cost = %d "
3142 "divided by the scalar iteration cost = %d "
3143 "is greater or equal to the vectorization factor = %d"
3144 ".\n",
3145 vec_inside_cost, scalar_single_iter_cost, vf);
3146 *ret_min_profitable_niters = -1;
3147 *ret_min_profitable_estimate = -1;
3148 return;
3151 dump_printf (MSG_NOTE,
3152 " Calculated minimum iters for profitability: %d\n",
3153 min_profitable_iters);
3155 min_profitable_iters =
3156 min_profitable_iters < vf ? vf : min_profitable_iters;
3158 /* Because the condition we create is:
3159 if (niters <= min_profitable_iters)
3160 then skip the vectorized loop. */
3161 min_profitable_iters--;
3163 if (dump_enabled_p ())
3164 dump_printf_loc (MSG_NOTE, vect_location,
3165 " Runtime profitability threshold = %d\n",
3166 min_profitable_iters);
3168 *ret_min_profitable_niters = min_profitable_iters;
3170 /* Calculate number of iterations required to make the vector version
3171 profitable, relative to the loop bodies only.
3173 Non-vectorized variant is SIC * niters and it must win over vector
3174 variant on the expected loop trip count. The following condition must hold true:
3175 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3177 if (vec_outside_cost <= 0)
3178 min_profitable_estimate = 1;
3179 else
3181 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3182 - vec_inside_cost * peel_iters_prologue
3183 - vec_inside_cost * peel_iters_epilogue)
3184 / ((scalar_single_iter_cost * vf)
3185 - vec_inside_cost);
3187 min_profitable_estimate --;
3188 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE, vect_location,
3191 " Static estimate profitability threshold = %d\n",
3192 min_profitable_iters);
3194 *ret_min_profitable_estimate = min_profitable_estimate;
3197 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3198 vector elements (not bits) for a vector of mode MODE. */
3199 static void
3200 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3201 unsigned char *sel)
3203 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3205 for (i = 0; i < nelt; i++)
3206 sel[i] = (i + offset) & (2*nelt - 1);
3209 /* Checks whether the target supports whole-vector shifts for vectors of mode
3210 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3211 it supports vec_perm_const with masks for all necessary shift amounts. */
3212 static bool
3213 have_whole_vector_shift (enum machine_mode mode)
3215 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3216 return true;
3218 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3219 return false;
3221 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3222 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3224 for (i = nelt/2; i >= 1; i/=2)
3226 calc_vec_perm_mask_for_shift (mode, i, sel);
3227 if (!can_vec_perm_p (mode, false, sel))
3228 return false;
3230 return true;
3233 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3235 static tree
3236 get_reduction_op (gimple stmt, int reduc_index)
3238 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3240 case GIMPLE_SINGLE_RHS:
3241 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3242 == ternary_op);
3243 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3244 case GIMPLE_UNARY_RHS:
3245 return gimple_assign_rhs1 (stmt);
3246 case GIMPLE_BINARY_RHS:
3247 return (reduc_index
3248 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3249 case GIMPLE_TERNARY_RHS:
3250 return gimple_op (stmt, reduc_index + 1);
3251 default:
3252 gcc_unreachable ();
3256 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3257 functions. Design better to avoid maintenance issues. */
3259 /* Function vect_model_reduction_cost.
3261 Models cost for a reduction operation, including the vector ops
3262 generated within the strip-mine loop, the initial definition before
3263 the loop, and the epilogue code that must be generated. */
3265 static bool
3266 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3267 int ncopies, int reduc_index)
3269 int prologue_cost = 0, epilogue_cost = 0;
3270 enum tree_code code;
3271 optab optab;
3272 tree vectype;
3273 gimple stmt, orig_stmt;
3274 tree reduction_op;
3275 machine_mode mode;
3276 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3277 struct loop *loop = NULL;
3278 void *target_cost_data;
3280 if (loop_vinfo)
3282 loop = LOOP_VINFO_LOOP (loop_vinfo);
3283 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3285 else
3286 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3288 /* Cost of reduction op inside loop. */
3289 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3290 stmt_info, 0, vect_body);
3291 stmt = STMT_VINFO_STMT (stmt_info);
3293 reduction_op = get_reduction_op (stmt, reduc_index);
3295 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3296 if (!vectype)
3298 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3301 "unsupported data-type ");
3302 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3303 TREE_TYPE (reduction_op));
3304 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3306 return false;
3309 mode = TYPE_MODE (vectype);
3310 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3312 if (!orig_stmt)
3313 orig_stmt = STMT_VINFO_STMT (stmt_info);
3315 code = gimple_assign_rhs_code (orig_stmt);
3317 /* Add in cost for initial definition. */
3318 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3319 stmt_info, 0, vect_prologue);
3321 /* Determine cost of epilogue code.
3323 We have a reduction operator that will reduce the vector in one statement.
3324 Also requires scalar extract. */
3326 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3328 if (reduc_code != ERROR_MARK)
3330 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3331 stmt_info, 0, vect_epilogue);
3332 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3333 stmt_info, 0, vect_epilogue);
3335 else
3337 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3338 tree bitsize =
3339 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3340 int element_bitsize = tree_to_uhwi (bitsize);
3341 int nelements = vec_size_in_bits / element_bitsize;
3343 optab = optab_for_tree_code (code, vectype, optab_default);
3345 /* We have a whole vector shift available. */
3346 if (VECTOR_MODE_P (mode)
3347 && optab_handler (optab, mode) != CODE_FOR_nothing
3348 && have_whole_vector_shift (mode))
3350 /* Final reduction via vector shifts and the reduction operator.
3351 Also requires scalar extract. */
3352 epilogue_cost += add_stmt_cost (target_cost_data,
3353 exact_log2 (nelements) * 2,
3354 vector_stmt, stmt_info, 0,
3355 vect_epilogue);
3356 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3357 vec_to_scalar, stmt_info, 0,
3358 vect_epilogue);
3360 else
3361 /* Use extracts and reduction op for final reduction. For N
3362 elements, we have N extracts and N-1 reduction ops. */
3363 epilogue_cost += add_stmt_cost (target_cost_data,
3364 nelements + nelements - 1,
3365 vector_stmt, stmt_info, 0,
3366 vect_epilogue);
3370 if (dump_enabled_p ())
3371 dump_printf (MSG_NOTE,
3372 "vect_model_reduction_cost: inside_cost = %d, "
3373 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3374 prologue_cost, epilogue_cost);
3376 return true;
3380 /* Function vect_model_induction_cost.
3382 Models cost for induction operations. */
3384 static void
3385 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3387 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3388 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3389 unsigned inside_cost, prologue_cost;
3391 /* loop cost for vec_loop. */
3392 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3393 stmt_info, 0, vect_body);
3395 /* prologue cost for vec_init and vec_step. */
3396 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3397 stmt_info, 0, vect_prologue);
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_NOTE, vect_location,
3401 "vect_model_induction_cost: inside_cost = %d, "
3402 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3406 /* Function get_initial_def_for_induction
3408 Input:
3409 STMT - a stmt that performs an induction operation in the loop.
3410 IV_PHI - the initial value of the induction variable
3412 Output:
3413 Return a vector variable, initialized with the first VF values of
3414 the induction variable. E.g., for an iv with IV_PHI='X' and
3415 evolution S, for a vector of 4 units, we want to return:
3416 [X, X + S, X + 2*S, X + 3*S]. */
3418 static tree
3419 get_initial_def_for_induction (gimple iv_phi)
3421 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3422 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3423 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3424 tree vectype;
3425 int nunits;
3426 edge pe = loop_preheader_edge (loop);
3427 struct loop *iv_loop;
3428 basic_block new_bb;
3429 tree new_vec, vec_init, vec_step, t;
3430 tree new_var;
3431 tree new_name;
3432 gimple init_stmt, new_stmt;
3433 gphi *induction_phi;
3434 tree induc_def, vec_def, vec_dest;
3435 tree init_expr, step_expr;
3436 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3437 int i;
3438 int ncopies;
3439 tree expr;
3440 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3441 bool nested_in_vect_loop = false;
3442 gimple_seq stmts = NULL;
3443 imm_use_iterator imm_iter;
3444 use_operand_p use_p;
3445 gimple exit_phi;
3446 edge latch_e;
3447 tree loop_arg;
3448 gimple_stmt_iterator si;
3449 basic_block bb = gimple_bb (iv_phi);
3450 tree stepvectype;
3451 tree resvectype;
3453 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3454 if (nested_in_vect_loop_p (loop, iv_phi))
3456 nested_in_vect_loop = true;
3457 iv_loop = loop->inner;
3459 else
3460 iv_loop = loop;
3461 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3463 latch_e = loop_latch_edge (iv_loop);
3464 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3466 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3467 gcc_assert (step_expr != NULL_TREE);
3469 pe = loop_preheader_edge (iv_loop);
3470 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3471 loop_preheader_edge (iv_loop));
3473 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3474 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3475 gcc_assert (vectype);
3476 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3477 ncopies = vf / nunits;
3479 gcc_assert (phi_info);
3480 gcc_assert (ncopies >= 1);
3482 /* Convert the step to the desired type. */
3483 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3484 step_expr),
3485 &stmts, true, NULL_TREE);
3486 if (stmts)
3488 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3489 gcc_assert (!new_bb);
3492 /* Find the first insertion point in the BB. */
3493 si = gsi_after_labels (bb);
3495 /* Create the vector that holds the initial_value of the induction. */
3496 if (nested_in_vect_loop)
3498 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3499 been created during vectorization of previous stmts. We obtain it
3500 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3501 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3502 /* If the initial value is not of proper type, convert it. */
3503 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3505 new_stmt
3506 = gimple_build_assign (vect_get_new_vect_var (vectype,
3507 vect_simple_var,
3508 "vec_iv_"),
3509 VIEW_CONVERT_EXPR,
3510 build1 (VIEW_CONVERT_EXPR, vectype,
3511 vec_init));
3512 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3513 gimple_assign_set_lhs (new_stmt, vec_init);
3514 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3515 new_stmt);
3516 gcc_assert (!new_bb);
3517 set_vinfo_for_stmt (new_stmt,
3518 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3521 else
3523 vec<constructor_elt, va_gc> *v;
3525 /* iv_loop is the loop to be vectorized. Create:
3526 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3527 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3528 vect_scalar_var, "var_");
3529 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3530 init_expr),
3531 &stmts, false, new_var);
3532 if (stmts)
3534 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3535 gcc_assert (!new_bb);
3538 vec_alloc (v, nunits);
3539 bool constant_p = is_gimple_min_invariant (new_name);
3540 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3541 for (i = 1; i < nunits; i++)
3543 /* Create: new_name_i = new_name + step_expr */
3544 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3545 new_name, step_expr);
3546 if (!is_gimple_min_invariant (new_name))
3548 init_stmt = gimple_build_assign (new_var, new_name);
3549 new_name = make_ssa_name (new_var, init_stmt);
3550 gimple_assign_set_lhs (init_stmt, new_name);
3551 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3552 gcc_assert (!new_bb);
3553 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_NOTE, vect_location,
3556 "created new init_stmt: ");
3557 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3558 dump_printf (MSG_NOTE, "\n");
3560 constant_p = false;
3562 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3564 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3565 if (constant_p)
3566 new_vec = build_vector_from_ctor (vectype, v);
3567 else
3568 new_vec = build_constructor (vectype, v);
3569 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3573 /* Create the vector that holds the step of the induction. */
3574 if (nested_in_vect_loop)
3575 /* iv_loop is nested in the loop to be vectorized. Generate:
3576 vec_step = [S, S, S, S] */
3577 new_name = step_expr;
3578 else
3580 /* iv_loop is the loop to be vectorized. Generate:
3581 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3582 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3584 expr = build_int_cst (integer_type_node, vf);
3585 expr = fold_convert (TREE_TYPE (step_expr), expr);
3587 else
3588 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3589 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3590 expr, step_expr);
3591 if (TREE_CODE (step_expr) == SSA_NAME)
3592 new_name = vect_init_vector (iv_phi, new_name,
3593 TREE_TYPE (step_expr), NULL);
3596 t = unshare_expr (new_name);
3597 gcc_assert (CONSTANT_CLASS_P (new_name)
3598 || TREE_CODE (new_name) == SSA_NAME);
3599 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3600 gcc_assert (stepvectype);
3601 new_vec = build_vector_from_val (stepvectype, t);
3602 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3605 /* Create the following def-use cycle:
3606 loop prolog:
3607 vec_init = ...
3608 vec_step = ...
3609 loop:
3610 vec_iv = PHI <vec_init, vec_loop>
3612 STMT
3614 vec_loop = vec_iv + vec_step; */
3616 /* Create the induction-phi that defines the induction-operand. */
3617 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3618 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3619 set_vinfo_for_stmt (induction_phi,
3620 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3621 induc_def = PHI_RESULT (induction_phi);
3623 /* Create the iv update inside the loop */
3624 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3625 vec_def = make_ssa_name (vec_dest, new_stmt);
3626 gimple_assign_set_lhs (new_stmt, vec_def);
3627 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3628 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3629 NULL));
3631 /* Set the arguments of the phi node: */
3632 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3633 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3634 UNKNOWN_LOCATION);
3637 /* In case that vectorization factor (VF) is bigger than the number
3638 of elements that we can fit in a vectype (nunits), we have to generate
3639 more than one vector stmt - i.e - we need to "unroll" the
3640 vector stmt by a factor VF/nunits. For more details see documentation
3641 in vectorizable_operation. */
3643 if (ncopies > 1)
3645 stmt_vec_info prev_stmt_vinfo;
3646 /* FORNOW. This restriction should be relaxed. */
3647 gcc_assert (!nested_in_vect_loop);
3649 /* Create the vector that holds the step of the induction. */
3650 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3652 expr = build_int_cst (integer_type_node, nunits);
3653 expr = fold_convert (TREE_TYPE (step_expr), expr);
3655 else
3656 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3657 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3658 expr, step_expr);
3659 if (TREE_CODE (step_expr) == SSA_NAME)
3660 new_name = vect_init_vector (iv_phi, new_name,
3661 TREE_TYPE (step_expr), NULL);
3662 t = unshare_expr (new_name);
3663 gcc_assert (CONSTANT_CLASS_P (new_name)
3664 || TREE_CODE (new_name) == SSA_NAME);
3665 new_vec = build_vector_from_val (stepvectype, t);
3666 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3668 vec_def = induc_def;
3669 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3670 for (i = 1; i < ncopies; i++)
3672 /* vec_i = vec_prev + vec_step */
3673 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3674 vec_def, vec_step);
3675 vec_def = make_ssa_name (vec_dest, new_stmt);
3676 gimple_assign_set_lhs (new_stmt, vec_def);
3678 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3679 if (!useless_type_conversion_p (resvectype, vectype))
3681 new_stmt
3682 = gimple_build_assign
3683 (vect_get_new_vect_var (resvectype, vect_simple_var,
3684 "vec_iv_"),
3685 VIEW_CONVERT_EXPR,
3686 build1 (VIEW_CONVERT_EXPR, resvectype,
3687 gimple_assign_lhs (new_stmt)));
3688 gimple_assign_set_lhs (new_stmt,
3689 make_ssa_name
3690 (gimple_assign_lhs (new_stmt), new_stmt));
3691 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3693 set_vinfo_for_stmt (new_stmt,
3694 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3695 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3696 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3700 if (nested_in_vect_loop)
3702 /* Find the loop-closed exit-phi of the induction, and record
3703 the final vector of induction results: */
3704 exit_phi = NULL;
3705 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3707 gimple use_stmt = USE_STMT (use_p);
3708 if (is_gimple_debug (use_stmt))
3709 continue;
3711 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3713 exit_phi = use_stmt;
3714 break;
3717 if (exit_phi)
3719 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3720 /* FORNOW. Currently not supporting the case that an inner-loop induction
3721 is not used in the outer-loop (i.e. only outside the outer-loop). */
3722 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3723 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3725 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3726 if (dump_enabled_p ())
3728 dump_printf_loc (MSG_NOTE, vect_location,
3729 "vector of inductions after inner-loop:");
3730 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3731 dump_printf (MSG_NOTE, "\n");
3737 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE, vect_location,
3740 "transform induction: created def-use cycle: ");
3741 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3742 dump_printf (MSG_NOTE, "\n");
3743 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3744 SSA_NAME_DEF_STMT (vec_def), 0);
3745 dump_printf (MSG_NOTE, "\n");
3748 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3749 if (!useless_type_conversion_p (resvectype, vectype))
3751 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3752 vect_simple_var,
3753 "vec_iv_"),
3754 VIEW_CONVERT_EXPR,
3755 build1 (VIEW_CONVERT_EXPR, resvectype,
3756 induc_def));
3757 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3758 gimple_assign_set_lhs (new_stmt, induc_def);
3759 si = gsi_after_labels (bb);
3760 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3761 set_vinfo_for_stmt (new_stmt,
3762 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3763 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3764 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3767 return induc_def;
3771 /* Function get_initial_def_for_reduction
3773 Input:
3774 STMT - a stmt that performs a reduction operation in the loop.
3775 INIT_VAL - the initial value of the reduction variable
3777 Output:
3778 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3779 of the reduction (used for adjusting the epilog - see below).
3780 Return a vector variable, initialized according to the operation that STMT
3781 performs. This vector will be used as the initial value of the
3782 vector of partial results.
3784 Option1 (adjust in epilog): Initialize the vector as follows:
3785 add/bit or/xor: [0,0,...,0,0]
3786 mult/bit and: [1,1,...,1,1]
3787 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3788 and when necessary (e.g. add/mult case) let the caller know
3789 that it needs to adjust the result by init_val.
3791 Option2: Initialize the vector as follows:
3792 add/bit or/xor: [init_val,0,0,...,0]
3793 mult/bit and: [init_val,1,1,...,1]
3794 min/max/cond_expr: [init_val,init_val,...,init_val]
3795 and no adjustments are needed.
3797 For example, for the following code:
3799 s = init_val;
3800 for (i=0;i<n;i++)
3801 s = s + a[i];
3803 STMT is 's = s + a[i]', and the reduction variable is 's'.
3804 For a vector of 4 units, we want to return either [0,0,0,init_val],
3805 or [0,0,0,0] and let the caller know that it needs to adjust
3806 the result at the end by 'init_val'.
3808 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3809 initialization vector is simpler (same element in all entries), if
3810 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3812 A cost model should help decide between these two schemes. */
3814 tree
3815 get_initial_def_for_reduction (gimple stmt, tree init_val,
3816 tree *adjustment_def)
3818 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3819 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3820 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3821 tree scalar_type = TREE_TYPE (init_val);
3822 tree vectype = get_vectype_for_scalar_type (scalar_type);
3823 int nunits;
3824 enum tree_code code = gimple_assign_rhs_code (stmt);
3825 tree def_for_init;
3826 tree init_def;
3827 tree *elts;
3828 int i;
3829 bool nested_in_vect_loop = false;
3830 tree init_value;
3831 REAL_VALUE_TYPE real_init_val = dconst0;
3832 int int_init_val = 0;
3833 gimple def_stmt = NULL;
3835 gcc_assert (vectype);
3836 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3838 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3839 || SCALAR_FLOAT_TYPE_P (scalar_type));
3841 if (nested_in_vect_loop_p (loop, stmt))
3842 nested_in_vect_loop = true;
3843 else
3844 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3846 /* In case of double reduction we only create a vector variable to be put
3847 in the reduction phi node. The actual statement creation is done in
3848 vect_create_epilog_for_reduction. */
3849 if (adjustment_def && nested_in_vect_loop
3850 && TREE_CODE (init_val) == SSA_NAME
3851 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3852 && gimple_code (def_stmt) == GIMPLE_PHI
3853 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3854 && vinfo_for_stmt (def_stmt)
3855 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3856 == vect_double_reduction_def)
3858 *adjustment_def = NULL;
3859 return vect_create_destination_var (init_val, vectype);
3862 if (TREE_CONSTANT (init_val))
3864 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3865 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3866 else
3867 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3869 else
3870 init_value = init_val;
3872 switch (code)
3874 case WIDEN_SUM_EXPR:
3875 case DOT_PROD_EXPR:
3876 case SAD_EXPR:
3877 case PLUS_EXPR:
3878 case MINUS_EXPR:
3879 case BIT_IOR_EXPR:
3880 case BIT_XOR_EXPR:
3881 case MULT_EXPR:
3882 case BIT_AND_EXPR:
3883 /* ADJUSMENT_DEF is NULL when called from
3884 vect_create_epilog_for_reduction to vectorize double reduction. */
3885 if (adjustment_def)
3887 if (nested_in_vect_loop)
3888 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3889 NULL);
3890 else
3891 *adjustment_def = init_val;
3894 if (code == MULT_EXPR)
3896 real_init_val = dconst1;
3897 int_init_val = 1;
3900 if (code == BIT_AND_EXPR)
3901 int_init_val = -1;
3903 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3904 def_for_init = build_real (scalar_type, real_init_val);
3905 else
3906 def_for_init = build_int_cst (scalar_type, int_init_val);
3908 /* Create a vector of '0' or '1' except the first element. */
3909 elts = XALLOCAVEC (tree, nunits);
3910 for (i = nunits - 2; i >= 0; --i)
3911 elts[i + 1] = def_for_init;
3913 /* Option1: the first element is '0' or '1' as well. */
3914 if (adjustment_def)
3916 elts[0] = def_for_init;
3917 init_def = build_vector (vectype, elts);
3918 break;
3921 /* Option2: the first element is INIT_VAL. */
3922 elts[0] = init_val;
3923 if (TREE_CONSTANT (init_val))
3924 init_def = build_vector (vectype, elts);
3925 else
3927 vec<constructor_elt, va_gc> *v;
3928 vec_alloc (v, nunits);
3929 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3930 for (i = 1; i < nunits; ++i)
3931 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3932 init_def = build_constructor (vectype, v);
3935 break;
3937 case MIN_EXPR:
3938 case MAX_EXPR:
3939 case COND_EXPR:
3940 if (adjustment_def)
3942 *adjustment_def = NULL_TREE;
3943 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3944 break;
3947 init_def = build_vector_from_val (vectype, init_value);
3948 break;
3950 default:
3951 gcc_unreachable ();
3954 return init_def;
3957 /* Function vect_create_epilog_for_reduction
3959 Create code at the loop-epilog to finalize the result of a reduction
3960 computation.
3962 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3963 reduction statements.
3964 STMT is the scalar reduction stmt that is being vectorized.
3965 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3966 number of elements that we can fit in a vectype (nunits). In this case
3967 we have to generate more than one vector stmt - i.e - we need to "unroll"
3968 the vector stmt by a factor VF/nunits. For more details see documentation
3969 in vectorizable_operation.
3970 REDUC_CODE is the tree-code for the epilog reduction.
3971 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3972 computation.
3973 REDUC_INDEX is the index of the operand in the right hand side of the
3974 statement that is defined by REDUCTION_PHI.
3975 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3976 SLP_NODE is an SLP node containing a group of reduction statements. The
3977 first one in this group is STMT.
3979 This function:
3980 1. Creates the reduction def-use cycles: sets the arguments for
3981 REDUCTION_PHIS:
3982 The loop-entry argument is the vectorized initial-value of the reduction.
3983 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3984 sums.
3985 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3986 by applying the operation specified by REDUC_CODE if available, or by
3987 other means (whole-vector shifts or a scalar loop).
3988 The function also creates a new phi node at the loop exit to preserve
3989 loop-closed form, as illustrated below.
3991 The flow at the entry to this function:
3993 loop:
3994 vec_def = phi <null, null> # REDUCTION_PHI
3995 VECT_DEF = vector_stmt # vectorized form of STMT
3996 s_loop = scalar_stmt # (scalar) STMT
3997 loop_exit:
3998 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3999 use <s_out0>
4000 use <s_out0>
4002 The above is transformed by this function into:
4004 loop:
4005 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4006 VECT_DEF = vector_stmt # vectorized form of STMT
4007 s_loop = scalar_stmt # (scalar) STMT
4008 loop_exit:
4009 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4010 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4011 v_out2 = reduce <v_out1>
4012 s_out3 = extract_field <v_out2, 0>
4013 s_out4 = adjust_result <s_out3>
4014 use <s_out4>
4015 use <s_out4>
4018 static void
4019 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
4020 int ncopies, enum tree_code reduc_code,
4021 vec<gimple> reduction_phis,
4022 int reduc_index, bool double_reduc,
4023 slp_tree slp_node)
4025 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4026 stmt_vec_info prev_phi_info;
4027 tree vectype;
4028 machine_mode mode;
4029 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4030 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4031 basic_block exit_bb;
4032 tree scalar_dest;
4033 tree scalar_type;
4034 gimple new_phi = NULL, phi;
4035 gimple_stmt_iterator exit_gsi;
4036 tree vec_dest;
4037 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4038 gimple epilog_stmt = NULL;
4039 enum tree_code code = gimple_assign_rhs_code (stmt);
4040 gimple exit_phi;
4041 tree bitsize;
4042 tree adjustment_def = NULL;
4043 tree vec_initial_def = NULL;
4044 tree reduction_op, expr, def;
4045 tree orig_name, scalar_result;
4046 imm_use_iterator imm_iter, phi_imm_iter;
4047 use_operand_p use_p, phi_use_p;
4048 gimple use_stmt, orig_stmt, reduction_phi = NULL;
4049 bool nested_in_vect_loop = false;
4050 auto_vec<gimple> new_phis;
4051 auto_vec<gimple> inner_phis;
4052 enum vect_def_type dt = vect_unknown_def_type;
4053 int j, i;
4054 auto_vec<tree> scalar_results;
4055 unsigned int group_size = 1, k, ratio;
4056 auto_vec<tree> vec_initial_defs;
4057 auto_vec<gimple> phis;
4058 bool slp_reduc = false;
4059 tree new_phi_result;
4060 gimple inner_phi = NULL;
4062 if (slp_node)
4063 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4065 if (nested_in_vect_loop_p (loop, stmt))
4067 outer_loop = loop;
4068 loop = loop->inner;
4069 nested_in_vect_loop = true;
4070 gcc_assert (!slp_node);
4073 reduction_op = get_reduction_op (stmt, reduc_index);
4075 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4076 gcc_assert (vectype);
4077 mode = TYPE_MODE (vectype);
4079 /* 1. Create the reduction def-use cycle:
4080 Set the arguments of REDUCTION_PHIS, i.e., transform
4082 loop:
4083 vec_def = phi <null, null> # REDUCTION_PHI
4084 VECT_DEF = vector_stmt # vectorized form of STMT
4087 into:
4089 loop:
4090 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4091 VECT_DEF = vector_stmt # vectorized form of STMT
4094 (in case of SLP, do it for all the phis). */
4096 /* Get the loop-entry arguments. */
4097 if (slp_node)
4098 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4099 NULL, slp_node, reduc_index);
4100 else
4102 vec_initial_defs.create (1);
4103 /* For the case of reduction, vect_get_vec_def_for_operand returns
4104 the scalar def before the loop, that defines the initial value
4105 of the reduction variable. */
4106 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4107 &adjustment_def);
4108 vec_initial_defs.quick_push (vec_initial_def);
4111 /* Set phi nodes arguments. */
4112 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4114 tree vec_init_def, def;
4115 gimple_seq stmts;
4116 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4117 true, NULL_TREE);
4118 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4119 def = vect_defs[i];
4120 for (j = 0; j < ncopies; j++)
4122 /* Set the loop-entry arg of the reduction-phi. */
4123 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4124 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4126 /* Set the loop-latch arg for the reduction-phi. */
4127 if (j > 0)
4128 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4130 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4131 UNKNOWN_LOCATION);
4133 if (dump_enabled_p ())
4135 dump_printf_loc (MSG_NOTE, vect_location,
4136 "transform reduction: created def-use cycle: ");
4137 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4138 dump_printf (MSG_NOTE, "\n");
4139 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4140 dump_printf (MSG_NOTE, "\n");
4143 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4147 /* 2. Create epilog code.
4148 The reduction epilog code operates across the elements of the vector
4149 of partial results computed by the vectorized loop.
4150 The reduction epilog code consists of:
4152 step 1: compute the scalar result in a vector (v_out2)
4153 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4154 step 3: adjust the scalar result (s_out3) if needed.
4156 Step 1 can be accomplished using one the following three schemes:
4157 (scheme 1) using reduc_code, if available.
4158 (scheme 2) using whole-vector shifts, if available.
4159 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4160 combined.
4162 The overall epilog code looks like this:
4164 s_out0 = phi <s_loop> # original EXIT_PHI
4165 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4166 v_out2 = reduce <v_out1> # step 1
4167 s_out3 = extract_field <v_out2, 0> # step 2
4168 s_out4 = adjust_result <s_out3> # step 3
4170 (step 3 is optional, and steps 1 and 2 may be combined).
4171 Lastly, the uses of s_out0 are replaced by s_out4. */
4174 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4175 v_out1 = phi <VECT_DEF>
4176 Store them in NEW_PHIS. */
4178 exit_bb = single_exit (loop)->dest;
4179 prev_phi_info = NULL;
4180 new_phis.create (vect_defs.length ());
4181 FOR_EACH_VEC_ELT (vect_defs, i, def)
4183 for (j = 0; j < ncopies; j++)
4185 tree new_def = copy_ssa_name (def);
4186 phi = create_phi_node (new_def, exit_bb);
4187 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4188 if (j == 0)
4189 new_phis.quick_push (phi);
4190 else
4192 def = vect_get_vec_def_for_stmt_copy (dt, def);
4193 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4196 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4197 prev_phi_info = vinfo_for_stmt (phi);
4201 /* The epilogue is created for the outer-loop, i.e., for the loop being
4202 vectorized. Create exit phis for the outer loop. */
4203 if (double_reduc)
4205 loop = outer_loop;
4206 exit_bb = single_exit (loop)->dest;
4207 inner_phis.create (vect_defs.length ());
4208 FOR_EACH_VEC_ELT (new_phis, i, phi)
4210 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4211 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4212 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4213 PHI_RESULT (phi));
4214 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4215 loop_vinfo, NULL));
4216 inner_phis.quick_push (phi);
4217 new_phis[i] = outer_phi;
4218 prev_phi_info = vinfo_for_stmt (outer_phi);
4219 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4221 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4222 new_result = copy_ssa_name (PHI_RESULT (phi));
4223 outer_phi = create_phi_node (new_result, exit_bb);
4224 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4225 PHI_RESULT (phi));
4226 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4227 loop_vinfo, NULL));
4228 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4229 prev_phi_info = vinfo_for_stmt (outer_phi);
4234 exit_gsi = gsi_after_labels (exit_bb);
4236 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4237 (i.e. when reduc_code is not available) and in the final adjustment
4238 code (if needed). Also get the original scalar reduction variable as
4239 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4240 represents a reduction pattern), the tree-code and scalar-def are
4241 taken from the original stmt that the pattern-stmt (STMT) replaces.
4242 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4243 are taken from STMT. */
4245 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4246 if (!orig_stmt)
4248 /* Regular reduction */
4249 orig_stmt = stmt;
4251 else
4253 /* Reduction pattern */
4254 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4255 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4256 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4259 code = gimple_assign_rhs_code (orig_stmt);
4260 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4261 partial results are added and not subtracted. */
4262 if (code == MINUS_EXPR)
4263 code = PLUS_EXPR;
4265 scalar_dest = gimple_assign_lhs (orig_stmt);
4266 scalar_type = TREE_TYPE (scalar_dest);
4267 scalar_results.create (group_size);
4268 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4269 bitsize = TYPE_SIZE (scalar_type);
4271 /* In case this is a reduction in an inner-loop while vectorizing an outer
4272 loop - we don't need to extract a single scalar result at the end of the
4273 inner-loop (unless it is double reduction, i.e., the use of reduction is
4274 outside the outer-loop). The final vector of partial results will be used
4275 in the vectorized outer-loop, or reduced to a scalar result at the end of
4276 the outer-loop. */
4277 if (nested_in_vect_loop && !double_reduc)
4278 goto vect_finalize_reduction;
4280 /* SLP reduction without reduction chain, e.g.,
4281 # a1 = phi <a2, a0>
4282 # b1 = phi <b2, b0>
4283 a2 = operation (a1)
4284 b2 = operation (b1) */
4285 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4287 /* In case of reduction chain, e.g.,
4288 # a1 = phi <a3, a0>
4289 a2 = operation (a1)
4290 a3 = operation (a2),
4292 we may end up with more than one vector result. Here we reduce them to
4293 one vector. */
4294 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4296 tree first_vect = PHI_RESULT (new_phis[0]);
4297 tree tmp;
4298 gassign *new_vec_stmt = NULL;
4300 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4301 for (k = 1; k < new_phis.length (); k++)
4303 gimple next_phi = new_phis[k];
4304 tree second_vect = PHI_RESULT (next_phi);
4306 tmp = build2 (code, vectype, first_vect, second_vect);
4307 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4308 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4309 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4310 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4313 new_phi_result = first_vect;
4314 if (new_vec_stmt)
4316 new_phis.truncate (0);
4317 new_phis.safe_push (new_vec_stmt);
4320 else
4321 new_phi_result = PHI_RESULT (new_phis[0]);
4323 /* 2.3 Create the reduction code, using one of the three schemes described
4324 above. In SLP we simply need to extract all the elements from the
4325 vector (without reducing them), so we use scalar shifts. */
4326 if (reduc_code != ERROR_MARK && !slp_reduc)
4328 tree tmp;
4329 tree vec_elem_type;
4331 /*** Case 1: Create:
4332 v_out2 = reduc_expr <v_out1> */
4334 if (dump_enabled_p ())
4335 dump_printf_loc (MSG_NOTE, vect_location,
4336 "Reduce using direct vector reduction.\n");
4338 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4339 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4341 tree tmp_dest =
4342 vect_create_destination_var (scalar_dest, vec_elem_type);
4343 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4344 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4345 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4346 gimple_assign_set_lhs (epilog_stmt, new_temp);
4347 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4349 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4351 else
4352 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4353 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4354 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4355 gimple_assign_set_lhs (epilog_stmt, new_temp);
4356 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4357 scalar_results.safe_push (new_temp);
4359 else
4361 bool reduce_with_shift = have_whole_vector_shift (mode);
4362 int element_bitsize = tree_to_uhwi (bitsize);
4363 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4364 tree vec_temp;
4366 /* Regardless of whether we have a whole vector shift, if we're
4367 emulating the operation via tree-vect-generic, we don't want
4368 to use it. Only the first round of the reduction is likely
4369 to still be profitable via emulation. */
4370 /* ??? It might be better to emit a reduction tree code here, so that
4371 tree-vect-generic can expand the first round via bit tricks. */
4372 if (!VECTOR_MODE_P (mode))
4373 reduce_with_shift = false;
4374 else
4376 optab optab = optab_for_tree_code (code, vectype, optab_default);
4377 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4378 reduce_with_shift = false;
4381 if (reduce_with_shift && !slp_reduc)
4383 int nelements = vec_size_in_bits / element_bitsize;
4384 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4386 int elt_offset;
4388 tree zero_vec = build_zero_cst (vectype);
4389 /*** Case 2: Create:
4390 for (offset = nelements/2; offset >= 1; offset/=2)
4392 Create: va' = vec_shift <va, offset>
4393 Create: va = vop <va, va'>
4394 } */
4396 tree rhs;
4398 if (dump_enabled_p ())
4399 dump_printf_loc (MSG_NOTE, vect_location,
4400 "Reduce using vector shifts\n");
4402 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4403 new_temp = new_phi_result;
4404 for (elt_offset = nelements / 2;
4405 elt_offset >= 1;
4406 elt_offset /= 2)
4408 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4409 tree mask = vect_gen_perm_mask_any (vectype, sel);
4410 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4411 new_temp, zero_vec, mask);
4412 new_name = make_ssa_name (vec_dest, epilog_stmt);
4413 gimple_assign_set_lhs (epilog_stmt, new_name);
4414 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4416 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4417 new_temp);
4418 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4419 gimple_assign_set_lhs (epilog_stmt, new_temp);
4420 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4423 /* 2.4 Extract the final scalar result. Create:
4424 s_out3 = extract_field <v_out2, bitpos> */
4426 if (dump_enabled_p ())
4427 dump_printf_loc (MSG_NOTE, vect_location,
4428 "extract scalar result\n");
4430 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4431 bitsize, bitsize_zero_node);
4432 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4433 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4434 gimple_assign_set_lhs (epilog_stmt, new_temp);
4435 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4436 scalar_results.safe_push (new_temp);
4438 else
4440 /*** Case 3: Create:
4441 s = extract_field <v_out2, 0>
4442 for (offset = element_size;
4443 offset < vector_size;
4444 offset += element_size;)
4446 Create: s' = extract_field <v_out2, offset>
4447 Create: s = op <s, s'> // For non SLP cases
4448 } */
4450 if (dump_enabled_p ())
4451 dump_printf_loc (MSG_NOTE, vect_location,
4452 "Reduce using scalar code.\n");
4454 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4455 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4457 int bit_offset;
4458 if (gimple_code (new_phi) == GIMPLE_PHI)
4459 vec_temp = PHI_RESULT (new_phi);
4460 else
4461 vec_temp = gimple_assign_lhs (new_phi);
4462 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4463 bitsize_zero_node);
4464 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4465 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4466 gimple_assign_set_lhs (epilog_stmt, new_temp);
4467 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4469 /* In SLP we don't need to apply reduction operation, so we just
4470 collect s' values in SCALAR_RESULTS. */
4471 if (slp_reduc)
4472 scalar_results.safe_push (new_temp);
4474 for (bit_offset = element_bitsize;
4475 bit_offset < vec_size_in_bits;
4476 bit_offset += element_bitsize)
4478 tree bitpos = bitsize_int (bit_offset);
4479 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4480 bitsize, bitpos);
4482 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4483 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4484 gimple_assign_set_lhs (epilog_stmt, new_name);
4485 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4487 if (slp_reduc)
4489 /* In SLP we don't need to apply reduction operation, so
4490 we just collect s' values in SCALAR_RESULTS. */
4491 new_temp = new_name;
4492 scalar_results.safe_push (new_name);
4494 else
4496 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4497 new_name, new_temp);
4498 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4499 gimple_assign_set_lhs (epilog_stmt, new_temp);
4500 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4505 /* The only case where we need to reduce scalar results in SLP, is
4506 unrolling. If the size of SCALAR_RESULTS is greater than
4507 GROUP_SIZE, we reduce them combining elements modulo
4508 GROUP_SIZE. */
4509 if (slp_reduc)
4511 tree res, first_res, new_res;
4512 gimple new_stmt;
4514 /* Reduce multiple scalar results in case of SLP unrolling. */
4515 for (j = group_size; scalar_results.iterate (j, &res);
4516 j++)
4518 first_res = scalar_results[j % group_size];
4519 new_stmt = gimple_build_assign (new_scalar_dest, code,
4520 first_res, res);
4521 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4522 gimple_assign_set_lhs (new_stmt, new_res);
4523 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4524 scalar_results[j % group_size] = new_res;
4527 else
4528 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4529 scalar_results.safe_push (new_temp);
4533 vect_finalize_reduction:
4535 if (double_reduc)
4536 loop = loop->inner;
4538 /* 2.5 Adjust the final result by the initial value of the reduction
4539 variable. (When such adjustment is not needed, then
4540 'adjustment_def' is zero). For example, if code is PLUS we create:
4541 new_temp = loop_exit_def + adjustment_def */
4543 if (adjustment_def)
4545 gcc_assert (!slp_reduc);
4546 if (nested_in_vect_loop)
4548 new_phi = new_phis[0];
4549 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4550 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4551 new_dest = vect_create_destination_var (scalar_dest, vectype);
4553 else
4555 new_temp = scalar_results[0];
4556 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4557 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4558 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4561 epilog_stmt = gimple_build_assign (new_dest, expr);
4562 new_temp = make_ssa_name (new_dest, epilog_stmt);
4563 gimple_assign_set_lhs (epilog_stmt, new_temp);
4564 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4565 if (nested_in_vect_loop)
4567 set_vinfo_for_stmt (epilog_stmt,
4568 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4569 NULL));
4570 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4571 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4573 if (!double_reduc)
4574 scalar_results.quick_push (new_temp);
4575 else
4576 scalar_results[0] = new_temp;
4578 else
4579 scalar_results[0] = new_temp;
4581 new_phis[0] = epilog_stmt;
4584 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4585 phis with new adjusted scalar results, i.e., replace use <s_out0>
4586 with use <s_out4>.
4588 Transform:
4589 loop_exit:
4590 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4591 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4592 v_out2 = reduce <v_out1>
4593 s_out3 = extract_field <v_out2, 0>
4594 s_out4 = adjust_result <s_out3>
4595 use <s_out0>
4596 use <s_out0>
4598 into:
4600 loop_exit:
4601 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4602 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4603 v_out2 = reduce <v_out1>
4604 s_out3 = extract_field <v_out2, 0>
4605 s_out4 = adjust_result <s_out3>
4606 use <s_out4>
4607 use <s_out4> */
4610 /* In SLP reduction chain we reduce vector results into one vector if
4611 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4612 the last stmt in the reduction chain, since we are looking for the loop
4613 exit phi node. */
4614 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4616 gimple dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4617 /* Handle reduction patterns. */
4618 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4619 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4621 scalar_dest = gimple_assign_lhs (dest_stmt);
4622 group_size = 1;
4625 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4626 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4627 need to match SCALAR_RESULTS with corresponding statements. The first
4628 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4629 the first vector stmt, etc.
4630 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4631 if (group_size > new_phis.length ())
4633 ratio = group_size / new_phis.length ();
4634 gcc_assert (!(group_size % new_phis.length ()));
4636 else
4637 ratio = 1;
4639 for (k = 0; k < group_size; k++)
4641 if (k % ratio == 0)
4643 epilog_stmt = new_phis[k / ratio];
4644 reduction_phi = reduction_phis[k / ratio];
4645 if (double_reduc)
4646 inner_phi = inner_phis[k / ratio];
4649 if (slp_reduc)
4651 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4653 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4654 /* SLP statements can't participate in patterns. */
4655 gcc_assert (!orig_stmt);
4656 scalar_dest = gimple_assign_lhs (current_stmt);
4659 phis.create (3);
4660 /* Find the loop-closed-use at the loop exit of the original scalar
4661 result. (The reduction result is expected to have two immediate uses -
4662 one at the latch block, and one at the loop exit). */
4663 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4664 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4665 && !is_gimple_debug (USE_STMT (use_p)))
4666 phis.safe_push (USE_STMT (use_p));
4668 /* While we expect to have found an exit_phi because of loop-closed-ssa
4669 form we can end up without one if the scalar cycle is dead. */
4671 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4673 if (outer_loop)
4675 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4676 gphi *vect_phi;
4678 /* FORNOW. Currently not supporting the case that an inner-loop
4679 reduction is not used in the outer-loop (but only outside the
4680 outer-loop), unless it is double reduction. */
4681 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4682 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4683 || double_reduc);
4685 if (double_reduc)
4686 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4687 else
4688 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4689 if (!double_reduc
4690 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4691 != vect_double_reduction_def)
4692 continue;
4694 /* Handle double reduction:
4696 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4697 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4698 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4699 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4701 At that point the regular reduction (stmt2 and stmt3) is
4702 already vectorized, as well as the exit phi node, stmt4.
4703 Here we vectorize the phi node of double reduction, stmt1, and
4704 update all relevant statements. */
4706 /* Go through all the uses of s2 to find double reduction phi
4707 node, i.e., stmt1 above. */
4708 orig_name = PHI_RESULT (exit_phi);
4709 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4711 stmt_vec_info use_stmt_vinfo;
4712 stmt_vec_info new_phi_vinfo;
4713 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4714 basic_block bb = gimple_bb (use_stmt);
4715 gimple use;
4717 /* Check that USE_STMT is really double reduction phi
4718 node. */
4719 if (gimple_code (use_stmt) != GIMPLE_PHI
4720 || gimple_phi_num_args (use_stmt) != 2
4721 || bb->loop_father != outer_loop)
4722 continue;
4723 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4724 if (!use_stmt_vinfo
4725 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4726 != vect_double_reduction_def)
4727 continue;
4729 /* Create vector phi node for double reduction:
4730 vs1 = phi <vs0, vs2>
4731 vs1 was created previously in this function by a call to
4732 vect_get_vec_def_for_operand and is stored in
4733 vec_initial_def;
4734 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4735 vs0 is created here. */
4737 /* Create vector phi node. */
4738 vect_phi = create_phi_node (vec_initial_def, bb);
4739 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4740 loop_vec_info_for_loop (outer_loop), NULL);
4741 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4743 /* Create vs0 - initial def of the double reduction phi. */
4744 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4745 loop_preheader_edge (outer_loop));
4746 init_def = get_initial_def_for_reduction (stmt,
4747 preheader_arg, NULL);
4748 vect_phi_init = vect_init_vector (use_stmt, init_def,
4749 vectype, NULL);
4751 /* Update phi node arguments with vs0 and vs2. */
4752 add_phi_arg (vect_phi, vect_phi_init,
4753 loop_preheader_edge (outer_loop),
4754 UNKNOWN_LOCATION);
4755 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4756 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4757 if (dump_enabled_p ())
4759 dump_printf_loc (MSG_NOTE, vect_location,
4760 "created double reduction phi node: ");
4761 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4762 dump_printf (MSG_NOTE, "\n");
4765 vect_phi_res = PHI_RESULT (vect_phi);
4767 /* Replace the use, i.e., set the correct vs1 in the regular
4768 reduction phi node. FORNOW, NCOPIES is always 1, so the
4769 loop is redundant. */
4770 use = reduction_phi;
4771 for (j = 0; j < ncopies; j++)
4773 edge pr_edge = loop_preheader_edge (loop);
4774 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4775 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4781 phis.release ();
4782 if (nested_in_vect_loop)
4784 if (double_reduc)
4785 loop = outer_loop;
4786 else
4787 continue;
4790 phis.create (3);
4791 /* Find the loop-closed-use at the loop exit of the original scalar
4792 result. (The reduction result is expected to have two immediate uses,
4793 one at the latch block, and one at the loop exit). For double
4794 reductions we are looking for exit phis of the outer loop. */
4795 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4797 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4799 if (!is_gimple_debug (USE_STMT (use_p)))
4800 phis.safe_push (USE_STMT (use_p));
4802 else
4804 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4806 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4808 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4810 if (!flow_bb_inside_loop_p (loop,
4811 gimple_bb (USE_STMT (phi_use_p)))
4812 && !is_gimple_debug (USE_STMT (phi_use_p)))
4813 phis.safe_push (USE_STMT (phi_use_p));
4819 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4821 /* Replace the uses: */
4822 orig_name = PHI_RESULT (exit_phi);
4823 scalar_result = scalar_results[k];
4824 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4825 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4826 SET_USE (use_p, scalar_result);
4829 phis.release ();
4834 /* Function vectorizable_reduction.
4836 Check if STMT performs a reduction operation that can be vectorized.
4837 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4838 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4839 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4841 This function also handles reduction idioms (patterns) that have been
4842 recognized in advance during vect_pattern_recog. In this case, STMT may be
4843 of this form:
4844 X = pattern_expr (arg0, arg1, ..., X)
4845 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4846 sequence that had been detected and replaced by the pattern-stmt (STMT).
4848 In some cases of reduction patterns, the type of the reduction variable X is
4849 different than the type of the other arguments of STMT.
4850 In such cases, the vectype that is used when transforming STMT into a vector
4851 stmt is different than the vectype that is used to determine the
4852 vectorization factor, because it consists of a different number of elements
4853 than the actual number of elements that are being operated upon in parallel.
4855 For example, consider an accumulation of shorts into an int accumulator.
4856 On some targets it's possible to vectorize this pattern operating on 8
4857 shorts at a time (hence, the vectype for purposes of determining the
4858 vectorization factor should be V8HI); on the other hand, the vectype that
4859 is used to create the vector form is actually V4SI (the type of the result).
4861 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4862 indicates what is the actual level of parallelism (V8HI in the example), so
4863 that the right vectorization factor would be derived. This vectype
4864 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4865 be used to create the vectorized stmt. The right vectype for the vectorized
4866 stmt is obtained from the type of the result X:
4867 get_vectype_for_scalar_type (TREE_TYPE (X))
4869 This means that, contrary to "regular" reductions (or "regular" stmts in
4870 general), the following equation:
4871 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4872 does *NOT* necessarily hold for reduction patterns. */
4874 bool
4875 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4876 gimple *vec_stmt, slp_tree slp_node)
4878 tree vec_dest;
4879 tree scalar_dest;
4880 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4881 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4882 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4883 tree vectype_in = NULL_TREE;
4884 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4885 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4886 enum tree_code code, orig_code, epilog_reduc_code;
4887 machine_mode vec_mode;
4888 int op_type;
4889 optab optab, reduc_optab;
4890 tree new_temp = NULL_TREE;
4891 tree def;
4892 gimple def_stmt;
4893 enum vect_def_type dt;
4894 gphi *new_phi = NULL;
4895 tree scalar_type;
4896 bool is_simple_use;
4897 gimple orig_stmt;
4898 stmt_vec_info orig_stmt_info;
4899 tree expr = NULL_TREE;
4900 int i;
4901 int ncopies;
4902 int epilog_copies;
4903 stmt_vec_info prev_stmt_info, prev_phi_info;
4904 bool single_defuse_cycle = false;
4905 tree reduc_def = NULL_TREE;
4906 gimple new_stmt = NULL;
4907 int j;
4908 tree ops[3];
4909 bool nested_cycle = false, found_nested_cycle_def = false;
4910 gimple reduc_def_stmt = NULL;
4911 bool double_reduc = false, dummy;
4912 basic_block def_bb;
4913 struct loop * def_stmt_loop, *outer_loop = NULL;
4914 tree def_arg;
4915 gimple def_arg_stmt;
4916 auto_vec<tree> vec_oprnds0;
4917 auto_vec<tree> vec_oprnds1;
4918 auto_vec<tree> vect_defs;
4919 auto_vec<gimple> phis;
4920 int vec_num;
4921 tree def0, def1, tem, op0, op1 = NULL_TREE;
4922 bool first_p = true;
4924 /* In case of reduction chain we switch to the first stmt in the chain, but
4925 we don't update STMT_INFO, since only the last stmt is marked as reduction
4926 and has reduction properties. */
4927 if (GROUP_FIRST_ELEMENT (stmt_info)
4928 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4930 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4931 first_p = false;
4934 if (nested_in_vect_loop_p (loop, stmt))
4936 outer_loop = loop;
4937 loop = loop->inner;
4938 nested_cycle = true;
4941 /* 1. Is vectorizable reduction? */
4942 /* Not supportable if the reduction variable is used in the loop, unless
4943 it's a reduction chain. */
4944 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4945 && !GROUP_FIRST_ELEMENT (stmt_info))
4946 return false;
4948 /* Reductions that are not used even in an enclosing outer-loop,
4949 are expected to be "live" (used out of the loop). */
4950 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4951 && !STMT_VINFO_LIVE_P (stmt_info))
4952 return false;
4954 /* Make sure it was already recognized as a reduction computation. */
4955 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4956 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4957 return false;
4959 /* 2. Has this been recognized as a reduction pattern?
4961 Check if STMT represents a pattern that has been recognized
4962 in earlier analysis stages. For stmts that represent a pattern,
4963 the STMT_VINFO_RELATED_STMT field records the last stmt in
4964 the original sequence that constitutes the pattern. */
4966 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4967 if (orig_stmt)
4969 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4970 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4971 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4974 /* 3. Check the operands of the operation. The first operands are defined
4975 inside the loop body. The last operand is the reduction variable,
4976 which is defined by the loop-header-phi. */
4978 gcc_assert (is_gimple_assign (stmt));
4980 /* Flatten RHS. */
4981 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4983 case GIMPLE_SINGLE_RHS:
4984 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4985 if (op_type == ternary_op)
4987 tree rhs = gimple_assign_rhs1 (stmt);
4988 ops[0] = TREE_OPERAND (rhs, 0);
4989 ops[1] = TREE_OPERAND (rhs, 1);
4990 ops[2] = TREE_OPERAND (rhs, 2);
4991 code = TREE_CODE (rhs);
4993 else
4994 return false;
4995 break;
4997 case GIMPLE_BINARY_RHS:
4998 code = gimple_assign_rhs_code (stmt);
4999 op_type = TREE_CODE_LENGTH (code);
5000 gcc_assert (op_type == binary_op);
5001 ops[0] = gimple_assign_rhs1 (stmt);
5002 ops[1] = gimple_assign_rhs2 (stmt);
5003 break;
5005 case GIMPLE_TERNARY_RHS:
5006 code = gimple_assign_rhs_code (stmt);
5007 op_type = TREE_CODE_LENGTH (code);
5008 gcc_assert (op_type == ternary_op);
5009 ops[0] = gimple_assign_rhs1 (stmt);
5010 ops[1] = gimple_assign_rhs2 (stmt);
5011 ops[2] = gimple_assign_rhs3 (stmt);
5012 break;
5014 case GIMPLE_UNARY_RHS:
5015 return false;
5017 default:
5018 gcc_unreachable ();
5020 /* The default is that the reduction variable is the last in statement. */
5021 int reduc_index = op_type - 1;
5023 if (code == COND_EXPR && slp_node)
5024 return false;
5026 scalar_dest = gimple_assign_lhs (stmt);
5027 scalar_type = TREE_TYPE (scalar_dest);
5028 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5029 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5030 return false;
5032 /* Do not try to vectorize bit-precision reductions. */
5033 if ((TYPE_PRECISION (scalar_type)
5034 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5035 return false;
5037 /* All uses but the last are expected to be defined in the loop.
5038 The last use is the reduction variable. In case of nested cycle this
5039 assumption is not true: we use reduc_index to record the index of the
5040 reduction variable. */
5041 for (i = 0; i < op_type - 1; i++)
5043 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5044 if (i == 0 && code == COND_EXPR)
5045 continue;
5047 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5048 &def_stmt, &def, &dt, &tem);
5049 if (!vectype_in)
5050 vectype_in = tem;
5051 gcc_assert (is_simple_use);
5053 if (dt != vect_internal_def
5054 && dt != vect_external_def
5055 && dt != vect_constant_def
5056 && dt != vect_induction_def
5057 && !(dt == vect_nested_cycle && nested_cycle))
5058 return false;
5060 if (dt == vect_nested_cycle)
5062 found_nested_cycle_def = true;
5063 reduc_def_stmt = def_stmt;
5064 reduc_index = i;
5068 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5069 &def_stmt, &def, &dt, &tem);
5070 if (!vectype_in)
5071 vectype_in = tem;
5072 gcc_assert (is_simple_use);
5073 if (!found_nested_cycle_def)
5074 reduc_def_stmt = def_stmt;
5076 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5077 return false;
5079 if (!(dt == vect_reduction_def
5080 || dt == vect_nested_cycle
5081 || ((dt == vect_internal_def || dt == vect_external_def
5082 || dt == vect_constant_def || dt == vect_induction_def)
5083 && nested_cycle && found_nested_cycle_def)))
5085 /* For pattern recognized stmts, orig_stmt might be a reduction,
5086 but some helper statements for the pattern might not, or
5087 might be COND_EXPRs with reduction uses in the condition. */
5088 gcc_assert (orig_stmt);
5089 return false;
5092 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5093 !nested_cycle, &dummy, false);
5094 if (orig_stmt)
5095 gcc_assert (tmp == orig_stmt
5096 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5097 else
5098 /* We changed STMT to be the first stmt in reduction chain, hence we
5099 check that in this case the first element in the chain is STMT. */
5100 gcc_assert (stmt == tmp
5101 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5103 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5104 return false;
5106 if (slp_node || PURE_SLP_STMT (stmt_info))
5107 ncopies = 1;
5108 else
5109 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5110 / TYPE_VECTOR_SUBPARTS (vectype_in));
5112 gcc_assert (ncopies >= 1);
5114 vec_mode = TYPE_MODE (vectype_in);
5116 if (code == COND_EXPR)
5118 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5120 if (dump_enabled_p ())
5121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5122 "unsupported condition in reduction\n");
5124 return false;
5127 else
5129 /* 4. Supportable by target? */
5131 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5132 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5134 /* Shifts and rotates are only supported by vectorizable_shifts,
5135 not vectorizable_reduction. */
5136 if (dump_enabled_p ())
5137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5138 "unsupported shift or rotation.\n");
5139 return false;
5142 /* 4.1. check support for the operation in the loop */
5143 optab = optab_for_tree_code (code, vectype_in, optab_default);
5144 if (!optab)
5146 if (dump_enabled_p ())
5147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5148 "no optab.\n");
5150 return false;
5153 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5155 if (dump_enabled_p ())
5156 dump_printf (MSG_NOTE, "op not supported by target.\n");
5158 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5159 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5160 < vect_min_worthwhile_factor (code))
5161 return false;
5163 if (dump_enabled_p ())
5164 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5167 /* Worthwhile without SIMD support? */
5168 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5169 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5170 < vect_min_worthwhile_factor (code))
5172 if (dump_enabled_p ())
5173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5174 "not worthwhile without SIMD support.\n");
5176 return false;
5180 /* 4.2. Check support for the epilog operation.
5182 If STMT represents a reduction pattern, then the type of the
5183 reduction variable may be different than the type of the rest
5184 of the arguments. For example, consider the case of accumulation
5185 of shorts into an int accumulator; The original code:
5186 S1: int_a = (int) short_a;
5187 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5189 was replaced with:
5190 STMT: int_acc = widen_sum <short_a, int_acc>
5192 This means that:
5193 1. The tree-code that is used to create the vector operation in the
5194 epilog code (that reduces the partial results) is not the
5195 tree-code of STMT, but is rather the tree-code of the original
5196 stmt from the pattern that STMT is replacing. I.e, in the example
5197 above we want to use 'widen_sum' in the loop, but 'plus' in the
5198 epilog.
5199 2. The type (mode) we use to check available target support
5200 for the vector operation to be created in the *epilog*, is
5201 determined by the type of the reduction variable (in the example
5202 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5203 However the type (mode) we use to check available target support
5204 for the vector operation to be created *inside the loop*, is
5205 determined by the type of the other arguments to STMT (in the
5206 example we'd check this: optab_handler (widen_sum_optab,
5207 vect_short_mode)).
5209 This is contrary to "regular" reductions, in which the types of all
5210 the arguments are the same as the type of the reduction variable.
5211 For "regular" reductions we can therefore use the same vector type
5212 (and also the same tree-code) when generating the epilog code and
5213 when generating the code inside the loop. */
5215 if (orig_stmt)
5217 /* This is a reduction pattern: get the vectype from the type of the
5218 reduction variable, and get the tree-code from orig_stmt. */
5219 orig_code = gimple_assign_rhs_code (orig_stmt);
5220 gcc_assert (vectype_out);
5221 vec_mode = TYPE_MODE (vectype_out);
5223 else
5225 /* Regular reduction: use the same vectype and tree-code as used for
5226 the vector code inside the loop can be used for the epilog code. */
5227 orig_code = code;
5230 if (nested_cycle)
5232 def_bb = gimple_bb (reduc_def_stmt);
5233 def_stmt_loop = def_bb->loop_father;
5234 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5235 loop_preheader_edge (def_stmt_loop));
5236 if (TREE_CODE (def_arg) == SSA_NAME
5237 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5238 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5239 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5240 && vinfo_for_stmt (def_arg_stmt)
5241 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5242 == vect_double_reduction_def)
5243 double_reduc = true;
5246 epilog_reduc_code = ERROR_MARK;
5247 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5249 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5250 optab_default);
5251 if (!reduc_optab)
5253 if (dump_enabled_p ())
5254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5255 "no optab for reduction.\n");
5257 epilog_reduc_code = ERROR_MARK;
5259 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5261 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5262 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5264 if (dump_enabled_p ())
5265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5266 "reduc op not supported by target.\n");
5268 epilog_reduc_code = ERROR_MARK;
5272 else
5274 if (!nested_cycle || double_reduc)
5276 if (dump_enabled_p ())
5277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5278 "no reduc code for scalar code.\n");
5280 return false;
5284 if (double_reduc && ncopies > 1)
5286 if (dump_enabled_p ())
5287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5288 "multiple types in double reduction\n");
5290 return false;
5293 /* In case of widenning multiplication by a constant, we update the type
5294 of the constant to be the type of the other operand. We check that the
5295 constant fits the type in the pattern recognition pass. */
5296 if (code == DOT_PROD_EXPR
5297 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5299 if (TREE_CODE (ops[0]) == INTEGER_CST)
5300 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5301 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5302 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5303 else
5305 if (dump_enabled_p ())
5306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5307 "invalid types in dot-prod\n");
5309 return false;
5313 if (!vec_stmt) /* transformation not required. */
5315 if (first_p
5316 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5317 reduc_index))
5318 return false;
5319 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5320 return true;
5323 /** Transform. **/
5325 if (dump_enabled_p ())
5326 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5328 /* FORNOW: Multiple types are not supported for condition. */
5329 if (code == COND_EXPR)
5330 gcc_assert (ncopies == 1);
5332 /* Create the destination vector */
5333 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5335 /* In case the vectorization factor (VF) is bigger than the number
5336 of elements that we can fit in a vectype (nunits), we have to generate
5337 more than one vector stmt - i.e - we need to "unroll" the
5338 vector stmt by a factor VF/nunits. For more details see documentation
5339 in vectorizable_operation. */
5341 /* If the reduction is used in an outer loop we need to generate
5342 VF intermediate results, like so (e.g. for ncopies=2):
5343 r0 = phi (init, r0)
5344 r1 = phi (init, r1)
5345 r0 = x0 + r0;
5346 r1 = x1 + r1;
5347 (i.e. we generate VF results in 2 registers).
5348 In this case we have a separate def-use cycle for each copy, and therefore
5349 for each copy we get the vector def for the reduction variable from the
5350 respective phi node created for this copy.
5352 Otherwise (the reduction is unused in the loop nest), we can combine
5353 together intermediate results, like so (e.g. for ncopies=2):
5354 r = phi (init, r)
5355 r = x0 + r;
5356 r = x1 + r;
5357 (i.e. we generate VF/2 results in a single register).
5358 In this case for each copy we get the vector def for the reduction variable
5359 from the vectorized reduction operation generated in the previous iteration.
5362 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5364 single_defuse_cycle = true;
5365 epilog_copies = 1;
5367 else
5368 epilog_copies = ncopies;
5370 prev_stmt_info = NULL;
5371 prev_phi_info = NULL;
5372 if (slp_node)
5373 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5374 else
5376 vec_num = 1;
5377 vec_oprnds0.create (1);
5378 if (op_type == ternary_op)
5379 vec_oprnds1.create (1);
5382 phis.create (vec_num);
5383 vect_defs.create (vec_num);
5384 if (!slp_node)
5385 vect_defs.quick_push (NULL_TREE);
5387 for (j = 0; j < ncopies; j++)
5389 if (j == 0 || !single_defuse_cycle)
5391 for (i = 0; i < vec_num; i++)
5393 /* Create the reduction-phi that defines the reduction
5394 operand. */
5395 new_phi = create_phi_node (vec_dest, loop->header);
5396 set_vinfo_for_stmt (new_phi,
5397 new_stmt_vec_info (new_phi, loop_vinfo,
5398 NULL));
5399 if (j == 0 || slp_node)
5400 phis.quick_push (new_phi);
5404 if (code == COND_EXPR)
5406 gcc_assert (!slp_node);
5407 vectorizable_condition (stmt, gsi, vec_stmt,
5408 PHI_RESULT (phis[0]),
5409 reduc_index, NULL);
5410 /* Multiple types are not supported for condition. */
5411 break;
5414 /* Handle uses. */
5415 if (j == 0)
5417 op0 = ops[!reduc_index];
5418 if (op_type == ternary_op)
5420 if (reduc_index == 0)
5421 op1 = ops[2];
5422 else
5423 op1 = ops[1];
5426 if (slp_node)
5427 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5428 slp_node, -1);
5429 else
5431 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5432 stmt, NULL);
5433 vec_oprnds0.quick_push (loop_vec_def0);
5434 if (op_type == ternary_op)
5436 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5437 NULL);
5438 vec_oprnds1.quick_push (loop_vec_def1);
5442 else
5444 if (!slp_node)
5446 enum vect_def_type dt;
5447 gimple dummy_stmt;
5448 tree dummy;
5450 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5451 &dummy_stmt, &dummy, &dt);
5452 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5453 loop_vec_def0);
5454 vec_oprnds0[0] = loop_vec_def0;
5455 if (op_type == ternary_op)
5457 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5458 &dummy, &dt);
5459 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5460 loop_vec_def1);
5461 vec_oprnds1[0] = loop_vec_def1;
5465 if (single_defuse_cycle)
5466 reduc_def = gimple_assign_lhs (new_stmt);
5468 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5471 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5473 if (slp_node)
5474 reduc_def = PHI_RESULT (phis[i]);
5475 else
5477 if (!single_defuse_cycle || j == 0)
5478 reduc_def = PHI_RESULT (new_phi);
5481 def1 = ((op_type == ternary_op)
5482 ? vec_oprnds1[i] : NULL);
5483 if (op_type == binary_op)
5485 if (reduc_index == 0)
5486 expr = build2 (code, vectype_out, reduc_def, def0);
5487 else
5488 expr = build2 (code, vectype_out, def0, reduc_def);
5490 else
5492 if (reduc_index == 0)
5493 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5494 else
5496 if (reduc_index == 1)
5497 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5498 else
5499 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5503 new_stmt = gimple_build_assign (vec_dest, expr);
5504 new_temp = make_ssa_name (vec_dest, new_stmt);
5505 gimple_assign_set_lhs (new_stmt, new_temp);
5506 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5508 if (slp_node)
5510 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5511 vect_defs.quick_push (new_temp);
5513 else
5514 vect_defs[0] = new_temp;
5517 if (slp_node)
5518 continue;
5520 if (j == 0)
5521 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5522 else
5523 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5525 prev_stmt_info = vinfo_for_stmt (new_stmt);
5526 prev_phi_info = vinfo_for_stmt (new_phi);
5529 /* Finalize the reduction-phi (set its arguments) and create the
5530 epilog reduction code. */
5531 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5533 new_temp = gimple_assign_lhs (*vec_stmt);
5534 vect_defs[0] = new_temp;
5537 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5538 epilog_reduc_code, phis, reduc_index,
5539 double_reduc, slp_node);
5541 return true;
5544 /* Function vect_min_worthwhile_factor.
5546 For a loop where we could vectorize the operation indicated by CODE,
5547 return the minimum vectorization factor that makes it worthwhile
5548 to use generic vectors. */
5550 vect_min_worthwhile_factor (enum tree_code code)
5552 switch (code)
5554 case PLUS_EXPR:
5555 case MINUS_EXPR:
5556 case NEGATE_EXPR:
5557 return 4;
5559 case BIT_AND_EXPR:
5560 case BIT_IOR_EXPR:
5561 case BIT_XOR_EXPR:
5562 case BIT_NOT_EXPR:
5563 return 2;
5565 default:
5566 return INT_MAX;
5571 /* Function vectorizable_induction
5573 Check if PHI performs an induction computation that can be vectorized.
5574 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5575 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5576 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5578 bool
5579 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5580 gimple *vec_stmt)
5582 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5583 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5584 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5585 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5586 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5587 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5588 tree vec_def;
5590 gcc_assert (ncopies >= 1);
5591 /* FORNOW. These restrictions should be relaxed. */
5592 if (nested_in_vect_loop_p (loop, phi))
5594 imm_use_iterator imm_iter;
5595 use_operand_p use_p;
5596 gimple exit_phi;
5597 edge latch_e;
5598 tree loop_arg;
5600 if (ncopies > 1)
5602 if (dump_enabled_p ())
5603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5604 "multiple types in nested loop.\n");
5605 return false;
5608 exit_phi = NULL;
5609 latch_e = loop_latch_edge (loop->inner);
5610 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5611 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5613 gimple use_stmt = USE_STMT (use_p);
5614 if (is_gimple_debug (use_stmt))
5615 continue;
5617 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5619 exit_phi = use_stmt;
5620 break;
5623 if (exit_phi)
5625 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5626 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5627 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5629 if (dump_enabled_p ())
5630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631 "inner-loop induction only used outside "
5632 "of the outer vectorized loop.\n");
5633 return false;
5638 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5639 return false;
5641 /* FORNOW: SLP not supported. */
5642 if (STMT_SLP_TYPE (stmt_info))
5643 return false;
5645 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5647 if (gimple_code (phi) != GIMPLE_PHI)
5648 return false;
5650 if (!vec_stmt) /* transformation not required. */
5652 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5653 if (dump_enabled_p ())
5654 dump_printf_loc (MSG_NOTE, vect_location,
5655 "=== vectorizable_induction ===\n");
5656 vect_model_induction_cost (stmt_info, ncopies);
5657 return true;
5660 /** Transform. **/
5662 if (dump_enabled_p ())
5663 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5665 vec_def = get_initial_def_for_induction (phi);
5666 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5667 return true;
5670 /* Function vectorizable_live_operation.
5672 STMT computes a value that is used outside the loop. Check if
5673 it can be supported. */
5675 bool
5676 vectorizable_live_operation (gimple stmt,
5677 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5678 gimple *vec_stmt)
5680 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5681 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5682 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5683 int i;
5684 int op_type;
5685 tree op;
5686 tree def;
5687 gimple def_stmt;
5688 enum vect_def_type dt;
5689 enum tree_code code;
5690 enum gimple_rhs_class rhs_class;
5692 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5694 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5695 return false;
5697 if (!is_gimple_assign (stmt))
5699 if (gimple_call_internal_p (stmt)
5700 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5701 && gimple_call_lhs (stmt)
5702 && loop->simduid
5703 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5704 && loop->simduid
5705 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5707 edge e = single_exit (loop);
5708 basic_block merge_bb = e->dest;
5709 imm_use_iterator imm_iter;
5710 use_operand_p use_p;
5711 tree lhs = gimple_call_lhs (stmt);
5713 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5715 gimple use_stmt = USE_STMT (use_p);
5716 if (gimple_code (use_stmt) == GIMPLE_PHI
5717 && gimple_bb (use_stmt) == merge_bb)
5719 if (vec_stmt)
5721 tree vfm1
5722 = build_int_cst (unsigned_type_node,
5723 loop_vinfo->vectorization_factor - 1);
5724 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5726 return true;
5731 return false;
5734 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5735 return false;
5737 /* FORNOW. CHECKME. */
5738 if (nested_in_vect_loop_p (loop, stmt))
5739 return false;
5741 code = gimple_assign_rhs_code (stmt);
5742 op_type = TREE_CODE_LENGTH (code);
5743 rhs_class = get_gimple_rhs_class (code);
5744 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5745 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5747 /* FORNOW: support only if all uses are invariant. This means
5748 that the scalar operations can remain in place, unvectorized.
5749 The original last scalar value that they compute will be used. */
5751 for (i = 0; i < op_type; i++)
5753 if (rhs_class == GIMPLE_SINGLE_RHS)
5754 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5755 else
5756 op = gimple_op (stmt, i + 1);
5757 if (op
5758 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5759 &dt))
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763 "use not simple.\n");
5764 return false;
5767 if (dt != vect_external_def && dt != vect_constant_def)
5768 return false;
5771 /* No transformation is required for the cases we currently support. */
5772 return true;
5775 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5777 static void
5778 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5780 ssa_op_iter op_iter;
5781 imm_use_iterator imm_iter;
5782 def_operand_p def_p;
5783 gimple ustmt;
5785 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5787 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5789 basic_block bb;
5791 if (!is_gimple_debug (ustmt))
5792 continue;
5794 bb = gimple_bb (ustmt);
5796 if (!flow_bb_inside_loop_p (loop, bb))
5798 if (gimple_debug_bind_p (ustmt))
5800 if (dump_enabled_p ())
5801 dump_printf_loc (MSG_NOTE, vect_location,
5802 "killing debug use\n");
5804 gimple_debug_bind_reset_value (ustmt);
5805 update_stmt (ustmt);
5807 else
5808 gcc_unreachable ();
5815 /* This function builds ni_name = number of iterations. Statements
5816 are emitted on the loop preheader edge. */
5818 static tree
5819 vect_build_loop_niters (loop_vec_info loop_vinfo)
5821 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5822 if (TREE_CODE (ni) == INTEGER_CST)
5823 return ni;
5824 else
5826 tree ni_name, var;
5827 gimple_seq stmts = NULL;
5828 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5830 var = create_tmp_var (TREE_TYPE (ni), "niters");
5831 ni_name = force_gimple_operand (ni, &stmts, false, var);
5832 if (stmts)
5833 gsi_insert_seq_on_edge_immediate (pe, stmts);
5835 return ni_name;
5840 /* This function generates the following statements:
5842 ni_name = number of iterations loop executes
5843 ratio = ni_name / vf
5844 ratio_mult_vf_name = ratio * vf
5846 and places them on the loop preheader edge. */
5848 static void
5849 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5850 tree ni_name,
5851 tree *ratio_mult_vf_name_ptr,
5852 tree *ratio_name_ptr)
5854 tree ni_minus_gap_name;
5855 tree var;
5856 tree ratio_name;
5857 tree ratio_mult_vf_name;
5858 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5859 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5860 tree log_vf;
5862 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5864 /* If epilogue loop is required because of data accesses with gaps, we
5865 subtract one iteration from the total number of iterations here for
5866 correct calculation of RATIO. */
5867 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5869 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5870 ni_name,
5871 build_one_cst (TREE_TYPE (ni_name)));
5872 if (!is_gimple_val (ni_minus_gap_name))
5874 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5875 gimple stmts = NULL;
5876 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5877 true, var);
5878 gsi_insert_seq_on_edge_immediate (pe, stmts);
5881 else
5882 ni_minus_gap_name = ni_name;
5884 /* Create: ratio = ni >> log2(vf) */
5885 /* ??? As we have ni == number of latch executions + 1, ni could
5886 have overflown to zero. So avoid computing ratio based on ni
5887 but compute it using the fact that we know ratio will be at least
5888 one, thus via (ni - vf) >> log2(vf) + 1. */
5889 ratio_name
5890 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5891 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5892 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5893 ni_minus_gap_name,
5894 build_int_cst
5895 (TREE_TYPE (ni_name), vf)),
5896 log_vf),
5897 build_int_cst (TREE_TYPE (ni_name), 1));
5898 if (!is_gimple_val (ratio_name))
5900 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5901 gimple stmts = NULL;
5902 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5903 gsi_insert_seq_on_edge_immediate (pe, stmts);
5905 *ratio_name_ptr = ratio_name;
5907 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5909 if (ratio_mult_vf_name_ptr)
5911 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5912 ratio_name, log_vf);
5913 if (!is_gimple_val (ratio_mult_vf_name))
5915 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5916 gimple stmts = NULL;
5917 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5918 true, var);
5919 gsi_insert_seq_on_edge_immediate (pe, stmts);
5921 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5924 return;
5928 /* Function vect_transform_loop.
5930 The analysis phase has determined that the loop is vectorizable.
5931 Vectorize the loop - created vectorized stmts to replace the scalar
5932 stmts in the loop, and update the loop exit condition. */
5934 void
5935 vect_transform_loop (loop_vec_info loop_vinfo)
5937 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5938 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5939 int nbbs = loop->num_nodes;
5940 int i;
5941 tree ratio = NULL;
5942 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5943 bool grouped_store;
5944 bool slp_scheduled = false;
5945 gimple stmt, pattern_stmt;
5946 gimple_seq pattern_def_seq = NULL;
5947 gimple_stmt_iterator pattern_def_si = gsi_none ();
5948 bool transform_pattern_stmt = false;
5949 bool check_profitability = false;
5950 int th;
5951 /* Record number of iterations before we started tampering with the profile. */
5952 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5954 if (dump_enabled_p ())
5955 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5957 /* If profile is inprecise, we have chance to fix it up. */
5958 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5959 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5961 /* Use the more conservative vectorization threshold. If the number
5962 of iterations is constant assume the cost check has been performed
5963 by our caller. If the threshold makes all loops profitable that
5964 run at least the vectorization factor number of times checking
5965 is pointless, too. */
5966 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5967 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5968 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5970 if (dump_enabled_p ())
5971 dump_printf_loc (MSG_NOTE, vect_location,
5972 "Profitability threshold is %d loop iterations.\n",
5973 th);
5974 check_profitability = true;
5977 /* Version the loop first, if required, so the profitability check
5978 comes first. */
5980 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5981 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5983 vect_loop_versioning (loop_vinfo, th, check_profitability);
5984 check_profitability = false;
5987 tree ni_name = vect_build_loop_niters (loop_vinfo);
5988 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5990 /* Peel the loop if there are data refs with unknown alignment.
5991 Only one data ref with unknown store is allowed. */
5993 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5995 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5996 th, check_profitability);
5997 check_profitability = false;
5998 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5999 be re-computed. */
6000 ni_name = NULL_TREE;
6003 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6004 compile time constant), or it is a constant that doesn't divide by the
6005 vectorization factor, then an epilog loop needs to be created.
6006 We therefore duplicate the loop: the original loop will be vectorized,
6007 and will compute the first (n/VF) iterations. The second copy of the loop
6008 will remain scalar and will compute the remaining (n%VF) iterations.
6009 (VF is the vectorization factor). */
6011 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6012 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6014 tree ratio_mult_vf;
6015 if (!ni_name)
6016 ni_name = vect_build_loop_niters (loop_vinfo);
6017 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6018 &ratio);
6019 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6020 th, check_profitability);
6022 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6023 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6024 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6025 else
6027 if (!ni_name)
6028 ni_name = vect_build_loop_niters (loop_vinfo);
6029 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6032 /* 1) Make sure the loop header has exactly two entries
6033 2) Make sure we have a preheader basic block. */
6035 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6037 split_edge (loop_preheader_edge (loop));
6039 /* FORNOW: the vectorizer supports only loops which body consist
6040 of one basic block (header + empty latch). When the vectorizer will
6041 support more involved loop forms, the order by which the BBs are
6042 traversed need to be reconsidered. */
6044 for (i = 0; i < nbbs; i++)
6046 basic_block bb = bbs[i];
6047 stmt_vec_info stmt_info;
6049 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6050 gsi_next (&si))
6052 gphi *phi = si.phi ();
6053 if (dump_enabled_p ())
6055 dump_printf_loc (MSG_NOTE, vect_location,
6056 "------>vectorizing phi: ");
6057 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6058 dump_printf (MSG_NOTE, "\n");
6060 stmt_info = vinfo_for_stmt (phi);
6061 if (!stmt_info)
6062 continue;
6064 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6065 vect_loop_kill_debug_uses (loop, phi);
6067 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6068 && !STMT_VINFO_LIVE_P (stmt_info))
6069 continue;
6071 if (STMT_VINFO_VECTYPE (stmt_info)
6072 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6073 != (unsigned HOST_WIDE_INT) vectorization_factor)
6074 && dump_enabled_p ())
6075 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6077 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6079 if (dump_enabled_p ())
6080 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6081 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6085 pattern_stmt = NULL;
6086 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6087 !gsi_end_p (si) || transform_pattern_stmt;)
6089 bool is_store;
6091 if (transform_pattern_stmt)
6092 stmt = pattern_stmt;
6093 else
6095 stmt = gsi_stmt (si);
6096 /* During vectorization remove existing clobber stmts. */
6097 if (gimple_clobber_p (stmt))
6099 unlink_stmt_vdef (stmt);
6100 gsi_remove (&si, true);
6101 release_defs (stmt);
6102 continue;
6106 if (dump_enabled_p ())
6108 dump_printf_loc (MSG_NOTE, vect_location,
6109 "------>vectorizing statement: ");
6110 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6111 dump_printf (MSG_NOTE, "\n");
6114 stmt_info = vinfo_for_stmt (stmt);
6116 /* vector stmts created in the outer-loop during vectorization of
6117 stmts in an inner-loop may not have a stmt_info, and do not
6118 need to be vectorized. */
6119 if (!stmt_info)
6121 gsi_next (&si);
6122 continue;
6125 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6126 vect_loop_kill_debug_uses (loop, stmt);
6128 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6129 && !STMT_VINFO_LIVE_P (stmt_info))
6131 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6132 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6133 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6134 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6136 stmt = pattern_stmt;
6137 stmt_info = vinfo_for_stmt (stmt);
6139 else
6141 gsi_next (&si);
6142 continue;
6145 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6146 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6147 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6148 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6149 transform_pattern_stmt = true;
6151 /* If pattern statement has def stmts, vectorize them too. */
6152 if (is_pattern_stmt_p (stmt_info))
6154 if (pattern_def_seq == NULL)
6156 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6157 pattern_def_si = gsi_start (pattern_def_seq);
6159 else if (!gsi_end_p (pattern_def_si))
6160 gsi_next (&pattern_def_si);
6161 if (pattern_def_seq != NULL)
6163 gimple pattern_def_stmt = NULL;
6164 stmt_vec_info pattern_def_stmt_info = NULL;
6166 while (!gsi_end_p (pattern_def_si))
6168 pattern_def_stmt = gsi_stmt (pattern_def_si);
6169 pattern_def_stmt_info
6170 = vinfo_for_stmt (pattern_def_stmt);
6171 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6172 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6173 break;
6174 gsi_next (&pattern_def_si);
6177 if (!gsi_end_p (pattern_def_si))
6179 if (dump_enabled_p ())
6181 dump_printf_loc (MSG_NOTE, vect_location,
6182 "==> vectorizing pattern def "
6183 "stmt: ");
6184 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6185 pattern_def_stmt, 0);
6186 dump_printf (MSG_NOTE, "\n");
6189 stmt = pattern_def_stmt;
6190 stmt_info = pattern_def_stmt_info;
6192 else
6194 pattern_def_si = gsi_none ();
6195 transform_pattern_stmt = false;
6198 else
6199 transform_pattern_stmt = false;
6202 if (STMT_VINFO_VECTYPE (stmt_info))
6204 unsigned int nunits
6205 = (unsigned int)
6206 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6207 if (!STMT_SLP_TYPE (stmt_info)
6208 && nunits != (unsigned int) vectorization_factor
6209 && dump_enabled_p ())
6210 /* For SLP VF is set according to unrolling factor, and not
6211 to vector size, hence for SLP this print is not valid. */
6212 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6215 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6216 reached. */
6217 if (STMT_SLP_TYPE (stmt_info))
6219 if (!slp_scheduled)
6221 slp_scheduled = true;
6223 if (dump_enabled_p ())
6224 dump_printf_loc (MSG_NOTE, vect_location,
6225 "=== scheduling SLP instances ===\n");
6227 vect_schedule_slp (loop_vinfo, NULL);
6230 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6231 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6233 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6235 pattern_def_seq = NULL;
6236 gsi_next (&si);
6238 continue;
6242 /* -------- vectorize statement ------------ */
6243 if (dump_enabled_p ())
6244 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6246 grouped_store = false;
6247 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6248 if (is_store)
6250 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6252 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6253 interleaving chain was completed - free all the stores in
6254 the chain. */
6255 gsi_next (&si);
6256 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6258 else
6260 /* Free the attached stmt_vec_info and remove the stmt. */
6261 gimple store = gsi_stmt (si);
6262 free_stmt_vec_info (store);
6263 unlink_stmt_vdef (store);
6264 gsi_remove (&si, true);
6265 release_defs (store);
6268 /* Stores can only appear at the end of pattern statements. */
6269 gcc_assert (!transform_pattern_stmt);
6270 pattern_def_seq = NULL;
6272 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6274 pattern_def_seq = NULL;
6275 gsi_next (&si);
6277 } /* stmts in BB */
6278 } /* BBs in loop */
6280 slpeel_make_loop_iterate_ntimes (loop, ratio);
6282 /* Reduce loop iterations by the vectorization factor. */
6283 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6284 expected_iterations / vectorization_factor);
6285 loop->nb_iterations_upper_bound
6286 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6287 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6288 && loop->nb_iterations_upper_bound != 0)
6289 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6290 if (loop->any_estimate)
6292 loop->nb_iterations_estimate
6293 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6294 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6295 && loop->nb_iterations_estimate != 0)
6296 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6299 if (dump_enabled_p ())
6301 dump_printf_loc (MSG_NOTE, vect_location,
6302 "LOOP VECTORIZED\n");
6303 if (loop->inner)
6304 dump_printf_loc (MSG_NOTE, vect_location,
6305 "OUTER LOOP VECTORIZED\n");
6306 dump_printf (MSG_NOTE, "\n");