2013-12-06 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-loop.c
blobca8d3a6cde089574cf431201dac9bea6fa60e226
1 /* Loop Vectorization
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-ssa-alias.h"
32 #include "internal-fn.h"
33 #include "gimple-expr.h"
34 #include "is-a.h"
35 #include "gimple.h"
36 #include "gimplify.h"
37 #include "gimple-iterator.h"
38 #include "gimplify-me.h"
39 #include "gimple-ssa.h"
40 #include "tree-phinodes.h"
41 #include "ssa-iterators.h"
42 #include "stringpool.h"
43 #include "tree-ssanames.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-pass.h"
48 #include "cfgloop.h"
49 #include "expr.h"
50 #include "recog.h"
51 #include "optabs.h"
52 #include "params.h"
53 #include "diagnostic-core.h"
54 #include "tree-chrec.h"
55 #include "tree-scalar-evolution.h"
56 #include "tree-vectorizer.h"
57 #include "target.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
159 /* Function vect_determine_vectorization_factor
161 Determine the vectorization factor (VF). VF is the number of data elements
162 that are operated upon in parallel in a single iteration of the vectorized
163 loop. For example, when vectorizing a loop that operates on 4byte elements,
164 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
165 elements can fit in a single vector register.
167 We currently support vectorization of loops in which all types operated upon
168 are of the same size. Therefore this function currently sets VF according to
169 the size of the types operated upon, and fails if there are multiple sizes
170 in the loop.
172 VF is also the factor by which the loop iterations are strip-mined, e.g.:
173 original loop:
174 for (i=0; i<N; i++){
175 a[i] = b[i] + c[i];
178 vectorized loop:
179 for (i=0; i<N; i+=VF){
180 a[i:VF] = b[i:VF] + c[i:VF];
184 static bool
185 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
187 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
188 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
189 int nbbs = loop->num_nodes;
190 gimple_stmt_iterator si;
191 unsigned int vectorization_factor = 0;
192 tree scalar_type;
193 gimple phi;
194 tree vectype;
195 unsigned int nunits;
196 stmt_vec_info stmt_info;
197 int i;
198 HOST_WIDE_INT dummy;
199 gimple stmt, pattern_stmt = NULL;
200 gimple_seq pattern_def_seq = NULL;
201 gimple_stmt_iterator pattern_def_si = gsi_none ();
202 bool analyze_pattern_stmt = false;
204 if (dump_enabled_p ())
205 dump_printf_loc (MSG_NOTE, vect_location,
206 "=== vect_determine_vectorization_factor ===\n");
208 for (i = 0; i < nbbs; i++)
210 basic_block bb = bbs[i];
212 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
214 phi = gsi_stmt (si);
215 stmt_info = vinfo_for_stmt (phi);
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
220 dump_printf (MSG_NOTE, "\n");
223 gcc_assert (stmt_info);
225 if (STMT_VINFO_RELEVANT_P (stmt_info))
227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228 scalar_type = TREE_TYPE (PHI_RESULT (phi));
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "get vectype for scalar type: ");
234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235 dump_printf (MSG_NOTE, "\n");
238 vectype = get_vectype_for_scalar_type (scalar_type);
239 if (!vectype)
241 if (dump_enabled_p ())
243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244 "not vectorized: unsupported "
245 "data-type ");
246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247 scalar_type);
248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
250 return false;
252 STMT_VINFO_VECTYPE (stmt_info) = vectype;
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258 dump_printf (MSG_NOTE, "\n");
261 nunits = TYPE_VECTOR_SUBPARTS (vectype);
262 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
264 nunits);
266 if (!vectorization_factor
267 || (nunits > vectorization_factor))
268 vectorization_factor = nunits;
272 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
274 tree vf_vectype;
276 if (analyze_pattern_stmt)
277 stmt = pattern_stmt;
278 else
279 stmt = gsi_stmt (si);
281 stmt_info = vinfo_for_stmt (stmt);
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_NOTE, vect_location,
286 "==> examining statement: ");
287 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
288 dump_printf (MSG_NOTE, "\n");
291 gcc_assert (stmt_info);
293 /* Skip stmts which do not need to be vectorized. */
294 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 && !STMT_VINFO_LIVE_P (stmt_info))
296 || gimple_clobber_p (stmt))
298 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 stmt = pattern_stmt;
304 stmt_info = vinfo_for_stmt (pattern_stmt);
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location,
308 "==> examining pattern statement: ");
309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
310 dump_printf (MSG_NOTE, "\n");
313 else
315 if (dump_enabled_p ())
316 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
317 gsi_next (&si);
318 continue;
321 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
322 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
323 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
324 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
325 analyze_pattern_stmt = true;
327 /* If a pattern statement has def stmts, analyze them too. */
328 if (is_pattern_stmt_p (stmt_info))
330 if (pattern_def_seq == NULL)
332 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
333 pattern_def_si = gsi_start (pattern_def_seq);
335 else if (!gsi_end_p (pattern_def_si))
336 gsi_next (&pattern_def_si);
337 if (pattern_def_seq != NULL)
339 gimple pattern_def_stmt = NULL;
340 stmt_vec_info pattern_def_stmt_info = NULL;
342 while (!gsi_end_p (pattern_def_si))
344 pattern_def_stmt = gsi_stmt (pattern_def_si);
345 pattern_def_stmt_info
346 = vinfo_for_stmt (pattern_def_stmt);
347 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
348 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
349 break;
350 gsi_next (&pattern_def_si);
353 if (!gsi_end_p (pattern_def_si))
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_NOTE, vect_location,
358 "==> examining pattern def stmt: ");
359 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
360 pattern_def_stmt, 0);
361 dump_printf (MSG_NOTE, "\n");
364 stmt = pattern_def_stmt;
365 stmt_info = pattern_def_stmt_info;
367 else
369 pattern_def_si = gsi_none ();
370 analyze_pattern_stmt = false;
373 else
374 analyze_pattern_stmt = false;
377 if (gimple_get_lhs (stmt) == NULL_TREE)
379 if (is_gimple_call (stmt))
381 /* Ignore calls with no lhs. These must be calls to
382 #pragma omp simd functions, and what vectorization factor
383 it really needs can't be determined until
384 vectorizable_simd_clone_call. */
385 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
387 pattern_def_seq = NULL;
388 gsi_next (&si);
390 continue;
392 if (dump_enabled_p ())
394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395 "not vectorized: irregular stmt.");
396 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
398 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
400 return false;
403 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "not vectorized: vector stmt in loop:");
409 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
410 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 return false;
415 if (STMT_VINFO_VECTYPE (stmt_info))
417 /* The only case when a vectype had been already set is for stmts
418 that contain a dataref, or for "pattern-stmts" (stmts
419 generated by the vectorizer to represent/replace a certain
420 idiom). */
421 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
422 || is_pattern_stmt_p (stmt_info)
423 || !gsi_end_p (pattern_def_si));
424 vectype = STMT_VINFO_VECTYPE (stmt_info);
426 else
428 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
429 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
430 if (dump_enabled_p ())
432 dump_printf_loc (MSG_NOTE, vect_location,
433 "get vectype for scalar type: ");
434 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
435 dump_printf (MSG_NOTE, "\n");
437 vectype = get_vectype_for_scalar_type (scalar_type);
438 if (!vectype)
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized: unsupported "
444 "data-type ");
445 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
446 scalar_type);
447 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
449 return false;
452 STMT_VINFO_VECTYPE (stmt_info) = vectype;
454 if (dump_enabled_p ())
456 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
457 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
458 dump_printf (MSG_NOTE, "\n");
462 /* The vectorization factor is according to the smallest
463 scalar type (or the largest vector size, but we only
464 support one vector size per loop). */
465 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
466 &dummy);
467 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "get vectype for scalar type: ");
471 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
472 dump_printf (MSG_NOTE, "\n");
474 vf_vectype = get_vectype_for_scalar_type (scalar_type);
475 if (!vf_vectype)
477 if (dump_enabled_p ())
479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
480 "not vectorized: unsupported data-type ");
481 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
482 scalar_type);
483 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
485 return false;
488 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
489 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
491 if (dump_enabled_p ())
493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494 "not vectorized: different sized vector "
495 "types in statement, ");
496 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
497 vectype);
498 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
499 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
500 vf_vectype);
501 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
503 return false;
506 if (dump_enabled_p ())
508 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
509 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
510 dump_printf (MSG_NOTE, "\n");
513 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
516 if (!vectorization_factor
517 || (nunits > vectorization_factor))
518 vectorization_factor = nunits;
520 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
522 pattern_def_seq = NULL;
523 gsi_next (&si);
528 /* TODO: Analyze cost. Decide if worth while to vectorize. */
529 if (dump_enabled_p ())
530 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
531 vectorization_factor);
532 if (vectorization_factor <= 1)
534 if (dump_enabled_p ())
535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
536 "not vectorized: unsupported data-type\n");
537 return false;
539 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
541 return true;
545 /* Function vect_is_simple_iv_evolution.
547 FORNOW: A simple evolution of an induction variables in the loop is
548 considered a polynomial evolution. */
550 static bool
551 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
552 tree * step)
554 tree init_expr;
555 tree step_expr;
556 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
557 basic_block bb;
559 /* When there is no evolution in this loop, the evolution function
560 is not "simple". */
561 if (evolution_part == NULL_TREE)
562 return false;
564 /* When the evolution is a polynomial of degree >= 2
565 the evolution function is not "simple". */
566 if (tree_is_chrec (evolution_part))
567 return false;
569 step_expr = evolution_part;
570 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
572 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
575 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
576 dump_printf (MSG_NOTE, ", init: ");
577 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
578 dump_printf (MSG_NOTE, "\n");
581 *init = init_expr;
582 *step = step_expr;
584 if (TREE_CODE (step_expr) != INTEGER_CST
585 && (TREE_CODE (step_expr) != SSA_NAME
586 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
587 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
588 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
589 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
590 || !flag_associative_math)))
591 && (TREE_CODE (step_expr) != REAL_CST
592 || !flag_associative_math))
594 if (dump_enabled_p ())
595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
596 "step unknown.\n");
597 return false;
600 return true;
603 /* Function vect_analyze_scalar_cycles_1.
605 Examine the cross iteration def-use cycles of scalar variables
606 in LOOP. LOOP_VINFO represents the loop that is now being
607 considered for vectorization (can be LOOP, or an outer-loop
608 enclosing LOOP). */
610 static void
611 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
613 basic_block bb = loop->header;
614 tree init, step;
615 stack_vec<gimple, 64> worklist;
616 gimple_stmt_iterator gsi;
617 bool double_reduc;
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_NOTE, vect_location,
621 "=== vect_analyze_scalar_cycles ===\n");
623 /* First - identify all inductions. Reduction detection assumes that all the
624 inductions have been identified, therefore, this order must not be
625 changed. */
626 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
628 gimple phi = gsi_stmt (gsi);
629 tree access_fn = NULL;
630 tree def = PHI_RESULT (phi);
631 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
633 if (dump_enabled_p ())
635 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
636 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
637 dump_printf (MSG_NOTE, "\n");
640 /* Skip virtual phi's. The data dependences that are associated with
641 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
642 if (virtual_operand_p (def))
643 continue;
645 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
647 /* Analyze the evolution function. */
648 access_fn = analyze_scalar_evolution (loop, def);
649 if (access_fn)
651 STRIP_NOPS (access_fn);
652 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location,
655 "Access function of PHI: ");
656 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
657 dump_printf (MSG_NOTE, "\n");
659 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
660 = evolution_part_in_loop_num (access_fn, loop->num);
663 if (!access_fn
664 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
665 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
666 && TREE_CODE (step) != INTEGER_CST))
668 worklist.safe_push (phi);
669 continue;
672 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
674 if (dump_enabled_p ())
675 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
676 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
680 /* Second - identify all reductions and nested cycles. */
681 while (worklist.length () > 0)
683 gimple phi = worklist.pop ();
684 tree def = PHI_RESULT (phi);
685 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
686 gimple reduc_stmt;
687 bool nested_cycle;
689 if (dump_enabled_p ())
691 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
692 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
693 dump_printf (MSG_NOTE, "\n");
696 gcc_assert (!virtual_operand_p (def)
697 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
699 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
700 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
701 &double_reduc);
702 if (reduc_stmt)
704 if (double_reduc)
706 if (dump_enabled_p ())
707 dump_printf_loc (MSG_NOTE, vect_location,
708 "Detected double reduction.\n");
710 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
711 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
712 vect_double_reduction_def;
714 else
716 if (nested_cycle)
718 if (dump_enabled_p ())
719 dump_printf_loc (MSG_NOTE, vect_location,
720 "Detected vectorizable nested cycle.\n");
722 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
723 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
724 vect_nested_cycle;
726 else
728 if (dump_enabled_p ())
729 dump_printf_loc (MSG_NOTE, vect_location,
730 "Detected reduction.\n");
732 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
733 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
734 vect_reduction_def;
735 /* Store the reduction cycles for possible vectorization in
736 loop-aware SLP. */
737 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
741 else
742 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744 "Unknown def-use cycle pattern.\n");
749 /* Function vect_analyze_scalar_cycles.
751 Examine the cross iteration def-use cycles of scalar variables, by
752 analyzing the loop-header PHIs of scalar variables. Classify each
753 cycle as one of the following: invariant, induction, reduction, unknown.
754 We do that for the loop represented by LOOP_VINFO, and also to its
755 inner-loop, if exists.
756 Examples for scalar cycles:
758 Example1: reduction:
760 loop1:
761 for (i=0; i<N; i++)
762 sum += a[i];
764 Example2: induction:
766 loop2:
767 for (i=0; i<N; i++)
768 a[i] = i; */
770 static void
771 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
773 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
775 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
777 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
778 Reductions in such inner-loop therefore have different properties than
779 the reductions in the nest that gets vectorized:
780 1. When vectorized, they are executed in the same order as in the original
781 scalar loop, so we can't change the order of computation when
782 vectorizing them.
783 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
784 current checks are too strict. */
786 if (loop->inner)
787 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
791 /* Function vect_get_loop_niters.
793 Determine how many iterations the loop is executed and place it
794 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
795 in NUMBER_OF_ITERATIONSM1.
797 Return the loop exit condition. */
799 static gimple
800 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
801 tree *number_of_iterationsm1)
803 tree niters;
805 if (dump_enabled_p ())
806 dump_printf_loc (MSG_NOTE, vect_location,
807 "=== get_loop_niters ===\n");
809 niters = number_of_latch_executions (loop);
810 *number_of_iterationsm1 = niters;
812 /* We want the number of loop header executions which is the number
813 of latch executions plus one.
814 ??? For UINT_MAX latch executions this number overflows to zero
815 for loops like do { n++; } while (n != 0); */
816 if (niters && !chrec_contains_undetermined (niters))
817 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
818 build_int_cst (TREE_TYPE (niters), 1));
819 *number_of_iterations = niters;
821 return get_loop_exit_condition (loop);
825 /* Function bb_in_loop_p
827 Used as predicate for dfs order traversal of the loop bbs. */
829 static bool
830 bb_in_loop_p (const_basic_block bb, const void *data)
832 const struct loop *const loop = (const struct loop *)data;
833 if (flow_bb_inside_loop_p (loop, bb))
834 return true;
835 return false;
839 /* Function new_loop_vec_info.
841 Create and initialize a new loop_vec_info struct for LOOP, as well as
842 stmt_vec_info structs for all the stmts in LOOP. */
844 static loop_vec_info
845 new_loop_vec_info (struct loop *loop)
847 loop_vec_info res;
848 basic_block *bbs;
849 gimple_stmt_iterator si;
850 unsigned int i, nbbs;
852 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
853 LOOP_VINFO_LOOP (res) = loop;
855 bbs = get_loop_body (loop);
857 /* Create/Update stmt_info for all stmts in the loop. */
858 for (i = 0; i < loop->num_nodes; i++)
860 basic_block bb = bbs[i];
862 /* BBs in a nested inner-loop will have been already processed (because
863 we will have called vect_analyze_loop_form for any nested inner-loop).
864 Therefore, for stmts in an inner-loop we just want to update the
865 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
866 loop_info of the outer-loop we are currently considering to vectorize
867 (instead of the loop_info of the inner-loop).
868 For stmts in other BBs we need to create a stmt_info from scratch. */
869 if (bb->loop_father != loop)
871 /* Inner-loop bb. */
872 gcc_assert (loop->inner && bb->loop_father == loop->inner);
873 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
875 gimple phi = gsi_stmt (si);
876 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
877 loop_vec_info inner_loop_vinfo =
878 STMT_VINFO_LOOP_VINFO (stmt_info);
879 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
880 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
882 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
884 gimple stmt = gsi_stmt (si);
885 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
886 loop_vec_info inner_loop_vinfo =
887 STMT_VINFO_LOOP_VINFO (stmt_info);
888 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
889 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
892 else
894 /* bb in current nest. */
895 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
897 gimple phi = gsi_stmt (si);
898 gimple_set_uid (phi, 0);
899 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
902 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
904 gimple stmt = gsi_stmt (si);
905 gimple_set_uid (stmt, 0);
906 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
911 /* CHECKME: We want to visit all BBs before their successors (except for
912 latch blocks, for which this assertion wouldn't hold). In the simple
913 case of the loop forms we allow, a dfs order of the BBs would the same
914 as reversed postorder traversal, so we are safe. */
916 free (bbs);
917 bbs = XCNEWVEC (basic_block, loop->num_nodes);
918 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
919 bbs, loop->num_nodes, loop);
920 gcc_assert (nbbs == loop->num_nodes);
922 LOOP_VINFO_BBS (res) = bbs;
923 LOOP_VINFO_NITERSM1 (res) = NULL;
924 LOOP_VINFO_NITERS (res) = NULL;
925 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
926 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
927 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
928 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
929 LOOP_VINFO_VECT_FACTOR (res) = 0;
930 LOOP_VINFO_LOOP_NEST (res).create (3);
931 LOOP_VINFO_DATAREFS (res).create (10);
932 LOOP_VINFO_DDRS (res).create (10 * 10);
933 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
934 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
935 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
936 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
937 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
938 LOOP_VINFO_GROUPED_STORES (res).create (10);
939 LOOP_VINFO_REDUCTIONS (res).create (10);
940 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
941 LOOP_VINFO_SLP_INSTANCES (res).create (10);
942 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
943 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
944 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
945 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
946 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
948 return res;
952 /* Function destroy_loop_vec_info.
954 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
955 stmts in the loop. */
957 void
958 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
960 struct loop *loop;
961 basic_block *bbs;
962 int nbbs;
963 gimple_stmt_iterator si;
964 int j;
965 vec<slp_instance> slp_instances;
966 slp_instance instance;
967 bool swapped;
969 if (!loop_vinfo)
970 return;
972 loop = LOOP_VINFO_LOOP (loop_vinfo);
974 bbs = LOOP_VINFO_BBS (loop_vinfo);
975 nbbs = clean_stmts ? loop->num_nodes : 0;
976 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
978 for (j = 0; j < nbbs; j++)
980 basic_block bb = bbs[j];
981 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
982 free_stmt_vec_info (gsi_stmt (si));
984 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
986 gimple stmt = gsi_stmt (si);
988 /* We may have broken canonical form by moving a constant
989 into RHS1 of a commutative op. Fix such occurrences. */
990 if (swapped && is_gimple_assign (stmt))
992 enum tree_code code = gimple_assign_rhs_code (stmt);
994 if ((code == PLUS_EXPR
995 || code == POINTER_PLUS_EXPR
996 || code == MULT_EXPR)
997 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
998 swap_ssa_operands (stmt,
999 gimple_assign_rhs1_ptr (stmt),
1000 gimple_assign_rhs2_ptr (stmt));
1003 /* Free stmt_vec_info. */
1004 free_stmt_vec_info (stmt);
1005 gsi_next (&si);
1009 free (LOOP_VINFO_BBS (loop_vinfo));
1010 vect_destroy_datarefs (loop_vinfo, NULL);
1011 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1012 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1013 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1014 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1015 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1016 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1017 vect_free_slp_instance (instance);
1019 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1020 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1021 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1022 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1024 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1025 LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
1027 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1029 free (loop_vinfo);
1030 loop->aux = NULL;
1034 /* Function vect_analyze_loop_1.
1036 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1037 for it. The different analyses will record information in the
1038 loop_vec_info struct. This is a subset of the analyses applied in
1039 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1040 that is now considered for (outer-loop) vectorization. */
1042 static loop_vec_info
1043 vect_analyze_loop_1 (struct loop *loop)
1045 loop_vec_info loop_vinfo;
1047 if (dump_enabled_p ())
1048 dump_printf_loc (MSG_NOTE, vect_location,
1049 "===== analyze_loop_nest_1 =====\n");
1051 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1053 loop_vinfo = vect_analyze_loop_form (loop);
1054 if (!loop_vinfo)
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 "bad inner-loop form.\n");
1059 return NULL;
1062 return loop_vinfo;
1066 /* Function vect_analyze_loop_form.
1068 Verify that certain CFG restrictions hold, including:
1069 - the loop has a pre-header
1070 - the loop has a single entry and exit
1071 - the loop exit condition is simple enough, and the number of iterations
1072 can be analyzed (a countable loop). */
1074 loop_vec_info
1075 vect_analyze_loop_form (struct loop *loop)
1077 loop_vec_info loop_vinfo;
1078 gimple loop_cond;
1079 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1080 loop_vec_info inner_loop_vinfo = NULL;
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_NOTE, vect_location,
1084 "=== vect_analyze_loop_form ===\n");
1086 /* Different restrictions apply when we are considering an inner-most loop,
1087 vs. an outer (nested) loop.
1088 (FORNOW. May want to relax some of these restrictions in the future). */
1090 if (!loop->inner)
1092 /* Inner-most loop. We currently require that the number of BBs is
1093 exactly 2 (the header and latch). Vectorizable inner-most loops
1094 look like this:
1096 (pre-header)
1098 header <--------+
1099 | | |
1100 | +--> latch --+
1102 (exit-bb) */
1104 if (loop->num_nodes != 2)
1106 if (dump_enabled_p ())
1107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1108 "not vectorized: control flow in loop.\n");
1109 return NULL;
1112 if (empty_block_p (loop->header))
1114 if (dump_enabled_p ())
1115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1116 "not vectorized: empty loop.\n");
1117 return NULL;
1120 else
1122 struct loop *innerloop = loop->inner;
1123 edge entryedge;
1125 /* Nested loop. We currently require that the loop is doubly-nested,
1126 contains a single inner loop, and the number of BBs is exactly 5.
1127 Vectorizable outer-loops look like this:
1129 (pre-header)
1131 header <---+
1133 inner-loop |
1135 tail ------+
1137 (exit-bb)
1139 The inner-loop has the properties expected of inner-most loops
1140 as described above. */
1142 if ((loop->inner)->inner || (loop->inner)->next)
1144 if (dump_enabled_p ())
1145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146 "not vectorized: multiple nested loops.\n");
1147 return NULL;
1150 /* Analyze the inner-loop. */
1151 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1152 if (!inner_loop_vinfo)
1154 if (dump_enabled_p ())
1155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1156 "not vectorized: Bad inner loop.\n");
1157 return NULL;
1160 if (!expr_invariant_in_loop_p (loop,
1161 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "not vectorized: inner-loop count not"
1166 " invariant.\n");
1167 destroy_loop_vec_info (inner_loop_vinfo, true);
1168 return NULL;
1171 if (loop->num_nodes != 5)
1173 if (dump_enabled_p ())
1174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1175 "not vectorized: control flow in loop.\n");
1176 destroy_loop_vec_info (inner_loop_vinfo, true);
1177 return NULL;
1180 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1181 entryedge = EDGE_PRED (innerloop->header, 0);
1182 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1183 entryedge = EDGE_PRED (innerloop->header, 1);
1185 if (entryedge->src != loop->header
1186 || !single_exit (innerloop)
1187 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1189 if (dump_enabled_p ())
1190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 "not vectorized: unsupported outerloop form.\n");
1192 destroy_loop_vec_info (inner_loop_vinfo, true);
1193 return NULL;
1196 if (dump_enabled_p ())
1197 dump_printf_loc (MSG_NOTE, vect_location,
1198 "Considering outer-loop vectorization.\n");
1201 if (!single_exit (loop)
1202 || EDGE_COUNT (loop->header->preds) != 2)
1204 if (dump_enabled_p ())
1206 if (!single_exit (loop))
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "not vectorized: multiple exits.\n");
1209 else if (EDGE_COUNT (loop->header->preds) != 2)
1210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1211 "not vectorized: too many incoming edges.\n");
1213 if (inner_loop_vinfo)
1214 destroy_loop_vec_info (inner_loop_vinfo, true);
1215 return NULL;
1218 /* We assume that the loop exit condition is at the end of the loop. i.e,
1219 that the loop is represented as a do-while (with a proper if-guard
1220 before the loop if needed), where the loop header contains all the
1221 executable statements, and the latch is empty. */
1222 if (!empty_block_p (loop->latch)
1223 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1225 if (dump_enabled_p ())
1226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1227 "not vectorized: latch block not empty.\n");
1228 if (inner_loop_vinfo)
1229 destroy_loop_vec_info (inner_loop_vinfo, true);
1230 return NULL;
1233 /* Make sure there exists a single-predecessor exit bb: */
1234 if (!single_pred_p (single_exit (loop)->dest))
1236 edge e = single_exit (loop);
1237 if (!(e->flags & EDGE_ABNORMAL))
1239 split_loop_exit_edge (e);
1240 if (dump_enabled_p ())
1241 dump_printf (MSG_NOTE, "split exit edge.\n");
1243 else
1245 if (dump_enabled_p ())
1246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247 "not vectorized: abnormal loop exit edge.\n");
1248 if (inner_loop_vinfo)
1249 destroy_loop_vec_info (inner_loop_vinfo, true);
1250 return NULL;
1254 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1255 &number_of_iterationsm1);
1256 if (!loop_cond)
1258 if (dump_enabled_p ())
1259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1260 "not vectorized: complicated exit condition.\n");
1261 if (inner_loop_vinfo)
1262 destroy_loop_vec_info (inner_loop_vinfo, true);
1263 return NULL;
1266 if (!number_of_iterations
1267 || chrec_contains_undetermined (number_of_iterations))
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271 "not vectorized: number of iterations cannot be "
1272 "computed.\n");
1273 if (inner_loop_vinfo)
1274 destroy_loop_vec_info (inner_loop_vinfo, true);
1275 return NULL;
1278 if (integer_zerop (number_of_iterations))
1280 if (dump_enabled_p ())
1281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1282 "not vectorized: number of iterations = 0.\n");
1283 if (inner_loop_vinfo)
1284 destroy_loop_vec_info (inner_loop_vinfo, true);
1285 return NULL;
1288 loop_vinfo = new_loop_vec_info (loop);
1289 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1290 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1291 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1293 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1295 if (dump_enabled_p ())
1297 dump_printf_loc (MSG_NOTE, vect_location,
1298 "Symbolic number of iterations is ");
1299 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1300 dump_printf (MSG_NOTE, "\n");
1304 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1306 /* CHECKME: May want to keep it around it in the future. */
1307 if (inner_loop_vinfo)
1308 destroy_loop_vec_info (inner_loop_vinfo, false);
1310 gcc_assert (!loop->aux);
1311 loop->aux = loop_vinfo;
1312 return loop_vinfo;
1316 /* Function vect_analyze_loop_operations.
1318 Scan the loop stmts and make sure they are all vectorizable. */
1320 static bool
1321 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1323 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1324 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1325 int nbbs = loop->num_nodes;
1326 gimple_stmt_iterator si;
1327 unsigned int vectorization_factor = 0;
1328 int i;
1329 gimple phi;
1330 stmt_vec_info stmt_info;
1331 bool need_to_vectorize = false;
1332 int min_profitable_iters;
1333 int min_scalar_loop_bound;
1334 unsigned int th;
1335 bool only_slp_in_loop = true, ok;
1336 HOST_WIDE_INT max_niter;
1337 HOST_WIDE_INT estimated_niter;
1338 int min_profitable_estimate;
1340 if (dump_enabled_p ())
1341 dump_printf_loc (MSG_NOTE, vect_location,
1342 "=== vect_analyze_loop_operations ===\n");
1344 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1345 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1346 if (slp)
1348 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1349 vectorization factor of the loop is the unrolling factor required by
1350 the SLP instances. If that unrolling factor is 1, we say, that we
1351 perform pure SLP on loop - cross iteration parallelism is not
1352 exploited. */
1353 for (i = 0; i < nbbs; i++)
1355 basic_block bb = bbs[i];
1356 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1358 gimple stmt = gsi_stmt (si);
1359 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1360 gcc_assert (stmt_info);
1361 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1362 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1363 && !PURE_SLP_STMT (stmt_info))
1364 /* STMT needs both SLP and loop-based vectorization. */
1365 only_slp_in_loop = false;
1369 if (only_slp_in_loop)
1370 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1371 else
1372 vectorization_factor = least_common_multiple (vectorization_factor,
1373 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1375 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1376 if (dump_enabled_p ())
1377 dump_printf_loc (MSG_NOTE, vect_location,
1378 "Updating vectorization factor to %d\n",
1379 vectorization_factor);
1382 for (i = 0; i < nbbs; i++)
1384 basic_block bb = bbs[i];
1386 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1388 phi = gsi_stmt (si);
1389 ok = true;
1391 stmt_info = vinfo_for_stmt (phi);
1392 if (dump_enabled_p ())
1394 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1395 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1396 dump_printf (MSG_NOTE, "\n");
1399 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1400 (i.e., a phi in the tail of the outer-loop). */
1401 if (! is_loop_header_bb_p (bb))
1403 /* FORNOW: we currently don't support the case that these phis
1404 are not used in the outerloop (unless it is double reduction,
1405 i.e., this phi is vect_reduction_def), cause this case
1406 requires to actually do something here. */
1407 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1408 || STMT_VINFO_LIVE_P (stmt_info))
1409 && STMT_VINFO_DEF_TYPE (stmt_info)
1410 != vect_double_reduction_def)
1412 if (dump_enabled_p ())
1413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1414 "Unsupported loop-closed phi in "
1415 "outer-loop.\n");
1416 return false;
1419 /* If PHI is used in the outer loop, we check that its operand
1420 is defined in the inner loop. */
1421 if (STMT_VINFO_RELEVANT_P (stmt_info))
1423 tree phi_op;
1424 gimple op_def_stmt;
1426 if (gimple_phi_num_args (phi) != 1)
1427 return false;
1429 phi_op = PHI_ARG_DEF (phi, 0);
1430 if (TREE_CODE (phi_op) != SSA_NAME)
1431 return false;
1433 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1434 if (gimple_nop_p (op_def_stmt)
1435 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1436 || !vinfo_for_stmt (op_def_stmt))
1437 return false;
1439 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1440 != vect_used_in_outer
1441 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1442 != vect_used_in_outer_by_reduction)
1443 return false;
1446 continue;
1449 gcc_assert (stmt_info);
1451 if (STMT_VINFO_LIVE_P (stmt_info))
1453 /* FORNOW: not yet supported. */
1454 if (dump_enabled_p ())
1455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1456 "not vectorized: value used after loop.\n");
1457 return false;
1460 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1461 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1463 /* A scalar-dependence cycle that we don't support. */
1464 if (dump_enabled_p ())
1465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466 "not vectorized: scalar dependence cycle.\n");
1467 return false;
1470 if (STMT_VINFO_RELEVANT_P (stmt_info))
1472 need_to_vectorize = true;
1473 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1474 ok = vectorizable_induction (phi, NULL, NULL);
1477 if (!ok)
1479 if (dump_enabled_p ())
1481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1482 "not vectorized: relevant phi not "
1483 "supported: ");
1484 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1485 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1487 return false;
1491 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1493 gimple stmt = gsi_stmt (si);
1494 if (!gimple_clobber_p (stmt)
1495 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1496 return false;
1498 } /* bbs */
1500 /* All operations in the loop are either irrelevant (deal with loop
1501 control, or dead), or only used outside the loop and can be moved
1502 out of the loop (e.g. invariants, inductions). The loop can be
1503 optimized away by scalar optimizations. We're better off not
1504 touching this loop. */
1505 if (!need_to_vectorize)
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_NOTE, vect_location,
1509 "All the computation can be taken out of the loop.\n");
1510 if (dump_enabled_p ())
1511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1512 "not vectorized: redundant loop. no profit to "
1513 "vectorize.\n");
1514 return false;
1517 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1518 dump_printf_loc (MSG_NOTE, vect_location,
1519 "vectorization_factor = %d, niters = "
1520 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1521 LOOP_VINFO_INT_NITERS (loop_vinfo));
1523 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1524 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1525 || ((max_niter = max_stmt_executions_int (loop)) != -1
1526 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1528 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 "not vectorized: iteration count too small.\n");
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "not vectorized: iteration count smaller than "
1534 "vectorization factor.\n");
1535 return false;
1538 /* Analyze cost. Decide if worth while to vectorize. */
1540 /* Once VF is set, SLP costs should be updated since the number of created
1541 vector stmts depends on VF. */
1542 vect_update_slp_costs_according_to_vf (loop_vinfo);
1544 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1545 &min_profitable_estimate);
1546 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1548 if (min_profitable_iters < 0)
1550 if (dump_enabled_p ())
1551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1552 "not vectorized: vectorization not profitable.\n");
1553 if (dump_enabled_p ())
1554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555 "not vectorized: vector version will never be "
1556 "profitable.\n");
1557 return false;
1560 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1561 * vectorization_factor) - 1);
1564 /* Use the cost model only if it is more conservative than user specified
1565 threshold. */
1567 th = (unsigned) min_scalar_loop_bound;
1568 if (min_profitable_iters
1569 && (!min_scalar_loop_bound
1570 || min_profitable_iters > min_scalar_loop_bound))
1571 th = (unsigned) min_profitable_iters;
1573 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1574 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1576 if (dump_enabled_p ())
1577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1578 "not vectorized: vectorization not profitable.\n");
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE, vect_location,
1581 "not vectorized: iteration count smaller than user "
1582 "specified loop bound parameter or minimum profitable "
1583 "iterations (whichever is more conservative).\n");
1584 return false;
1587 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1588 && ((unsigned HOST_WIDE_INT) estimated_niter
1589 <= MAX (th, (unsigned)min_profitable_estimate)))
1591 if (dump_enabled_p ())
1592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1593 "not vectorized: estimated iteration count too "
1594 "small.\n");
1595 if (dump_enabled_p ())
1596 dump_printf_loc (MSG_NOTE, vect_location,
1597 "not vectorized: estimated iteration count smaller "
1598 "than specified loop bound parameter or minimum "
1599 "profitable iterations (whichever is more "
1600 "conservative).\n");
1601 return false;
1604 return true;
1608 /* Function vect_analyze_loop_2.
1610 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1611 for it. The different analyses will record information in the
1612 loop_vec_info struct. */
1613 static bool
1614 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1616 bool ok, slp = false;
1617 int max_vf = MAX_VECTORIZATION_FACTOR;
1618 int min_vf = 2;
1620 /* Find all data references in the loop (which correspond to vdefs/vuses)
1621 and analyze their evolution in the loop. Also adjust the minimal
1622 vectorization factor according to the loads and stores.
1624 FORNOW: Handle only simple, array references, which
1625 alignment can be forced, and aligned pointer-references. */
1627 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1628 if (!ok)
1630 if (dump_enabled_p ())
1631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1632 "bad data references.\n");
1633 return false;
1636 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1637 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1639 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1640 if (!ok)
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644 "bad data access.\n");
1645 return false;
1648 /* Classify all cross-iteration scalar data-flow cycles.
1649 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1651 vect_analyze_scalar_cycles (loop_vinfo);
1653 vect_pattern_recog (loop_vinfo, NULL);
1655 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1657 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1658 if (!ok)
1660 if (dump_enabled_p ())
1661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1662 "unexpected pattern.\n");
1663 return false;
1666 /* Analyze data dependences between the data-refs in the loop
1667 and adjust the maximum vectorization factor according to
1668 the dependences.
1669 FORNOW: fail at the first data dependence that we encounter. */
1671 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1672 if (!ok
1673 || max_vf < min_vf)
1675 if (dump_enabled_p ())
1676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677 "bad data dependence.\n");
1678 return false;
1681 ok = vect_determine_vectorization_factor (loop_vinfo);
1682 if (!ok)
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686 "can't determine vectorization factor.\n");
1687 return false;
1689 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1691 if (dump_enabled_p ())
1692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1693 "bad data dependence.\n");
1694 return false;
1697 /* Analyze the alignment of the data-refs in the loop.
1698 Fail if a data reference is found that cannot be vectorized. */
1700 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1701 if (!ok)
1703 if (dump_enabled_p ())
1704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1705 "bad data alignment.\n");
1706 return false;
1709 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1710 It is important to call pruning after vect_analyze_data_ref_accesses,
1711 since we use grouping information gathered by interleaving analysis. */
1712 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1713 if (!ok)
1715 if (dump_enabled_p ())
1716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717 "too long list of versioning for alias "
1718 "run-time tests.\n");
1719 return false;
1722 /* This pass will decide on using loop versioning and/or loop peeling in
1723 order to enhance the alignment of data references in the loop. */
1725 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1726 if (!ok)
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730 "bad data alignment.\n");
1731 return false;
1734 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1735 ok = vect_analyze_slp (loop_vinfo, NULL);
1736 if (ok)
1738 /* Decide which possible SLP instances to SLP. */
1739 slp = vect_make_slp_decision (loop_vinfo);
1741 /* Find stmts that need to be both vectorized and SLPed. */
1742 vect_detect_hybrid_slp (loop_vinfo);
1744 else
1745 return false;
1747 /* Scan all the operations in the loop and make sure they are
1748 vectorizable. */
1750 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1751 if (!ok)
1753 if (dump_enabled_p ())
1754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1755 "bad operation or unsupported loop bound.\n");
1756 return false;
1759 /* Decide whether we need to create an epilogue loop to handle
1760 remaining scalar iterations. */
1761 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1762 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1764 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1765 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1766 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1767 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1769 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1770 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1771 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
1772 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1774 /* If an epilogue loop is required make sure we can create one. */
1775 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1776 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1778 if (dump_enabled_p ())
1779 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1780 if (!vect_can_advance_ivs_p (loop_vinfo)
1781 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1782 single_exit (LOOP_VINFO_LOOP
1783 (loop_vinfo))))
1785 if (dump_enabled_p ())
1786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1787 "not vectorized: can't create required "
1788 "epilog loop\n");
1789 return false;
1793 return true;
1796 /* Function vect_analyze_loop.
1798 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1799 for it. The different analyses will record information in the
1800 loop_vec_info struct. */
1801 loop_vec_info
1802 vect_analyze_loop (struct loop *loop)
1804 loop_vec_info loop_vinfo;
1805 unsigned int vector_sizes;
1807 /* Autodetect first vector size we try. */
1808 current_vector_size = 0;
1809 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1811 if (dump_enabled_p ())
1812 dump_printf_loc (MSG_NOTE, vect_location,
1813 "===== analyze_loop_nest =====\n");
1815 if (loop_outer (loop)
1816 && loop_vec_info_for_loop (loop_outer (loop))
1817 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1819 if (dump_enabled_p ())
1820 dump_printf_loc (MSG_NOTE, vect_location,
1821 "outer-loop already vectorized.\n");
1822 return NULL;
1825 while (1)
1827 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1828 loop_vinfo = vect_analyze_loop_form (loop);
1829 if (!loop_vinfo)
1831 if (dump_enabled_p ())
1832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1833 "bad loop form.\n");
1834 return NULL;
1837 if (vect_analyze_loop_2 (loop_vinfo))
1839 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1841 return loop_vinfo;
1844 destroy_loop_vec_info (loop_vinfo, true);
1846 vector_sizes &= ~current_vector_size;
1847 if (vector_sizes == 0
1848 || current_vector_size == 0)
1849 return NULL;
1851 /* Try the next biggest vector size. */
1852 current_vector_size = 1 << floor_log2 (vector_sizes);
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_NOTE, vect_location,
1855 "***** Re-trying analysis with "
1856 "vector size %d\n", current_vector_size);
1861 /* Function reduction_code_for_scalar_code
1863 Input:
1864 CODE - tree_code of a reduction operations.
1866 Output:
1867 REDUC_CODE - the corresponding tree-code to be used to reduce the
1868 vector of partial results into a single scalar result (which
1869 will also reside in a vector) or ERROR_MARK if the operation is
1870 a supported reduction operation, but does not have such tree-code.
1872 Return FALSE if CODE currently cannot be vectorized as reduction. */
1874 static bool
1875 reduction_code_for_scalar_code (enum tree_code code,
1876 enum tree_code *reduc_code)
1878 switch (code)
1880 case MAX_EXPR:
1881 *reduc_code = REDUC_MAX_EXPR;
1882 return true;
1884 case MIN_EXPR:
1885 *reduc_code = REDUC_MIN_EXPR;
1886 return true;
1888 case PLUS_EXPR:
1889 *reduc_code = REDUC_PLUS_EXPR;
1890 return true;
1892 case MULT_EXPR:
1893 case MINUS_EXPR:
1894 case BIT_IOR_EXPR:
1895 case BIT_XOR_EXPR:
1896 case BIT_AND_EXPR:
1897 *reduc_code = ERROR_MARK;
1898 return true;
1900 default:
1901 return false;
1906 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1907 STMT is printed with a message MSG. */
1909 static void
1910 report_vect_op (int msg_type, gimple stmt, const char *msg)
1912 dump_printf_loc (msg_type, vect_location, "%s", msg);
1913 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1914 dump_printf (msg_type, "\n");
1918 /* Detect SLP reduction of the form:
1920 #a1 = phi <a5, a0>
1921 a2 = operation (a1)
1922 a3 = operation (a2)
1923 a4 = operation (a3)
1924 a5 = operation (a4)
1926 #a = phi <a5>
1928 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1929 FIRST_STMT is the first reduction stmt in the chain
1930 (a2 = operation (a1)).
1932 Return TRUE if a reduction chain was detected. */
1934 static bool
1935 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1937 struct loop *loop = (gimple_bb (phi))->loop_father;
1938 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1939 enum tree_code code;
1940 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1941 stmt_vec_info use_stmt_info, current_stmt_info;
1942 tree lhs;
1943 imm_use_iterator imm_iter;
1944 use_operand_p use_p;
1945 int nloop_uses, size = 0, n_out_of_loop_uses;
1946 bool found = false;
1948 if (loop != vect_loop)
1949 return false;
1951 lhs = PHI_RESULT (phi);
1952 code = gimple_assign_rhs_code (first_stmt);
1953 while (1)
1955 nloop_uses = 0;
1956 n_out_of_loop_uses = 0;
1957 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1959 gimple use_stmt = USE_STMT (use_p);
1960 if (is_gimple_debug (use_stmt))
1961 continue;
1963 use_stmt = USE_STMT (use_p);
1965 /* Check if we got back to the reduction phi. */
1966 if (use_stmt == phi)
1968 loop_use_stmt = use_stmt;
1969 found = true;
1970 break;
1973 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1975 if (vinfo_for_stmt (use_stmt)
1976 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1978 loop_use_stmt = use_stmt;
1979 nloop_uses++;
1982 else
1983 n_out_of_loop_uses++;
1985 /* There are can be either a single use in the loop or two uses in
1986 phi nodes. */
1987 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1988 return false;
1991 if (found)
1992 break;
1994 /* We reached a statement with no loop uses. */
1995 if (nloop_uses == 0)
1996 return false;
1998 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1999 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2000 return false;
2002 if (!is_gimple_assign (loop_use_stmt)
2003 || code != gimple_assign_rhs_code (loop_use_stmt)
2004 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2005 return false;
2007 /* Insert USE_STMT into reduction chain. */
2008 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2009 if (current_stmt)
2011 current_stmt_info = vinfo_for_stmt (current_stmt);
2012 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2013 GROUP_FIRST_ELEMENT (use_stmt_info)
2014 = GROUP_FIRST_ELEMENT (current_stmt_info);
2016 else
2017 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2019 lhs = gimple_assign_lhs (loop_use_stmt);
2020 current_stmt = loop_use_stmt;
2021 size++;
2024 if (!found || loop_use_stmt != phi || size < 2)
2025 return false;
2027 /* Swap the operands, if needed, to make the reduction operand be the second
2028 operand. */
2029 lhs = PHI_RESULT (phi);
2030 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2031 while (next_stmt)
2033 if (gimple_assign_rhs2 (next_stmt) == lhs)
2035 tree op = gimple_assign_rhs1 (next_stmt);
2036 gimple def_stmt = NULL;
2038 if (TREE_CODE (op) == SSA_NAME)
2039 def_stmt = SSA_NAME_DEF_STMT (op);
2041 /* Check that the other def is either defined in the loop
2042 ("vect_internal_def"), or it's an induction (defined by a
2043 loop-header phi-node). */
2044 if (def_stmt
2045 && gimple_bb (def_stmt)
2046 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2047 && (is_gimple_assign (def_stmt)
2048 || is_gimple_call (def_stmt)
2049 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2050 == vect_induction_def
2051 || (gimple_code (def_stmt) == GIMPLE_PHI
2052 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2053 == vect_internal_def
2054 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2056 lhs = gimple_assign_lhs (next_stmt);
2057 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2058 continue;
2061 return false;
2063 else
2065 tree op = gimple_assign_rhs2 (next_stmt);
2066 gimple def_stmt = NULL;
2068 if (TREE_CODE (op) == SSA_NAME)
2069 def_stmt = SSA_NAME_DEF_STMT (op);
2071 /* Check that the other def is either defined in the loop
2072 ("vect_internal_def"), or it's an induction (defined by a
2073 loop-header phi-node). */
2074 if (def_stmt
2075 && gimple_bb (def_stmt)
2076 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2077 && (is_gimple_assign (def_stmt)
2078 || is_gimple_call (def_stmt)
2079 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2080 == vect_induction_def
2081 || (gimple_code (def_stmt) == GIMPLE_PHI
2082 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2083 == vect_internal_def
2084 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2086 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2089 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2090 dump_printf (MSG_NOTE, "\n");
2093 swap_ssa_operands (next_stmt,
2094 gimple_assign_rhs1_ptr (next_stmt),
2095 gimple_assign_rhs2_ptr (next_stmt));
2096 update_stmt (next_stmt);
2098 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2099 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2101 else
2102 return false;
2105 lhs = gimple_assign_lhs (next_stmt);
2106 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2109 /* Save the chain for further analysis in SLP detection. */
2110 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2111 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2112 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2114 return true;
2118 /* Function vect_is_simple_reduction_1
2120 (1) Detect a cross-iteration def-use cycle that represents a simple
2121 reduction computation. We look for the following pattern:
2123 loop_header:
2124 a1 = phi < a0, a2 >
2125 a3 = ...
2126 a2 = operation (a3, a1)
2130 a3 = ...
2131 loop_header:
2132 a1 = phi < a0, a2 >
2133 a2 = operation (a3, a1)
2135 such that:
2136 1. operation is commutative and associative and it is safe to
2137 change the order of the computation (if CHECK_REDUCTION is true)
2138 2. no uses for a2 in the loop (a2 is used out of the loop)
2139 3. no uses of a1 in the loop besides the reduction operation
2140 4. no uses of a1 outside the loop.
2142 Conditions 1,4 are tested here.
2143 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2145 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2146 nested cycles, if CHECK_REDUCTION is false.
2148 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2149 reductions:
2151 a1 = phi < a0, a2 >
2152 inner loop (def of a3)
2153 a2 = phi < a3 >
2155 If MODIFY is true it tries also to rework the code in-place to enable
2156 detection of more reduction patterns. For the time being we rewrite
2157 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2160 static gimple
2161 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2162 bool check_reduction, bool *double_reduc,
2163 bool modify)
2165 struct loop *loop = (gimple_bb (phi))->loop_father;
2166 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2167 edge latch_e = loop_latch_edge (loop);
2168 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2169 gimple def_stmt, def1 = NULL, def2 = NULL;
2170 enum tree_code orig_code, code;
2171 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2172 tree type;
2173 int nloop_uses;
2174 tree name;
2175 imm_use_iterator imm_iter;
2176 use_operand_p use_p;
2177 bool phi_def;
2179 *double_reduc = false;
2181 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2182 otherwise, we assume outer loop vectorization. */
2183 gcc_assert ((check_reduction && loop == vect_loop)
2184 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2186 name = PHI_RESULT (phi);
2187 nloop_uses = 0;
2188 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2190 gimple use_stmt = USE_STMT (use_p);
2191 if (is_gimple_debug (use_stmt))
2192 continue;
2194 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198 "intermediate value used outside loop.\n");
2200 return NULL;
2203 if (vinfo_for_stmt (use_stmt)
2204 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2205 nloop_uses++;
2206 if (nloop_uses > 1)
2208 if (dump_enabled_p ())
2209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2210 "reduction used in loop.\n");
2211 return NULL;
2215 if (TREE_CODE (loop_arg) != SSA_NAME)
2217 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "reduction: not ssa_name: ");
2221 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2222 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2224 return NULL;
2227 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2228 if (!def_stmt)
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2232 "reduction: no def_stmt.\n");
2233 return NULL;
2236 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2238 if (dump_enabled_p ())
2240 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2241 dump_printf (MSG_NOTE, "\n");
2243 return NULL;
2246 if (is_gimple_assign (def_stmt))
2248 name = gimple_assign_lhs (def_stmt);
2249 phi_def = false;
2251 else
2253 name = PHI_RESULT (def_stmt);
2254 phi_def = true;
2257 nloop_uses = 0;
2258 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2260 gimple use_stmt = USE_STMT (use_p);
2261 if (is_gimple_debug (use_stmt))
2262 continue;
2263 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2264 && vinfo_for_stmt (use_stmt)
2265 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2266 nloop_uses++;
2267 if (nloop_uses > 1)
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 "reduction used in loop.\n");
2272 return NULL;
2276 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2277 defined in the inner loop. */
2278 if (phi_def)
2280 op1 = PHI_ARG_DEF (def_stmt, 0);
2282 if (gimple_phi_num_args (def_stmt) != 1
2283 || TREE_CODE (op1) != SSA_NAME)
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "unsupported phi node definition.\n");
2289 return NULL;
2292 def1 = SSA_NAME_DEF_STMT (op1);
2293 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2294 && loop->inner
2295 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2296 && is_gimple_assign (def1))
2298 if (dump_enabled_p ())
2299 report_vect_op (MSG_NOTE, def_stmt,
2300 "detected double reduction: ");
2302 *double_reduc = true;
2303 return def_stmt;
2306 return NULL;
2309 code = orig_code = gimple_assign_rhs_code (def_stmt);
2311 /* We can handle "res -= x[i]", which is non-associative by
2312 simply rewriting this into "res += -x[i]". Avoid changing
2313 gimple instruction for the first simple tests and only do this
2314 if we're allowed to change code at all. */
2315 if (code == MINUS_EXPR
2316 && modify
2317 && (op1 = gimple_assign_rhs1 (def_stmt))
2318 && TREE_CODE (op1) == SSA_NAME
2319 && SSA_NAME_DEF_STMT (op1) == phi)
2320 code = PLUS_EXPR;
2322 if (check_reduction
2323 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2325 if (dump_enabled_p ())
2326 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2327 "reduction: not commutative/associative: ");
2328 return NULL;
2331 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2333 if (code != COND_EXPR)
2335 if (dump_enabled_p ())
2336 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2337 "reduction: not binary operation: ");
2339 return NULL;
2342 op3 = gimple_assign_rhs1 (def_stmt);
2343 if (COMPARISON_CLASS_P (op3))
2345 op4 = TREE_OPERAND (op3, 1);
2346 op3 = TREE_OPERAND (op3, 0);
2349 op1 = gimple_assign_rhs2 (def_stmt);
2350 op2 = gimple_assign_rhs3 (def_stmt);
2352 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2354 if (dump_enabled_p ())
2355 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2356 "reduction: uses not ssa_names: ");
2358 return NULL;
2361 else
2363 op1 = gimple_assign_rhs1 (def_stmt);
2364 op2 = gimple_assign_rhs2 (def_stmt);
2366 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2368 if (dump_enabled_p ())
2369 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370 "reduction: uses not ssa_names: ");
2372 return NULL;
2376 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2377 if ((TREE_CODE (op1) == SSA_NAME
2378 && !types_compatible_p (type,TREE_TYPE (op1)))
2379 || (TREE_CODE (op2) == SSA_NAME
2380 && !types_compatible_p (type, TREE_TYPE (op2)))
2381 || (op3 && TREE_CODE (op3) == SSA_NAME
2382 && !types_compatible_p (type, TREE_TYPE (op3)))
2383 || (op4 && TREE_CODE (op4) == SSA_NAME
2384 && !types_compatible_p (type, TREE_TYPE (op4))))
2386 if (dump_enabled_p ())
2388 dump_printf_loc (MSG_NOTE, vect_location,
2389 "reduction: multiple types: operation type: ");
2390 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2391 dump_printf (MSG_NOTE, ", operands types: ");
2392 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2393 TREE_TYPE (op1));
2394 dump_printf (MSG_NOTE, ",");
2395 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2396 TREE_TYPE (op2));
2397 if (op3)
2399 dump_printf (MSG_NOTE, ",");
2400 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2401 TREE_TYPE (op3));
2404 if (op4)
2406 dump_printf (MSG_NOTE, ",");
2407 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2408 TREE_TYPE (op4));
2410 dump_printf (MSG_NOTE, "\n");
2413 return NULL;
2416 /* Check that it's ok to change the order of the computation.
2417 Generally, when vectorizing a reduction we change the order of the
2418 computation. This may change the behavior of the program in some
2419 cases, so we need to check that this is ok. One exception is when
2420 vectorizing an outer-loop: the inner-loop is executed sequentially,
2421 and therefore vectorizing reductions in the inner-loop during
2422 outer-loop vectorization is safe. */
2424 /* CHECKME: check for !flag_finite_math_only too? */
2425 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2426 && check_reduction)
2428 /* Changing the order of operations changes the semantics. */
2429 if (dump_enabled_p ())
2430 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2431 "reduction: unsafe fp math optimization: ");
2432 return NULL;
2434 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2435 && check_reduction)
2437 /* Changing the order of operations changes the semantics. */
2438 if (dump_enabled_p ())
2439 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2440 "reduction: unsafe int math optimization: ");
2441 return NULL;
2443 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2445 /* Changing the order of operations changes the semantics. */
2446 if (dump_enabled_p ())
2447 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2448 "reduction: unsafe fixed-point math optimization: ");
2449 return NULL;
2452 /* If we detected "res -= x[i]" earlier, rewrite it into
2453 "res += -x[i]" now. If this turns out to be useless reassoc
2454 will clean it up again. */
2455 if (orig_code == MINUS_EXPR)
2457 tree rhs = gimple_assign_rhs2 (def_stmt);
2458 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2459 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2460 rhs, NULL);
2461 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2462 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2463 loop_info, NULL));
2464 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2465 gimple_assign_set_rhs2 (def_stmt, negrhs);
2466 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2467 update_stmt (def_stmt);
2470 /* Reduction is safe. We're dealing with one of the following:
2471 1) integer arithmetic and no trapv
2472 2) floating point arithmetic, and special flags permit this optimization
2473 3) nested cycle (i.e., outer loop vectorization). */
2474 if (TREE_CODE (op1) == SSA_NAME)
2475 def1 = SSA_NAME_DEF_STMT (op1);
2477 if (TREE_CODE (op2) == SSA_NAME)
2478 def2 = SSA_NAME_DEF_STMT (op2);
2480 if (code != COND_EXPR
2481 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2483 if (dump_enabled_p ())
2484 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2485 return NULL;
2488 /* Check that one def is the reduction def, defined by PHI,
2489 the other def is either defined in the loop ("vect_internal_def"),
2490 or it's an induction (defined by a loop-header phi-node). */
2492 if (def2 && def2 == phi
2493 && (code == COND_EXPR
2494 || !def1 || gimple_nop_p (def1)
2495 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2496 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2497 && (is_gimple_assign (def1)
2498 || is_gimple_call (def1)
2499 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2500 == vect_induction_def
2501 || (gimple_code (def1) == GIMPLE_PHI
2502 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2503 == vect_internal_def
2504 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2506 if (dump_enabled_p ())
2507 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2508 return def_stmt;
2511 if (def1 && def1 == phi
2512 && (code == COND_EXPR
2513 || !def2 || gimple_nop_p (def2)
2514 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2515 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2516 && (is_gimple_assign (def2)
2517 || is_gimple_call (def2)
2518 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2519 == vect_induction_def
2520 || (gimple_code (def2) == GIMPLE_PHI
2521 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2522 == vect_internal_def
2523 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2525 if (check_reduction)
2527 /* Swap operands (just for simplicity - so that the rest of the code
2528 can assume that the reduction variable is always the last (second)
2529 argument). */
2530 if (dump_enabled_p ())
2531 report_vect_op (MSG_NOTE, def_stmt,
2532 "detected reduction: need to swap operands: ");
2534 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2535 gimple_assign_rhs2_ptr (def_stmt));
2537 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2538 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2540 else
2542 if (dump_enabled_p ())
2543 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2546 return def_stmt;
2549 /* Try to find SLP reduction chain. */
2550 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2552 if (dump_enabled_p ())
2553 report_vect_op (MSG_NOTE, def_stmt,
2554 "reduction: detected reduction chain: ");
2556 return def_stmt;
2559 if (dump_enabled_p ())
2560 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2561 "reduction: unknown pattern: ");
2563 return NULL;
2566 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2567 in-place. Arguments as there. */
2569 static gimple
2570 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2571 bool check_reduction, bool *double_reduc)
2573 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2574 double_reduc, false);
2577 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2578 in-place if it enables detection of more reductions. Arguments
2579 as there. */
2581 gimple
2582 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2583 bool check_reduction, bool *double_reduc)
2585 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2586 double_reduc, true);
2589 /* Calculate the cost of one scalar iteration of the loop. */
2591 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2594 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2595 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2596 int innerloop_iters, i, stmt_cost;
2598 /* Count statements in scalar loop. Using this as scalar cost for a single
2599 iteration for now.
2601 TODO: Add outer loop support.
2603 TODO: Consider assigning different costs to different scalar
2604 statements. */
2606 /* FORNOW. */
2607 innerloop_iters = 1;
2608 if (loop->inner)
2609 innerloop_iters = 50; /* FIXME */
2611 for (i = 0; i < nbbs; i++)
2613 gimple_stmt_iterator si;
2614 basic_block bb = bbs[i];
2616 if (bb->loop_father == loop->inner)
2617 factor = innerloop_iters;
2618 else
2619 factor = 1;
2621 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2623 gimple stmt = gsi_stmt (si);
2624 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2626 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2627 continue;
2629 /* Skip stmts that are not vectorized inside the loop. */
2630 if (stmt_info
2631 && !STMT_VINFO_RELEVANT_P (stmt_info)
2632 && (!STMT_VINFO_LIVE_P (stmt_info)
2633 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2634 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2635 continue;
2637 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2639 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2640 stmt_cost = vect_get_stmt_cost (scalar_load);
2641 else
2642 stmt_cost = vect_get_stmt_cost (scalar_store);
2644 else
2645 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2647 scalar_single_iter_cost += stmt_cost * factor;
2650 return scalar_single_iter_cost;
2653 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2655 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2656 int *peel_iters_epilogue,
2657 int scalar_single_iter_cost,
2658 stmt_vector_for_cost *prologue_cost_vec,
2659 stmt_vector_for_cost *epilogue_cost_vec)
2661 int retval = 0;
2662 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2664 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2666 *peel_iters_epilogue = vf/2;
2667 if (dump_enabled_p ())
2668 dump_printf_loc (MSG_NOTE, vect_location,
2669 "cost model: epilogue peel iters set to vf/2 "
2670 "because loop iterations are unknown .\n");
2672 /* If peeled iterations are known but number of scalar loop
2673 iterations are unknown, count a taken branch per peeled loop. */
2674 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2675 NULL, 0, vect_prologue);
2677 else
2679 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2680 peel_iters_prologue = niters < peel_iters_prologue ?
2681 niters : peel_iters_prologue;
2682 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2683 /* If we need to peel for gaps, but no peeling is required, we have to
2684 peel VF iterations. */
2685 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2686 *peel_iters_epilogue = vf;
2689 if (peel_iters_prologue)
2690 retval += record_stmt_cost (prologue_cost_vec,
2691 peel_iters_prologue * scalar_single_iter_cost,
2692 scalar_stmt, NULL, 0, vect_prologue);
2693 if (*peel_iters_epilogue)
2694 retval += record_stmt_cost (epilogue_cost_vec,
2695 *peel_iters_epilogue * scalar_single_iter_cost,
2696 scalar_stmt, NULL, 0, vect_epilogue);
2697 return retval;
2700 /* Function vect_estimate_min_profitable_iters
2702 Return the number of iterations required for the vector version of the
2703 loop to be profitable relative to the cost of the scalar version of the
2704 loop. */
2706 static void
2707 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2708 int *ret_min_profitable_niters,
2709 int *ret_min_profitable_estimate)
2711 int min_profitable_iters;
2712 int min_profitable_estimate;
2713 int peel_iters_prologue;
2714 int peel_iters_epilogue;
2715 unsigned vec_inside_cost = 0;
2716 int vec_outside_cost = 0;
2717 unsigned vec_prologue_cost = 0;
2718 unsigned vec_epilogue_cost = 0;
2719 int scalar_single_iter_cost = 0;
2720 int scalar_outside_cost = 0;
2721 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2722 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2723 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2725 /* Cost model disabled. */
2726 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2728 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2729 *ret_min_profitable_niters = 0;
2730 *ret_min_profitable_estimate = 0;
2731 return;
2734 /* Requires loop versioning tests to handle misalignment. */
2735 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2737 /* FIXME: Make cost depend on complexity of individual check. */
2738 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2739 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2740 vect_prologue);
2741 dump_printf (MSG_NOTE,
2742 "cost model: Adding cost of checks for loop "
2743 "versioning to treat misalignment.\n");
2746 /* Requires loop versioning with alias checks. */
2747 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2749 /* FIXME: Make cost depend on complexity of individual check. */
2750 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2751 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2752 vect_prologue);
2753 dump_printf (MSG_NOTE,
2754 "cost model: Adding cost of checks for loop "
2755 "versioning aliasing.\n");
2758 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2759 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2760 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2761 vect_prologue);
2763 /* Count statements in scalar loop. Using this as scalar cost for a single
2764 iteration for now.
2766 TODO: Add outer loop support.
2768 TODO: Consider assigning different costs to different scalar
2769 statements. */
2771 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2773 /* Add additional cost for the peeled instructions in prologue and epilogue
2774 loop.
2776 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2777 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2779 TODO: Build an expression that represents peel_iters for prologue and
2780 epilogue to be used in a run-time test. */
2782 if (npeel < 0)
2784 peel_iters_prologue = vf/2;
2785 dump_printf (MSG_NOTE, "cost model: "
2786 "prologue peel iters set to vf/2.\n");
2788 /* If peeling for alignment is unknown, loop bound of main loop becomes
2789 unknown. */
2790 peel_iters_epilogue = vf/2;
2791 dump_printf (MSG_NOTE, "cost model: "
2792 "epilogue peel iters set to vf/2 because "
2793 "peeling for alignment is unknown.\n");
2795 /* If peeled iterations are unknown, count a taken branch and a not taken
2796 branch per peeled loop. Even if scalar loop iterations are known,
2797 vector iterations are not known since peeled prologue iterations are
2798 not known. Hence guards remain the same. */
2799 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2800 NULL, 0, vect_prologue);
2801 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2802 NULL, 0, vect_prologue);
2803 /* FORNOW: Don't attempt to pass individual scalar instructions to
2804 the model; just assume linear cost for scalar iterations. */
2805 (void) add_stmt_cost (target_cost_data,
2806 peel_iters_prologue * scalar_single_iter_cost,
2807 scalar_stmt, NULL, 0, vect_prologue);
2808 (void) add_stmt_cost (target_cost_data,
2809 peel_iters_epilogue * scalar_single_iter_cost,
2810 scalar_stmt, NULL, 0, vect_epilogue);
2812 else
2814 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2815 stmt_info_for_cost *si;
2816 int j;
2817 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2819 prologue_cost_vec.create (2);
2820 epilogue_cost_vec.create (2);
2821 peel_iters_prologue = npeel;
2823 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2824 &peel_iters_epilogue,
2825 scalar_single_iter_cost,
2826 &prologue_cost_vec,
2827 &epilogue_cost_vec);
2829 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2831 struct _stmt_vec_info *stmt_info
2832 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2833 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2834 si->misalign, vect_prologue);
2837 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2839 struct _stmt_vec_info *stmt_info
2840 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2841 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2842 si->misalign, vect_epilogue);
2845 prologue_cost_vec.release ();
2846 epilogue_cost_vec.release ();
2849 /* FORNOW: The scalar outside cost is incremented in one of the
2850 following ways:
2852 1. The vectorizer checks for alignment and aliasing and generates
2853 a condition that allows dynamic vectorization. A cost model
2854 check is ANDED with the versioning condition. Hence scalar code
2855 path now has the added cost of the versioning check.
2857 if (cost > th & versioning_check)
2858 jmp to vector code
2860 Hence run-time scalar is incremented by not-taken branch cost.
2862 2. The vectorizer then checks if a prologue is required. If the
2863 cost model check was not done before during versioning, it has to
2864 be done before the prologue check.
2866 if (cost <= th)
2867 prologue = scalar_iters
2868 if (prologue == 0)
2869 jmp to vector code
2870 else
2871 execute prologue
2872 if (prologue == num_iters)
2873 go to exit
2875 Hence the run-time scalar cost is incremented by a taken branch,
2876 plus a not-taken branch, plus a taken branch cost.
2878 3. The vectorizer then checks if an epilogue is required. If the
2879 cost model check was not done before during prologue check, it
2880 has to be done with the epilogue check.
2882 if (prologue == 0)
2883 jmp to vector code
2884 else
2885 execute prologue
2886 if (prologue == num_iters)
2887 go to exit
2888 vector code:
2889 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2890 jmp to epilogue
2892 Hence the run-time scalar cost should be incremented by 2 taken
2893 branches.
2895 TODO: The back end may reorder the BBS's differently and reverse
2896 conditions/branch directions. Change the estimates below to
2897 something more reasonable. */
2899 /* If the number of iterations is known and we do not do versioning, we can
2900 decide whether to vectorize at compile time. Hence the scalar version
2901 do not carry cost model guard costs. */
2902 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2903 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2904 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2906 /* Cost model check occurs at versioning. */
2907 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2908 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2909 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2910 else
2912 /* Cost model check occurs at prologue generation. */
2913 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2914 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2915 + vect_get_stmt_cost (cond_branch_not_taken);
2916 /* Cost model check occurs at epilogue generation. */
2917 else
2918 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2922 /* Complete the target-specific cost calculations. */
2923 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2924 &vec_inside_cost, &vec_epilogue_cost);
2926 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2928 /* Calculate number of iterations required to make the vector version
2929 profitable, relative to the loop bodies only. The following condition
2930 must hold true:
2931 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2932 where
2933 SIC = scalar iteration cost, VIC = vector iteration cost,
2934 VOC = vector outside cost, VF = vectorization factor,
2935 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2936 SOC = scalar outside cost for run time cost model check. */
2938 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2940 if (vec_outside_cost <= 0)
2941 min_profitable_iters = 1;
2942 else
2944 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2945 - vec_inside_cost * peel_iters_prologue
2946 - vec_inside_cost * peel_iters_epilogue)
2947 / ((scalar_single_iter_cost * vf)
2948 - vec_inside_cost);
2950 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2951 <= (((int) vec_inside_cost * min_profitable_iters)
2952 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2953 min_profitable_iters++;
2956 /* vector version will never be profitable. */
2957 else
2959 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
2960 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
2961 "did not happen for a simd loop");
2963 if (dump_enabled_p ())
2964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2965 "cost model: the vector iteration cost = %d "
2966 "divided by the scalar iteration cost = %d "
2967 "is greater or equal to the vectorization factor = %d"
2968 ".\n",
2969 vec_inside_cost, scalar_single_iter_cost, vf);
2970 *ret_min_profitable_niters = -1;
2971 *ret_min_profitable_estimate = -1;
2972 return;
2975 if (dump_enabled_p ())
2977 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2978 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2979 vec_inside_cost);
2980 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2981 vec_prologue_cost);
2982 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2983 vec_epilogue_cost);
2984 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2985 scalar_single_iter_cost);
2986 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2987 scalar_outside_cost);
2988 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
2989 vec_outside_cost);
2990 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2991 peel_iters_prologue);
2992 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2993 peel_iters_epilogue);
2994 dump_printf (MSG_NOTE,
2995 " Calculated minimum iters for profitability: %d\n",
2996 min_profitable_iters);
2997 dump_printf (MSG_NOTE, "\n");
3000 min_profitable_iters =
3001 min_profitable_iters < vf ? vf : min_profitable_iters;
3003 /* Because the condition we create is:
3004 if (niters <= min_profitable_iters)
3005 then skip the vectorized loop. */
3006 min_profitable_iters--;
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_NOTE, vect_location,
3010 " Runtime profitability threshold = %d\n",
3011 min_profitable_iters);
3013 *ret_min_profitable_niters = min_profitable_iters;
3015 /* Calculate number of iterations required to make the vector version
3016 profitable, relative to the loop bodies only.
3018 Non-vectorized variant is SIC * niters and it must win over vector
3019 variant on the expected loop trip count. The following condition must hold true:
3020 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3022 if (vec_outside_cost <= 0)
3023 min_profitable_estimate = 1;
3024 else
3026 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3027 - vec_inside_cost * peel_iters_prologue
3028 - vec_inside_cost * peel_iters_epilogue)
3029 / ((scalar_single_iter_cost * vf)
3030 - vec_inside_cost);
3032 min_profitable_estimate --;
3033 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3034 if (dump_enabled_p ())
3035 dump_printf_loc (MSG_NOTE, vect_location,
3036 " Static estimate profitability threshold = %d\n",
3037 min_profitable_iters);
3039 *ret_min_profitable_estimate = min_profitable_estimate;
3043 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3044 functions. Design better to avoid maintenance issues. */
3046 /* Function vect_model_reduction_cost.
3048 Models cost for a reduction operation, including the vector ops
3049 generated within the strip-mine loop, the initial definition before
3050 the loop, and the epilogue code that must be generated. */
3052 static bool
3053 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3054 int ncopies)
3056 int prologue_cost = 0, epilogue_cost = 0;
3057 enum tree_code code;
3058 optab optab;
3059 tree vectype;
3060 gimple stmt, orig_stmt;
3061 tree reduction_op;
3062 enum machine_mode mode;
3063 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3064 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3065 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3067 /* Cost of reduction op inside loop. */
3068 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3069 stmt_info, 0, vect_body);
3070 stmt = STMT_VINFO_STMT (stmt_info);
3072 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3074 case GIMPLE_SINGLE_RHS:
3075 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3076 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3077 break;
3078 case GIMPLE_UNARY_RHS:
3079 reduction_op = gimple_assign_rhs1 (stmt);
3080 break;
3081 case GIMPLE_BINARY_RHS:
3082 reduction_op = gimple_assign_rhs2 (stmt);
3083 break;
3084 case GIMPLE_TERNARY_RHS:
3085 reduction_op = gimple_assign_rhs3 (stmt);
3086 break;
3087 default:
3088 gcc_unreachable ();
3091 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3092 if (!vectype)
3094 if (dump_enabled_p ())
3096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3097 "unsupported data-type ");
3098 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3099 TREE_TYPE (reduction_op));
3100 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3102 return false;
3105 mode = TYPE_MODE (vectype);
3106 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3108 if (!orig_stmt)
3109 orig_stmt = STMT_VINFO_STMT (stmt_info);
3111 code = gimple_assign_rhs_code (orig_stmt);
3113 /* Add in cost for initial definition. */
3114 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3115 stmt_info, 0, vect_prologue);
3117 /* Determine cost of epilogue code.
3119 We have a reduction operator that will reduce the vector in one statement.
3120 Also requires scalar extract. */
3122 if (!nested_in_vect_loop_p (loop, orig_stmt))
3124 if (reduc_code != ERROR_MARK)
3126 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3127 stmt_info, 0, vect_epilogue);
3128 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3129 stmt_info, 0, vect_epilogue);
3131 else
3133 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3134 tree bitsize =
3135 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3136 int element_bitsize = tree_to_uhwi (bitsize);
3137 int nelements = vec_size_in_bits / element_bitsize;
3139 optab = optab_for_tree_code (code, vectype, optab_default);
3141 /* We have a whole vector shift available. */
3142 if (VECTOR_MODE_P (mode)
3143 && optab_handler (optab, mode) != CODE_FOR_nothing
3144 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3146 /* Final reduction via vector shifts and the reduction operator.
3147 Also requires scalar extract. */
3148 epilogue_cost += add_stmt_cost (target_cost_data,
3149 exact_log2 (nelements) * 2,
3150 vector_stmt, stmt_info, 0,
3151 vect_epilogue);
3152 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3153 vec_to_scalar, stmt_info, 0,
3154 vect_epilogue);
3156 else
3157 /* Use extracts and reduction op for final reduction. For N
3158 elements, we have N extracts and N-1 reduction ops. */
3159 epilogue_cost += add_stmt_cost (target_cost_data,
3160 nelements + nelements - 1,
3161 vector_stmt, stmt_info, 0,
3162 vect_epilogue);
3166 if (dump_enabled_p ())
3167 dump_printf (MSG_NOTE,
3168 "vect_model_reduction_cost: inside_cost = %d, "
3169 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3170 prologue_cost, epilogue_cost);
3172 return true;
3176 /* Function vect_model_induction_cost.
3178 Models cost for induction operations. */
3180 static void
3181 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3183 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3184 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3185 unsigned inside_cost, prologue_cost;
3187 /* loop cost for vec_loop. */
3188 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3189 stmt_info, 0, vect_body);
3191 /* prologue cost for vec_init and vec_step. */
3192 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3193 stmt_info, 0, vect_prologue);
3195 if (dump_enabled_p ())
3196 dump_printf_loc (MSG_NOTE, vect_location,
3197 "vect_model_induction_cost: inside_cost = %d, "
3198 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3202 /* Function get_initial_def_for_induction
3204 Input:
3205 STMT - a stmt that performs an induction operation in the loop.
3206 IV_PHI - the initial value of the induction variable
3208 Output:
3209 Return a vector variable, initialized with the first VF values of
3210 the induction variable. E.g., for an iv with IV_PHI='X' and
3211 evolution S, for a vector of 4 units, we want to return:
3212 [X, X + S, X + 2*S, X + 3*S]. */
3214 static tree
3215 get_initial_def_for_induction (gimple iv_phi)
3217 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3219 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3220 tree vectype;
3221 int nunits;
3222 edge pe = loop_preheader_edge (loop);
3223 struct loop *iv_loop;
3224 basic_block new_bb;
3225 tree new_vec, vec_init, vec_step, t;
3226 tree new_var;
3227 tree new_name;
3228 gimple init_stmt, induction_phi, new_stmt;
3229 tree induc_def, vec_def, vec_dest;
3230 tree init_expr, step_expr;
3231 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3232 int i;
3233 int ncopies;
3234 tree expr;
3235 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3236 bool nested_in_vect_loop = false;
3237 gimple_seq stmts = NULL;
3238 imm_use_iterator imm_iter;
3239 use_operand_p use_p;
3240 gimple exit_phi;
3241 edge latch_e;
3242 tree loop_arg;
3243 gimple_stmt_iterator si;
3244 basic_block bb = gimple_bb (iv_phi);
3245 tree stepvectype;
3246 tree resvectype;
3248 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3249 if (nested_in_vect_loop_p (loop, iv_phi))
3251 nested_in_vect_loop = true;
3252 iv_loop = loop->inner;
3254 else
3255 iv_loop = loop;
3256 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3258 latch_e = loop_latch_edge (iv_loop);
3259 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3261 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3262 gcc_assert (step_expr != NULL_TREE);
3264 pe = loop_preheader_edge (iv_loop);
3265 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3266 loop_preheader_edge (iv_loop));
3268 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3269 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3270 gcc_assert (vectype);
3271 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3272 ncopies = vf / nunits;
3274 gcc_assert (phi_info);
3275 gcc_assert (ncopies >= 1);
3277 /* Convert the step to the desired type. */
3278 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3279 step_expr),
3280 &stmts, true, NULL_TREE);
3281 if (stmts)
3283 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3284 gcc_assert (!new_bb);
3287 /* Find the first insertion point in the BB. */
3288 si = gsi_after_labels (bb);
3290 /* Create the vector that holds the initial_value of the induction. */
3291 if (nested_in_vect_loop)
3293 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3294 been created during vectorization of previous stmts. We obtain it
3295 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3296 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3297 /* If the initial value is not of proper type, convert it. */
3298 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3300 new_stmt = gimple_build_assign_with_ops
3301 (VIEW_CONVERT_EXPR,
3302 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3303 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3304 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3305 gimple_assign_set_lhs (new_stmt, vec_init);
3306 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3307 new_stmt);
3308 gcc_assert (!new_bb);
3309 set_vinfo_for_stmt (new_stmt,
3310 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3313 else
3315 vec<constructor_elt, va_gc> *v;
3317 /* iv_loop is the loop to be vectorized. Create:
3318 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3319 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3320 vect_scalar_var, "var_");
3321 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3322 init_expr),
3323 &stmts, false, new_var);
3324 if (stmts)
3326 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3327 gcc_assert (!new_bb);
3330 vec_alloc (v, nunits);
3331 bool constant_p = is_gimple_min_invariant (new_name);
3332 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3333 for (i = 1; i < nunits; i++)
3335 /* Create: new_name_i = new_name + step_expr */
3336 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3337 new_name, step_expr);
3338 if (!is_gimple_min_invariant (new_name))
3340 init_stmt = gimple_build_assign (new_var, new_name);
3341 new_name = make_ssa_name (new_var, init_stmt);
3342 gimple_assign_set_lhs (init_stmt, new_name);
3343 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3344 gcc_assert (!new_bb);
3345 if (dump_enabled_p ())
3347 dump_printf_loc (MSG_NOTE, vect_location,
3348 "created new init_stmt: ");
3349 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3350 dump_printf (MSG_NOTE, "\n");
3352 constant_p = false;
3354 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3356 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3357 if (constant_p)
3358 new_vec = build_vector_from_ctor (vectype, v);
3359 else
3360 new_vec = build_constructor (vectype, v);
3361 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3365 /* Create the vector that holds the step of the induction. */
3366 if (nested_in_vect_loop)
3367 /* iv_loop is nested in the loop to be vectorized. Generate:
3368 vec_step = [S, S, S, S] */
3369 new_name = step_expr;
3370 else
3372 /* iv_loop is the loop to be vectorized. Generate:
3373 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3374 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3376 expr = build_int_cst (integer_type_node, vf);
3377 expr = fold_convert (TREE_TYPE (step_expr), expr);
3379 else
3380 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3381 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3382 expr, step_expr);
3383 if (TREE_CODE (step_expr) == SSA_NAME)
3384 new_name = vect_init_vector (iv_phi, new_name,
3385 TREE_TYPE (step_expr), NULL);
3388 t = unshare_expr (new_name);
3389 gcc_assert (CONSTANT_CLASS_P (new_name)
3390 || TREE_CODE (new_name) == SSA_NAME);
3391 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3392 gcc_assert (stepvectype);
3393 new_vec = build_vector_from_val (stepvectype, t);
3394 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3397 /* Create the following def-use cycle:
3398 loop prolog:
3399 vec_init = ...
3400 vec_step = ...
3401 loop:
3402 vec_iv = PHI <vec_init, vec_loop>
3404 STMT
3406 vec_loop = vec_iv + vec_step; */
3408 /* Create the induction-phi that defines the induction-operand. */
3409 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3410 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3411 set_vinfo_for_stmt (induction_phi,
3412 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3413 induc_def = PHI_RESULT (induction_phi);
3415 /* Create the iv update inside the loop */
3416 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3417 induc_def, vec_step);
3418 vec_def = make_ssa_name (vec_dest, new_stmt);
3419 gimple_assign_set_lhs (new_stmt, vec_def);
3420 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3421 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3422 NULL));
3424 /* Set the arguments of the phi node: */
3425 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3426 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3427 UNKNOWN_LOCATION);
3430 /* In case that vectorization factor (VF) is bigger than the number
3431 of elements that we can fit in a vectype (nunits), we have to generate
3432 more than one vector stmt - i.e - we need to "unroll" the
3433 vector stmt by a factor VF/nunits. For more details see documentation
3434 in vectorizable_operation. */
3436 if (ncopies > 1)
3438 stmt_vec_info prev_stmt_vinfo;
3439 /* FORNOW. This restriction should be relaxed. */
3440 gcc_assert (!nested_in_vect_loop);
3442 /* Create the vector that holds the step of the induction. */
3443 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3445 expr = build_int_cst (integer_type_node, nunits);
3446 expr = fold_convert (TREE_TYPE (step_expr), expr);
3448 else
3449 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3450 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3451 expr, step_expr);
3452 if (TREE_CODE (step_expr) == SSA_NAME)
3453 new_name = vect_init_vector (iv_phi, new_name,
3454 TREE_TYPE (step_expr), NULL);
3455 t = unshare_expr (new_name);
3456 gcc_assert (CONSTANT_CLASS_P (new_name)
3457 || TREE_CODE (new_name) == SSA_NAME);
3458 new_vec = build_vector_from_val (stepvectype, t);
3459 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3461 vec_def = induc_def;
3462 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3463 for (i = 1; i < ncopies; i++)
3465 /* vec_i = vec_prev + vec_step */
3466 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3467 vec_def, vec_step);
3468 vec_def = make_ssa_name (vec_dest, new_stmt);
3469 gimple_assign_set_lhs (new_stmt, vec_def);
3471 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3472 if (!useless_type_conversion_p (resvectype, vectype))
3474 new_stmt = gimple_build_assign_with_ops
3475 (VIEW_CONVERT_EXPR,
3476 vect_get_new_vect_var (resvectype, vect_simple_var,
3477 "vec_iv_"),
3478 build1 (VIEW_CONVERT_EXPR, resvectype,
3479 gimple_assign_lhs (new_stmt)), NULL_TREE);
3480 gimple_assign_set_lhs (new_stmt,
3481 make_ssa_name
3482 (gimple_assign_lhs (new_stmt), new_stmt));
3483 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3485 set_vinfo_for_stmt (new_stmt,
3486 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3487 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3488 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3492 if (nested_in_vect_loop)
3494 /* Find the loop-closed exit-phi of the induction, and record
3495 the final vector of induction results: */
3496 exit_phi = NULL;
3497 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3499 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3501 exit_phi = USE_STMT (use_p);
3502 break;
3505 if (exit_phi)
3507 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3508 /* FORNOW. Currently not supporting the case that an inner-loop induction
3509 is not used in the outer-loop (i.e. only outside the outer-loop). */
3510 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3511 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3513 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3514 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE, vect_location,
3517 "vector of inductions after inner-loop:");
3518 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3519 dump_printf (MSG_NOTE, "\n");
3525 if (dump_enabled_p ())
3527 dump_printf_loc (MSG_NOTE, vect_location,
3528 "transform induction: created def-use cycle: ");
3529 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3530 dump_printf (MSG_NOTE, "\n");
3531 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3532 SSA_NAME_DEF_STMT (vec_def), 0);
3533 dump_printf (MSG_NOTE, "\n");
3536 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3537 if (!useless_type_conversion_p (resvectype, vectype))
3539 new_stmt = gimple_build_assign_with_ops
3540 (VIEW_CONVERT_EXPR,
3541 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3542 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3543 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3544 gimple_assign_set_lhs (new_stmt, induc_def);
3545 si = gsi_after_labels (bb);
3546 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3547 set_vinfo_for_stmt (new_stmt,
3548 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3549 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3550 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3553 return induc_def;
3557 /* Function get_initial_def_for_reduction
3559 Input:
3560 STMT - a stmt that performs a reduction operation in the loop.
3561 INIT_VAL - the initial value of the reduction variable
3563 Output:
3564 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3565 of the reduction (used for adjusting the epilog - see below).
3566 Return a vector variable, initialized according to the operation that STMT
3567 performs. This vector will be used as the initial value of the
3568 vector of partial results.
3570 Option1 (adjust in epilog): Initialize the vector as follows:
3571 add/bit or/xor: [0,0,...,0,0]
3572 mult/bit and: [1,1,...,1,1]
3573 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3574 and when necessary (e.g. add/mult case) let the caller know
3575 that it needs to adjust the result by init_val.
3577 Option2: Initialize the vector as follows:
3578 add/bit or/xor: [init_val,0,0,...,0]
3579 mult/bit and: [init_val,1,1,...,1]
3580 min/max/cond_expr: [init_val,init_val,...,init_val]
3581 and no adjustments are needed.
3583 For example, for the following code:
3585 s = init_val;
3586 for (i=0;i<n;i++)
3587 s = s + a[i];
3589 STMT is 's = s + a[i]', and the reduction variable is 's'.
3590 For a vector of 4 units, we want to return either [0,0,0,init_val],
3591 or [0,0,0,0] and let the caller know that it needs to adjust
3592 the result at the end by 'init_val'.
3594 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3595 initialization vector is simpler (same element in all entries), if
3596 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3598 A cost model should help decide between these two schemes. */
3600 tree
3601 get_initial_def_for_reduction (gimple stmt, tree init_val,
3602 tree *adjustment_def)
3604 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3605 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3606 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3607 tree scalar_type = TREE_TYPE (init_val);
3608 tree vectype = get_vectype_for_scalar_type (scalar_type);
3609 int nunits;
3610 enum tree_code code = gimple_assign_rhs_code (stmt);
3611 tree def_for_init;
3612 tree init_def;
3613 tree *elts;
3614 int i;
3615 bool nested_in_vect_loop = false;
3616 tree init_value;
3617 REAL_VALUE_TYPE real_init_val = dconst0;
3618 int int_init_val = 0;
3619 gimple def_stmt = NULL;
3621 gcc_assert (vectype);
3622 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3624 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3625 || SCALAR_FLOAT_TYPE_P (scalar_type));
3627 if (nested_in_vect_loop_p (loop, stmt))
3628 nested_in_vect_loop = true;
3629 else
3630 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3632 /* In case of double reduction we only create a vector variable to be put
3633 in the reduction phi node. The actual statement creation is done in
3634 vect_create_epilog_for_reduction. */
3635 if (adjustment_def && nested_in_vect_loop
3636 && TREE_CODE (init_val) == SSA_NAME
3637 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3638 && gimple_code (def_stmt) == GIMPLE_PHI
3639 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3640 && vinfo_for_stmt (def_stmt)
3641 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3642 == vect_double_reduction_def)
3644 *adjustment_def = NULL;
3645 return vect_create_destination_var (init_val, vectype);
3648 if (TREE_CONSTANT (init_val))
3650 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3651 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3652 else
3653 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3655 else
3656 init_value = init_val;
3658 switch (code)
3660 case WIDEN_SUM_EXPR:
3661 case DOT_PROD_EXPR:
3662 case PLUS_EXPR:
3663 case MINUS_EXPR:
3664 case BIT_IOR_EXPR:
3665 case BIT_XOR_EXPR:
3666 case MULT_EXPR:
3667 case BIT_AND_EXPR:
3668 /* ADJUSMENT_DEF is NULL when called from
3669 vect_create_epilog_for_reduction to vectorize double reduction. */
3670 if (adjustment_def)
3672 if (nested_in_vect_loop)
3673 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3674 NULL);
3675 else
3676 *adjustment_def = init_val;
3679 if (code == MULT_EXPR)
3681 real_init_val = dconst1;
3682 int_init_val = 1;
3685 if (code == BIT_AND_EXPR)
3686 int_init_val = -1;
3688 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3689 def_for_init = build_real (scalar_type, real_init_val);
3690 else
3691 def_for_init = build_int_cst (scalar_type, int_init_val);
3693 /* Create a vector of '0' or '1' except the first element. */
3694 elts = XALLOCAVEC (tree, nunits);
3695 for (i = nunits - 2; i >= 0; --i)
3696 elts[i + 1] = def_for_init;
3698 /* Option1: the first element is '0' or '1' as well. */
3699 if (adjustment_def)
3701 elts[0] = def_for_init;
3702 init_def = build_vector (vectype, elts);
3703 break;
3706 /* Option2: the first element is INIT_VAL. */
3707 elts[0] = init_val;
3708 if (TREE_CONSTANT (init_val))
3709 init_def = build_vector (vectype, elts);
3710 else
3712 vec<constructor_elt, va_gc> *v;
3713 vec_alloc (v, nunits);
3714 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3715 for (i = 1; i < nunits; ++i)
3716 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3717 init_def = build_constructor (vectype, v);
3720 break;
3722 case MIN_EXPR:
3723 case MAX_EXPR:
3724 case COND_EXPR:
3725 if (adjustment_def)
3727 *adjustment_def = NULL_TREE;
3728 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3729 break;
3732 init_def = build_vector_from_val (vectype, init_value);
3733 break;
3735 default:
3736 gcc_unreachable ();
3739 return init_def;
3743 /* Function vect_create_epilog_for_reduction
3745 Create code at the loop-epilog to finalize the result of a reduction
3746 computation.
3748 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3749 reduction statements.
3750 STMT is the scalar reduction stmt that is being vectorized.
3751 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3752 number of elements that we can fit in a vectype (nunits). In this case
3753 we have to generate more than one vector stmt - i.e - we need to "unroll"
3754 the vector stmt by a factor VF/nunits. For more details see documentation
3755 in vectorizable_operation.
3756 REDUC_CODE is the tree-code for the epilog reduction.
3757 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3758 computation.
3759 REDUC_INDEX is the index of the operand in the right hand side of the
3760 statement that is defined by REDUCTION_PHI.
3761 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3762 SLP_NODE is an SLP node containing a group of reduction statements. The
3763 first one in this group is STMT.
3765 This function:
3766 1. Creates the reduction def-use cycles: sets the arguments for
3767 REDUCTION_PHIS:
3768 The loop-entry argument is the vectorized initial-value of the reduction.
3769 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3770 sums.
3771 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3772 by applying the operation specified by REDUC_CODE if available, or by
3773 other means (whole-vector shifts or a scalar loop).
3774 The function also creates a new phi node at the loop exit to preserve
3775 loop-closed form, as illustrated below.
3777 The flow at the entry to this function:
3779 loop:
3780 vec_def = phi <null, null> # REDUCTION_PHI
3781 VECT_DEF = vector_stmt # vectorized form of STMT
3782 s_loop = scalar_stmt # (scalar) STMT
3783 loop_exit:
3784 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3785 use <s_out0>
3786 use <s_out0>
3788 The above is transformed by this function into:
3790 loop:
3791 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3792 VECT_DEF = vector_stmt # vectorized form of STMT
3793 s_loop = scalar_stmt # (scalar) STMT
3794 loop_exit:
3795 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3796 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3797 v_out2 = reduce <v_out1>
3798 s_out3 = extract_field <v_out2, 0>
3799 s_out4 = adjust_result <s_out3>
3800 use <s_out4>
3801 use <s_out4>
3804 static void
3805 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3806 int ncopies, enum tree_code reduc_code,
3807 vec<gimple> reduction_phis,
3808 int reduc_index, bool double_reduc,
3809 slp_tree slp_node)
3811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3812 stmt_vec_info prev_phi_info;
3813 tree vectype;
3814 enum machine_mode mode;
3815 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3816 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3817 basic_block exit_bb;
3818 tree scalar_dest;
3819 tree scalar_type;
3820 gimple new_phi = NULL, phi;
3821 gimple_stmt_iterator exit_gsi;
3822 tree vec_dest;
3823 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3824 gimple epilog_stmt = NULL;
3825 enum tree_code code = gimple_assign_rhs_code (stmt);
3826 gimple exit_phi;
3827 tree bitsize, bitpos;
3828 tree adjustment_def = NULL;
3829 tree vec_initial_def = NULL;
3830 tree reduction_op, expr, def;
3831 tree orig_name, scalar_result;
3832 imm_use_iterator imm_iter, phi_imm_iter;
3833 use_operand_p use_p, phi_use_p;
3834 bool extract_scalar_result = false;
3835 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3836 bool nested_in_vect_loop = false;
3837 auto_vec<gimple> new_phis;
3838 auto_vec<gimple> inner_phis;
3839 enum vect_def_type dt = vect_unknown_def_type;
3840 int j, i;
3841 auto_vec<tree> scalar_results;
3842 unsigned int group_size = 1, k, ratio;
3843 auto_vec<tree> vec_initial_defs;
3844 auto_vec<gimple> phis;
3845 bool slp_reduc = false;
3846 tree new_phi_result;
3847 gimple inner_phi = NULL;
3849 if (slp_node)
3850 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3852 if (nested_in_vect_loop_p (loop, stmt))
3854 outer_loop = loop;
3855 loop = loop->inner;
3856 nested_in_vect_loop = true;
3857 gcc_assert (!slp_node);
3860 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3862 case GIMPLE_SINGLE_RHS:
3863 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3864 == ternary_op);
3865 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3866 break;
3867 case GIMPLE_UNARY_RHS:
3868 reduction_op = gimple_assign_rhs1 (stmt);
3869 break;
3870 case GIMPLE_BINARY_RHS:
3871 reduction_op = reduc_index ?
3872 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3873 break;
3874 case GIMPLE_TERNARY_RHS:
3875 reduction_op = gimple_op (stmt, reduc_index + 1);
3876 break;
3877 default:
3878 gcc_unreachable ();
3881 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3882 gcc_assert (vectype);
3883 mode = TYPE_MODE (vectype);
3885 /* 1. Create the reduction def-use cycle:
3886 Set the arguments of REDUCTION_PHIS, i.e., transform
3888 loop:
3889 vec_def = phi <null, null> # REDUCTION_PHI
3890 VECT_DEF = vector_stmt # vectorized form of STMT
3893 into:
3895 loop:
3896 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3897 VECT_DEF = vector_stmt # vectorized form of STMT
3900 (in case of SLP, do it for all the phis). */
3902 /* Get the loop-entry arguments. */
3903 if (slp_node)
3904 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3905 NULL, slp_node, reduc_index);
3906 else
3908 vec_initial_defs.create (1);
3909 /* For the case of reduction, vect_get_vec_def_for_operand returns
3910 the scalar def before the loop, that defines the initial value
3911 of the reduction variable. */
3912 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3913 &adjustment_def);
3914 vec_initial_defs.quick_push (vec_initial_def);
3917 /* Set phi nodes arguments. */
3918 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3920 tree vec_init_def = vec_initial_defs[i];
3921 tree def = vect_defs[i];
3922 for (j = 0; j < ncopies; j++)
3924 /* Set the loop-entry arg of the reduction-phi. */
3925 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3926 UNKNOWN_LOCATION);
3928 /* Set the loop-latch arg for the reduction-phi. */
3929 if (j > 0)
3930 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3932 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3934 if (dump_enabled_p ())
3936 dump_printf_loc (MSG_NOTE, vect_location,
3937 "transform reduction: created def-use cycle: ");
3938 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3939 dump_printf (MSG_NOTE, "\n");
3940 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3941 dump_printf (MSG_NOTE, "\n");
3944 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3948 /* 2. Create epilog code.
3949 The reduction epilog code operates across the elements of the vector
3950 of partial results computed by the vectorized loop.
3951 The reduction epilog code consists of:
3953 step 1: compute the scalar result in a vector (v_out2)
3954 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3955 step 3: adjust the scalar result (s_out3) if needed.
3957 Step 1 can be accomplished using one the following three schemes:
3958 (scheme 1) using reduc_code, if available.
3959 (scheme 2) using whole-vector shifts, if available.
3960 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3961 combined.
3963 The overall epilog code looks like this:
3965 s_out0 = phi <s_loop> # original EXIT_PHI
3966 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3967 v_out2 = reduce <v_out1> # step 1
3968 s_out3 = extract_field <v_out2, 0> # step 2
3969 s_out4 = adjust_result <s_out3> # step 3
3971 (step 3 is optional, and steps 1 and 2 may be combined).
3972 Lastly, the uses of s_out0 are replaced by s_out4. */
3975 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3976 v_out1 = phi <VECT_DEF>
3977 Store them in NEW_PHIS. */
3979 exit_bb = single_exit (loop)->dest;
3980 prev_phi_info = NULL;
3981 new_phis.create (vect_defs.length ());
3982 FOR_EACH_VEC_ELT (vect_defs, i, def)
3984 for (j = 0; j < ncopies; j++)
3986 tree new_def = copy_ssa_name (def, NULL);
3987 phi = create_phi_node (new_def, exit_bb);
3988 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3989 if (j == 0)
3990 new_phis.quick_push (phi);
3991 else
3993 def = vect_get_vec_def_for_stmt_copy (dt, def);
3994 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3997 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3998 prev_phi_info = vinfo_for_stmt (phi);
4002 /* The epilogue is created for the outer-loop, i.e., for the loop being
4003 vectorized. Create exit phis for the outer loop. */
4004 if (double_reduc)
4006 loop = outer_loop;
4007 exit_bb = single_exit (loop)->dest;
4008 inner_phis.create (vect_defs.length ());
4009 FOR_EACH_VEC_ELT (new_phis, i, phi)
4011 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4012 gimple outer_phi = create_phi_node (new_result, exit_bb);
4013 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4014 PHI_RESULT (phi));
4015 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4016 loop_vinfo, NULL));
4017 inner_phis.quick_push (phi);
4018 new_phis[i] = outer_phi;
4019 prev_phi_info = vinfo_for_stmt (outer_phi);
4020 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4022 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4023 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4024 outer_phi = create_phi_node (new_result, exit_bb);
4025 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4026 PHI_RESULT (phi));
4027 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4028 loop_vinfo, NULL));
4029 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4030 prev_phi_info = vinfo_for_stmt (outer_phi);
4035 exit_gsi = gsi_after_labels (exit_bb);
4037 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4038 (i.e. when reduc_code is not available) and in the final adjustment
4039 code (if needed). Also get the original scalar reduction variable as
4040 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4041 represents a reduction pattern), the tree-code and scalar-def are
4042 taken from the original stmt that the pattern-stmt (STMT) replaces.
4043 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4044 are taken from STMT. */
4046 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4047 if (!orig_stmt)
4049 /* Regular reduction */
4050 orig_stmt = stmt;
4052 else
4054 /* Reduction pattern */
4055 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4056 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4057 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4060 code = gimple_assign_rhs_code (orig_stmt);
4061 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4062 partial results are added and not subtracted. */
4063 if (code == MINUS_EXPR)
4064 code = PLUS_EXPR;
4066 scalar_dest = gimple_assign_lhs (orig_stmt);
4067 scalar_type = TREE_TYPE (scalar_dest);
4068 scalar_results.create (group_size);
4069 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4070 bitsize = TYPE_SIZE (scalar_type);
4072 /* In case this is a reduction in an inner-loop while vectorizing an outer
4073 loop - we don't need to extract a single scalar result at the end of the
4074 inner-loop (unless it is double reduction, i.e., the use of reduction is
4075 outside the outer-loop). The final vector of partial results will be used
4076 in the vectorized outer-loop, or reduced to a scalar result at the end of
4077 the outer-loop. */
4078 if (nested_in_vect_loop && !double_reduc)
4079 goto vect_finalize_reduction;
4081 /* SLP reduction without reduction chain, e.g.,
4082 # a1 = phi <a2, a0>
4083 # b1 = phi <b2, b0>
4084 a2 = operation (a1)
4085 b2 = operation (b1) */
4086 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4088 /* In case of reduction chain, e.g.,
4089 # a1 = phi <a3, a0>
4090 a2 = operation (a1)
4091 a3 = operation (a2),
4093 we may end up with more than one vector result. Here we reduce them to
4094 one vector. */
4095 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4097 tree first_vect = PHI_RESULT (new_phis[0]);
4098 tree tmp;
4099 gimple new_vec_stmt = NULL;
4101 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4102 for (k = 1; k < new_phis.length (); k++)
4104 gimple next_phi = new_phis[k];
4105 tree second_vect = PHI_RESULT (next_phi);
4107 tmp = build2 (code, vectype, first_vect, second_vect);
4108 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4109 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4110 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4111 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4114 new_phi_result = first_vect;
4115 if (new_vec_stmt)
4117 new_phis.truncate (0);
4118 new_phis.safe_push (new_vec_stmt);
4121 else
4122 new_phi_result = PHI_RESULT (new_phis[0]);
4124 /* 2.3 Create the reduction code, using one of the three schemes described
4125 above. In SLP we simply need to extract all the elements from the
4126 vector (without reducing them), so we use scalar shifts. */
4127 if (reduc_code != ERROR_MARK && !slp_reduc)
4129 tree tmp;
4131 /*** Case 1: Create:
4132 v_out2 = reduc_expr <v_out1> */
4134 if (dump_enabled_p ())
4135 dump_printf_loc (MSG_NOTE, vect_location,
4136 "Reduce using direct vector reduction.\n");
4138 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4139 tmp = build1 (reduc_code, vectype, new_phi_result);
4140 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4141 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4142 gimple_assign_set_lhs (epilog_stmt, new_temp);
4143 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4145 extract_scalar_result = true;
4147 else
4149 enum tree_code shift_code = ERROR_MARK;
4150 bool have_whole_vector_shift = true;
4151 int bit_offset;
4152 int element_bitsize = tree_to_uhwi (bitsize);
4153 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4154 tree vec_temp;
4156 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4157 shift_code = VEC_RSHIFT_EXPR;
4158 else
4159 have_whole_vector_shift = false;
4161 /* Regardless of whether we have a whole vector shift, if we're
4162 emulating the operation via tree-vect-generic, we don't want
4163 to use it. Only the first round of the reduction is likely
4164 to still be profitable via emulation. */
4165 /* ??? It might be better to emit a reduction tree code here, so that
4166 tree-vect-generic can expand the first round via bit tricks. */
4167 if (!VECTOR_MODE_P (mode))
4168 have_whole_vector_shift = false;
4169 else
4171 optab optab = optab_for_tree_code (code, vectype, optab_default);
4172 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4173 have_whole_vector_shift = false;
4176 if (have_whole_vector_shift && !slp_reduc)
4178 /*** Case 2: Create:
4179 for (offset = VS/2; offset >= element_size; offset/=2)
4181 Create: va' = vec_shift <va, offset>
4182 Create: va = vop <va, va'>
4183 } */
4185 if (dump_enabled_p ())
4186 dump_printf_loc (MSG_NOTE, vect_location,
4187 "Reduce using vector shifts\n");
4189 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4190 new_temp = new_phi_result;
4191 for (bit_offset = vec_size_in_bits/2;
4192 bit_offset >= element_bitsize;
4193 bit_offset /= 2)
4195 tree bitpos = size_int (bit_offset);
4197 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4198 vec_dest, new_temp, bitpos);
4199 new_name = make_ssa_name (vec_dest, epilog_stmt);
4200 gimple_assign_set_lhs (epilog_stmt, new_name);
4201 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4203 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4204 new_name, new_temp);
4205 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4206 gimple_assign_set_lhs (epilog_stmt, new_temp);
4207 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4210 extract_scalar_result = true;
4212 else
4214 tree rhs;
4216 /*** Case 3: Create:
4217 s = extract_field <v_out2, 0>
4218 for (offset = element_size;
4219 offset < vector_size;
4220 offset += element_size;)
4222 Create: s' = extract_field <v_out2, offset>
4223 Create: s = op <s, s'> // For non SLP cases
4224 } */
4226 if (dump_enabled_p ())
4227 dump_printf_loc (MSG_NOTE, vect_location,
4228 "Reduce using scalar code.\n");
4230 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4231 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4233 if (gimple_code (new_phi) == GIMPLE_PHI)
4234 vec_temp = PHI_RESULT (new_phi);
4235 else
4236 vec_temp = gimple_assign_lhs (new_phi);
4237 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4238 bitsize_zero_node);
4239 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4240 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4241 gimple_assign_set_lhs (epilog_stmt, new_temp);
4242 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4244 /* In SLP we don't need to apply reduction operation, so we just
4245 collect s' values in SCALAR_RESULTS. */
4246 if (slp_reduc)
4247 scalar_results.safe_push (new_temp);
4249 for (bit_offset = element_bitsize;
4250 bit_offset < vec_size_in_bits;
4251 bit_offset += element_bitsize)
4253 tree bitpos = bitsize_int (bit_offset);
4254 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4255 bitsize, bitpos);
4257 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4258 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4259 gimple_assign_set_lhs (epilog_stmt, new_name);
4260 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4262 if (slp_reduc)
4264 /* In SLP we don't need to apply reduction operation, so
4265 we just collect s' values in SCALAR_RESULTS. */
4266 new_temp = new_name;
4267 scalar_results.safe_push (new_name);
4269 else
4271 epilog_stmt = gimple_build_assign_with_ops (code,
4272 new_scalar_dest, new_name, new_temp);
4273 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4274 gimple_assign_set_lhs (epilog_stmt, new_temp);
4275 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4280 /* The only case where we need to reduce scalar results in SLP, is
4281 unrolling. If the size of SCALAR_RESULTS is greater than
4282 GROUP_SIZE, we reduce them combining elements modulo
4283 GROUP_SIZE. */
4284 if (slp_reduc)
4286 tree res, first_res, new_res;
4287 gimple new_stmt;
4289 /* Reduce multiple scalar results in case of SLP unrolling. */
4290 for (j = group_size; scalar_results.iterate (j, &res);
4291 j++)
4293 first_res = scalar_results[j % group_size];
4294 new_stmt = gimple_build_assign_with_ops (code,
4295 new_scalar_dest, first_res, res);
4296 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4297 gimple_assign_set_lhs (new_stmt, new_res);
4298 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4299 scalar_results[j % group_size] = new_res;
4302 else
4303 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4304 scalar_results.safe_push (new_temp);
4306 extract_scalar_result = false;
4310 /* 2.4 Extract the final scalar result. Create:
4311 s_out3 = extract_field <v_out2, bitpos> */
4313 if (extract_scalar_result)
4315 tree rhs;
4317 if (dump_enabled_p ())
4318 dump_printf_loc (MSG_NOTE, vect_location,
4319 "extract scalar result\n");
4321 if (BYTES_BIG_ENDIAN)
4322 bitpos = size_binop (MULT_EXPR,
4323 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4324 TYPE_SIZE (scalar_type));
4325 else
4326 bitpos = bitsize_zero_node;
4328 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4329 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4330 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4331 gimple_assign_set_lhs (epilog_stmt, new_temp);
4332 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4333 scalar_results.safe_push (new_temp);
4336 vect_finalize_reduction:
4338 if (double_reduc)
4339 loop = loop->inner;
4341 /* 2.5 Adjust the final result by the initial value of the reduction
4342 variable. (When such adjustment is not needed, then
4343 'adjustment_def' is zero). For example, if code is PLUS we create:
4344 new_temp = loop_exit_def + adjustment_def */
4346 if (adjustment_def)
4348 gcc_assert (!slp_reduc);
4349 if (nested_in_vect_loop)
4351 new_phi = new_phis[0];
4352 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4353 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4354 new_dest = vect_create_destination_var (scalar_dest, vectype);
4356 else
4358 new_temp = scalar_results[0];
4359 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4360 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4361 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4364 epilog_stmt = gimple_build_assign (new_dest, expr);
4365 new_temp = make_ssa_name (new_dest, epilog_stmt);
4366 gimple_assign_set_lhs (epilog_stmt, new_temp);
4367 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4368 if (nested_in_vect_loop)
4370 set_vinfo_for_stmt (epilog_stmt,
4371 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4372 NULL));
4373 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4374 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4376 if (!double_reduc)
4377 scalar_results.quick_push (new_temp);
4378 else
4379 scalar_results[0] = new_temp;
4381 else
4382 scalar_results[0] = new_temp;
4384 new_phis[0] = epilog_stmt;
4387 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4388 phis with new adjusted scalar results, i.e., replace use <s_out0>
4389 with use <s_out4>.
4391 Transform:
4392 loop_exit:
4393 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4394 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4395 v_out2 = reduce <v_out1>
4396 s_out3 = extract_field <v_out2, 0>
4397 s_out4 = adjust_result <s_out3>
4398 use <s_out0>
4399 use <s_out0>
4401 into:
4403 loop_exit:
4404 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4405 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4406 v_out2 = reduce <v_out1>
4407 s_out3 = extract_field <v_out2, 0>
4408 s_out4 = adjust_result <s_out3>
4409 use <s_out4>
4410 use <s_out4> */
4413 /* In SLP reduction chain we reduce vector results into one vector if
4414 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4415 the last stmt in the reduction chain, since we are looking for the loop
4416 exit phi node. */
4417 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4419 scalar_dest = gimple_assign_lhs (
4420 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4421 group_size = 1;
4424 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4425 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4426 need to match SCALAR_RESULTS with corresponding statements. The first
4427 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4428 the first vector stmt, etc.
4429 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4430 if (group_size > new_phis.length ())
4432 ratio = group_size / new_phis.length ();
4433 gcc_assert (!(group_size % new_phis.length ()));
4435 else
4436 ratio = 1;
4438 for (k = 0; k < group_size; k++)
4440 if (k % ratio == 0)
4442 epilog_stmt = new_phis[k / ratio];
4443 reduction_phi = reduction_phis[k / ratio];
4444 if (double_reduc)
4445 inner_phi = inner_phis[k / ratio];
4448 if (slp_reduc)
4450 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4452 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4453 /* SLP statements can't participate in patterns. */
4454 gcc_assert (!orig_stmt);
4455 scalar_dest = gimple_assign_lhs (current_stmt);
4458 phis.create (3);
4459 /* Find the loop-closed-use at the loop exit of the original scalar
4460 result. (The reduction result is expected to have two immediate uses -
4461 one at the latch block, and one at the loop exit). */
4462 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4463 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4464 && !is_gimple_debug (USE_STMT (use_p)))
4465 phis.safe_push (USE_STMT (use_p));
4467 /* While we expect to have found an exit_phi because of loop-closed-ssa
4468 form we can end up without one if the scalar cycle is dead. */
4470 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4472 if (outer_loop)
4474 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4475 gimple vect_phi;
4477 /* FORNOW. Currently not supporting the case that an inner-loop
4478 reduction is not used in the outer-loop (but only outside the
4479 outer-loop), unless it is double reduction. */
4480 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4481 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4482 || double_reduc);
4484 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4485 if (!double_reduc
4486 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4487 != vect_double_reduction_def)
4488 continue;
4490 /* Handle double reduction:
4492 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4493 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4494 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4495 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4497 At that point the regular reduction (stmt2 and stmt3) is
4498 already vectorized, as well as the exit phi node, stmt4.
4499 Here we vectorize the phi node of double reduction, stmt1, and
4500 update all relevant statements. */
4502 /* Go through all the uses of s2 to find double reduction phi
4503 node, i.e., stmt1 above. */
4504 orig_name = PHI_RESULT (exit_phi);
4505 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4507 stmt_vec_info use_stmt_vinfo;
4508 stmt_vec_info new_phi_vinfo;
4509 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4510 basic_block bb = gimple_bb (use_stmt);
4511 gimple use;
4513 /* Check that USE_STMT is really double reduction phi
4514 node. */
4515 if (gimple_code (use_stmt) != GIMPLE_PHI
4516 || gimple_phi_num_args (use_stmt) != 2
4517 || bb->loop_father != outer_loop)
4518 continue;
4519 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4520 if (!use_stmt_vinfo
4521 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4522 != vect_double_reduction_def)
4523 continue;
4525 /* Create vector phi node for double reduction:
4526 vs1 = phi <vs0, vs2>
4527 vs1 was created previously in this function by a call to
4528 vect_get_vec_def_for_operand and is stored in
4529 vec_initial_def;
4530 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4531 vs0 is created here. */
4533 /* Create vector phi node. */
4534 vect_phi = create_phi_node (vec_initial_def, bb);
4535 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4536 loop_vec_info_for_loop (outer_loop), NULL);
4537 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4539 /* Create vs0 - initial def of the double reduction phi. */
4540 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4541 loop_preheader_edge (outer_loop));
4542 init_def = get_initial_def_for_reduction (stmt,
4543 preheader_arg, NULL);
4544 vect_phi_init = vect_init_vector (use_stmt, init_def,
4545 vectype, NULL);
4547 /* Update phi node arguments with vs0 and vs2. */
4548 add_phi_arg (vect_phi, vect_phi_init,
4549 loop_preheader_edge (outer_loop),
4550 UNKNOWN_LOCATION);
4551 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4552 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4553 if (dump_enabled_p ())
4555 dump_printf_loc (MSG_NOTE, vect_location,
4556 "created double reduction phi node: ");
4557 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4558 dump_printf (MSG_NOTE, "\n");
4561 vect_phi_res = PHI_RESULT (vect_phi);
4563 /* Replace the use, i.e., set the correct vs1 in the regular
4564 reduction phi node. FORNOW, NCOPIES is always 1, so the
4565 loop is redundant. */
4566 use = reduction_phi;
4567 for (j = 0; j < ncopies; j++)
4569 edge pr_edge = loop_preheader_edge (loop);
4570 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4571 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4577 phis.release ();
4578 if (nested_in_vect_loop)
4580 if (double_reduc)
4581 loop = outer_loop;
4582 else
4583 continue;
4586 phis.create (3);
4587 /* Find the loop-closed-use at the loop exit of the original scalar
4588 result. (The reduction result is expected to have two immediate uses,
4589 one at the latch block, and one at the loop exit). For double
4590 reductions we are looking for exit phis of the outer loop. */
4591 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4593 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4595 if (!is_gimple_debug (USE_STMT (use_p)))
4596 phis.safe_push (USE_STMT (use_p));
4598 else
4600 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4602 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4604 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4606 if (!flow_bb_inside_loop_p (loop,
4607 gimple_bb (USE_STMT (phi_use_p)))
4608 && !is_gimple_debug (USE_STMT (phi_use_p)))
4609 phis.safe_push (USE_STMT (phi_use_p));
4615 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4617 /* Replace the uses: */
4618 orig_name = PHI_RESULT (exit_phi);
4619 scalar_result = scalar_results[k];
4620 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4621 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4622 SET_USE (use_p, scalar_result);
4625 phis.release ();
4630 /* Function vectorizable_reduction.
4632 Check if STMT performs a reduction operation that can be vectorized.
4633 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4634 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4635 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4637 This function also handles reduction idioms (patterns) that have been
4638 recognized in advance during vect_pattern_recog. In this case, STMT may be
4639 of this form:
4640 X = pattern_expr (arg0, arg1, ..., X)
4641 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4642 sequence that had been detected and replaced by the pattern-stmt (STMT).
4644 In some cases of reduction patterns, the type of the reduction variable X is
4645 different than the type of the other arguments of STMT.
4646 In such cases, the vectype that is used when transforming STMT into a vector
4647 stmt is different than the vectype that is used to determine the
4648 vectorization factor, because it consists of a different number of elements
4649 than the actual number of elements that are being operated upon in parallel.
4651 For example, consider an accumulation of shorts into an int accumulator.
4652 On some targets it's possible to vectorize this pattern operating on 8
4653 shorts at a time (hence, the vectype for purposes of determining the
4654 vectorization factor should be V8HI); on the other hand, the vectype that
4655 is used to create the vector form is actually V4SI (the type of the result).
4657 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4658 indicates what is the actual level of parallelism (V8HI in the example), so
4659 that the right vectorization factor would be derived. This vectype
4660 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4661 be used to create the vectorized stmt. The right vectype for the vectorized
4662 stmt is obtained from the type of the result X:
4663 get_vectype_for_scalar_type (TREE_TYPE (X))
4665 This means that, contrary to "regular" reductions (or "regular" stmts in
4666 general), the following equation:
4667 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4668 does *NOT* necessarily hold for reduction patterns. */
4670 bool
4671 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4672 gimple *vec_stmt, slp_tree slp_node)
4674 tree vec_dest;
4675 tree scalar_dest;
4676 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4677 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4678 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4679 tree vectype_in = NULL_TREE;
4680 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4681 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4682 enum tree_code code, orig_code, epilog_reduc_code;
4683 enum machine_mode vec_mode;
4684 int op_type;
4685 optab optab, reduc_optab;
4686 tree new_temp = NULL_TREE;
4687 tree def;
4688 gimple def_stmt;
4689 enum vect_def_type dt;
4690 gimple new_phi = NULL;
4691 tree scalar_type;
4692 bool is_simple_use;
4693 gimple orig_stmt;
4694 stmt_vec_info orig_stmt_info;
4695 tree expr = NULL_TREE;
4696 int i;
4697 int ncopies;
4698 int epilog_copies;
4699 stmt_vec_info prev_stmt_info, prev_phi_info;
4700 bool single_defuse_cycle = false;
4701 tree reduc_def = NULL_TREE;
4702 gimple new_stmt = NULL;
4703 int j;
4704 tree ops[3];
4705 bool nested_cycle = false, found_nested_cycle_def = false;
4706 gimple reduc_def_stmt = NULL;
4707 /* The default is that the reduction variable is the last in statement. */
4708 int reduc_index = 2;
4709 bool double_reduc = false, dummy;
4710 basic_block def_bb;
4711 struct loop * def_stmt_loop, *outer_loop = NULL;
4712 tree def_arg;
4713 gimple def_arg_stmt;
4714 auto_vec<tree> vec_oprnds0;
4715 auto_vec<tree> vec_oprnds1;
4716 auto_vec<tree> vect_defs;
4717 auto_vec<gimple> phis;
4718 int vec_num;
4719 tree def0, def1, tem, op0, op1 = NULL_TREE;
4721 /* In case of reduction chain we switch to the first stmt in the chain, but
4722 we don't update STMT_INFO, since only the last stmt is marked as reduction
4723 and has reduction properties. */
4724 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4725 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4727 if (nested_in_vect_loop_p (loop, stmt))
4729 outer_loop = loop;
4730 loop = loop->inner;
4731 nested_cycle = true;
4734 /* 1. Is vectorizable reduction? */
4735 /* Not supportable if the reduction variable is used in the loop, unless
4736 it's a reduction chain. */
4737 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4738 && !GROUP_FIRST_ELEMENT (stmt_info))
4739 return false;
4741 /* Reductions that are not used even in an enclosing outer-loop,
4742 are expected to be "live" (used out of the loop). */
4743 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4744 && !STMT_VINFO_LIVE_P (stmt_info))
4745 return false;
4747 /* Make sure it was already recognized as a reduction computation. */
4748 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4749 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4750 return false;
4752 /* 2. Has this been recognized as a reduction pattern?
4754 Check if STMT represents a pattern that has been recognized
4755 in earlier analysis stages. For stmts that represent a pattern,
4756 the STMT_VINFO_RELATED_STMT field records the last stmt in
4757 the original sequence that constitutes the pattern. */
4759 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4760 if (orig_stmt)
4762 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4763 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4764 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4767 /* 3. Check the operands of the operation. The first operands are defined
4768 inside the loop body. The last operand is the reduction variable,
4769 which is defined by the loop-header-phi. */
4771 gcc_assert (is_gimple_assign (stmt));
4773 /* Flatten RHS. */
4774 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4776 case GIMPLE_SINGLE_RHS:
4777 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4778 if (op_type == ternary_op)
4780 tree rhs = gimple_assign_rhs1 (stmt);
4781 ops[0] = TREE_OPERAND (rhs, 0);
4782 ops[1] = TREE_OPERAND (rhs, 1);
4783 ops[2] = TREE_OPERAND (rhs, 2);
4784 code = TREE_CODE (rhs);
4786 else
4787 return false;
4788 break;
4790 case GIMPLE_BINARY_RHS:
4791 code = gimple_assign_rhs_code (stmt);
4792 op_type = TREE_CODE_LENGTH (code);
4793 gcc_assert (op_type == binary_op);
4794 ops[0] = gimple_assign_rhs1 (stmt);
4795 ops[1] = gimple_assign_rhs2 (stmt);
4796 break;
4798 case GIMPLE_TERNARY_RHS:
4799 code = gimple_assign_rhs_code (stmt);
4800 op_type = TREE_CODE_LENGTH (code);
4801 gcc_assert (op_type == ternary_op);
4802 ops[0] = gimple_assign_rhs1 (stmt);
4803 ops[1] = gimple_assign_rhs2 (stmt);
4804 ops[2] = gimple_assign_rhs3 (stmt);
4805 break;
4807 case GIMPLE_UNARY_RHS:
4808 return false;
4810 default:
4811 gcc_unreachable ();
4814 if (code == COND_EXPR && slp_node)
4815 return false;
4817 scalar_dest = gimple_assign_lhs (stmt);
4818 scalar_type = TREE_TYPE (scalar_dest);
4819 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4820 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4821 return false;
4823 /* Do not try to vectorize bit-precision reductions. */
4824 if ((TYPE_PRECISION (scalar_type)
4825 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4826 return false;
4828 /* All uses but the last are expected to be defined in the loop.
4829 The last use is the reduction variable. In case of nested cycle this
4830 assumption is not true: we use reduc_index to record the index of the
4831 reduction variable. */
4832 for (i = 0; i < op_type - 1; i++)
4834 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4835 if (i == 0 && code == COND_EXPR)
4836 continue;
4838 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4839 &def_stmt, &def, &dt, &tem);
4840 if (!vectype_in)
4841 vectype_in = tem;
4842 gcc_assert (is_simple_use);
4844 if (dt != vect_internal_def
4845 && dt != vect_external_def
4846 && dt != vect_constant_def
4847 && dt != vect_induction_def
4848 && !(dt == vect_nested_cycle && nested_cycle))
4849 return false;
4851 if (dt == vect_nested_cycle)
4853 found_nested_cycle_def = true;
4854 reduc_def_stmt = def_stmt;
4855 reduc_index = i;
4859 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4860 &def_stmt, &def, &dt, &tem);
4861 if (!vectype_in)
4862 vectype_in = tem;
4863 gcc_assert (is_simple_use);
4864 if (!(dt == vect_reduction_def
4865 || dt == vect_nested_cycle
4866 || ((dt == vect_internal_def || dt == vect_external_def
4867 || dt == vect_constant_def || dt == vect_induction_def)
4868 && nested_cycle && found_nested_cycle_def)))
4870 /* For pattern recognized stmts, orig_stmt might be a reduction,
4871 but some helper statements for the pattern might not, or
4872 might be COND_EXPRs with reduction uses in the condition. */
4873 gcc_assert (orig_stmt);
4874 return false;
4876 if (!found_nested_cycle_def)
4877 reduc_def_stmt = def_stmt;
4879 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4880 if (orig_stmt)
4881 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4882 reduc_def_stmt,
4883 !nested_cycle,
4884 &dummy));
4885 else
4887 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4888 !nested_cycle, &dummy);
4889 /* We changed STMT to be the first stmt in reduction chain, hence we
4890 check that in this case the first element in the chain is STMT. */
4891 gcc_assert (stmt == tmp
4892 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4895 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4896 return false;
4898 if (slp_node || PURE_SLP_STMT (stmt_info))
4899 ncopies = 1;
4900 else
4901 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4902 / TYPE_VECTOR_SUBPARTS (vectype_in));
4904 gcc_assert (ncopies >= 1);
4906 vec_mode = TYPE_MODE (vectype_in);
4908 if (code == COND_EXPR)
4910 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4912 if (dump_enabled_p ())
4913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4914 "unsupported condition in reduction\n");
4916 return false;
4919 else
4921 /* 4. Supportable by target? */
4923 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4924 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4926 /* Shifts and rotates are only supported by vectorizable_shifts,
4927 not vectorizable_reduction. */
4928 if (dump_enabled_p ())
4929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4930 "unsupported shift or rotation.\n");
4931 return false;
4934 /* 4.1. check support for the operation in the loop */
4935 optab = optab_for_tree_code (code, vectype_in, optab_default);
4936 if (!optab)
4938 if (dump_enabled_p ())
4939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4940 "no optab.\n");
4942 return false;
4945 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4947 if (dump_enabled_p ())
4948 dump_printf (MSG_NOTE, "op not supported by target.\n");
4950 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4951 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4952 < vect_min_worthwhile_factor (code))
4953 return false;
4955 if (dump_enabled_p ())
4956 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
4959 /* Worthwhile without SIMD support? */
4960 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4961 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4962 < vect_min_worthwhile_factor (code))
4964 if (dump_enabled_p ())
4965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4966 "not worthwhile without SIMD support.\n");
4968 return false;
4972 /* 4.2. Check support for the epilog operation.
4974 If STMT represents a reduction pattern, then the type of the
4975 reduction variable may be different than the type of the rest
4976 of the arguments. For example, consider the case of accumulation
4977 of shorts into an int accumulator; The original code:
4978 S1: int_a = (int) short_a;
4979 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4981 was replaced with:
4982 STMT: int_acc = widen_sum <short_a, int_acc>
4984 This means that:
4985 1. The tree-code that is used to create the vector operation in the
4986 epilog code (that reduces the partial results) is not the
4987 tree-code of STMT, but is rather the tree-code of the original
4988 stmt from the pattern that STMT is replacing. I.e, in the example
4989 above we want to use 'widen_sum' in the loop, but 'plus' in the
4990 epilog.
4991 2. The type (mode) we use to check available target support
4992 for the vector operation to be created in the *epilog*, is
4993 determined by the type of the reduction variable (in the example
4994 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4995 However the type (mode) we use to check available target support
4996 for the vector operation to be created *inside the loop*, is
4997 determined by the type of the other arguments to STMT (in the
4998 example we'd check this: optab_handler (widen_sum_optab,
4999 vect_short_mode)).
5001 This is contrary to "regular" reductions, in which the types of all
5002 the arguments are the same as the type of the reduction variable.
5003 For "regular" reductions we can therefore use the same vector type
5004 (and also the same tree-code) when generating the epilog code and
5005 when generating the code inside the loop. */
5007 if (orig_stmt)
5009 /* This is a reduction pattern: get the vectype from the type of the
5010 reduction variable, and get the tree-code from orig_stmt. */
5011 orig_code = gimple_assign_rhs_code (orig_stmt);
5012 gcc_assert (vectype_out);
5013 vec_mode = TYPE_MODE (vectype_out);
5015 else
5017 /* Regular reduction: use the same vectype and tree-code as used for
5018 the vector code inside the loop can be used for the epilog code. */
5019 orig_code = code;
5022 if (nested_cycle)
5024 def_bb = gimple_bb (reduc_def_stmt);
5025 def_stmt_loop = def_bb->loop_father;
5026 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5027 loop_preheader_edge (def_stmt_loop));
5028 if (TREE_CODE (def_arg) == SSA_NAME
5029 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5030 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5031 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5032 && vinfo_for_stmt (def_arg_stmt)
5033 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5034 == vect_double_reduction_def)
5035 double_reduc = true;
5038 epilog_reduc_code = ERROR_MARK;
5039 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5041 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5042 optab_default);
5043 if (!reduc_optab)
5045 if (dump_enabled_p ())
5046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5047 "no optab for reduction.\n");
5049 epilog_reduc_code = ERROR_MARK;
5052 if (reduc_optab
5053 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5055 if (dump_enabled_p ())
5056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5057 "reduc op not supported by target.\n");
5059 epilog_reduc_code = ERROR_MARK;
5062 else
5064 if (!nested_cycle || double_reduc)
5066 if (dump_enabled_p ())
5067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5068 "no reduc code for scalar code.\n");
5070 return false;
5074 if (double_reduc && ncopies > 1)
5076 if (dump_enabled_p ())
5077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5078 "multiple types in double reduction\n");
5080 return false;
5083 /* In case of widenning multiplication by a constant, we update the type
5084 of the constant to be the type of the other operand. We check that the
5085 constant fits the type in the pattern recognition pass. */
5086 if (code == DOT_PROD_EXPR
5087 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5089 if (TREE_CODE (ops[0]) == INTEGER_CST)
5090 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5091 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5092 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5093 else
5095 if (dump_enabled_p ())
5096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5097 "invalid types in dot-prod\n");
5099 return false;
5103 if (!vec_stmt) /* transformation not required. */
5105 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5106 return false;
5107 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5108 return true;
5111 /** Transform. **/
5113 if (dump_enabled_p ())
5114 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5116 /* FORNOW: Multiple types are not supported for condition. */
5117 if (code == COND_EXPR)
5118 gcc_assert (ncopies == 1);
5120 /* Create the destination vector */
5121 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5123 /* In case the vectorization factor (VF) is bigger than the number
5124 of elements that we can fit in a vectype (nunits), we have to generate
5125 more than one vector stmt - i.e - we need to "unroll" the
5126 vector stmt by a factor VF/nunits. For more details see documentation
5127 in vectorizable_operation. */
5129 /* If the reduction is used in an outer loop we need to generate
5130 VF intermediate results, like so (e.g. for ncopies=2):
5131 r0 = phi (init, r0)
5132 r1 = phi (init, r1)
5133 r0 = x0 + r0;
5134 r1 = x1 + r1;
5135 (i.e. we generate VF results in 2 registers).
5136 In this case we have a separate def-use cycle for each copy, and therefore
5137 for each copy we get the vector def for the reduction variable from the
5138 respective phi node created for this copy.
5140 Otherwise (the reduction is unused in the loop nest), we can combine
5141 together intermediate results, like so (e.g. for ncopies=2):
5142 r = phi (init, r)
5143 r = x0 + r;
5144 r = x1 + r;
5145 (i.e. we generate VF/2 results in a single register).
5146 In this case for each copy we get the vector def for the reduction variable
5147 from the vectorized reduction operation generated in the previous iteration.
5150 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5152 single_defuse_cycle = true;
5153 epilog_copies = 1;
5155 else
5156 epilog_copies = ncopies;
5158 prev_stmt_info = NULL;
5159 prev_phi_info = NULL;
5160 if (slp_node)
5162 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5163 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5164 == TYPE_VECTOR_SUBPARTS (vectype_in));
5166 else
5168 vec_num = 1;
5169 vec_oprnds0.create (1);
5170 if (op_type == ternary_op)
5171 vec_oprnds1.create (1);
5174 phis.create (vec_num);
5175 vect_defs.create (vec_num);
5176 if (!slp_node)
5177 vect_defs.quick_push (NULL_TREE);
5179 for (j = 0; j < ncopies; j++)
5181 if (j == 0 || !single_defuse_cycle)
5183 for (i = 0; i < vec_num; i++)
5185 /* Create the reduction-phi that defines the reduction
5186 operand. */
5187 new_phi = create_phi_node (vec_dest, loop->header);
5188 set_vinfo_for_stmt (new_phi,
5189 new_stmt_vec_info (new_phi, loop_vinfo,
5190 NULL));
5191 if (j == 0 || slp_node)
5192 phis.quick_push (new_phi);
5196 if (code == COND_EXPR)
5198 gcc_assert (!slp_node);
5199 vectorizable_condition (stmt, gsi, vec_stmt,
5200 PHI_RESULT (phis[0]),
5201 reduc_index, NULL);
5202 /* Multiple types are not supported for condition. */
5203 break;
5206 /* Handle uses. */
5207 if (j == 0)
5209 op0 = ops[!reduc_index];
5210 if (op_type == ternary_op)
5212 if (reduc_index == 0)
5213 op1 = ops[2];
5214 else
5215 op1 = ops[1];
5218 if (slp_node)
5219 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5220 slp_node, -1);
5221 else
5223 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5224 stmt, NULL);
5225 vec_oprnds0.quick_push (loop_vec_def0);
5226 if (op_type == ternary_op)
5228 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5229 NULL);
5230 vec_oprnds1.quick_push (loop_vec_def1);
5234 else
5236 if (!slp_node)
5238 enum vect_def_type dt;
5239 gimple dummy_stmt;
5240 tree dummy;
5242 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5243 &dummy_stmt, &dummy, &dt);
5244 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5245 loop_vec_def0);
5246 vec_oprnds0[0] = loop_vec_def0;
5247 if (op_type == ternary_op)
5249 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5250 &dummy, &dt);
5251 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5252 loop_vec_def1);
5253 vec_oprnds1[0] = loop_vec_def1;
5257 if (single_defuse_cycle)
5258 reduc_def = gimple_assign_lhs (new_stmt);
5260 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5263 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5265 if (slp_node)
5266 reduc_def = PHI_RESULT (phis[i]);
5267 else
5269 if (!single_defuse_cycle || j == 0)
5270 reduc_def = PHI_RESULT (new_phi);
5273 def1 = ((op_type == ternary_op)
5274 ? vec_oprnds1[i] : NULL);
5275 if (op_type == binary_op)
5277 if (reduc_index == 0)
5278 expr = build2 (code, vectype_out, reduc_def, def0);
5279 else
5280 expr = build2 (code, vectype_out, def0, reduc_def);
5282 else
5284 if (reduc_index == 0)
5285 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5286 else
5288 if (reduc_index == 1)
5289 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5290 else
5291 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5295 new_stmt = gimple_build_assign (vec_dest, expr);
5296 new_temp = make_ssa_name (vec_dest, new_stmt);
5297 gimple_assign_set_lhs (new_stmt, new_temp);
5298 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5300 if (slp_node)
5302 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5303 vect_defs.quick_push (new_temp);
5305 else
5306 vect_defs[0] = new_temp;
5309 if (slp_node)
5310 continue;
5312 if (j == 0)
5313 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5314 else
5315 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5317 prev_stmt_info = vinfo_for_stmt (new_stmt);
5318 prev_phi_info = vinfo_for_stmt (new_phi);
5321 /* Finalize the reduction-phi (set its arguments) and create the
5322 epilog reduction code. */
5323 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5325 new_temp = gimple_assign_lhs (*vec_stmt);
5326 vect_defs[0] = new_temp;
5329 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5330 epilog_reduc_code, phis, reduc_index,
5331 double_reduc, slp_node);
5333 return true;
5336 /* Function vect_min_worthwhile_factor.
5338 For a loop where we could vectorize the operation indicated by CODE,
5339 return the minimum vectorization factor that makes it worthwhile
5340 to use generic vectors. */
5342 vect_min_worthwhile_factor (enum tree_code code)
5344 switch (code)
5346 case PLUS_EXPR:
5347 case MINUS_EXPR:
5348 case NEGATE_EXPR:
5349 return 4;
5351 case BIT_AND_EXPR:
5352 case BIT_IOR_EXPR:
5353 case BIT_XOR_EXPR:
5354 case BIT_NOT_EXPR:
5355 return 2;
5357 default:
5358 return INT_MAX;
5363 /* Function vectorizable_induction
5365 Check if PHI performs an induction computation that can be vectorized.
5366 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5367 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5368 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5370 bool
5371 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5372 gimple *vec_stmt)
5374 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5375 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5376 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5377 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5378 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5379 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5380 tree vec_def;
5382 gcc_assert (ncopies >= 1);
5383 /* FORNOW. These restrictions should be relaxed. */
5384 if (nested_in_vect_loop_p (loop, phi))
5386 imm_use_iterator imm_iter;
5387 use_operand_p use_p;
5388 gimple exit_phi;
5389 edge latch_e;
5390 tree loop_arg;
5392 if (ncopies > 1)
5394 if (dump_enabled_p ())
5395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5396 "multiple types in nested loop.\n");
5397 return false;
5400 exit_phi = NULL;
5401 latch_e = loop_latch_edge (loop->inner);
5402 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5403 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5405 if (!flow_bb_inside_loop_p (loop->inner,
5406 gimple_bb (USE_STMT (use_p))))
5408 exit_phi = USE_STMT (use_p);
5409 break;
5412 if (exit_phi)
5414 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5415 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5416 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5418 if (dump_enabled_p ())
5419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5420 "inner-loop induction only used outside "
5421 "of the outer vectorized loop.\n");
5422 return false;
5427 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5428 return false;
5430 /* FORNOW: SLP not supported. */
5431 if (STMT_SLP_TYPE (stmt_info))
5432 return false;
5434 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5436 if (gimple_code (phi) != GIMPLE_PHI)
5437 return false;
5439 if (!vec_stmt) /* transformation not required. */
5441 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5442 if (dump_enabled_p ())
5443 dump_printf_loc (MSG_NOTE, vect_location,
5444 "=== vectorizable_induction ===\n");
5445 vect_model_induction_cost (stmt_info, ncopies);
5446 return true;
5449 /** Transform. **/
5451 if (dump_enabled_p ())
5452 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5454 vec_def = get_initial_def_for_induction (phi);
5455 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5456 return true;
5459 /* Function vectorizable_live_operation.
5461 STMT computes a value that is used outside the loop. Check if
5462 it can be supported. */
5464 bool
5465 vectorizable_live_operation (gimple stmt,
5466 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5467 gimple *vec_stmt)
5469 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5470 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5471 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5472 int i;
5473 int op_type;
5474 tree op;
5475 tree def;
5476 gimple def_stmt;
5477 enum vect_def_type dt;
5478 enum tree_code code;
5479 enum gimple_rhs_class rhs_class;
5481 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5483 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5484 return false;
5486 if (!is_gimple_assign (stmt))
5488 if (gimple_call_internal_p (stmt)
5489 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5490 && gimple_call_lhs (stmt)
5491 && loop->simduid
5492 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5493 && loop->simduid
5494 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5496 edge e = single_exit (loop);
5497 basic_block merge_bb = e->dest;
5498 imm_use_iterator imm_iter;
5499 use_operand_p use_p;
5500 tree lhs = gimple_call_lhs (stmt);
5502 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5504 gimple use_stmt = USE_STMT (use_p);
5505 if (gimple_code (use_stmt) == GIMPLE_PHI
5506 || gimple_bb (use_stmt) == merge_bb)
5508 if (vec_stmt)
5510 tree vfm1
5511 = build_int_cst (unsigned_type_node,
5512 loop_vinfo->vectorization_factor - 1);
5513 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5515 return true;
5520 return false;
5523 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5524 return false;
5526 /* FORNOW. CHECKME. */
5527 if (nested_in_vect_loop_p (loop, stmt))
5528 return false;
5530 code = gimple_assign_rhs_code (stmt);
5531 op_type = TREE_CODE_LENGTH (code);
5532 rhs_class = get_gimple_rhs_class (code);
5533 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5534 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5536 /* FORNOW: support only if all uses are invariant. This means
5537 that the scalar operations can remain in place, unvectorized.
5538 The original last scalar value that they compute will be used. */
5540 for (i = 0; i < op_type; i++)
5542 if (rhs_class == GIMPLE_SINGLE_RHS)
5543 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5544 else
5545 op = gimple_op (stmt, i + 1);
5546 if (op
5547 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5548 &dt))
5550 if (dump_enabled_p ())
5551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5552 "use not simple.\n");
5553 return false;
5556 if (dt != vect_external_def && dt != vect_constant_def)
5557 return false;
5560 /* No transformation is required for the cases we currently support. */
5561 return true;
5564 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5566 static void
5567 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5569 ssa_op_iter op_iter;
5570 imm_use_iterator imm_iter;
5571 def_operand_p def_p;
5572 gimple ustmt;
5574 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5576 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5578 basic_block bb;
5580 if (!is_gimple_debug (ustmt))
5581 continue;
5583 bb = gimple_bb (ustmt);
5585 if (!flow_bb_inside_loop_p (loop, bb))
5587 if (gimple_debug_bind_p (ustmt))
5589 if (dump_enabled_p ())
5590 dump_printf_loc (MSG_NOTE, vect_location,
5591 "killing debug use\n");
5593 gimple_debug_bind_reset_value (ustmt);
5594 update_stmt (ustmt);
5596 else
5597 gcc_unreachable ();
5604 /* This function builds ni_name = number of iterations. Statements
5605 are emitted on the loop preheader edge. */
5607 static tree
5608 vect_build_loop_niters (loop_vec_info loop_vinfo)
5610 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5611 if (TREE_CODE (ni) == INTEGER_CST)
5612 return ni;
5613 else
5615 tree ni_name, var;
5616 gimple_seq stmts = NULL;
5617 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5619 var = create_tmp_var (TREE_TYPE (ni), "niters");
5620 ni_name = force_gimple_operand (ni, &stmts, false, var);
5621 if (stmts)
5622 gsi_insert_seq_on_edge_immediate (pe, stmts);
5624 return ni_name;
5629 /* This function generates the following statements:
5631 ni_name = number of iterations loop executes
5632 ratio = ni_name / vf
5633 ratio_mult_vf_name = ratio * vf
5635 and places them on the loop preheader edge. */
5637 static void
5638 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5639 tree ni_name,
5640 tree *ratio_mult_vf_name_ptr,
5641 tree *ratio_name_ptr)
5643 tree ni_minus_gap_name;
5644 tree var;
5645 tree ratio_name;
5646 tree ratio_mult_vf_name;
5647 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5648 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5649 tree log_vf;
5651 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5653 /* If epilogue loop is required because of data accesses with gaps, we
5654 subtract one iteration from the total number of iterations here for
5655 correct calculation of RATIO. */
5656 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5658 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5659 ni_name,
5660 build_one_cst (TREE_TYPE (ni_name)));
5661 if (!is_gimple_val (ni_minus_gap_name))
5663 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5664 gimple stmts = NULL;
5665 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5666 true, var);
5667 gsi_insert_seq_on_edge_immediate (pe, stmts);
5670 else
5671 ni_minus_gap_name = ni_name;
5673 /* Create: ratio = ni >> log2(vf) */
5674 /* ??? As we have ni == number of latch executions + 1, ni could
5675 have overflown to zero. So avoid computing ratio based on ni
5676 but compute it using the fact that we know ratio will be at least
5677 one, thus via (ni - vf) >> log2(vf) + 1. */
5678 ratio_name
5679 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5680 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5681 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5682 ni_minus_gap_name,
5683 build_int_cst
5684 (TREE_TYPE (ni_name), vf)),
5685 log_vf),
5686 build_int_cst (TREE_TYPE (ni_name), 1));
5687 if (!is_gimple_val (ratio_name))
5689 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5690 gimple stmts = NULL;
5691 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5692 gsi_insert_seq_on_edge_immediate (pe, stmts);
5694 *ratio_name_ptr = ratio_name;
5696 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5698 if (ratio_mult_vf_name_ptr)
5700 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5701 ratio_name, log_vf);
5702 if (!is_gimple_val (ratio_mult_vf_name))
5704 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5705 gimple stmts = NULL;
5706 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5707 true, var);
5708 gsi_insert_seq_on_edge_immediate (pe, stmts);
5710 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5713 return;
5717 /* Function vect_transform_loop.
5719 The analysis phase has determined that the loop is vectorizable.
5720 Vectorize the loop - created vectorized stmts to replace the scalar
5721 stmts in the loop, and update the loop exit condition. */
5723 void
5724 vect_transform_loop (loop_vec_info loop_vinfo)
5726 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5727 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5728 int nbbs = loop->num_nodes;
5729 gimple_stmt_iterator si;
5730 int i;
5731 tree ratio = NULL;
5732 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5733 bool grouped_store;
5734 bool slp_scheduled = false;
5735 gimple stmt, pattern_stmt;
5736 gimple_seq pattern_def_seq = NULL;
5737 gimple_stmt_iterator pattern_def_si = gsi_none ();
5738 bool transform_pattern_stmt = false;
5739 bool check_profitability = false;
5740 int th;
5741 /* Record number of iterations before we started tampering with the profile. */
5742 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5744 if (dump_enabled_p ())
5745 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5747 /* If profile is inprecise, we have chance to fix it up. */
5748 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5749 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5751 /* Use the more conservative vectorization threshold. If the number
5752 of iterations is constant assume the cost check has been performed
5753 by our caller. If the threshold makes all loops profitable that
5754 run at least the vectorization factor number of times checking
5755 is pointless, too. */
5756 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5757 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5758 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5759 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5760 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5762 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_NOTE, vect_location,
5764 "Profitability threshold is %d loop iterations.\n",
5765 th);
5766 check_profitability = true;
5769 /* Version the loop first, if required, so the profitability check
5770 comes first. */
5772 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5773 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5775 vect_loop_versioning (loop_vinfo, th, check_profitability);
5776 check_profitability = false;
5779 tree ni_name = vect_build_loop_niters (loop_vinfo);
5780 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5782 /* Peel the loop if there are data refs with unknown alignment.
5783 Only one data ref with unknown store is allowed. */
5785 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5787 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5788 th, check_profitability);
5789 check_profitability = false;
5790 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5791 be re-computed. */
5792 ni_name = NULL_TREE;
5795 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5796 compile time constant), or it is a constant that doesn't divide by the
5797 vectorization factor, then an epilog loop needs to be created.
5798 We therefore duplicate the loop: the original loop will be vectorized,
5799 and will compute the first (n/VF) iterations. The second copy of the loop
5800 will remain scalar and will compute the remaining (n%VF) iterations.
5801 (VF is the vectorization factor). */
5803 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5804 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5806 tree ratio_mult_vf;
5807 if (!ni_name)
5808 ni_name = vect_build_loop_niters (loop_vinfo);
5809 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5810 &ratio);
5811 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5812 th, check_profitability);
5814 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5815 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5816 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5817 else
5819 if (!ni_name)
5820 ni_name = vect_build_loop_niters (loop_vinfo);
5821 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5824 /* 1) Make sure the loop header has exactly two entries
5825 2) Make sure we have a preheader basic block. */
5827 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5829 split_edge (loop_preheader_edge (loop));
5831 /* FORNOW: the vectorizer supports only loops which body consist
5832 of one basic block (header + empty latch). When the vectorizer will
5833 support more involved loop forms, the order by which the BBs are
5834 traversed need to be reconsidered. */
5836 for (i = 0; i < nbbs; i++)
5838 basic_block bb = bbs[i];
5839 stmt_vec_info stmt_info;
5840 gimple phi;
5842 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5844 phi = gsi_stmt (si);
5845 if (dump_enabled_p ())
5847 dump_printf_loc (MSG_NOTE, vect_location,
5848 "------>vectorizing phi: ");
5849 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5850 dump_printf (MSG_NOTE, "\n");
5852 stmt_info = vinfo_for_stmt (phi);
5853 if (!stmt_info)
5854 continue;
5856 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5857 vect_loop_kill_debug_uses (loop, phi);
5859 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5860 && !STMT_VINFO_LIVE_P (stmt_info))
5861 continue;
5863 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5864 != (unsigned HOST_WIDE_INT) vectorization_factor)
5865 && dump_enabled_p ())
5866 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5868 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5870 if (dump_enabled_p ())
5871 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5872 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5876 pattern_stmt = NULL;
5877 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5879 bool is_store;
5881 if (transform_pattern_stmt)
5882 stmt = pattern_stmt;
5883 else
5885 stmt = gsi_stmt (si);
5886 /* During vectorization remove existing clobber stmts. */
5887 if (gimple_clobber_p (stmt))
5889 unlink_stmt_vdef (stmt);
5890 gsi_remove (&si, true);
5891 release_defs (stmt);
5892 continue;
5896 if (dump_enabled_p ())
5898 dump_printf_loc (MSG_NOTE, vect_location,
5899 "------>vectorizing statement: ");
5900 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5901 dump_printf (MSG_NOTE, "\n");
5904 stmt_info = vinfo_for_stmt (stmt);
5906 /* vector stmts created in the outer-loop during vectorization of
5907 stmts in an inner-loop may not have a stmt_info, and do not
5908 need to be vectorized. */
5909 if (!stmt_info)
5911 gsi_next (&si);
5912 continue;
5915 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5916 vect_loop_kill_debug_uses (loop, stmt);
5918 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5919 && !STMT_VINFO_LIVE_P (stmt_info))
5921 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5922 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5923 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5924 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5926 stmt = pattern_stmt;
5927 stmt_info = vinfo_for_stmt (stmt);
5929 else
5931 gsi_next (&si);
5932 continue;
5935 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5936 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5937 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5938 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5939 transform_pattern_stmt = true;
5941 /* If pattern statement has def stmts, vectorize them too. */
5942 if (is_pattern_stmt_p (stmt_info))
5944 if (pattern_def_seq == NULL)
5946 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5947 pattern_def_si = gsi_start (pattern_def_seq);
5949 else if (!gsi_end_p (pattern_def_si))
5950 gsi_next (&pattern_def_si);
5951 if (pattern_def_seq != NULL)
5953 gimple pattern_def_stmt = NULL;
5954 stmt_vec_info pattern_def_stmt_info = NULL;
5956 while (!gsi_end_p (pattern_def_si))
5958 pattern_def_stmt = gsi_stmt (pattern_def_si);
5959 pattern_def_stmt_info
5960 = vinfo_for_stmt (pattern_def_stmt);
5961 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5962 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5963 break;
5964 gsi_next (&pattern_def_si);
5967 if (!gsi_end_p (pattern_def_si))
5969 if (dump_enabled_p ())
5971 dump_printf_loc (MSG_NOTE, vect_location,
5972 "==> vectorizing pattern def "
5973 "stmt: ");
5974 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5975 pattern_def_stmt, 0);
5976 dump_printf (MSG_NOTE, "\n");
5979 stmt = pattern_def_stmt;
5980 stmt_info = pattern_def_stmt_info;
5982 else
5984 pattern_def_si = gsi_none ();
5985 transform_pattern_stmt = false;
5988 else
5989 transform_pattern_stmt = false;
5992 if (STMT_VINFO_VECTYPE (stmt_info))
5994 unsigned int nunits
5995 = (unsigned int)
5996 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
5997 if (!STMT_SLP_TYPE (stmt_info)
5998 && nunits != (unsigned int) vectorization_factor
5999 && dump_enabled_p ())
6000 /* For SLP VF is set according to unrolling factor, and not
6001 to vector size, hence for SLP this print is not valid. */
6002 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6005 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6006 reached. */
6007 if (STMT_SLP_TYPE (stmt_info))
6009 if (!slp_scheduled)
6011 slp_scheduled = true;
6013 if (dump_enabled_p ())
6014 dump_printf_loc (MSG_NOTE, vect_location,
6015 "=== scheduling SLP instances ===\n");
6017 vect_schedule_slp (loop_vinfo, NULL);
6020 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6021 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6023 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6025 pattern_def_seq = NULL;
6026 gsi_next (&si);
6028 continue;
6032 /* -------- vectorize statement ------------ */
6033 if (dump_enabled_p ())
6034 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6036 grouped_store = false;
6037 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6038 if (is_store)
6040 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6042 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6043 interleaving chain was completed - free all the stores in
6044 the chain. */
6045 gsi_next (&si);
6046 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6047 continue;
6049 else
6051 /* Free the attached stmt_vec_info and remove the stmt. */
6052 gimple store = gsi_stmt (si);
6053 free_stmt_vec_info (store);
6054 unlink_stmt_vdef (store);
6055 gsi_remove (&si, true);
6056 release_defs (store);
6057 continue;
6061 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6063 pattern_def_seq = NULL;
6064 gsi_next (&si);
6066 } /* stmts in BB */
6067 } /* BBs in loop */
6069 slpeel_make_loop_iterate_ntimes (loop, ratio);
6071 /* Reduce loop iterations by the vectorization factor. */
6072 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6073 expected_iterations / vectorization_factor);
6074 loop->nb_iterations_upper_bound
6075 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
6076 FLOOR_DIV_EXPR);
6077 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6078 && loop->nb_iterations_upper_bound != double_int_zero)
6079 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
6080 if (loop->any_estimate)
6082 loop->nb_iterations_estimate
6083 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
6084 FLOOR_DIV_EXPR);
6085 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6086 && loop->nb_iterations_estimate != double_int_zero)
6087 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
6090 if (dump_enabled_p ())
6092 dump_printf_loc (MSG_NOTE, vect_location,
6093 "LOOP VECTORIZED\n");
6094 if (loop->inner)
6095 dump_printf_loc (MSG_NOTE, vect_location,
6096 "OUTER LOOP VECTORIZED\n");
6097 dump_printf (MSG_NOTE, "\n");