* expr.h (array_at_struct_end_p): Move to...
[official-gcc.git] / gcc / tree-vect-loop.c
blobb93685e816040c1a513c9770eb613279c38a1226
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "predict.h"
40 #include "hard-reg-set.h"
41 #include "function.h"
42 #include "dominance.h"
43 #include "cfg.h"
44 #include "cfganal.h"
45 #include "basic-block.h"
46 #include "gimple-pretty-print.h"
47 #include "tree-ssa-alias.h"
48 #include "internal-fn.h"
49 #include "gimple-expr.h"
50 #include "is-a.h"
51 #include "gimple.h"
52 #include "gimplify.h"
53 #include "gimple-iterator.h"
54 #include "gimplify-me.h"
55 #include "gimple-ssa.h"
56 #include "tree-phinodes.h"
57 #include "ssa-iterators.h"
58 #include "stringpool.h"
59 #include "tree-ssanames.h"
60 #include "tree-ssa-loop-ivopts.h"
61 #include "tree-ssa-loop-manip.h"
62 #include "tree-ssa-loop-niter.h"
63 #include "tree-pass.h"
64 #include "cfgloop.h"
65 #include "hashtab.h"
66 #include "rtl.h"
67 #include "flags.h"
68 #include "statistics.h"
69 #include "real.h"
70 #include "fixed-value.h"
71 #include "insn-config.h"
72 #include "expmed.h"
73 #include "dojump.h"
74 #include "explow.h"
75 #include "calls.h"
76 #include "emit-rtl.h"
77 #include "varasm.h"
78 #include "stmt.h"
79 #include "expr.h"
80 #include "recog.h"
81 #include "insn-codes.h"
82 #include "optabs.h"
83 #include "params.h"
84 #include "diagnostic-core.h"
85 #include "tree-chrec.h"
86 #include "tree-scalar-evolution.h"
87 #include "tree-vectorizer.h"
88 #include "target.h"
90 /* Loop Vectorization Pass.
92 This pass tries to vectorize loops.
94 For example, the vectorizer transforms the following simple loop:
96 short a[N]; short b[N]; short c[N]; int i;
98 for (i=0; i<N; i++){
99 a[i] = b[i] + c[i];
102 as if it was manually vectorized by rewriting the source code into:
104 typedef int __attribute__((mode(V8HI))) v8hi;
105 short a[N]; short b[N]; short c[N]; int i;
106 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
107 v8hi va, vb, vc;
109 for (i=0; i<N/8; i++){
110 vb = pb[i];
111 vc = pc[i];
112 va = vb + vc;
113 pa[i] = va;
116 The main entry to this pass is vectorize_loops(), in which
117 the vectorizer applies a set of analyses on a given set of loops,
118 followed by the actual vectorization transformation for the loops that
119 had successfully passed the analysis phase.
120 Throughout this pass we make a distinction between two types of
121 data: scalars (which are represented by SSA_NAMES), and memory references
122 ("data-refs"). These two types of data require different handling both
123 during analysis and transformation. The types of data-refs that the
124 vectorizer currently supports are ARRAY_REFS which base is an array DECL
125 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
126 accesses are required to have a simple (consecutive) access pattern.
128 Analysis phase:
129 ===============
130 The driver for the analysis phase is vect_analyze_loop().
131 It applies a set of analyses, some of which rely on the scalar evolution
132 analyzer (scev) developed by Sebastian Pop.
134 During the analysis phase the vectorizer records some information
135 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
136 loop, as well as general information about the loop as a whole, which is
137 recorded in a "loop_vec_info" struct attached to each loop.
139 Transformation phase:
140 =====================
141 The loop transformation phase scans all the stmts in the loop, and
142 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
143 the loop that needs to be vectorized. It inserts the vector code sequence
144 just before the scalar stmt S, and records a pointer to the vector code
145 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
146 attached to S). This pointer will be used for the vectorization of following
147 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
148 otherwise, we rely on dead code elimination for removing it.
150 For example, say stmt S1 was vectorized into stmt VS1:
152 VS1: vb = px[i];
153 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
154 S2: a = b;
156 To vectorize stmt S2, the vectorizer first finds the stmt that defines
157 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
158 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
159 resulting sequence would be:
161 VS1: vb = px[i];
162 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
163 VS2: va = vb;
164 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
166 Operands that are not SSA_NAMEs, are data-refs that appear in
167 load/store operations (like 'x[i]' in S1), and are handled differently.
169 Target modeling:
170 =================
171 Currently the only target specific information that is used is the
172 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
173 Targets that can support different sizes of vectors, for now will need
174 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
175 flexibility will be added in the future.
177 Since we only vectorize operations which vector form can be
178 expressed using existing tree codes, to verify that an operation is
179 supported, the vectorizer checks the relevant optab at the relevant
180 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
181 the value found is CODE_FOR_nothing, then there's no target support, and
182 we can't vectorize the stmt.
184 For additional information on this project see:
185 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
190 /* Function vect_determine_vectorization_factor
192 Determine the vectorization factor (VF). VF is the number of data elements
193 that are operated upon in parallel in a single iteration of the vectorized
194 loop. For example, when vectorizing a loop that operates on 4byte elements,
195 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
196 elements can fit in a single vector register.
198 We currently support vectorization of loops in which all types operated upon
199 are of the same size. Therefore this function currently sets VF according to
200 the size of the types operated upon, and fails if there are multiple sizes
201 in the loop.
203 VF is also the factor by which the loop iterations are strip-mined, e.g.:
204 original loop:
205 for (i=0; i<N; i++){
206 a[i] = b[i] + c[i];
209 vectorized loop:
210 for (i=0; i<N; i+=VF){
211 a[i:VF] = b[i:VF] + c[i:VF];
215 static bool
216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
218 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
219 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
220 int nbbs = loop->num_nodes;
221 unsigned int vectorization_factor = 0;
222 tree scalar_type;
223 gphi *phi;
224 tree vectype;
225 unsigned int nunits;
226 stmt_vec_info stmt_info;
227 int i;
228 HOST_WIDE_INT dummy;
229 gimple stmt, pattern_stmt = NULL;
230 gimple_seq pattern_def_seq = NULL;
231 gimple_stmt_iterator pattern_def_si = gsi_none ();
232 bool analyze_pattern_stmt = false;
234 if (dump_enabled_p ())
235 dump_printf_loc (MSG_NOTE, vect_location,
236 "=== vect_determine_vectorization_factor ===\n");
238 for (i = 0; i < nbbs; i++)
240 basic_block bb = bbs[i];
242 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
243 gsi_next (&si))
245 phi = si.phi ();
246 stmt_info = vinfo_for_stmt (phi);
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
250 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
251 dump_printf (MSG_NOTE, "\n");
254 gcc_assert (stmt_info);
256 if (STMT_VINFO_RELEVANT_P (stmt_info))
258 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
259 scalar_type = TREE_TYPE (PHI_RESULT (phi));
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location,
264 "get vectype for scalar type: ");
265 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
266 dump_printf (MSG_NOTE, "\n");
269 vectype = get_vectype_for_scalar_type (scalar_type);
270 if (!vectype)
272 if (dump_enabled_p ())
274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
275 "not vectorized: unsupported "
276 "data-type ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 scalar_type);
279 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
281 return false;
283 STMT_VINFO_VECTYPE (stmt_info) = vectype;
285 if (dump_enabled_p ())
287 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
288 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
289 dump_printf (MSG_NOTE, "\n");
292 nunits = TYPE_VECTOR_SUBPARTS (vectype);
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
295 nunits);
297 if (!vectorization_factor
298 || (nunits > vectorization_factor))
299 vectorization_factor = nunits;
303 for (gimple_stmt_iterator si = gsi_start_bb (bb);
304 !gsi_end_p (si) || analyze_pattern_stmt;)
306 tree vf_vectype;
308 if (analyze_pattern_stmt)
309 stmt = pattern_stmt;
310 else
311 stmt = gsi_stmt (si);
313 stmt_info = vinfo_for_stmt (stmt);
315 if (dump_enabled_p ())
317 dump_printf_loc (MSG_NOTE, vect_location,
318 "==> examining statement: ");
319 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
320 dump_printf (MSG_NOTE, "\n");
323 gcc_assert (stmt_info);
325 /* Skip stmts which do not need to be vectorized. */
326 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
327 && !STMT_VINFO_LIVE_P (stmt_info))
328 || gimple_clobber_p (stmt))
330 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
331 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
332 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
333 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
335 stmt = pattern_stmt;
336 stmt_info = vinfo_for_stmt (pattern_stmt);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location,
340 "==> examining pattern statement: ");
341 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
342 dump_printf (MSG_NOTE, "\n");
345 else
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
349 gsi_next (&si);
350 continue;
353 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
354 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
355 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
356 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
357 analyze_pattern_stmt = true;
359 /* If a pattern statement has def stmts, analyze them too. */
360 if (is_pattern_stmt_p (stmt_info))
362 if (pattern_def_seq == NULL)
364 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
365 pattern_def_si = gsi_start (pattern_def_seq);
367 else if (!gsi_end_p (pattern_def_si))
368 gsi_next (&pattern_def_si);
369 if (pattern_def_seq != NULL)
371 gimple pattern_def_stmt = NULL;
372 stmt_vec_info pattern_def_stmt_info = NULL;
374 while (!gsi_end_p (pattern_def_si))
376 pattern_def_stmt = gsi_stmt (pattern_def_si);
377 pattern_def_stmt_info
378 = vinfo_for_stmt (pattern_def_stmt);
379 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
380 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
381 break;
382 gsi_next (&pattern_def_si);
385 if (!gsi_end_p (pattern_def_si))
387 if (dump_enabled_p ())
389 dump_printf_loc (MSG_NOTE, vect_location,
390 "==> examining pattern def stmt: ");
391 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
392 pattern_def_stmt, 0);
393 dump_printf (MSG_NOTE, "\n");
396 stmt = pattern_def_stmt;
397 stmt_info = pattern_def_stmt_info;
399 else
401 pattern_def_si = gsi_none ();
402 analyze_pattern_stmt = false;
405 else
406 analyze_pattern_stmt = false;
409 if (gimple_get_lhs (stmt) == NULL_TREE
410 /* MASK_STORE has no lhs, but is ok. */
411 && (!is_gimple_call (stmt)
412 || !gimple_call_internal_p (stmt)
413 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
415 if (is_gimple_call (stmt))
417 /* Ignore calls with no lhs. These must be calls to
418 #pragma omp simd functions, and what vectorization factor
419 it really needs can't be determined until
420 vectorizable_simd_clone_call. */
421 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
423 pattern_def_seq = NULL;
424 gsi_next (&si);
426 continue;
428 if (dump_enabled_p ())
430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431 "not vectorized: irregular stmt.");
432 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
434 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
436 return false;
439 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
441 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444 "not vectorized: vector stmt in loop:");
445 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
446 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 return false;
451 if (STMT_VINFO_VECTYPE (stmt_info))
453 /* The only case when a vectype had been already set is for stmts
454 that contain a dataref, or for "pattern-stmts" (stmts
455 generated by the vectorizer to represent/replace a certain
456 idiom). */
457 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
458 || is_pattern_stmt_p (stmt_info)
459 || !gsi_end_p (pattern_def_si));
460 vectype = STMT_VINFO_VECTYPE (stmt_info);
462 else
464 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
465 if (is_gimple_call (stmt)
466 && gimple_call_internal_p (stmt)
467 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
468 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
469 else
470 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_NOTE, vect_location,
474 "get vectype for scalar type: ");
475 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
476 dump_printf (MSG_NOTE, "\n");
478 vectype = get_vectype_for_scalar_type (scalar_type);
479 if (!vectype)
481 if (dump_enabled_p ())
483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484 "not vectorized: unsupported "
485 "data-type ");
486 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
487 scalar_type);
488 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
490 return false;
493 STMT_VINFO_VECTYPE (stmt_info) = vectype;
495 if (dump_enabled_p ())
497 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
498 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
499 dump_printf (MSG_NOTE, "\n");
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
516 if (!vf_vectype)
518 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "not vectorized: unsupported data-type ");
522 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523 scalar_type);
524 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
526 return false;
529 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "not vectorized: different sized vector "
536 "types in statement, ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538 vectype);
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541 vf_vectype);
542 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 return false;
547 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551 dump_printf (MSG_NOTE, "\n");
554 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557 if (!vectorization_factor
558 || (nunits > vectorization_factor))
559 vectorization_factor = nunits;
561 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
563 pattern_def_seq = NULL;
564 gsi_next (&si);
569 /* TODO: Analyze cost. Decide if worth while to vectorize. */
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572 vectorization_factor);
573 if (vectorization_factor <= 1)
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "not vectorized: unsupported data-type\n");
578 return false;
580 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
582 return true;
586 /* Function vect_is_simple_iv_evolution.
588 FORNOW: A simple evolution of an induction variables in the loop is
589 considered a polynomial evolution. */
591 static bool
592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
593 tree * step)
595 tree init_expr;
596 tree step_expr;
597 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
598 basic_block bb;
600 /* When there is no evolution in this loop, the evolution function
601 is not "simple". */
602 if (evolution_part == NULL_TREE)
603 return false;
605 /* When the evolution is a polynomial of degree >= 2
606 the evolution function is not "simple". */
607 if (tree_is_chrec (evolution_part))
608 return false;
610 step_expr = evolution_part;
611 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
613 if (dump_enabled_p ())
615 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
616 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
617 dump_printf (MSG_NOTE, ", init: ");
618 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
619 dump_printf (MSG_NOTE, "\n");
622 *init = init_expr;
623 *step = step_expr;
625 if (TREE_CODE (step_expr) != INTEGER_CST
626 && (TREE_CODE (step_expr) != SSA_NAME
627 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
628 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
629 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
630 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
631 || !flag_associative_math)))
632 && (TREE_CODE (step_expr) != REAL_CST
633 || !flag_associative_math))
635 if (dump_enabled_p ())
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637 "step unknown.\n");
638 return false;
641 return true;
644 /* Function vect_analyze_scalar_cycles_1.
646 Examine the cross iteration def-use cycles of scalar variables
647 in LOOP. LOOP_VINFO represents the loop that is now being
648 considered for vectorization (can be LOOP, or an outer-loop
649 enclosing LOOP). */
651 static void
652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
654 basic_block bb = loop->header;
655 tree init, step;
656 auto_vec<gimple, 64> worklist;
657 gphi_iterator gsi;
658 bool double_reduc;
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "=== vect_analyze_scalar_cycles ===\n");
664 /* First - identify all inductions. Reduction detection assumes that all the
665 inductions have been identified, therefore, this order must not be
666 changed. */
667 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
669 gphi *phi = gsi.phi ();
670 tree access_fn = NULL;
671 tree def = PHI_RESULT (phi);
672 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
678 dump_printf (MSG_NOTE, "\n");
681 /* Skip virtual phi's. The data dependences that are associated with
682 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
683 if (virtual_operand_p (def))
684 continue;
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
688 /* Analyze the evolution function. */
689 access_fn = analyze_scalar_evolution (loop, def);
690 if (access_fn)
692 STRIP_NOPS (access_fn);
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_NOTE, vect_location,
696 "Access function of PHI: ");
697 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
698 dump_printf (MSG_NOTE, "\n");
700 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
701 = evolution_part_in_loop_num (access_fn, loop->num);
704 if (!access_fn
705 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
706 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
707 && TREE_CODE (step) != INTEGER_CST))
709 worklist.safe_push (phi);
710 continue;
713 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
715 if (dump_enabled_p ())
716 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
721 /* Second - identify all reductions and nested cycles. */
722 while (worklist.length () > 0)
724 gimple phi = worklist.pop ();
725 tree def = PHI_RESULT (phi);
726 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
727 gimple reduc_stmt;
728 bool nested_cycle;
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
733 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
734 dump_printf (MSG_NOTE, "\n");
737 gcc_assert (!virtual_operand_p (def)
738 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
740 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
741 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
742 &double_reduc);
743 if (reduc_stmt)
745 if (double_reduc)
747 if (dump_enabled_p ())
748 dump_printf_loc (MSG_NOTE, vect_location,
749 "Detected double reduction.\n");
751 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
752 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
753 vect_double_reduction_def;
755 else
757 if (nested_cycle)
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_NOTE, vect_location,
761 "Detected vectorizable nested cycle.\n");
763 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
764 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
765 vect_nested_cycle;
767 else
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE, vect_location,
771 "Detected reduction.\n");
773 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
774 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
775 vect_reduction_def;
776 /* Store the reduction cycles for possible vectorization in
777 loop-aware SLP. */
778 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
782 else
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 "Unknown def-use cycle pattern.\n");
790 /* Function vect_analyze_scalar_cycles.
792 Examine the cross iteration def-use cycles of scalar variables, by
793 analyzing the loop-header PHIs of scalar variables. Classify each
794 cycle as one of the following: invariant, induction, reduction, unknown.
795 We do that for the loop represented by LOOP_VINFO, and also to its
796 inner-loop, if exists.
797 Examples for scalar cycles:
799 Example1: reduction:
801 loop1:
802 for (i=0; i<N; i++)
803 sum += a[i];
805 Example2: induction:
807 loop2:
808 for (i=0; i<N; i++)
809 a[i] = i; */
811 static void
812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
814 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
816 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
818 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
819 Reductions in such inner-loop therefore have different properties than
820 the reductions in the nest that gets vectorized:
821 1. When vectorized, they are executed in the same order as in the original
822 scalar loop, so we can't change the order of computation when
823 vectorizing them.
824 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
825 current checks are too strict. */
827 if (loop->inner)
828 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
832 /* Function vect_get_loop_niters.
834 Determine how many iterations the loop is executed and place it
835 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
836 in NUMBER_OF_ITERATIONSM1.
838 Return the loop exit condition. */
841 static gcond *
842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
843 tree *number_of_iterationsm1)
845 tree niters;
847 if (dump_enabled_p ())
848 dump_printf_loc (MSG_NOTE, vect_location,
849 "=== get_loop_niters ===\n");
851 niters = number_of_latch_executions (loop);
852 *number_of_iterationsm1 = niters;
854 /* We want the number of loop header executions which is the number
855 of latch executions plus one.
856 ??? For UINT_MAX latch executions this number overflows to zero
857 for loops like do { n++; } while (n != 0); */
858 if (niters && !chrec_contains_undetermined (niters))
859 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
860 build_int_cst (TREE_TYPE (niters), 1));
861 *number_of_iterations = niters;
863 return get_loop_exit_condition (loop);
867 /* Function bb_in_loop_p
869 Used as predicate for dfs order traversal of the loop bbs. */
871 static bool
872 bb_in_loop_p (const_basic_block bb, const void *data)
874 const struct loop *const loop = (const struct loop *)data;
875 if (flow_bb_inside_loop_p (loop, bb))
876 return true;
877 return false;
881 /* Function new_loop_vec_info.
883 Create and initialize a new loop_vec_info struct for LOOP, as well as
884 stmt_vec_info structs for all the stmts in LOOP. */
886 static loop_vec_info
887 new_loop_vec_info (struct loop *loop)
889 loop_vec_info res;
890 basic_block *bbs;
891 gimple_stmt_iterator si;
892 unsigned int i, nbbs;
894 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
895 LOOP_VINFO_LOOP (res) = loop;
897 bbs = get_loop_body (loop);
899 /* Create/Update stmt_info for all stmts in the loop. */
900 for (i = 0; i < loop->num_nodes; i++)
902 basic_block bb = bbs[i];
904 /* BBs in a nested inner-loop will have been already processed (because
905 we will have called vect_analyze_loop_form for any nested inner-loop).
906 Therefore, for stmts in an inner-loop we just want to update the
907 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
908 loop_info of the outer-loop we are currently considering to vectorize
909 (instead of the loop_info of the inner-loop).
910 For stmts in other BBs we need to create a stmt_info from scratch. */
911 if (bb->loop_father != loop)
913 /* Inner-loop bb. */
914 gcc_assert (loop->inner && bb->loop_father == loop->inner);
915 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
917 gimple phi = gsi_stmt (si);
918 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
919 loop_vec_info inner_loop_vinfo =
920 STMT_VINFO_LOOP_VINFO (stmt_info);
921 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
922 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
924 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
926 gimple stmt = gsi_stmt (si);
927 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
928 loop_vec_info inner_loop_vinfo =
929 STMT_VINFO_LOOP_VINFO (stmt_info);
930 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
931 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
934 else
936 /* bb in current nest. */
937 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
939 gimple phi = gsi_stmt (si);
940 gimple_set_uid (phi, 0);
941 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
944 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
946 gimple stmt = gsi_stmt (si);
947 gimple_set_uid (stmt, 0);
948 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
953 /* CHECKME: We want to visit all BBs before their successors (except for
954 latch blocks, for which this assertion wouldn't hold). In the simple
955 case of the loop forms we allow, a dfs order of the BBs would the same
956 as reversed postorder traversal, so we are safe. */
958 free (bbs);
959 bbs = XCNEWVEC (basic_block, loop->num_nodes);
960 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
961 bbs, loop->num_nodes, loop);
962 gcc_assert (nbbs == loop->num_nodes);
964 LOOP_VINFO_BBS (res) = bbs;
965 LOOP_VINFO_NITERSM1 (res) = NULL;
966 LOOP_VINFO_NITERS (res) = NULL;
967 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
968 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
969 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
970 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
971 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
972 LOOP_VINFO_VECT_FACTOR (res) = 0;
973 LOOP_VINFO_LOOP_NEST (res).create (3);
974 LOOP_VINFO_DATAREFS (res).create (10);
975 LOOP_VINFO_DDRS (res).create (10 * 10);
976 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
977 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
978 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
979 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
980 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
981 LOOP_VINFO_GROUPED_STORES (res).create (10);
982 LOOP_VINFO_REDUCTIONS (res).create (10);
983 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
984 LOOP_VINFO_SLP_INSTANCES (res).create (10);
985 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
986 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
987 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
988 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
989 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
991 return res;
995 /* Function destroy_loop_vec_info.
997 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
998 stmts in the loop. */
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1003 struct loop *loop;
1004 basic_block *bbs;
1005 int nbbs;
1006 gimple_stmt_iterator si;
1007 int j;
1008 vec<slp_instance> slp_instances;
1009 slp_instance instance;
1010 bool swapped;
1012 if (!loop_vinfo)
1013 return;
1015 loop = LOOP_VINFO_LOOP (loop_vinfo);
1017 bbs = LOOP_VINFO_BBS (loop_vinfo);
1018 nbbs = clean_stmts ? loop->num_nodes : 0;
1019 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1021 for (j = 0; j < nbbs; j++)
1023 basic_block bb = bbs[j];
1024 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025 free_stmt_vec_info (gsi_stmt (si));
1027 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1029 gimple stmt = gsi_stmt (si);
1031 /* We may have broken canonical form by moving a constant
1032 into RHS1 of a commutative op. Fix such occurrences. */
1033 if (swapped && is_gimple_assign (stmt))
1035 enum tree_code code = gimple_assign_rhs_code (stmt);
1037 if ((code == PLUS_EXPR
1038 || code == POINTER_PLUS_EXPR
1039 || code == MULT_EXPR)
1040 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041 swap_ssa_operands (stmt,
1042 gimple_assign_rhs1_ptr (stmt),
1043 gimple_assign_rhs2_ptr (stmt));
1046 /* Free stmt_vec_info. */
1047 free_stmt_vec_info (stmt);
1048 gsi_next (&si);
1052 free (LOOP_VINFO_BBS (loop_vinfo));
1053 vect_destroy_datarefs (loop_vinfo, NULL);
1054 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060 vect_free_slp_instance (instance);
1062 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1067 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1070 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1072 free (loop_vinfo);
1073 loop->aux = NULL;
1077 /* Function vect_analyze_loop_1.
1079 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080 for it. The different analyses will record information in the
1081 loop_vec_info struct. This is a subset of the analyses applied in
1082 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083 that is now considered for (outer-loop) vectorization. */
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1088 loop_vec_info loop_vinfo;
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "===== analyze_loop_nest_1 =====\n");
1094 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1096 loop_vinfo = vect_analyze_loop_form (loop);
1097 if (!loop_vinfo)
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "bad inner-loop form.\n");
1102 return NULL;
1105 return loop_vinfo;
1109 /* Function vect_analyze_loop_form.
1111 Verify that certain CFG restrictions hold, including:
1112 - the loop has a pre-header
1113 - the loop has a single entry and exit
1114 - the loop exit condition is simple enough, and the number of iterations
1115 can be analyzed (a countable loop). */
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1120 loop_vec_info loop_vinfo;
1121 gcond *loop_cond;
1122 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123 loop_vec_info inner_loop_vinfo = NULL;
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_NOTE, vect_location,
1127 "=== vect_analyze_loop_form ===\n");
1129 /* Different restrictions apply when we are considering an inner-most loop,
1130 vs. an outer (nested) loop.
1131 (FORNOW. May want to relax some of these restrictions in the future). */
1133 if (!loop->inner)
1135 /* Inner-most loop. We currently require that the number of BBs is
1136 exactly 2 (the header and latch). Vectorizable inner-most loops
1137 look like this:
1139 (pre-header)
1141 header <--------+
1142 | | |
1143 | +--> latch --+
1145 (exit-bb) */
1147 if (loop->num_nodes != 2)
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151 "not vectorized: control flow in loop.\n");
1152 return NULL;
1155 if (empty_block_p (loop->header))
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "not vectorized: empty loop.\n");
1160 return NULL;
1163 else
1165 struct loop *innerloop = loop->inner;
1166 edge entryedge;
1168 /* Nested loop. We currently require that the loop is doubly-nested,
1169 contains a single inner loop, and the number of BBs is exactly 5.
1170 Vectorizable outer-loops look like this:
1172 (pre-header)
1174 header <---+
1176 inner-loop |
1178 tail ------+
1180 (exit-bb)
1182 The inner-loop has the properties expected of inner-most loops
1183 as described above. */
1185 if ((loop->inner)->inner || (loop->inner)->next)
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "not vectorized: multiple nested loops.\n");
1190 return NULL;
1193 /* Analyze the inner-loop. */
1194 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195 if (!inner_loop_vinfo)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "not vectorized: Bad inner loop.\n");
1200 return NULL;
1203 if (!expr_invariant_in_loop_p (loop,
1204 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "not vectorized: inner-loop count not"
1209 " invariant.\n");
1210 destroy_loop_vec_info (inner_loop_vinfo, true);
1211 return NULL;
1214 if (loop->num_nodes != 5)
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "not vectorized: control flow in loop.\n");
1219 destroy_loop_vec_info (inner_loop_vinfo, true);
1220 return NULL;
1223 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224 entryedge = EDGE_PRED (innerloop->header, 0);
1225 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226 entryedge = EDGE_PRED (innerloop->header, 1);
1228 if (entryedge->src != loop->header
1229 || !single_exit (innerloop)
1230 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: unsupported outerloop form.\n");
1235 destroy_loop_vec_info (inner_loop_vinfo, true);
1236 return NULL;
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_NOTE, vect_location,
1241 "Considering outer-loop vectorization.\n");
1244 if (!single_exit (loop)
1245 || EDGE_COUNT (loop->header->preds) != 2)
1247 if (dump_enabled_p ())
1249 if (!single_exit (loop))
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 "not vectorized: multiple exits.\n");
1252 else if (EDGE_COUNT (loop->header->preds) != 2)
1253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 "not vectorized: too many incoming edges.\n");
1256 if (inner_loop_vinfo)
1257 destroy_loop_vec_info (inner_loop_vinfo, true);
1258 return NULL;
1261 /* We assume that the loop exit condition is at the end of the loop. i.e,
1262 that the loop is represented as a do-while (with a proper if-guard
1263 before the loop if needed), where the loop header contains all the
1264 executable statements, and the latch is empty. */
1265 if (!empty_block_p (loop->latch)
1266 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1268 if (dump_enabled_p ())
1269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270 "not vectorized: latch block not empty.\n");
1271 if (inner_loop_vinfo)
1272 destroy_loop_vec_info (inner_loop_vinfo, true);
1273 return NULL;
1276 /* Make sure there exists a single-predecessor exit bb: */
1277 if (!single_pred_p (single_exit (loop)->dest))
1279 edge e = single_exit (loop);
1280 if (!(e->flags & EDGE_ABNORMAL))
1282 split_loop_exit_edge (e);
1283 if (dump_enabled_p ())
1284 dump_printf (MSG_NOTE, "split exit edge.\n");
1286 else
1288 if (dump_enabled_p ())
1289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290 "not vectorized: abnormal loop exit edge.\n");
1291 if (inner_loop_vinfo)
1292 destroy_loop_vec_info (inner_loop_vinfo, true);
1293 return NULL;
1297 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298 &number_of_iterationsm1);
1299 if (!loop_cond)
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: complicated exit condition.\n");
1304 if (inner_loop_vinfo)
1305 destroy_loop_vec_info (inner_loop_vinfo, true);
1306 return NULL;
1309 if (!number_of_iterations
1310 || chrec_contains_undetermined (number_of_iterations))
1312 if (dump_enabled_p ())
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 "not vectorized: number of iterations cannot be "
1315 "computed.\n");
1316 if (inner_loop_vinfo)
1317 destroy_loop_vec_info (inner_loop_vinfo, true);
1318 return NULL;
1321 if (integer_zerop (number_of_iterations))
1323 if (dump_enabled_p ())
1324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325 "not vectorized: number of iterations = 0.\n");
1326 if (inner_loop_vinfo)
1327 destroy_loop_vec_info (inner_loop_vinfo, true);
1328 return NULL;
1331 loop_vinfo = new_loop_vec_info (loop);
1332 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1336 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1338 if (dump_enabled_p ())
1340 dump_printf_loc (MSG_NOTE, vect_location,
1341 "Symbolic number of iterations is ");
1342 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343 dump_printf (MSG_NOTE, "\n");
1347 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1349 /* CHECKME: May want to keep it around it in the future. */
1350 if (inner_loop_vinfo)
1351 destroy_loop_vec_info (inner_loop_vinfo, false);
1353 gcc_assert (!loop->aux);
1354 loop->aux = loop_vinfo;
1355 return loop_vinfo;
1358 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1359 statements update the vectorization factor. */
1361 static void
1362 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1364 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1365 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1366 int nbbs = loop->num_nodes;
1367 unsigned int vectorization_factor;
1368 int i;
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_NOTE, vect_location,
1372 "=== vect_update_vf_for_slp ===\n");
1374 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1375 gcc_assert (vectorization_factor != 0);
1377 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1378 vectorization factor of the loop is the unrolling factor required by
1379 the SLP instances. If that unrolling factor is 1, we say, that we
1380 perform pure SLP on loop - cross iteration parallelism is not
1381 exploited. */
1382 bool only_slp_in_loop = true;
1383 for (i = 0; i < nbbs; i++)
1385 basic_block bb = bbs[i];
1386 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1387 gsi_next (&si))
1389 gimple stmt = gsi_stmt (si);
1390 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1391 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1392 && STMT_VINFO_RELATED_STMT (stmt_info))
1394 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1395 stmt_info = vinfo_for_stmt (stmt);
1397 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399 && !PURE_SLP_STMT (stmt_info))
1400 /* STMT needs both SLP and loop-based vectorization. */
1401 only_slp_in_loop = false;
1405 if (only_slp_in_loop)
1406 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1407 else
1408 vectorization_factor
1409 = least_common_multiple (vectorization_factor,
1410 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1412 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_NOTE, vect_location,
1415 "Updating vectorization factor to %d\n",
1416 vectorization_factor);
1419 /* Function vect_analyze_loop_operations.
1421 Scan the loop stmts and make sure they are all vectorizable. */
1423 static bool
1424 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1426 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1427 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1428 int nbbs = loop->num_nodes;
1429 unsigned int vectorization_factor;
1430 int i;
1431 stmt_vec_info stmt_info;
1432 bool need_to_vectorize = false;
1433 int min_profitable_iters;
1434 int min_scalar_loop_bound;
1435 unsigned int th;
1436 bool ok;
1437 HOST_WIDE_INT max_niter;
1438 HOST_WIDE_INT estimated_niter;
1439 int min_profitable_estimate;
1441 if (dump_enabled_p ())
1442 dump_printf_loc (MSG_NOTE, vect_location,
1443 "=== vect_analyze_loop_operations ===\n");
1445 for (i = 0; i < nbbs; i++)
1447 basic_block bb = bbs[i];
1449 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1450 gsi_next (&si))
1452 gphi *phi = si.phi ();
1453 ok = true;
1455 stmt_info = vinfo_for_stmt (phi);
1456 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1459 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1460 dump_printf (MSG_NOTE, "\n");
1463 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1464 (i.e., a phi in the tail of the outer-loop). */
1465 if (! is_loop_header_bb_p (bb))
1467 /* FORNOW: we currently don't support the case that these phis
1468 are not used in the outerloop (unless it is double reduction,
1469 i.e., this phi is vect_reduction_def), cause this case
1470 requires to actually do something here. */
1471 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1472 || STMT_VINFO_LIVE_P (stmt_info))
1473 && STMT_VINFO_DEF_TYPE (stmt_info)
1474 != vect_double_reduction_def)
1476 if (dump_enabled_p ())
1477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1478 "Unsupported loop-closed phi in "
1479 "outer-loop.\n");
1480 return false;
1483 /* If PHI is used in the outer loop, we check that its operand
1484 is defined in the inner loop. */
1485 if (STMT_VINFO_RELEVANT_P (stmt_info))
1487 tree phi_op;
1488 gimple op_def_stmt;
1490 if (gimple_phi_num_args (phi) != 1)
1491 return false;
1493 phi_op = PHI_ARG_DEF (phi, 0);
1494 if (TREE_CODE (phi_op) != SSA_NAME)
1495 return false;
1497 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1498 if (gimple_nop_p (op_def_stmt)
1499 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1500 || !vinfo_for_stmt (op_def_stmt))
1501 return false;
1503 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1504 != vect_used_in_outer
1505 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1506 != vect_used_in_outer_by_reduction)
1507 return false;
1510 continue;
1513 gcc_assert (stmt_info);
1515 if (STMT_VINFO_LIVE_P (stmt_info))
1517 /* FORNOW: not yet supported. */
1518 if (dump_enabled_p ())
1519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1520 "not vectorized: value used after loop.\n");
1521 return false;
1524 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1525 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1527 /* A scalar-dependence cycle that we don't support. */
1528 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 "not vectorized: scalar dependence cycle.\n");
1531 return false;
1534 if (STMT_VINFO_RELEVANT_P (stmt_info))
1536 need_to_vectorize = true;
1537 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1538 ok = vectorizable_induction (phi, NULL, NULL);
1541 if (!ok)
1543 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546 "not vectorized: relevant phi not "
1547 "supported: ");
1548 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1549 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1551 return false;
1555 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556 gsi_next (&si))
1558 gimple stmt = gsi_stmt (si);
1559 if (!gimple_clobber_p (stmt)
1560 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1561 return false;
1563 } /* bbs */
1565 /* All operations in the loop are either irrelevant (deal with loop
1566 control, or dead), or only used outside the loop and can be moved
1567 out of the loop (e.g. invariants, inductions). The loop can be
1568 optimized away by scalar optimizations. We're better off not
1569 touching this loop. */
1570 if (!need_to_vectorize)
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_NOTE, vect_location,
1574 "All the computation can be taken out of the loop.\n");
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577 "not vectorized: redundant loop. no profit to "
1578 "vectorize.\n");
1579 return false;
1582 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1583 gcc_assert (vectorization_factor != 0);
1585 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1586 dump_printf_loc (MSG_NOTE, vect_location,
1587 "vectorization_factor = %d, niters = "
1588 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1589 LOOP_VINFO_INT_NITERS (loop_vinfo));
1591 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1592 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1593 || ((max_niter = max_stmt_executions_int (loop)) != -1
1594 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1596 if (dump_enabled_p ())
1597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1598 "not vectorized: iteration count too small.\n");
1599 if (dump_enabled_p ())
1600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601 "not vectorized: iteration count smaller than "
1602 "vectorization factor.\n");
1603 return false;
1606 /* Analyze cost. Decide if worth while to vectorize. */
1608 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1609 &min_profitable_estimate);
1610 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1612 if (min_profitable_iters < 0)
1614 if (dump_enabled_p ())
1615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1616 "not vectorized: vectorization not profitable.\n");
1617 if (dump_enabled_p ())
1618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1619 "not vectorized: vector version will never be "
1620 "profitable.\n");
1621 return false;
1624 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1625 * vectorization_factor) - 1);
1628 /* Use the cost model only if it is more conservative than user specified
1629 threshold. */
1631 th = (unsigned) min_scalar_loop_bound;
1632 if (min_profitable_iters
1633 && (!min_scalar_loop_bound
1634 || min_profitable_iters > min_scalar_loop_bound))
1635 th = (unsigned) min_profitable_iters;
1637 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1639 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1640 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644 "not vectorized: vectorization not profitable.\n");
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_NOTE, vect_location,
1647 "not vectorized: iteration count smaller than user "
1648 "specified loop bound parameter or minimum profitable "
1649 "iterations (whichever is more conservative).\n");
1650 return false;
1653 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1654 && ((unsigned HOST_WIDE_INT) estimated_niter
1655 <= MAX (th, (unsigned)min_profitable_estimate)))
1657 if (dump_enabled_p ())
1658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1659 "not vectorized: estimated iteration count too "
1660 "small.\n");
1661 if (dump_enabled_p ())
1662 dump_printf_loc (MSG_NOTE, vect_location,
1663 "not vectorized: estimated iteration count smaller "
1664 "than specified loop bound parameter or minimum "
1665 "profitable iterations (whichever is more "
1666 "conservative).\n");
1667 return false;
1670 return true;
1674 /* Function vect_analyze_loop_2.
1676 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1677 for it. The different analyses will record information in the
1678 loop_vec_info struct. */
1679 static bool
1680 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1682 bool ok;
1683 int max_vf = MAX_VECTORIZATION_FACTOR;
1684 int min_vf = 2;
1685 unsigned int th;
1686 unsigned int n_stmts = 0;
1688 /* Find all data references in the loop (which correspond to vdefs/vuses)
1689 and analyze their evolution in the loop. Also adjust the minimal
1690 vectorization factor according to the loads and stores.
1692 FORNOW: Handle only simple, array references, which
1693 alignment can be forced, and aligned pointer-references. */
1695 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1696 if (!ok)
1698 if (dump_enabled_p ())
1699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700 "bad data references.\n");
1701 return false;
1704 /* Classify all cross-iteration scalar data-flow cycles.
1705 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1707 vect_analyze_scalar_cycles (loop_vinfo);
1709 vect_pattern_recog (loop_vinfo, NULL);
1711 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1712 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1714 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1715 if (!ok)
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "bad data access.\n");
1720 return false;
1723 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1725 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1726 if (!ok)
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730 "unexpected pattern.\n");
1731 return false;
1734 /* Analyze data dependences between the data-refs in the loop
1735 and adjust the maximum vectorization factor according to
1736 the dependences.
1737 FORNOW: fail at the first data dependence that we encounter. */
1739 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1740 if (!ok
1741 || max_vf < min_vf)
1743 if (dump_enabled_p ())
1744 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1745 "bad data dependence.\n");
1746 return false;
1749 ok = vect_determine_vectorization_factor (loop_vinfo);
1750 if (!ok)
1752 if (dump_enabled_p ())
1753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754 "can't determine vectorization factor.\n");
1755 return false;
1757 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761 "bad data dependence.\n");
1762 return false;
1765 /* Analyze the alignment of the data-refs in the loop.
1766 Fail if a data reference is found that cannot be vectorized. */
1768 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1769 if (!ok)
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "bad data alignment.\n");
1774 return false;
1777 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1778 It is important to call pruning after vect_analyze_data_ref_accesses,
1779 since we use grouping information gathered by interleaving analysis. */
1780 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1781 if (!ok)
1783 if (dump_enabled_p ())
1784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785 "number of versioning for alias "
1786 "run-time tests exceeds %d "
1787 "(--param vect-max-version-for-alias-checks)\n",
1788 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1789 return false;
1792 /* This pass will decide on using loop versioning and/or loop peeling in
1793 order to enhance the alignment of data references in the loop. */
1795 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1796 if (!ok)
1798 if (dump_enabled_p ())
1799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1800 "bad data alignment.\n");
1801 return false;
1804 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1805 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1806 if (ok)
1808 /* If there are any SLP instances mark them as pure_slp. */
1809 if (vect_make_slp_decision (loop_vinfo))
1811 /* Find stmts that need to be both vectorized and SLPed. */
1812 vect_detect_hybrid_slp (loop_vinfo);
1814 /* Update the vectorization factor based on the SLP decision. */
1815 vect_update_vf_for_slp (loop_vinfo);
1817 /* Once VF is set, SLP costs should be updated since the number of
1818 created vector stmts depends on VF. */
1819 vect_update_slp_costs_according_to_vf (loop_vinfo);
1821 /* Analyze operations in the SLP instances. Note this may
1822 remove unsupported SLP instances which makes the above
1823 SLP kind detection invalid. */
1824 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1825 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1826 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1827 return false;
1830 else
1831 return false;
1833 /* Scan all the remaining operations in the loop that are not subject
1834 to SLP and make sure they are vectorizable. */
1835 ok = vect_analyze_loop_operations (loop_vinfo);
1836 if (!ok)
1838 if (dump_enabled_p ())
1839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840 "bad operation or unsupported loop bound.\n");
1841 return false;
1844 /* Decide whether we need to create an epilogue loop to handle
1845 remaining scalar iterations. */
1846 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1847 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1848 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1850 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1851 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1853 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1854 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1855 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1856 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1858 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1859 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1860 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1861 /* In case of versioning, check if the maximum number of
1862 iterations is greater than th. If they are identical,
1863 the epilogue is unnecessary. */
1864 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1865 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1866 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1867 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1868 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1870 /* If an epilogue loop is required make sure we can create one. */
1871 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1872 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1876 if (!vect_can_advance_ivs_p (loop_vinfo)
1877 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1878 single_exit (LOOP_VINFO_LOOP
1879 (loop_vinfo))))
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883 "not vectorized: can't create required "
1884 "epilog loop\n");
1885 return false;
1889 return true;
1892 /* Function vect_analyze_loop.
1894 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1895 for it. The different analyses will record information in the
1896 loop_vec_info struct. */
1897 loop_vec_info
1898 vect_analyze_loop (struct loop *loop)
1900 loop_vec_info loop_vinfo;
1901 unsigned int vector_sizes;
1903 /* Autodetect first vector size we try. */
1904 current_vector_size = 0;
1905 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1907 if (dump_enabled_p ())
1908 dump_printf_loc (MSG_NOTE, vect_location,
1909 "===== analyze_loop_nest =====\n");
1911 if (loop_outer (loop)
1912 && loop_vec_info_for_loop (loop_outer (loop))
1913 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1915 if (dump_enabled_p ())
1916 dump_printf_loc (MSG_NOTE, vect_location,
1917 "outer-loop already vectorized.\n");
1918 return NULL;
1921 while (1)
1923 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1924 loop_vinfo = vect_analyze_loop_form (loop);
1925 if (!loop_vinfo)
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "bad loop form.\n");
1930 return NULL;
1933 if (vect_analyze_loop_2 (loop_vinfo))
1935 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1937 return loop_vinfo;
1940 destroy_loop_vec_info (loop_vinfo, true);
1942 vector_sizes &= ~current_vector_size;
1943 if (vector_sizes == 0
1944 || current_vector_size == 0)
1945 return NULL;
1947 /* Try the next biggest vector size. */
1948 current_vector_size = 1 << floor_log2 (vector_sizes);
1949 if (dump_enabled_p ())
1950 dump_printf_loc (MSG_NOTE, vect_location,
1951 "***** Re-trying analysis with "
1952 "vector size %d\n", current_vector_size);
1957 /* Function reduction_code_for_scalar_code
1959 Input:
1960 CODE - tree_code of a reduction operations.
1962 Output:
1963 REDUC_CODE - the corresponding tree-code to be used to reduce the
1964 vector of partial results into a single scalar result, or ERROR_MARK
1965 if the operation is a supported reduction operation, but does not have
1966 such a tree-code.
1968 Return FALSE if CODE currently cannot be vectorized as reduction. */
1970 static bool
1971 reduction_code_for_scalar_code (enum tree_code code,
1972 enum tree_code *reduc_code)
1974 switch (code)
1976 case MAX_EXPR:
1977 *reduc_code = REDUC_MAX_EXPR;
1978 return true;
1980 case MIN_EXPR:
1981 *reduc_code = REDUC_MIN_EXPR;
1982 return true;
1984 case PLUS_EXPR:
1985 *reduc_code = REDUC_PLUS_EXPR;
1986 return true;
1988 case MULT_EXPR:
1989 case MINUS_EXPR:
1990 case BIT_IOR_EXPR:
1991 case BIT_XOR_EXPR:
1992 case BIT_AND_EXPR:
1993 *reduc_code = ERROR_MARK;
1994 return true;
1996 default:
1997 return false;
2002 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2003 STMT is printed with a message MSG. */
2005 static void
2006 report_vect_op (int msg_type, gimple stmt, const char *msg)
2008 dump_printf_loc (msg_type, vect_location, "%s", msg);
2009 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2010 dump_printf (msg_type, "\n");
2014 /* Detect SLP reduction of the form:
2016 #a1 = phi <a5, a0>
2017 a2 = operation (a1)
2018 a3 = operation (a2)
2019 a4 = operation (a3)
2020 a5 = operation (a4)
2022 #a = phi <a5>
2024 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2025 FIRST_STMT is the first reduction stmt in the chain
2026 (a2 = operation (a1)).
2028 Return TRUE if a reduction chain was detected. */
2030 static bool
2031 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2033 struct loop *loop = (gimple_bb (phi))->loop_father;
2034 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2035 enum tree_code code;
2036 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2037 stmt_vec_info use_stmt_info, current_stmt_info;
2038 tree lhs;
2039 imm_use_iterator imm_iter;
2040 use_operand_p use_p;
2041 int nloop_uses, size = 0, n_out_of_loop_uses;
2042 bool found = false;
2044 if (loop != vect_loop)
2045 return false;
2047 lhs = PHI_RESULT (phi);
2048 code = gimple_assign_rhs_code (first_stmt);
2049 while (1)
2051 nloop_uses = 0;
2052 n_out_of_loop_uses = 0;
2053 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2055 gimple use_stmt = USE_STMT (use_p);
2056 if (is_gimple_debug (use_stmt))
2057 continue;
2059 /* Check if we got back to the reduction phi. */
2060 if (use_stmt == phi)
2062 loop_use_stmt = use_stmt;
2063 found = true;
2064 break;
2067 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2069 loop_use_stmt = use_stmt;
2070 nloop_uses++;
2072 else
2073 n_out_of_loop_uses++;
2075 /* There are can be either a single use in the loop or two uses in
2076 phi nodes. */
2077 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2078 return false;
2081 if (found)
2082 break;
2084 /* We reached a statement with no loop uses. */
2085 if (nloop_uses == 0)
2086 return false;
2088 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2089 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2090 return false;
2092 if (!is_gimple_assign (loop_use_stmt)
2093 || code != gimple_assign_rhs_code (loop_use_stmt)
2094 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2095 return false;
2097 /* Insert USE_STMT into reduction chain. */
2098 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2099 if (current_stmt)
2101 current_stmt_info = vinfo_for_stmt (current_stmt);
2102 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2103 GROUP_FIRST_ELEMENT (use_stmt_info)
2104 = GROUP_FIRST_ELEMENT (current_stmt_info);
2106 else
2107 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2109 lhs = gimple_assign_lhs (loop_use_stmt);
2110 current_stmt = loop_use_stmt;
2111 size++;
2114 if (!found || loop_use_stmt != phi || size < 2)
2115 return false;
2117 /* Swap the operands, if needed, to make the reduction operand be the second
2118 operand. */
2119 lhs = PHI_RESULT (phi);
2120 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2121 while (next_stmt)
2123 if (gimple_assign_rhs2 (next_stmt) == lhs)
2125 tree op = gimple_assign_rhs1 (next_stmt);
2126 gimple def_stmt = NULL;
2128 if (TREE_CODE (op) == SSA_NAME)
2129 def_stmt = SSA_NAME_DEF_STMT (op);
2131 /* Check that the other def is either defined in the loop
2132 ("vect_internal_def"), or it's an induction (defined by a
2133 loop-header phi-node). */
2134 if (def_stmt
2135 && gimple_bb (def_stmt)
2136 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2137 && (is_gimple_assign (def_stmt)
2138 || is_gimple_call (def_stmt)
2139 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2140 == vect_induction_def
2141 || (gimple_code (def_stmt) == GIMPLE_PHI
2142 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2143 == vect_internal_def
2144 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2146 lhs = gimple_assign_lhs (next_stmt);
2147 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2148 continue;
2151 return false;
2153 else
2155 tree op = gimple_assign_rhs2 (next_stmt);
2156 gimple def_stmt = NULL;
2158 if (TREE_CODE (op) == SSA_NAME)
2159 def_stmt = SSA_NAME_DEF_STMT (op);
2161 /* Check that the other def is either defined in the loop
2162 ("vect_internal_def"), or it's an induction (defined by a
2163 loop-header phi-node). */
2164 if (def_stmt
2165 && gimple_bb (def_stmt)
2166 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2167 && (is_gimple_assign (def_stmt)
2168 || is_gimple_call (def_stmt)
2169 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2170 == vect_induction_def
2171 || (gimple_code (def_stmt) == GIMPLE_PHI
2172 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2173 == vect_internal_def
2174 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2176 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2179 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2180 dump_printf (MSG_NOTE, "\n");
2183 swap_ssa_operands (next_stmt,
2184 gimple_assign_rhs1_ptr (next_stmt),
2185 gimple_assign_rhs2_ptr (next_stmt));
2186 update_stmt (next_stmt);
2188 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2189 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2191 else
2192 return false;
2195 lhs = gimple_assign_lhs (next_stmt);
2196 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2199 /* Save the chain for further analysis in SLP detection. */
2200 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2201 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2202 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2204 return true;
2208 /* Function vect_is_simple_reduction_1
2210 (1) Detect a cross-iteration def-use cycle that represents a simple
2211 reduction computation. We look for the following pattern:
2213 loop_header:
2214 a1 = phi < a0, a2 >
2215 a3 = ...
2216 a2 = operation (a3, a1)
2220 a3 = ...
2221 loop_header:
2222 a1 = phi < a0, a2 >
2223 a2 = operation (a3, a1)
2225 such that:
2226 1. operation is commutative and associative and it is safe to
2227 change the order of the computation (if CHECK_REDUCTION is true)
2228 2. no uses for a2 in the loop (a2 is used out of the loop)
2229 3. no uses of a1 in the loop besides the reduction operation
2230 4. no uses of a1 outside the loop.
2232 Conditions 1,4 are tested here.
2233 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2235 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2236 nested cycles, if CHECK_REDUCTION is false.
2238 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2239 reductions:
2241 a1 = phi < a0, a2 >
2242 inner loop (def of a3)
2243 a2 = phi < a3 >
2245 If MODIFY is true it tries also to rework the code in-place to enable
2246 detection of more reduction patterns. For the time being we rewrite
2247 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2250 static gimple
2251 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2252 bool check_reduction, bool *double_reduc,
2253 bool modify)
2255 struct loop *loop = (gimple_bb (phi))->loop_father;
2256 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2257 edge latch_e = loop_latch_edge (loop);
2258 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2259 gimple def_stmt, def1 = NULL, def2 = NULL;
2260 enum tree_code orig_code, code;
2261 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2262 tree type;
2263 int nloop_uses;
2264 tree name;
2265 imm_use_iterator imm_iter;
2266 use_operand_p use_p;
2267 bool phi_def;
2269 *double_reduc = false;
2271 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2272 otherwise, we assume outer loop vectorization. */
2273 gcc_assert ((check_reduction && loop == vect_loop)
2274 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2276 name = PHI_RESULT (phi);
2277 /* ??? If there are no uses of the PHI result the inner loop reduction
2278 won't be detected as possibly double-reduction by vectorizable_reduction
2279 because that tries to walk the PHI arg from the preheader edge which
2280 can be constant. See PR60382. */
2281 if (has_zero_uses (name))
2282 return NULL;
2283 nloop_uses = 0;
2284 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2286 gimple use_stmt = USE_STMT (use_p);
2287 if (is_gimple_debug (use_stmt))
2288 continue;
2290 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294 "intermediate value used outside loop.\n");
2296 return NULL;
2299 nloop_uses++;
2300 if (nloop_uses > 1)
2302 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304 "reduction used in loop.\n");
2305 return NULL;
2309 if (TREE_CODE (loop_arg) != SSA_NAME)
2311 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "reduction: not ssa_name: ");
2315 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2318 return NULL;
2321 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2322 if (!def_stmt)
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326 "reduction: no def_stmt.\n");
2327 return NULL;
2330 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2332 if (dump_enabled_p ())
2334 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2335 dump_printf (MSG_NOTE, "\n");
2337 return NULL;
2340 if (is_gimple_assign (def_stmt))
2342 name = gimple_assign_lhs (def_stmt);
2343 phi_def = false;
2345 else
2347 name = PHI_RESULT (def_stmt);
2348 phi_def = true;
2351 nloop_uses = 0;
2352 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2354 gimple use_stmt = USE_STMT (use_p);
2355 if (is_gimple_debug (use_stmt))
2356 continue;
2357 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2358 nloop_uses++;
2359 if (nloop_uses > 1)
2361 if (dump_enabled_p ())
2362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363 "reduction used in loop.\n");
2364 return NULL;
2368 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2369 defined in the inner loop. */
2370 if (phi_def)
2372 op1 = PHI_ARG_DEF (def_stmt, 0);
2374 if (gimple_phi_num_args (def_stmt) != 1
2375 || TREE_CODE (op1) != SSA_NAME)
2377 if (dump_enabled_p ())
2378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2379 "unsupported phi node definition.\n");
2381 return NULL;
2384 def1 = SSA_NAME_DEF_STMT (op1);
2385 if (gimple_bb (def1)
2386 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2387 && loop->inner
2388 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2389 && is_gimple_assign (def1))
2391 if (dump_enabled_p ())
2392 report_vect_op (MSG_NOTE, def_stmt,
2393 "detected double reduction: ");
2395 *double_reduc = true;
2396 return def_stmt;
2399 return NULL;
2402 code = orig_code = gimple_assign_rhs_code (def_stmt);
2404 /* We can handle "res -= x[i]", which is non-associative by
2405 simply rewriting this into "res += -x[i]". Avoid changing
2406 gimple instruction for the first simple tests and only do this
2407 if we're allowed to change code at all. */
2408 if (code == MINUS_EXPR
2409 && modify
2410 && (op1 = gimple_assign_rhs1 (def_stmt))
2411 && TREE_CODE (op1) == SSA_NAME
2412 && SSA_NAME_DEF_STMT (op1) == phi)
2413 code = PLUS_EXPR;
2415 if (check_reduction
2416 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2418 if (dump_enabled_p ())
2419 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2420 "reduction: not commutative/associative: ");
2421 return NULL;
2424 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2426 if (code != COND_EXPR)
2428 if (dump_enabled_p ())
2429 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2430 "reduction: not binary operation: ");
2432 return NULL;
2435 op3 = gimple_assign_rhs1 (def_stmt);
2436 if (COMPARISON_CLASS_P (op3))
2438 op4 = TREE_OPERAND (op3, 1);
2439 op3 = TREE_OPERAND (op3, 0);
2442 op1 = gimple_assign_rhs2 (def_stmt);
2443 op2 = gimple_assign_rhs3 (def_stmt);
2445 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2447 if (dump_enabled_p ())
2448 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2449 "reduction: uses not ssa_names: ");
2451 return NULL;
2454 else
2456 op1 = gimple_assign_rhs1 (def_stmt);
2457 op2 = gimple_assign_rhs2 (def_stmt);
2459 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2461 if (dump_enabled_p ())
2462 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2463 "reduction: uses not ssa_names: ");
2465 return NULL;
2469 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2470 if ((TREE_CODE (op1) == SSA_NAME
2471 && !types_compatible_p (type,TREE_TYPE (op1)))
2472 || (TREE_CODE (op2) == SSA_NAME
2473 && !types_compatible_p (type, TREE_TYPE (op2)))
2474 || (op3 && TREE_CODE (op3) == SSA_NAME
2475 && !types_compatible_p (type, TREE_TYPE (op3)))
2476 || (op4 && TREE_CODE (op4) == SSA_NAME
2477 && !types_compatible_p (type, TREE_TYPE (op4))))
2479 if (dump_enabled_p ())
2481 dump_printf_loc (MSG_NOTE, vect_location,
2482 "reduction: multiple types: operation type: ");
2483 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2484 dump_printf (MSG_NOTE, ", operands types: ");
2485 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2486 TREE_TYPE (op1));
2487 dump_printf (MSG_NOTE, ",");
2488 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2489 TREE_TYPE (op2));
2490 if (op3)
2492 dump_printf (MSG_NOTE, ",");
2493 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2494 TREE_TYPE (op3));
2497 if (op4)
2499 dump_printf (MSG_NOTE, ",");
2500 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2501 TREE_TYPE (op4));
2503 dump_printf (MSG_NOTE, "\n");
2506 return NULL;
2509 /* Check that it's ok to change the order of the computation.
2510 Generally, when vectorizing a reduction we change the order of the
2511 computation. This may change the behavior of the program in some
2512 cases, so we need to check that this is ok. One exception is when
2513 vectorizing an outer-loop: the inner-loop is executed sequentially,
2514 and therefore vectorizing reductions in the inner-loop during
2515 outer-loop vectorization is safe. */
2517 /* CHECKME: check for !flag_finite_math_only too? */
2518 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2519 && check_reduction)
2521 /* Changing the order of operations changes the semantics. */
2522 if (dump_enabled_p ())
2523 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2524 "reduction: unsafe fp math optimization: ");
2525 return NULL;
2527 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2528 && check_reduction)
2530 /* Changing the order of operations changes the semantics. */
2531 if (dump_enabled_p ())
2532 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2533 "reduction: unsafe int math optimization: ");
2534 return NULL;
2536 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2538 /* Changing the order of operations changes the semantics. */
2539 if (dump_enabled_p ())
2540 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2541 "reduction: unsafe fixed-point math optimization: ");
2542 return NULL;
2545 /* If we detected "res -= x[i]" earlier, rewrite it into
2546 "res += -x[i]" now. If this turns out to be useless reassoc
2547 will clean it up again. */
2548 if (orig_code == MINUS_EXPR)
2550 tree rhs = gimple_assign_rhs2 (def_stmt);
2551 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2552 gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2553 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2554 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2555 loop_info, NULL));
2556 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2557 gimple_assign_set_rhs2 (def_stmt, negrhs);
2558 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2559 update_stmt (def_stmt);
2562 /* Reduction is safe. We're dealing with one of the following:
2563 1) integer arithmetic and no trapv
2564 2) floating point arithmetic, and special flags permit this optimization
2565 3) nested cycle (i.e., outer loop vectorization). */
2566 if (TREE_CODE (op1) == SSA_NAME)
2567 def1 = SSA_NAME_DEF_STMT (op1);
2569 if (TREE_CODE (op2) == SSA_NAME)
2570 def2 = SSA_NAME_DEF_STMT (op2);
2572 if (code != COND_EXPR
2573 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2575 if (dump_enabled_p ())
2576 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2577 return NULL;
2580 /* Check that one def is the reduction def, defined by PHI,
2581 the other def is either defined in the loop ("vect_internal_def"),
2582 or it's an induction (defined by a loop-header phi-node). */
2584 if (def2 && def2 == phi
2585 && (code == COND_EXPR
2586 || !def1 || gimple_nop_p (def1)
2587 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2588 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2589 && (is_gimple_assign (def1)
2590 || is_gimple_call (def1)
2591 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2592 == vect_induction_def
2593 || (gimple_code (def1) == GIMPLE_PHI
2594 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2595 == vect_internal_def
2596 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2598 if (dump_enabled_p ())
2599 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2600 return def_stmt;
2603 if (def1 && def1 == phi
2604 && (code == COND_EXPR
2605 || !def2 || gimple_nop_p (def2)
2606 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2607 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2608 && (is_gimple_assign (def2)
2609 || is_gimple_call (def2)
2610 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2611 == vect_induction_def
2612 || (gimple_code (def2) == GIMPLE_PHI
2613 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2614 == vect_internal_def
2615 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2617 if (check_reduction)
2619 /* Swap operands (just for simplicity - so that the rest of the code
2620 can assume that the reduction variable is always the last (second)
2621 argument). */
2622 if (dump_enabled_p ())
2623 report_vect_op (MSG_NOTE, def_stmt,
2624 "detected reduction: need to swap operands: ");
2626 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2627 gimple_assign_rhs2_ptr (def_stmt));
2629 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2630 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2632 else
2634 if (dump_enabled_p ())
2635 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2638 return def_stmt;
2641 /* Try to find SLP reduction chain. */
2642 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2644 if (dump_enabled_p ())
2645 report_vect_op (MSG_NOTE, def_stmt,
2646 "reduction: detected reduction chain: ");
2648 return def_stmt;
2651 if (dump_enabled_p ())
2652 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2653 "reduction: unknown pattern: ");
2655 return NULL;
2658 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2659 in-place. Arguments as there. */
2661 static gimple
2662 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2663 bool check_reduction, bool *double_reduc)
2665 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2666 double_reduc, false);
2669 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2670 in-place if it enables detection of more reductions. Arguments
2671 as there. */
2673 gimple
2674 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2675 bool check_reduction, bool *double_reduc)
2677 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2678 double_reduc, true);
2681 /* Calculate the cost of one scalar iteration of the loop. */
2683 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2684 stmt_vector_for_cost *scalar_cost_vec)
2686 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2687 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2688 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2689 int innerloop_iters, i;
2691 /* Count statements in scalar loop. Using this as scalar cost for a single
2692 iteration for now.
2694 TODO: Add outer loop support.
2696 TODO: Consider assigning different costs to different scalar
2697 statements. */
2699 /* FORNOW. */
2700 innerloop_iters = 1;
2701 if (loop->inner)
2702 innerloop_iters = 50; /* FIXME */
2704 for (i = 0; i < nbbs; i++)
2706 gimple_stmt_iterator si;
2707 basic_block bb = bbs[i];
2709 if (bb->loop_father == loop->inner)
2710 factor = innerloop_iters;
2711 else
2712 factor = 1;
2714 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2716 gimple stmt = gsi_stmt (si);
2717 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2719 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2720 continue;
2722 /* Skip stmts that are not vectorized inside the loop. */
2723 if (stmt_info
2724 && !STMT_VINFO_RELEVANT_P (stmt_info)
2725 && (!STMT_VINFO_LIVE_P (stmt_info)
2726 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2727 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2728 continue;
2730 vect_cost_for_stmt kind;
2731 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2733 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2734 kind = scalar_load;
2735 else
2736 kind = scalar_store;
2738 else
2739 kind = scalar_stmt;
2741 scalar_single_iter_cost
2742 += record_stmt_cost (scalar_cost_vec, factor, kind,
2743 NULL, 0, vect_prologue);
2746 return scalar_single_iter_cost;
2749 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2751 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2752 int *peel_iters_epilogue,
2753 stmt_vector_for_cost *scalar_cost_vec,
2754 stmt_vector_for_cost *prologue_cost_vec,
2755 stmt_vector_for_cost *epilogue_cost_vec)
2757 int retval = 0;
2758 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2760 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2762 *peel_iters_epilogue = vf/2;
2763 if (dump_enabled_p ())
2764 dump_printf_loc (MSG_NOTE, vect_location,
2765 "cost model: epilogue peel iters set to vf/2 "
2766 "because loop iterations are unknown .\n");
2768 /* If peeled iterations are known but number of scalar loop
2769 iterations are unknown, count a taken branch per peeled loop. */
2770 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2771 NULL, 0, vect_prologue);
2772 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2773 NULL, 0, vect_epilogue);
2775 else
2777 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2778 peel_iters_prologue = niters < peel_iters_prologue ?
2779 niters : peel_iters_prologue;
2780 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2781 /* If we need to peel for gaps, but no peeling is required, we have to
2782 peel VF iterations. */
2783 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2784 *peel_iters_epilogue = vf;
2787 stmt_info_for_cost *si;
2788 int j;
2789 if (peel_iters_prologue)
2790 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2791 retval += record_stmt_cost (prologue_cost_vec,
2792 si->count * peel_iters_prologue,
2793 si->kind, NULL, si->misalign,
2794 vect_prologue);
2795 if (*peel_iters_epilogue)
2796 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2797 retval += record_stmt_cost (epilogue_cost_vec,
2798 si->count * *peel_iters_epilogue,
2799 si->kind, NULL, si->misalign,
2800 vect_epilogue);
2802 return retval;
2805 /* Function vect_estimate_min_profitable_iters
2807 Return the number of iterations required for the vector version of the
2808 loop to be profitable relative to the cost of the scalar version of the
2809 loop. */
2811 static void
2812 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2813 int *ret_min_profitable_niters,
2814 int *ret_min_profitable_estimate)
2816 int min_profitable_iters;
2817 int min_profitable_estimate;
2818 int peel_iters_prologue;
2819 int peel_iters_epilogue;
2820 unsigned vec_inside_cost = 0;
2821 int vec_outside_cost = 0;
2822 unsigned vec_prologue_cost = 0;
2823 unsigned vec_epilogue_cost = 0;
2824 int scalar_single_iter_cost = 0;
2825 int scalar_outside_cost = 0;
2826 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2827 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2828 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2830 /* Cost model disabled. */
2831 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2833 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2834 *ret_min_profitable_niters = 0;
2835 *ret_min_profitable_estimate = 0;
2836 return;
2839 /* Requires loop versioning tests to handle misalignment. */
2840 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2842 /* FIXME: Make cost depend on complexity of individual check. */
2843 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2844 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2845 vect_prologue);
2846 dump_printf (MSG_NOTE,
2847 "cost model: Adding cost of checks for loop "
2848 "versioning to treat misalignment.\n");
2851 /* Requires loop versioning with alias checks. */
2852 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2854 /* FIXME: Make cost depend on complexity of individual check. */
2855 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2856 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2857 vect_prologue);
2858 dump_printf (MSG_NOTE,
2859 "cost model: Adding cost of checks for loop "
2860 "versioning aliasing.\n");
2863 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2864 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2865 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2866 vect_prologue);
2868 /* Count statements in scalar loop. Using this as scalar cost for a single
2869 iteration for now.
2871 TODO: Add outer loop support.
2873 TODO: Consider assigning different costs to different scalar
2874 statements. */
2876 auto_vec<stmt_info_for_cost> scalar_cost_vec;
2877 scalar_single_iter_cost
2878 = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2880 /* Add additional cost for the peeled instructions in prologue and epilogue
2881 loop.
2883 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2884 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2886 TODO: Build an expression that represents peel_iters for prologue and
2887 epilogue to be used in a run-time test. */
2889 if (npeel < 0)
2891 peel_iters_prologue = vf/2;
2892 dump_printf (MSG_NOTE, "cost model: "
2893 "prologue peel iters set to vf/2.\n");
2895 /* If peeling for alignment is unknown, loop bound of main loop becomes
2896 unknown. */
2897 peel_iters_epilogue = vf/2;
2898 dump_printf (MSG_NOTE, "cost model: "
2899 "epilogue peel iters set to vf/2 because "
2900 "peeling for alignment is unknown.\n");
2902 /* If peeled iterations are unknown, count a taken branch and a not taken
2903 branch per peeled loop. Even if scalar loop iterations are known,
2904 vector iterations are not known since peeled prologue iterations are
2905 not known. Hence guards remain the same. */
2906 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2907 NULL, 0, vect_prologue);
2908 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2909 NULL, 0, vect_prologue);
2910 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2911 NULL, 0, vect_epilogue);
2912 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2913 NULL, 0, vect_epilogue);
2914 stmt_info_for_cost *si;
2915 int j;
2916 FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2918 struct _stmt_vec_info *stmt_info
2919 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2920 (void) add_stmt_cost (target_cost_data,
2921 si->count * peel_iters_prologue,
2922 si->kind, stmt_info, si->misalign,
2923 vect_prologue);
2924 (void) add_stmt_cost (target_cost_data,
2925 si->count * peel_iters_epilogue,
2926 si->kind, stmt_info, si->misalign,
2927 vect_epilogue);
2930 else
2932 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2933 stmt_info_for_cost *si;
2934 int j;
2935 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2937 prologue_cost_vec.create (2);
2938 epilogue_cost_vec.create (2);
2939 peel_iters_prologue = npeel;
2941 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2942 &peel_iters_epilogue,
2943 &scalar_cost_vec,
2944 &prologue_cost_vec,
2945 &epilogue_cost_vec);
2947 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2949 struct _stmt_vec_info *stmt_info
2950 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2951 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2952 si->misalign, vect_prologue);
2955 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2957 struct _stmt_vec_info *stmt_info
2958 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2959 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2960 si->misalign, vect_epilogue);
2963 prologue_cost_vec.release ();
2964 epilogue_cost_vec.release ();
2967 /* FORNOW: The scalar outside cost is incremented in one of the
2968 following ways:
2970 1. The vectorizer checks for alignment and aliasing and generates
2971 a condition that allows dynamic vectorization. A cost model
2972 check is ANDED with the versioning condition. Hence scalar code
2973 path now has the added cost of the versioning check.
2975 if (cost > th & versioning_check)
2976 jmp to vector code
2978 Hence run-time scalar is incremented by not-taken branch cost.
2980 2. The vectorizer then checks if a prologue is required. If the
2981 cost model check was not done before during versioning, it has to
2982 be done before the prologue check.
2984 if (cost <= th)
2985 prologue = scalar_iters
2986 if (prologue == 0)
2987 jmp to vector code
2988 else
2989 execute prologue
2990 if (prologue == num_iters)
2991 go to exit
2993 Hence the run-time scalar cost is incremented by a taken branch,
2994 plus a not-taken branch, plus a taken branch cost.
2996 3. The vectorizer then checks if an epilogue is required. If the
2997 cost model check was not done before during prologue check, it
2998 has to be done with the epilogue check.
3000 if (prologue == 0)
3001 jmp to vector code
3002 else
3003 execute prologue
3004 if (prologue == num_iters)
3005 go to exit
3006 vector code:
3007 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3008 jmp to epilogue
3010 Hence the run-time scalar cost should be incremented by 2 taken
3011 branches.
3013 TODO: The back end may reorder the BBS's differently and reverse
3014 conditions/branch directions. Change the estimates below to
3015 something more reasonable. */
3017 /* If the number of iterations is known and we do not do versioning, we can
3018 decide whether to vectorize at compile time. Hence the scalar version
3019 do not carry cost model guard costs. */
3020 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3021 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3022 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3024 /* Cost model check occurs at versioning. */
3025 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3026 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3027 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3028 else
3030 /* Cost model check occurs at prologue generation. */
3031 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3032 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3033 + vect_get_stmt_cost (cond_branch_not_taken);
3034 /* Cost model check occurs at epilogue generation. */
3035 else
3036 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3040 /* Complete the target-specific cost calculations. */
3041 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3042 &vec_inside_cost, &vec_epilogue_cost);
3044 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3046 if (dump_enabled_p ())
3048 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3049 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3050 vec_inside_cost);
3051 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3052 vec_prologue_cost);
3053 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3054 vec_epilogue_cost);
3055 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3056 scalar_single_iter_cost);
3057 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3058 scalar_outside_cost);
3059 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3060 vec_outside_cost);
3061 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3062 peel_iters_prologue);
3063 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3064 peel_iters_epilogue);
3067 /* Calculate number of iterations required to make the vector version
3068 profitable, relative to the loop bodies only. The following condition
3069 must hold true:
3070 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3071 where
3072 SIC = scalar iteration cost, VIC = vector iteration cost,
3073 VOC = vector outside cost, VF = vectorization factor,
3074 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3075 SOC = scalar outside cost for run time cost model check. */
3077 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3079 if (vec_outside_cost <= 0)
3080 min_profitable_iters = 1;
3081 else
3083 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3084 - vec_inside_cost * peel_iters_prologue
3085 - vec_inside_cost * peel_iters_epilogue)
3086 / ((scalar_single_iter_cost * vf)
3087 - vec_inside_cost);
3089 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3090 <= (((int) vec_inside_cost * min_profitable_iters)
3091 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3092 min_profitable_iters++;
3095 /* vector version will never be profitable. */
3096 else
3098 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3099 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3100 "did not happen for a simd loop");
3102 if (dump_enabled_p ())
3103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3104 "cost model: the vector iteration cost = %d "
3105 "divided by the scalar iteration cost = %d "
3106 "is greater or equal to the vectorization factor = %d"
3107 ".\n",
3108 vec_inside_cost, scalar_single_iter_cost, vf);
3109 *ret_min_profitable_niters = -1;
3110 *ret_min_profitable_estimate = -1;
3111 return;
3114 dump_printf (MSG_NOTE,
3115 " Calculated minimum iters for profitability: %d\n",
3116 min_profitable_iters);
3118 min_profitable_iters =
3119 min_profitable_iters < vf ? vf : min_profitable_iters;
3121 /* Because the condition we create is:
3122 if (niters <= min_profitable_iters)
3123 then skip the vectorized loop. */
3124 min_profitable_iters--;
3126 if (dump_enabled_p ())
3127 dump_printf_loc (MSG_NOTE, vect_location,
3128 " Runtime profitability threshold = %d\n",
3129 min_profitable_iters);
3131 *ret_min_profitable_niters = min_profitable_iters;
3133 /* Calculate number of iterations required to make the vector version
3134 profitable, relative to the loop bodies only.
3136 Non-vectorized variant is SIC * niters and it must win over vector
3137 variant on the expected loop trip count. The following condition must hold true:
3138 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3140 if (vec_outside_cost <= 0)
3141 min_profitable_estimate = 1;
3142 else
3144 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3145 - vec_inside_cost * peel_iters_prologue
3146 - vec_inside_cost * peel_iters_epilogue)
3147 / ((scalar_single_iter_cost * vf)
3148 - vec_inside_cost);
3150 min_profitable_estimate --;
3151 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3152 if (dump_enabled_p ())
3153 dump_printf_loc (MSG_NOTE, vect_location,
3154 " Static estimate profitability threshold = %d\n",
3155 min_profitable_iters);
3157 *ret_min_profitable_estimate = min_profitable_estimate;
3160 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3161 vector elements (not bits) for a vector of mode MODE. */
3162 static void
3163 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3164 unsigned char *sel)
3166 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3168 for (i = 0; i < nelt; i++)
3169 sel[i] = (i + offset) & (2*nelt - 1);
3172 /* Checks whether the target supports whole-vector shifts for vectors of mode
3173 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3174 it supports vec_perm_const with masks for all necessary shift amounts. */
3175 static bool
3176 have_whole_vector_shift (enum machine_mode mode)
3178 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3179 return true;
3181 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3182 return false;
3184 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3185 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3187 for (i = nelt/2; i >= 1; i/=2)
3189 calc_vec_perm_mask_for_shift (mode, i, sel);
3190 if (!can_vec_perm_p (mode, false, sel))
3191 return false;
3193 return true;
3196 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3198 static tree
3199 get_reduction_op (gimple stmt, int reduc_index)
3201 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3203 case GIMPLE_SINGLE_RHS:
3204 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3205 == ternary_op);
3206 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3207 case GIMPLE_UNARY_RHS:
3208 return gimple_assign_rhs1 (stmt);
3209 case GIMPLE_BINARY_RHS:
3210 return (reduc_index
3211 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3212 case GIMPLE_TERNARY_RHS:
3213 return gimple_op (stmt, reduc_index + 1);
3214 default:
3215 gcc_unreachable ();
3219 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3220 functions. Design better to avoid maintenance issues. */
3222 /* Function vect_model_reduction_cost.
3224 Models cost for a reduction operation, including the vector ops
3225 generated within the strip-mine loop, the initial definition before
3226 the loop, and the epilogue code that must be generated. */
3228 static bool
3229 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3230 int ncopies, int reduc_index)
3232 int prologue_cost = 0, epilogue_cost = 0;
3233 enum tree_code code;
3234 optab optab;
3235 tree vectype;
3236 gimple stmt, orig_stmt;
3237 tree reduction_op;
3238 machine_mode mode;
3239 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3240 struct loop *loop = NULL;
3241 void *target_cost_data;
3243 if (loop_vinfo)
3245 loop = LOOP_VINFO_LOOP (loop_vinfo);
3246 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3248 else
3249 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3251 /* Cost of reduction op inside loop. */
3252 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3253 stmt_info, 0, vect_body);
3254 stmt = STMT_VINFO_STMT (stmt_info);
3256 reduction_op = get_reduction_op (stmt, reduc_index);
3258 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3259 if (!vectype)
3261 if (dump_enabled_p ())
3263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3264 "unsupported data-type ");
3265 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3266 TREE_TYPE (reduction_op));
3267 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3269 return false;
3272 mode = TYPE_MODE (vectype);
3273 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3275 if (!orig_stmt)
3276 orig_stmt = STMT_VINFO_STMT (stmt_info);
3278 code = gimple_assign_rhs_code (orig_stmt);
3280 /* Add in cost for initial definition. */
3281 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3282 stmt_info, 0, vect_prologue);
3284 /* Determine cost of epilogue code.
3286 We have a reduction operator that will reduce the vector in one statement.
3287 Also requires scalar extract. */
3289 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3291 if (reduc_code != ERROR_MARK)
3293 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3294 stmt_info, 0, vect_epilogue);
3295 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3296 stmt_info, 0, vect_epilogue);
3298 else
3300 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3301 tree bitsize =
3302 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3303 int element_bitsize = tree_to_uhwi (bitsize);
3304 int nelements = vec_size_in_bits / element_bitsize;
3306 optab = optab_for_tree_code (code, vectype, optab_default);
3308 /* We have a whole vector shift available. */
3309 if (VECTOR_MODE_P (mode)
3310 && optab_handler (optab, mode) != CODE_FOR_nothing
3311 && have_whole_vector_shift (mode))
3313 /* Final reduction via vector shifts and the reduction operator.
3314 Also requires scalar extract. */
3315 epilogue_cost += add_stmt_cost (target_cost_data,
3316 exact_log2 (nelements) * 2,
3317 vector_stmt, stmt_info, 0,
3318 vect_epilogue);
3319 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3320 vec_to_scalar, stmt_info, 0,
3321 vect_epilogue);
3323 else
3324 /* Use extracts and reduction op for final reduction. For N
3325 elements, we have N extracts and N-1 reduction ops. */
3326 epilogue_cost += add_stmt_cost (target_cost_data,
3327 nelements + nelements - 1,
3328 vector_stmt, stmt_info, 0,
3329 vect_epilogue);
3333 if (dump_enabled_p ())
3334 dump_printf (MSG_NOTE,
3335 "vect_model_reduction_cost: inside_cost = %d, "
3336 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3337 prologue_cost, epilogue_cost);
3339 return true;
3343 /* Function vect_model_induction_cost.
3345 Models cost for induction operations. */
3347 static void
3348 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3350 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3351 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3352 unsigned inside_cost, prologue_cost;
3354 /* loop cost for vec_loop. */
3355 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3356 stmt_info, 0, vect_body);
3358 /* prologue cost for vec_init and vec_step. */
3359 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3360 stmt_info, 0, vect_prologue);
3362 if (dump_enabled_p ())
3363 dump_printf_loc (MSG_NOTE, vect_location,
3364 "vect_model_induction_cost: inside_cost = %d, "
3365 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3369 /* Function get_initial_def_for_induction
3371 Input:
3372 STMT - a stmt that performs an induction operation in the loop.
3373 IV_PHI - the initial value of the induction variable
3375 Output:
3376 Return a vector variable, initialized with the first VF values of
3377 the induction variable. E.g., for an iv with IV_PHI='X' and
3378 evolution S, for a vector of 4 units, we want to return:
3379 [X, X + S, X + 2*S, X + 3*S]. */
3381 static tree
3382 get_initial_def_for_induction (gimple iv_phi)
3384 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3385 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3386 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3387 tree vectype;
3388 int nunits;
3389 edge pe = loop_preheader_edge (loop);
3390 struct loop *iv_loop;
3391 basic_block new_bb;
3392 tree new_vec, vec_init, vec_step, t;
3393 tree new_var;
3394 tree new_name;
3395 gimple init_stmt, new_stmt;
3396 gphi *induction_phi;
3397 tree induc_def, vec_def, vec_dest;
3398 tree init_expr, step_expr;
3399 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3400 int i;
3401 int ncopies;
3402 tree expr;
3403 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3404 bool nested_in_vect_loop = false;
3405 gimple_seq stmts = NULL;
3406 imm_use_iterator imm_iter;
3407 use_operand_p use_p;
3408 gimple exit_phi;
3409 edge latch_e;
3410 tree loop_arg;
3411 gimple_stmt_iterator si;
3412 basic_block bb = gimple_bb (iv_phi);
3413 tree stepvectype;
3414 tree resvectype;
3416 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3417 if (nested_in_vect_loop_p (loop, iv_phi))
3419 nested_in_vect_loop = true;
3420 iv_loop = loop->inner;
3422 else
3423 iv_loop = loop;
3424 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3426 latch_e = loop_latch_edge (iv_loop);
3427 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3429 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3430 gcc_assert (step_expr != NULL_TREE);
3432 pe = loop_preheader_edge (iv_loop);
3433 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3434 loop_preheader_edge (iv_loop));
3436 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3437 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3438 gcc_assert (vectype);
3439 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3440 ncopies = vf / nunits;
3442 gcc_assert (phi_info);
3443 gcc_assert (ncopies >= 1);
3445 /* Convert the step to the desired type. */
3446 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3447 step_expr),
3448 &stmts, true, NULL_TREE);
3449 if (stmts)
3451 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3452 gcc_assert (!new_bb);
3455 /* Find the first insertion point in the BB. */
3456 si = gsi_after_labels (bb);
3458 /* Create the vector that holds the initial_value of the induction. */
3459 if (nested_in_vect_loop)
3461 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3462 been created during vectorization of previous stmts. We obtain it
3463 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3464 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3465 /* If the initial value is not of proper type, convert it. */
3466 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3468 new_stmt
3469 = gimple_build_assign (vect_get_new_vect_var (vectype,
3470 vect_simple_var,
3471 "vec_iv_"),
3472 VIEW_CONVERT_EXPR,
3473 build1 (VIEW_CONVERT_EXPR, vectype,
3474 vec_init));
3475 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3476 gimple_assign_set_lhs (new_stmt, vec_init);
3477 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3478 new_stmt);
3479 gcc_assert (!new_bb);
3480 set_vinfo_for_stmt (new_stmt,
3481 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3484 else
3486 vec<constructor_elt, va_gc> *v;
3488 /* iv_loop is the loop to be vectorized. Create:
3489 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3490 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3491 vect_scalar_var, "var_");
3492 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3493 init_expr),
3494 &stmts, false, new_var);
3495 if (stmts)
3497 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3498 gcc_assert (!new_bb);
3501 vec_alloc (v, nunits);
3502 bool constant_p = is_gimple_min_invariant (new_name);
3503 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3504 for (i = 1; i < nunits; i++)
3506 /* Create: new_name_i = new_name + step_expr */
3507 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3508 new_name, step_expr);
3509 if (!is_gimple_min_invariant (new_name))
3511 init_stmt = gimple_build_assign (new_var, new_name);
3512 new_name = make_ssa_name (new_var, init_stmt);
3513 gimple_assign_set_lhs (init_stmt, new_name);
3514 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3515 gcc_assert (!new_bb);
3516 if (dump_enabled_p ())
3518 dump_printf_loc (MSG_NOTE, vect_location,
3519 "created new init_stmt: ");
3520 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3521 dump_printf (MSG_NOTE, "\n");
3523 constant_p = false;
3525 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3527 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3528 if (constant_p)
3529 new_vec = build_vector_from_ctor (vectype, v);
3530 else
3531 new_vec = build_constructor (vectype, v);
3532 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3536 /* Create the vector that holds the step of the induction. */
3537 if (nested_in_vect_loop)
3538 /* iv_loop is nested in the loop to be vectorized. Generate:
3539 vec_step = [S, S, S, S] */
3540 new_name = step_expr;
3541 else
3543 /* iv_loop is the loop to be vectorized. Generate:
3544 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3545 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3547 expr = build_int_cst (integer_type_node, vf);
3548 expr = fold_convert (TREE_TYPE (step_expr), expr);
3550 else
3551 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3552 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3553 expr, step_expr);
3554 if (TREE_CODE (step_expr) == SSA_NAME)
3555 new_name = vect_init_vector (iv_phi, new_name,
3556 TREE_TYPE (step_expr), NULL);
3559 t = unshare_expr (new_name);
3560 gcc_assert (CONSTANT_CLASS_P (new_name)
3561 || TREE_CODE (new_name) == SSA_NAME);
3562 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3563 gcc_assert (stepvectype);
3564 new_vec = build_vector_from_val (stepvectype, t);
3565 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3568 /* Create the following def-use cycle:
3569 loop prolog:
3570 vec_init = ...
3571 vec_step = ...
3572 loop:
3573 vec_iv = PHI <vec_init, vec_loop>
3575 STMT
3577 vec_loop = vec_iv + vec_step; */
3579 /* Create the induction-phi that defines the induction-operand. */
3580 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3581 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3582 set_vinfo_for_stmt (induction_phi,
3583 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3584 induc_def = PHI_RESULT (induction_phi);
3586 /* Create the iv update inside the loop */
3587 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3588 vec_def = make_ssa_name (vec_dest, new_stmt);
3589 gimple_assign_set_lhs (new_stmt, vec_def);
3590 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3591 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3592 NULL));
3594 /* Set the arguments of the phi node: */
3595 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3596 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3597 UNKNOWN_LOCATION);
3600 /* In case that vectorization factor (VF) is bigger than the number
3601 of elements that we can fit in a vectype (nunits), we have to generate
3602 more than one vector stmt - i.e - we need to "unroll" the
3603 vector stmt by a factor VF/nunits. For more details see documentation
3604 in vectorizable_operation. */
3606 if (ncopies > 1)
3608 stmt_vec_info prev_stmt_vinfo;
3609 /* FORNOW. This restriction should be relaxed. */
3610 gcc_assert (!nested_in_vect_loop);
3612 /* Create the vector that holds the step of the induction. */
3613 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3615 expr = build_int_cst (integer_type_node, nunits);
3616 expr = fold_convert (TREE_TYPE (step_expr), expr);
3618 else
3619 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3620 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3621 expr, step_expr);
3622 if (TREE_CODE (step_expr) == SSA_NAME)
3623 new_name = vect_init_vector (iv_phi, new_name,
3624 TREE_TYPE (step_expr), NULL);
3625 t = unshare_expr (new_name);
3626 gcc_assert (CONSTANT_CLASS_P (new_name)
3627 || TREE_CODE (new_name) == SSA_NAME);
3628 new_vec = build_vector_from_val (stepvectype, t);
3629 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3631 vec_def = induc_def;
3632 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3633 for (i = 1; i < ncopies; i++)
3635 /* vec_i = vec_prev + vec_step */
3636 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3637 vec_def, vec_step);
3638 vec_def = make_ssa_name (vec_dest, new_stmt);
3639 gimple_assign_set_lhs (new_stmt, vec_def);
3641 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3642 if (!useless_type_conversion_p (resvectype, vectype))
3644 new_stmt
3645 = gimple_build_assign
3646 (vect_get_new_vect_var (resvectype, vect_simple_var,
3647 "vec_iv_"),
3648 VIEW_CONVERT_EXPR,
3649 build1 (VIEW_CONVERT_EXPR, resvectype,
3650 gimple_assign_lhs (new_stmt)));
3651 gimple_assign_set_lhs (new_stmt,
3652 make_ssa_name
3653 (gimple_assign_lhs (new_stmt), new_stmt));
3654 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3656 set_vinfo_for_stmt (new_stmt,
3657 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3658 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3659 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3663 if (nested_in_vect_loop)
3665 /* Find the loop-closed exit-phi of the induction, and record
3666 the final vector of induction results: */
3667 exit_phi = NULL;
3668 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3670 gimple use_stmt = USE_STMT (use_p);
3671 if (is_gimple_debug (use_stmt))
3672 continue;
3674 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3676 exit_phi = use_stmt;
3677 break;
3680 if (exit_phi)
3682 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3683 /* FORNOW. Currently not supporting the case that an inner-loop induction
3684 is not used in the outer-loop (i.e. only outside the outer-loop). */
3685 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3686 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3688 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3689 if (dump_enabled_p ())
3691 dump_printf_loc (MSG_NOTE, vect_location,
3692 "vector of inductions after inner-loop:");
3693 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3694 dump_printf (MSG_NOTE, "\n");
3700 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_NOTE, vect_location,
3703 "transform induction: created def-use cycle: ");
3704 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3705 dump_printf (MSG_NOTE, "\n");
3706 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3707 SSA_NAME_DEF_STMT (vec_def), 0);
3708 dump_printf (MSG_NOTE, "\n");
3711 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3712 if (!useless_type_conversion_p (resvectype, vectype))
3714 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3715 vect_simple_var,
3716 "vec_iv_"),
3717 VIEW_CONVERT_EXPR,
3718 build1 (VIEW_CONVERT_EXPR, resvectype,
3719 induc_def));
3720 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3721 gimple_assign_set_lhs (new_stmt, induc_def);
3722 si = gsi_after_labels (bb);
3723 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3724 set_vinfo_for_stmt (new_stmt,
3725 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3726 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3727 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3730 return induc_def;
3734 /* Function get_initial_def_for_reduction
3736 Input:
3737 STMT - a stmt that performs a reduction operation in the loop.
3738 INIT_VAL - the initial value of the reduction variable
3740 Output:
3741 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3742 of the reduction (used for adjusting the epilog - see below).
3743 Return a vector variable, initialized according to the operation that STMT
3744 performs. This vector will be used as the initial value of the
3745 vector of partial results.
3747 Option1 (adjust in epilog): Initialize the vector as follows:
3748 add/bit or/xor: [0,0,...,0,0]
3749 mult/bit and: [1,1,...,1,1]
3750 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3751 and when necessary (e.g. add/mult case) let the caller know
3752 that it needs to adjust the result by init_val.
3754 Option2: Initialize the vector as follows:
3755 add/bit or/xor: [init_val,0,0,...,0]
3756 mult/bit and: [init_val,1,1,...,1]
3757 min/max/cond_expr: [init_val,init_val,...,init_val]
3758 and no adjustments are needed.
3760 For example, for the following code:
3762 s = init_val;
3763 for (i=0;i<n;i++)
3764 s = s + a[i];
3766 STMT is 's = s + a[i]', and the reduction variable is 's'.
3767 For a vector of 4 units, we want to return either [0,0,0,init_val],
3768 or [0,0,0,0] and let the caller know that it needs to adjust
3769 the result at the end by 'init_val'.
3771 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3772 initialization vector is simpler (same element in all entries), if
3773 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3775 A cost model should help decide between these two schemes. */
3777 tree
3778 get_initial_def_for_reduction (gimple stmt, tree init_val,
3779 tree *adjustment_def)
3781 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3782 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3783 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3784 tree scalar_type = TREE_TYPE (init_val);
3785 tree vectype = get_vectype_for_scalar_type (scalar_type);
3786 int nunits;
3787 enum tree_code code = gimple_assign_rhs_code (stmt);
3788 tree def_for_init;
3789 tree init_def;
3790 tree *elts;
3791 int i;
3792 bool nested_in_vect_loop = false;
3793 tree init_value;
3794 REAL_VALUE_TYPE real_init_val = dconst0;
3795 int int_init_val = 0;
3796 gimple def_stmt = NULL;
3798 gcc_assert (vectype);
3799 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3801 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3802 || SCALAR_FLOAT_TYPE_P (scalar_type));
3804 if (nested_in_vect_loop_p (loop, stmt))
3805 nested_in_vect_loop = true;
3806 else
3807 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3809 /* In case of double reduction we only create a vector variable to be put
3810 in the reduction phi node. The actual statement creation is done in
3811 vect_create_epilog_for_reduction. */
3812 if (adjustment_def && nested_in_vect_loop
3813 && TREE_CODE (init_val) == SSA_NAME
3814 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3815 && gimple_code (def_stmt) == GIMPLE_PHI
3816 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3817 && vinfo_for_stmt (def_stmt)
3818 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3819 == vect_double_reduction_def)
3821 *adjustment_def = NULL;
3822 return vect_create_destination_var (init_val, vectype);
3825 if (TREE_CONSTANT (init_val))
3827 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3828 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3829 else
3830 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3832 else
3833 init_value = init_val;
3835 switch (code)
3837 case WIDEN_SUM_EXPR:
3838 case DOT_PROD_EXPR:
3839 case SAD_EXPR:
3840 case PLUS_EXPR:
3841 case MINUS_EXPR:
3842 case BIT_IOR_EXPR:
3843 case BIT_XOR_EXPR:
3844 case MULT_EXPR:
3845 case BIT_AND_EXPR:
3846 /* ADJUSMENT_DEF is NULL when called from
3847 vect_create_epilog_for_reduction to vectorize double reduction. */
3848 if (adjustment_def)
3850 if (nested_in_vect_loop)
3851 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3852 NULL);
3853 else
3854 *adjustment_def = init_val;
3857 if (code == MULT_EXPR)
3859 real_init_val = dconst1;
3860 int_init_val = 1;
3863 if (code == BIT_AND_EXPR)
3864 int_init_val = -1;
3866 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3867 def_for_init = build_real (scalar_type, real_init_val);
3868 else
3869 def_for_init = build_int_cst (scalar_type, int_init_val);
3871 /* Create a vector of '0' or '1' except the first element. */
3872 elts = XALLOCAVEC (tree, nunits);
3873 for (i = nunits - 2; i >= 0; --i)
3874 elts[i + 1] = def_for_init;
3876 /* Option1: the first element is '0' or '1' as well. */
3877 if (adjustment_def)
3879 elts[0] = def_for_init;
3880 init_def = build_vector (vectype, elts);
3881 break;
3884 /* Option2: the first element is INIT_VAL. */
3885 elts[0] = init_val;
3886 if (TREE_CONSTANT (init_val))
3887 init_def = build_vector (vectype, elts);
3888 else
3890 vec<constructor_elt, va_gc> *v;
3891 vec_alloc (v, nunits);
3892 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3893 for (i = 1; i < nunits; ++i)
3894 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3895 init_def = build_constructor (vectype, v);
3898 break;
3900 case MIN_EXPR:
3901 case MAX_EXPR:
3902 case COND_EXPR:
3903 if (adjustment_def)
3905 *adjustment_def = NULL_TREE;
3906 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3907 break;
3910 init_def = build_vector_from_val (vectype, init_value);
3911 break;
3913 default:
3914 gcc_unreachable ();
3917 return init_def;
3920 /* Function vect_create_epilog_for_reduction
3922 Create code at the loop-epilog to finalize the result of a reduction
3923 computation.
3925 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3926 reduction statements.
3927 STMT is the scalar reduction stmt that is being vectorized.
3928 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3929 number of elements that we can fit in a vectype (nunits). In this case
3930 we have to generate more than one vector stmt - i.e - we need to "unroll"
3931 the vector stmt by a factor VF/nunits. For more details see documentation
3932 in vectorizable_operation.
3933 REDUC_CODE is the tree-code for the epilog reduction.
3934 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3935 computation.
3936 REDUC_INDEX is the index of the operand in the right hand side of the
3937 statement that is defined by REDUCTION_PHI.
3938 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3939 SLP_NODE is an SLP node containing a group of reduction statements. The
3940 first one in this group is STMT.
3942 This function:
3943 1. Creates the reduction def-use cycles: sets the arguments for
3944 REDUCTION_PHIS:
3945 The loop-entry argument is the vectorized initial-value of the reduction.
3946 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3947 sums.
3948 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3949 by applying the operation specified by REDUC_CODE if available, or by
3950 other means (whole-vector shifts or a scalar loop).
3951 The function also creates a new phi node at the loop exit to preserve
3952 loop-closed form, as illustrated below.
3954 The flow at the entry to this function:
3956 loop:
3957 vec_def = phi <null, null> # REDUCTION_PHI
3958 VECT_DEF = vector_stmt # vectorized form of STMT
3959 s_loop = scalar_stmt # (scalar) STMT
3960 loop_exit:
3961 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3962 use <s_out0>
3963 use <s_out0>
3965 The above is transformed by this function into:
3967 loop:
3968 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3969 VECT_DEF = vector_stmt # vectorized form of STMT
3970 s_loop = scalar_stmt # (scalar) STMT
3971 loop_exit:
3972 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3973 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3974 v_out2 = reduce <v_out1>
3975 s_out3 = extract_field <v_out2, 0>
3976 s_out4 = adjust_result <s_out3>
3977 use <s_out4>
3978 use <s_out4>
3981 static void
3982 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3983 int ncopies, enum tree_code reduc_code,
3984 vec<gimple> reduction_phis,
3985 int reduc_index, bool double_reduc,
3986 slp_tree slp_node)
3988 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3989 stmt_vec_info prev_phi_info;
3990 tree vectype;
3991 machine_mode mode;
3992 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3993 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3994 basic_block exit_bb;
3995 tree scalar_dest;
3996 tree scalar_type;
3997 gimple new_phi = NULL, phi;
3998 gimple_stmt_iterator exit_gsi;
3999 tree vec_dest;
4000 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4001 gimple epilog_stmt = NULL;
4002 enum tree_code code = gimple_assign_rhs_code (stmt);
4003 gimple exit_phi;
4004 tree bitsize;
4005 tree adjustment_def = NULL;
4006 tree vec_initial_def = NULL;
4007 tree reduction_op, expr, def;
4008 tree orig_name, scalar_result;
4009 imm_use_iterator imm_iter, phi_imm_iter;
4010 use_operand_p use_p, phi_use_p;
4011 gimple use_stmt, orig_stmt, reduction_phi = NULL;
4012 bool nested_in_vect_loop = false;
4013 auto_vec<gimple> new_phis;
4014 auto_vec<gimple> inner_phis;
4015 enum vect_def_type dt = vect_unknown_def_type;
4016 int j, i;
4017 auto_vec<tree> scalar_results;
4018 unsigned int group_size = 1, k, ratio;
4019 auto_vec<tree> vec_initial_defs;
4020 auto_vec<gimple> phis;
4021 bool slp_reduc = false;
4022 tree new_phi_result;
4023 gimple inner_phi = NULL;
4025 if (slp_node)
4026 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4028 if (nested_in_vect_loop_p (loop, stmt))
4030 outer_loop = loop;
4031 loop = loop->inner;
4032 nested_in_vect_loop = true;
4033 gcc_assert (!slp_node);
4036 reduction_op = get_reduction_op (stmt, reduc_index);
4038 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4039 gcc_assert (vectype);
4040 mode = TYPE_MODE (vectype);
4042 /* 1. Create the reduction def-use cycle:
4043 Set the arguments of REDUCTION_PHIS, i.e., transform
4045 loop:
4046 vec_def = phi <null, null> # REDUCTION_PHI
4047 VECT_DEF = vector_stmt # vectorized form of STMT
4050 into:
4052 loop:
4053 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4054 VECT_DEF = vector_stmt # vectorized form of STMT
4057 (in case of SLP, do it for all the phis). */
4059 /* Get the loop-entry arguments. */
4060 if (slp_node)
4061 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4062 NULL, slp_node, reduc_index);
4063 else
4065 vec_initial_defs.create (1);
4066 /* For the case of reduction, vect_get_vec_def_for_operand returns
4067 the scalar def before the loop, that defines the initial value
4068 of the reduction variable. */
4069 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4070 &adjustment_def);
4071 vec_initial_defs.quick_push (vec_initial_def);
4074 /* Set phi nodes arguments. */
4075 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4077 tree vec_init_def, def;
4078 gimple_seq stmts;
4079 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4080 true, NULL_TREE);
4081 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4082 def = vect_defs[i];
4083 for (j = 0; j < ncopies; j++)
4085 /* Set the loop-entry arg of the reduction-phi. */
4086 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4087 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4089 /* Set the loop-latch arg for the reduction-phi. */
4090 if (j > 0)
4091 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4093 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4094 UNKNOWN_LOCATION);
4096 if (dump_enabled_p ())
4098 dump_printf_loc (MSG_NOTE, vect_location,
4099 "transform reduction: created def-use cycle: ");
4100 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4101 dump_printf (MSG_NOTE, "\n");
4102 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4103 dump_printf (MSG_NOTE, "\n");
4106 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4110 /* 2. Create epilog code.
4111 The reduction epilog code operates across the elements of the vector
4112 of partial results computed by the vectorized loop.
4113 The reduction epilog code consists of:
4115 step 1: compute the scalar result in a vector (v_out2)
4116 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4117 step 3: adjust the scalar result (s_out3) if needed.
4119 Step 1 can be accomplished using one the following three schemes:
4120 (scheme 1) using reduc_code, if available.
4121 (scheme 2) using whole-vector shifts, if available.
4122 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4123 combined.
4125 The overall epilog code looks like this:
4127 s_out0 = phi <s_loop> # original EXIT_PHI
4128 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4129 v_out2 = reduce <v_out1> # step 1
4130 s_out3 = extract_field <v_out2, 0> # step 2
4131 s_out4 = adjust_result <s_out3> # step 3
4133 (step 3 is optional, and steps 1 and 2 may be combined).
4134 Lastly, the uses of s_out0 are replaced by s_out4. */
4137 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4138 v_out1 = phi <VECT_DEF>
4139 Store them in NEW_PHIS. */
4141 exit_bb = single_exit (loop)->dest;
4142 prev_phi_info = NULL;
4143 new_phis.create (vect_defs.length ());
4144 FOR_EACH_VEC_ELT (vect_defs, i, def)
4146 for (j = 0; j < ncopies; j++)
4148 tree new_def = copy_ssa_name (def);
4149 phi = create_phi_node (new_def, exit_bb);
4150 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4151 if (j == 0)
4152 new_phis.quick_push (phi);
4153 else
4155 def = vect_get_vec_def_for_stmt_copy (dt, def);
4156 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4159 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4160 prev_phi_info = vinfo_for_stmt (phi);
4164 /* The epilogue is created for the outer-loop, i.e., for the loop being
4165 vectorized. Create exit phis for the outer loop. */
4166 if (double_reduc)
4168 loop = outer_loop;
4169 exit_bb = single_exit (loop)->dest;
4170 inner_phis.create (vect_defs.length ());
4171 FOR_EACH_VEC_ELT (new_phis, i, phi)
4173 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4174 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4175 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4176 PHI_RESULT (phi));
4177 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4178 loop_vinfo, NULL));
4179 inner_phis.quick_push (phi);
4180 new_phis[i] = outer_phi;
4181 prev_phi_info = vinfo_for_stmt (outer_phi);
4182 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4184 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4185 new_result = copy_ssa_name (PHI_RESULT (phi));
4186 outer_phi = create_phi_node (new_result, exit_bb);
4187 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4188 PHI_RESULT (phi));
4189 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4190 loop_vinfo, NULL));
4191 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4192 prev_phi_info = vinfo_for_stmt (outer_phi);
4197 exit_gsi = gsi_after_labels (exit_bb);
4199 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4200 (i.e. when reduc_code is not available) and in the final adjustment
4201 code (if needed). Also get the original scalar reduction variable as
4202 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4203 represents a reduction pattern), the tree-code and scalar-def are
4204 taken from the original stmt that the pattern-stmt (STMT) replaces.
4205 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4206 are taken from STMT. */
4208 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4209 if (!orig_stmt)
4211 /* Regular reduction */
4212 orig_stmt = stmt;
4214 else
4216 /* Reduction pattern */
4217 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4218 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4219 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4222 code = gimple_assign_rhs_code (orig_stmt);
4223 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4224 partial results are added and not subtracted. */
4225 if (code == MINUS_EXPR)
4226 code = PLUS_EXPR;
4228 scalar_dest = gimple_assign_lhs (orig_stmt);
4229 scalar_type = TREE_TYPE (scalar_dest);
4230 scalar_results.create (group_size);
4231 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4232 bitsize = TYPE_SIZE (scalar_type);
4234 /* In case this is a reduction in an inner-loop while vectorizing an outer
4235 loop - we don't need to extract a single scalar result at the end of the
4236 inner-loop (unless it is double reduction, i.e., the use of reduction is
4237 outside the outer-loop). The final vector of partial results will be used
4238 in the vectorized outer-loop, or reduced to a scalar result at the end of
4239 the outer-loop. */
4240 if (nested_in_vect_loop && !double_reduc)
4241 goto vect_finalize_reduction;
4243 /* SLP reduction without reduction chain, e.g.,
4244 # a1 = phi <a2, a0>
4245 # b1 = phi <b2, b0>
4246 a2 = operation (a1)
4247 b2 = operation (b1) */
4248 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4250 /* In case of reduction chain, e.g.,
4251 # a1 = phi <a3, a0>
4252 a2 = operation (a1)
4253 a3 = operation (a2),
4255 we may end up with more than one vector result. Here we reduce them to
4256 one vector. */
4257 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4259 tree first_vect = PHI_RESULT (new_phis[0]);
4260 tree tmp;
4261 gassign *new_vec_stmt = NULL;
4263 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4264 for (k = 1; k < new_phis.length (); k++)
4266 gimple next_phi = new_phis[k];
4267 tree second_vect = PHI_RESULT (next_phi);
4269 tmp = build2 (code, vectype, first_vect, second_vect);
4270 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4271 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4272 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4273 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4276 new_phi_result = first_vect;
4277 if (new_vec_stmt)
4279 new_phis.truncate (0);
4280 new_phis.safe_push (new_vec_stmt);
4283 else
4284 new_phi_result = PHI_RESULT (new_phis[0]);
4286 /* 2.3 Create the reduction code, using one of the three schemes described
4287 above. In SLP we simply need to extract all the elements from the
4288 vector (without reducing them), so we use scalar shifts. */
4289 if (reduc_code != ERROR_MARK && !slp_reduc)
4291 tree tmp;
4292 tree vec_elem_type;
4294 /*** Case 1: Create:
4295 v_out2 = reduc_expr <v_out1> */
4297 if (dump_enabled_p ())
4298 dump_printf_loc (MSG_NOTE, vect_location,
4299 "Reduce using direct vector reduction.\n");
4301 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4302 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4304 tree tmp_dest =
4305 vect_create_destination_var (scalar_dest, vec_elem_type);
4306 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4307 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4308 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4309 gimple_assign_set_lhs (epilog_stmt, new_temp);
4310 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4312 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4314 else
4315 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4316 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4317 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4318 gimple_assign_set_lhs (epilog_stmt, new_temp);
4319 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4320 scalar_results.safe_push (new_temp);
4322 else
4324 bool reduce_with_shift = have_whole_vector_shift (mode);
4325 int element_bitsize = tree_to_uhwi (bitsize);
4326 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4327 tree vec_temp;
4329 /* Regardless of whether we have a whole vector shift, if we're
4330 emulating the operation via tree-vect-generic, we don't want
4331 to use it. Only the first round of the reduction is likely
4332 to still be profitable via emulation. */
4333 /* ??? It might be better to emit a reduction tree code here, so that
4334 tree-vect-generic can expand the first round via bit tricks. */
4335 if (!VECTOR_MODE_P (mode))
4336 reduce_with_shift = false;
4337 else
4339 optab optab = optab_for_tree_code (code, vectype, optab_default);
4340 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4341 reduce_with_shift = false;
4344 if (reduce_with_shift && !slp_reduc)
4346 int nelements = vec_size_in_bits / element_bitsize;
4347 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4349 int elt_offset;
4351 tree zero_vec = build_zero_cst (vectype);
4352 /*** Case 2: Create:
4353 for (offset = nelements/2; offset >= 1; offset/=2)
4355 Create: va' = vec_shift <va, offset>
4356 Create: va = vop <va, va'>
4357 } */
4359 tree rhs;
4361 if (dump_enabled_p ())
4362 dump_printf_loc (MSG_NOTE, vect_location,
4363 "Reduce using vector shifts\n");
4365 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4366 new_temp = new_phi_result;
4367 for (elt_offset = nelements / 2;
4368 elt_offset >= 1;
4369 elt_offset /= 2)
4371 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4372 tree mask = vect_gen_perm_mask_any (vectype, sel);
4373 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4374 new_temp, zero_vec, mask);
4375 new_name = make_ssa_name (vec_dest, epilog_stmt);
4376 gimple_assign_set_lhs (epilog_stmt, new_name);
4377 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4379 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4380 new_temp);
4381 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4382 gimple_assign_set_lhs (epilog_stmt, new_temp);
4383 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4386 /* 2.4 Extract the final scalar result. Create:
4387 s_out3 = extract_field <v_out2, bitpos> */
4389 if (dump_enabled_p ())
4390 dump_printf_loc (MSG_NOTE, vect_location,
4391 "extract scalar result\n");
4393 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4394 bitsize, bitsize_zero_node);
4395 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4396 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4397 gimple_assign_set_lhs (epilog_stmt, new_temp);
4398 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4399 scalar_results.safe_push (new_temp);
4401 else
4403 /*** Case 3: Create:
4404 s = extract_field <v_out2, 0>
4405 for (offset = element_size;
4406 offset < vector_size;
4407 offset += element_size;)
4409 Create: s' = extract_field <v_out2, offset>
4410 Create: s = op <s, s'> // For non SLP cases
4411 } */
4413 if (dump_enabled_p ())
4414 dump_printf_loc (MSG_NOTE, vect_location,
4415 "Reduce using scalar code.\n");
4417 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4418 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4420 int bit_offset;
4421 if (gimple_code (new_phi) == GIMPLE_PHI)
4422 vec_temp = PHI_RESULT (new_phi);
4423 else
4424 vec_temp = gimple_assign_lhs (new_phi);
4425 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4426 bitsize_zero_node);
4427 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4428 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4429 gimple_assign_set_lhs (epilog_stmt, new_temp);
4430 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4432 /* In SLP we don't need to apply reduction operation, so we just
4433 collect s' values in SCALAR_RESULTS. */
4434 if (slp_reduc)
4435 scalar_results.safe_push (new_temp);
4437 for (bit_offset = element_bitsize;
4438 bit_offset < vec_size_in_bits;
4439 bit_offset += element_bitsize)
4441 tree bitpos = bitsize_int (bit_offset);
4442 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4443 bitsize, bitpos);
4445 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4446 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4447 gimple_assign_set_lhs (epilog_stmt, new_name);
4448 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4450 if (slp_reduc)
4452 /* In SLP we don't need to apply reduction operation, so
4453 we just collect s' values in SCALAR_RESULTS. */
4454 new_temp = new_name;
4455 scalar_results.safe_push (new_name);
4457 else
4459 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4460 new_name, new_temp);
4461 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4462 gimple_assign_set_lhs (epilog_stmt, new_temp);
4463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4468 /* The only case where we need to reduce scalar results in SLP, is
4469 unrolling. If the size of SCALAR_RESULTS is greater than
4470 GROUP_SIZE, we reduce them combining elements modulo
4471 GROUP_SIZE. */
4472 if (slp_reduc)
4474 tree res, first_res, new_res;
4475 gimple new_stmt;
4477 /* Reduce multiple scalar results in case of SLP unrolling. */
4478 for (j = group_size; scalar_results.iterate (j, &res);
4479 j++)
4481 first_res = scalar_results[j % group_size];
4482 new_stmt = gimple_build_assign (new_scalar_dest, code,
4483 first_res, res);
4484 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4485 gimple_assign_set_lhs (new_stmt, new_res);
4486 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4487 scalar_results[j % group_size] = new_res;
4490 else
4491 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4492 scalar_results.safe_push (new_temp);
4496 vect_finalize_reduction:
4498 if (double_reduc)
4499 loop = loop->inner;
4501 /* 2.5 Adjust the final result by the initial value of the reduction
4502 variable. (When such adjustment is not needed, then
4503 'adjustment_def' is zero). For example, if code is PLUS we create:
4504 new_temp = loop_exit_def + adjustment_def */
4506 if (adjustment_def)
4508 gcc_assert (!slp_reduc);
4509 if (nested_in_vect_loop)
4511 new_phi = new_phis[0];
4512 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4513 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4514 new_dest = vect_create_destination_var (scalar_dest, vectype);
4516 else
4518 new_temp = scalar_results[0];
4519 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4520 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4521 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4524 epilog_stmt = gimple_build_assign (new_dest, expr);
4525 new_temp = make_ssa_name (new_dest, epilog_stmt);
4526 gimple_assign_set_lhs (epilog_stmt, new_temp);
4527 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4528 if (nested_in_vect_loop)
4530 set_vinfo_for_stmt (epilog_stmt,
4531 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4532 NULL));
4533 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4534 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4536 if (!double_reduc)
4537 scalar_results.quick_push (new_temp);
4538 else
4539 scalar_results[0] = new_temp;
4541 else
4542 scalar_results[0] = new_temp;
4544 new_phis[0] = epilog_stmt;
4547 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4548 phis with new adjusted scalar results, i.e., replace use <s_out0>
4549 with use <s_out4>.
4551 Transform:
4552 loop_exit:
4553 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4554 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4555 v_out2 = reduce <v_out1>
4556 s_out3 = extract_field <v_out2, 0>
4557 s_out4 = adjust_result <s_out3>
4558 use <s_out0>
4559 use <s_out0>
4561 into:
4563 loop_exit:
4564 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4565 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4566 v_out2 = reduce <v_out1>
4567 s_out3 = extract_field <v_out2, 0>
4568 s_out4 = adjust_result <s_out3>
4569 use <s_out4>
4570 use <s_out4> */
4573 /* In SLP reduction chain we reduce vector results into one vector if
4574 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4575 the last stmt in the reduction chain, since we are looking for the loop
4576 exit phi node. */
4577 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4579 scalar_dest = gimple_assign_lhs (
4580 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4581 group_size = 1;
4584 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4585 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4586 need to match SCALAR_RESULTS with corresponding statements. The first
4587 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4588 the first vector stmt, etc.
4589 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4590 if (group_size > new_phis.length ())
4592 ratio = group_size / new_phis.length ();
4593 gcc_assert (!(group_size % new_phis.length ()));
4595 else
4596 ratio = 1;
4598 for (k = 0; k < group_size; k++)
4600 if (k % ratio == 0)
4602 epilog_stmt = new_phis[k / ratio];
4603 reduction_phi = reduction_phis[k / ratio];
4604 if (double_reduc)
4605 inner_phi = inner_phis[k / ratio];
4608 if (slp_reduc)
4610 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4612 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4613 /* SLP statements can't participate in patterns. */
4614 gcc_assert (!orig_stmt);
4615 scalar_dest = gimple_assign_lhs (current_stmt);
4618 phis.create (3);
4619 /* Find the loop-closed-use at the loop exit of the original scalar
4620 result. (The reduction result is expected to have two immediate uses -
4621 one at the latch block, and one at the loop exit). */
4622 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4623 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4624 && !is_gimple_debug (USE_STMT (use_p)))
4625 phis.safe_push (USE_STMT (use_p));
4627 /* While we expect to have found an exit_phi because of loop-closed-ssa
4628 form we can end up without one if the scalar cycle is dead. */
4630 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4632 if (outer_loop)
4634 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4635 gphi *vect_phi;
4637 /* FORNOW. Currently not supporting the case that an inner-loop
4638 reduction is not used in the outer-loop (but only outside the
4639 outer-loop), unless it is double reduction. */
4640 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4641 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4642 || double_reduc);
4644 if (double_reduc)
4645 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4646 else
4647 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4648 if (!double_reduc
4649 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4650 != vect_double_reduction_def)
4651 continue;
4653 /* Handle double reduction:
4655 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4656 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4657 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4658 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4660 At that point the regular reduction (stmt2 and stmt3) is
4661 already vectorized, as well as the exit phi node, stmt4.
4662 Here we vectorize the phi node of double reduction, stmt1, and
4663 update all relevant statements. */
4665 /* Go through all the uses of s2 to find double reduction phi
4666 node, i.e., stmt1 above. */
4667 orig_name = PHI_RESULT (exit_phi);
4668 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4670 stmt_vec_info use_stmt_vinfo;
4671 stmt_vec_info new_phi_vinfo;
4672 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4673 basic_block bb = gimple_bb (use_stmt);
4674 gimple use;
4676 /* Check that USE_STMT is really double reduction phi
4677 node. */
4678 if (gimple_code (use_stmt) != GIMPLE_PHI
4679 || gimple_phi_num_args (use_stmt) != 2
4680 || bb->loop_father != outer_loop)
4681 continue;
4682 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4683 if (!use_stmt_vinfo
4684 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4685 != vect_double_reduction_def)
4686 continue;
4688 /* Create vector phi node for double reduction:
4689 vs1 = phi <vs0, vs2>
4690 vs1 was created previously in this function by a call to
4691 vect_get_vec_def_for_operand and is stored in
4692 vec_initial_def;
4693 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4694 vs0 is created here. */
4696 /* Create vector phi node. */
4697 vect_phi = create_phi_node (vec_initial_def, bb);
4698 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4699 loop_vec_info_for_loop (outer_loop), NULL);
4700 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4702 /* Create vs0 - initial def of the double reduction phi. */
4703 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4704 loop_preheader_edge (outer_loop));
4705 init_def = get_initial_def_for_reduction (stmt,
4706 preheader_arg, NULL);
4707 vect_phi_init = vect_init_vector (use_stmt, init_def,
4708 vectype, NULL);
4710 /* Update phi node arguments with vs0 and vs2. */
4711 add_phi_arg (vect_phi, vect_phi_init,
4712 loop_preheader_edge (outer_loop),
4713 UNKNOWN_LOCATION);
4714 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4715 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4716 if (dump_enabled_p ())
4718 dump_printf_loc (MSG_NOTE, vect_location,
4719 "created double reduction phi node: ");
4720 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4721 dump_printf (MSG_NOTE, "\n");
4724 vect_phi_res = PHI_RESULT (vect_phi);
4726 /* Replace the use, i.e., set the correct vs1 in the regular
4727 reduction phi node. FORNOW, NCOPIES is always 1, so the
4728 loop is redundant. */
4729 use = reduction_phi;
4730 for (j = 0; j < ncopies; j++)
4732 edge pr_edge = loop_preheader_edge (loop);
4733 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4734 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4740 phis.release ();
4741 if (nested_in_vect_loop)
4743 if (double_reduc)
4744 loop = outer_loop;
4745 else
4746 continue;
4749 phis.create (3);
4750 /* Find the loop-closed-use at the loop exit of the original scalar
4751 result. (The reduction result is expected to have two immediate uses,
4752 one at the latch block, and one at the loop exit). For double
4753 reductions we are looking for exit phis of the outer loop. */
4754 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4756 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4758 if (!is_gimple_debug (USE_STMT (use_p)))
4759 phis.safe_push (USE_STMT (use_p));
4761 else
4763 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4765 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4767 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4769 if (!flow_bb_inside_loop_p (loop,
4770 gimple_bb (USE_STMT (phi_use_p)))
4771 && !is_gimple_debug (USE_STMT (phi_use_p)))
4772 phis.safe_push (USE_STMT (phi_use_p));
4778 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4780 /* Replace the uses: */
4781 orig_name = PHI_RESULT (exit_phi);
4782 scalar_result = scalar_results[k];
4783 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4784 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4785 SET_USE (use_p, scalar_result);
4788 phis.release ();
4793 /* Function vectorizable_reduction.
4795 Check if STMT performs a reduction operation that can be vectorized.
4796 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4797 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4798 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4800 This function also handles reduction idioms (patterns) that have been
4801 recognized in advance during vect_pattern_recog. In this case, STMT may be
4802 of this form:
4803 X = pattern_expr (arg0, arg1, ..., X)
4804 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4805 sequence that had been detected and replaced by the pattern-stmt (STMT).
4807 In some cases of reduction patterns, the type of the reduction variable X is
4808 different than the type of the other arguments of STMT.
4809 In such cases, the vectype that is used when transforming STMT into a vector
4810 stmt is different than the vectype that is used to determine the
4811 vectorization factor, because it consists of a different number of elements
4812 than the actual number of elements that are being operated upon in parallel.
4814 For example, consider an accumulation of shorts into an int accumulator.
4815 On some targets it's possible to vectorize this pattern operating on 8
4816 shorts at a time (hence, the vectype for purposes of determining the
4817 vectorization factor should be V8HI); on the other hand, the vectype that
4818 is used to create the vector form is actually V4SI (the type of the result).
4820 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4821 indicates what is the actual level of parallelism (V8HI in the example), so
4822 that the right vectorization factor would be derived. This vectype
4823 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4824 be used to create the vectorized stmt. The right vectype for the vectorized
4825 stmt is obtained from the type of the result X:
4826 get_vectype_for_scalar_type (TREE_TYPE (X))
4828 This means that, contrary to "regular" reductions (or "regular" stmts in
4829 general), the following equation:
4830 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4831 does *NOT* necessarily hold for reduction patterns. */
4833 bool
4834 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4835 gimple *vec_stmt, slp_tree slp_node)
4837 tree vec_dest;
4838 tree scalar_dest;
4839 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4840 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4841 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4842 tree vectype_in = NULL_TREE;
4843 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4844 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4845 enum tree_code code, orig_code, epilog_reduc_code;
4846 machine_mode vec_mode;
4847 int op_type;
4848 optab optab, reduc_optab;
4849 tree new_temp = NULL_TREE;
4850 tree def;
4851 gimple def_stmt;
4852 enum vect_def_type dt;
4853 gphi *new_phi = NULL;
4854 tree scalar_type;
4855 bool is_simple_use;
4856 gimple orig_stmt;
4857 stmt_vec_info orig_stmt_info;
4858 tree expr = NULL_TREE;
4859 int i;
4860 int ncopies;
4861 int epilog_copies;
4862 stmt_vec_info prev_stmt_info, prev_phi_info;
4863 bool single_defuse_cycle = false;
4864 tree reduc_def = NULL_TREE;
4865 gimple new_stmt = NULL;
4866 int j;
4867 tree ops[3];
4868 bool nested_cycle = false, found_nested_cycle_def = false;
4869 gimple reduc_def_stmt = NULL;
4870 bool double_reduc = false, dummy;
4871 basic_block def_bb;
4872 struct loop * def_stmt_loop, *outer_loop = NULL;
4873 tree def_arg;
4874 gimple def_arg_stmt;
4875 auto_vec<tree> vec_oprnds0;
4876 auto_vec<tree> vec_oprnds1;
4877 auto_vec<tree> vect_defs;
4878 auto_vec<gimple> phis;
4879 int vec_num;
4880 tree def0, def1, tem, op0, op1 = NULL_TREE;
4882 /* In case of reduction chain we switch to the first stmt in the chain, but
4883 we don't update STMT_INFO, since only the last stmt is marked as reduction
4884 and has reduction properties. */
4885 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4886 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4888 if (nested_in_vect_loop_p (loop, stmt))
4890 outer_loop = loop;
4891 loop = loop->inner;
4892 nested_cycle = true;
4895 /* 1. Is vectorizable reduction? */
4896 /* Not supportable if the reduction variable is used in the loop, unless
4897 it's a reduction chain. */
4898 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4899 && !GROUP_FIRST_ELEMENT (stmt_info))
4900 return false;
4902 /* Reductions that are not used even in an enclosing outer-loop,
4903 are expected to be "live" (used out of the loop). */
4904 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4905 && !STMT_VINFO_LIVE_P (stmt_info))
4906 return false;
4908 /* Make sure it was already recognized as a reduction computation. */
4909 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4910 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4911 return false;
4913 /* 2. Has this been recognized as a reduction pattern?
4915 Check if STMT represents a pattern that has been recognized
4916 in earlier analysis stages. For stmts that represent a pattern,
4917 the STMT_VINFO_RELATED_STMT field records the last stmt in
4918 the original sequence that constitutes the pattern. */
4920 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4921 if (orig_stmt)
4923 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4924 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4925 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4928 /* 3. Check the operands of the operation. The first operands are defined
4929 inside the loop body. The last operand is the reduction variable,
4930 which is defined by the loop-header-phi. */
4932 gcc_assert (is_gimple_assign (stmt));
4934 /* Flatten RHS. */
4935 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4937 case GIMPLE_SINGLE_RHS:
4938 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4939 if (op_type == ternary_op)
4941 tree rhs = gimple_assign_rhs1 (stmt);
4942 ops[0] = TREE_OPERAND (rhs, 0);
4943 ops[1] = TREE_OPERAND (rhs, 1);
4944 ops[2] = TREE_OPERAND (rhs, 2);
4945 code = TREE_CODE (rhs);
4947 else
4948 return false;
4949 break;
4951 case GIMPLE_BINARY_RHS:
4952 code = gimple_assign_rhs_code (stmt);
4953 op_type = TREE_CODE_LENGTH (code);
4954 gcc_assert (op_type == binary_op);
4955 ops[0] = gimple_assign_rhs1 (stmt);
4956 ops[1] = gimple_assign_rhs2 (stmt);
4957 break;
4959 case GIMPLE_TERNARY_RHS:
4960 code = gimple_assign_rhs_code (stmt);
4961 op_type = TREE_CODE_LENGTH (code);
4962 gcc_assert (op_type == ternary_op);
4963 ops[0] = gimple_assign_rhs1 (stmt);
4964 ops[1] = gimple_assign_rhs2 (stmt);
4965 ops[2] = gimple_assign_rhs3 (stmt);
4966 break;
4968 case GIMPLE_UNARY_RHS:
4969 return false;
4971 default:
4972 gcc_unreachable ();
4974 /* The default is that the reduction variable is the last in statement. */
4975 int reduc_index = op_type - 1;
4977 if (code == COND_EXPR && slp_node)
4978 return false;
4980 scalar_dest = gimple_assign_lhs (stmt);
4981 scalar_type = TREE_TYPE (scalar_dest);
4982 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4983 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4984 return false;
4986 /* Do not try to vectorize bit-precision reductions. */
4987 if ((TYPE_PRECISION (scalar_type)
4988 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4989 return false;
4991 /* All uses but the last are expected to be defined in the loop.
4992 The last use is the reduction variable. In case of nested cycle this
4993 assumption is not true: we use reduc_index to record the index of the
4994 reduction variable. */
4995 for (i = 0; i < op_type - 1; i++)
4997 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4998 if (i == 0 && code == COND_EXPR)
4999 continue;
5001 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5002 &def_stmt, &def, &dt, &tem);
5003 if (!vectype_in)
5004 vectype_in = tem;
5005 gcc_assert (is_simple_use);
5007 if (dt != vect_internal_def
5008 && dt != vect_external_def
5009 && dt != vect_constant_def
5010 && dt != vect_induction_def
5011 && !(dt == vect_nested_cycle && nested_cycle))
5012 return false;
5014 if (dt == vect_nested_cycle)
5016 found_nested_cycle_def = true;
5017 reduc_def_stmt = def_stmt;
5018 reduc_index = i;
5022 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5023 &def_stmt, &def, &dt, &tem);
5024 if (!vectype_in)
5025 vectype_in = tem;
5026 gcc_assert (is_simple_use);
5027 if (!found_nested_cycle_def)
5028 reduc_def_stmt = def_stmt;
5030 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5031 return false;
5033 if (!(dt == vect_reduction_def
5034 || dt == vect_nested_cycle
5035 || ((dt == vect_internal_def || dt == vect_external_def
5036 || dt == vect_constant_def || dt == vect_induction_def)
5037 && nested_cycle && found_nested_cycle_def)))
5039 /* For pattern recognized stmts, orig_stmt might be a reduction,
5040 but some helper statements for the pattern might not, or
5041 might be COND_EXPRs with reduction uses in the condition. */
5042 gcc_assert (orig_stmt);
5043 return false;
5046 if (orig_stmt)
5047 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5048 reduc_def_stmt,
5049 !nested_cycle,
5050 &dummy));
5051 else
5053 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5054 !nested_cycle, &dummy);
5055 /* We changed STMT to be the first stmt in reduction chain, hence we
5056 check that in this case the first element in the chain is STMT. */
5057 gcc_assert (stmt == tmp
5058 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5061 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5062 return false;
5064 if (slp_node || PURE_SLP_STMT (stmt_info))
5065 ncopies = 1;
5066 else
5067 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5068 / TYPE_VECTOR_SUBPARTS (vectype_in));
5070 gcc_assert (ncopies >= 1);
5072 vec_mode = TYPE_MODE (vectype_in);
5074 if (code == COND_EXPR)
5076 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5078 if (dump_enabled_p ())
5079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5080 "unsupported condition in reduction\n");
5082 return false;
5085 else
5087 /* 4. Supportable by target? */
5089 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5090 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5092 /* Shifts and rotates are only supported by vectorizable_shifts,
5093 not vectorizable_reduction. */
5094 if (dump_enabled_p ())
5095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5096 "unsupported shift or rotation.\n");
5097 return false;
5100 /* 4.1. check support for the operation in the loop */
5101 optab = optab_for_tree_code (code, vectype_in, optab_default);
5102 if (!optab)
5104 if (dump_enabled_p ())
5105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5106 "no optab.\n");
5108 return false;
5111 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5113 if (dump_enabled_p ())
5114 dump_printf (MSG_NOTE, "op not supported by target.\n");
5116 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5117 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5118 < vect_min_worthwhile_factor (code))
5119 return false;
5121 if (dump_enabled_p ())
5122 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5125 /* Worthwhile without SIMD support? */
5126 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5127 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5128 < vect_min_worthwhile_factor (code))
5130 if (dump_enabled_p ())
5131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132 "not worthwhile without SIMD support.\n");
5134 return false;
5138 /* 4.2. Check support for the epilog operation.
5140 If STMT represents a reduction pattern, then the type of the
5141 reduction variable may be different than the type of the rest
5142 of the arguments. For example, consider the case of accumulation
5143 of shorts into an int accumulator; The original code:
5144 S1: int_a = (int) short_a;
5145 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5147 was replaced with:
5148 STMT: int_acc = widen_sum <short_a, int_acc>
5150 This means that:
5151 1. The tree-code that is used to create the vector operation in the
5152 epilog code (that reduces the partial results) is not the
5153 tree-code of STMT, but is rather the tree-code of the original
5154 stmt from the pattern that STMT is replacing. I.e, in the example
5155 above we want to use 'widen_sum' in the loop, but 'plus' in the
5156 epilog.
5157 2. The type (mode) we use to check available target support
5158 for the vector operation to be created in the *epilog*, is
5159 determined by the type of the reduction variable (in the example
5160 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5161 However the type (mode) we use to check available target support
5162 for the vector operation to be created *inside the loop*, is
5163 determined by the type of the other arguments to STMT (in the
5164 example we'd check this: optab_handler (widen_sum_optab,
5165 vect_short_mode)).
5167 This is contrary to "regular" reductions, in which the types of all
5168 the arguments are the same as the type of the reduction variable.
5169 For "regular" reductions we can therefore use the same vector type
5170 (and also the same tree-code) when generating the epilog code and
5171 when generating the code inside the loop. */
5173 if (orig_stmt)
5175 /* This is a reduction pattern: get the vectype from the type of the
5176 reduction variable, and get the tree-code from orig_stmt. */
5177 orig_code = gimple_assign_rhs_code (orig_stmt);
5178 gcc_assert (vectype_out);
5179 vec_mode = TYPE_MODE (vectype_out);
5181 else
5183 /* Regular reduction: use the same vectype and tree-code as used for
5184 the vector code inside the loop can be used for the epilog code. */
5185 orig_code = code;
5188 if (nested_cycle)
5190 def_bb = gimple_bb (reduc_def_stmt);
5191 def_stmt_loop = def_bb->loop_father;
5192 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5193 loop_preheader_edge (def_stmt_loop));
5194 if (TREE_CODE (def_arg) == SSA_NAME
5195 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5196 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5197 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5198 && vinfo_for_stmt (def_arg_stmt)
5199 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5200 == vect_double_reduction_def)
5201 double_reduc = true;
5204 epilog_reduc_code = ERROR_MARK;
5205 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5207 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5208 optab_default);
5209 if (!reduc_optab)
5211 if (dump_enabled_p ())
5212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5213 "no optab for reduction.\n");
5215 epilog_reduc_code = ERROR_MARK;
5217 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5219 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5220 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5222 if (dump_enabled_p ())
5223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5224 "reduc op not supported by target.\n");
5226 epilog_reduc_code = ERROR_MARK;
5230 else
5232 if (!nested_cycle || double_reduc)
5234 if (dump_enabled_p ())
5235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5236 "no reduc code for scalar code.\n");
5238 return false;
5242 if (double_reduc && ncopies > 1)
5244 if (dump_enabled_p ())
5245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5246 "multiple types in double reduction\n");
5248 return false;
5251 /* In case of widenning multiplication by a constant, we update the type
5252 of the constant to be the type of the other operand. We check that the
5253 constant fits the type in the pattern recognition pass. */
5254 if (code == DOT_PROD_EXPR
5255 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5257 if (TREE_CODE (ops[0]) == INTEGER_CST)
5258 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5259 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5260 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5261 else
5263 if (dump_enabled_p ())
5264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5265 "invalid types in dot-prod\n");
5267 return false;
5271 if (!vec_stmt) /* transformation not required. */
5273 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5274 reduc_index))
5275 return false;
5276 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5277 return true;
5280 /** Transform. **/
5282 if (dump_enabled_p ())
5283 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5285 /* FORNOW: Multiple types are not supported for condition. */
5286 if (code == COND_EXPR)
5287 gcc_assert (ncopies == 1);
5289 /* Create the destination vector */
5290 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5292 /* In case the vectorization factor (VF) is bigger than the number
5293 of elements that we can fit in a vectype (nunits), we have to generate
5294 more than one vector stmt - i.e - we need to "unroll" the
5295 vector stmt by a factor VF/nunits. For more details see documentation
5296 in vectorizable_operation. */
5298 /* If the reduction is used in an outer loop we need to generate
5299 VF intermediate results, like so (e.g. for ncopies=2):
5300 r0 = phi (init, r0)
5301 r1 = phi (init, r1)
5302 r0 = x0 + r0;
5303 r1 = x1 + r1;
5304 (i.e. we generate VF results in 2 registers).
5305 In this case we have a separate def-use cycle for each copy, and therefore
5306 for each copy we get the vector def for the reduction variable from the
5307 respective phi node created for this copy.
5309 Otherwise (the reduction is unused in the loop nest), we can combine
5310 together intermediate results, like so (e.g. for ncopies=2):
5311 r = phi (init, r)
5312 r = x0 + r;
5313 r = x1 + r;
5314 (i.e. we generate VF/2 results in a single register).
5315 In this case for each copy we get the vector def for the reduction variable
5316 from the vectorized reduction operation generated in the previous iteration.
5319 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5321 single_defuse_cycle = true;
5322 epilog_copies = 1;
5324 else
5325 epilog_copies = ncopies;
5327 prev_stmt_info = NULL;
5328 prev_phi_info = NULL;
5329 if (slp_node)
5331 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5332 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5333 == TYPE_VECTOR_SUBPARTS (vectype_in));
5335 else
5337 vec_num = 1;
5338 vec_oprnds0.create (1);
5339 if (op_type == ternary_op)
5340 vec_oprnds1.create (1);
5343 phis.create (vec_num);
5344 vect_defs.create (vec_num);
5345 if (!slp_node)
5346 vect_defs.quick_push (NULL_TREE);
5348 for (j = 0; j < ncopies; j++)
5350 if (j == 0 || !single_defuse_cycle)
5352 for (i = 0; i < vec_num; i++)
5354 /* Create the reduction-phi that defines the reduction
5355 operand. */
5356 new_phi = create_phi_node (vec_dest, loop->header);
5357 set_vinfo_for_stmt (new_phi,
5358 new_stmt_vec_info (new_phi, loop_vinfo,
5359 NULL));
5360 if (j == 0 || slp_node)
5361 phis.quick_push (new_phi);
5365 if (code == COND_EXPR)
5367 gcc_assert (!slp_node);
5368 vectorizable_condition (stmt, gsi, vec_stmt,
5369 PHI_RESULT (phis[0]),
5370 reduc_index, NULL);
5371 /* Multiple types are not supported for condition. */
5372 break;
5375 /* Handle uses. */
5376 if (j == 0)
5378 op0 = ops[!reduc_index];
5379 if (op_type == ternary_op)
5381 if (reduc_index == 0)
5382 op1 = ops[2];
5383 else
5384 op1 = ops[1];
5387 if (slp_node)
5388 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5389 slp_node, -1);
5390 else
5392 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5393 stmt, NULL);
5394 vec_oprnds0.quick_push (loop_vec_def0);
5395 if (op_type == ternary_op)
5397 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5398 NULL);
5399 vec_oprnds1.quick_push (loop_vec_def1);
5403 else
5405 if (!slp_node)
5407 enum vect_def_type dt;
5408 gimple dummy_stmt;
5409 tree dummy;
5411 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5412 &dummy_stmt, &dummy, &dt);
5413 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5414 loop_vec_def0);
5415 vec_oprnds0[0] = loop_vec_def0;
5416 if (op_type == ternary_op)
5418 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5419 &dummy, &dt);
5420 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5421 loop_vec_def1);
5422 vec_oprnds1[0] = loop_vec_def1;
5426 if (single_defuse_cycle)
5427 reduc_def = gimple_assign_lhs (new_stmt);
5429 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5432 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5434 if (slp_node)
5435 reduc_def = PHI_RESULT (phis[i]);
5436 else
5438 if (!single_defuse_cycle || j == 0)
5439 reduc_def = PHI_RESULT (new_phi);
5442 def1 = ((op_type == ternary_op)
5443 ? vec_oprnds1[i] : NULL);
5444 if (op_type == binary_op)
5446 if (reduc_index == 0)
5447 expr = build2 (code, vectype_out, reduc_def, def0);
5448 else
5449 expr = build2 (code, vectype_out, def0, reduc_def);
5451 else
5453 if (reduc_index == 0)
5454 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5455 else
5457 if (reduc_index == 1)
5458 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5459 else
5460 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5464 new_stmt = gimple_build_assign (vec_dest, expr);
5465 new_temp = make_ssa_name (vec_dest, new_stmt);
5466 gimple_assign_set_lhs (new_stmt, new_temp);
5467 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5469 if (slp_node)
5471 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5472 vect_defs.quick_push (new_temp);
5474 else
5475 vect_defs[0] = new_temp;
5478 if (slp_node)
5479 continue;
5481 if (j == 0)
5482 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5483 else
5484 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5486 prev_stmt_info = vinfo_for_stmt (new_stmt);
5487 prev_phi_info = vinfo_for_stmt (new_phi);
5490 /* Finalize the reduction-phi (set its arguments) and create the
5491 epilog reduction code. */
5492 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5494 new_temp = gimple_assign_lhs (*vec_stmt);
5495 vect_defs[0] = new_temp;
5498 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5499 epilog_reduc_code, phis, reduc_index,
5500 double_reduc, slp_node);
5502 return true;
5505 /* Function vect_min_worthwhile_factor.
5507 For a loop where we could vectorize the operation indicated by CODE,
5508 return the minimum vectorization factor that makes it worthwhile
5509 to use generic vectors. */
5511 vect_min_worthwhile_factor (enum tree_code code)
5513 switch (code)
5515 case PLUS_EXPR:
5516 case MINUS_EXPR:
5517 case NEGATE_EXPR:
5518 return 4;
5520 case BIT_AND_EXPR:
5521 case BIT_IOR_EXPR:
5522 case BIT_XOR_EXPR:
5523 case BIT_NOT_EXPR:
5524 return 2;
5526 default:
5527 return INT_MAX;
5532 /* Function vectorizable_induction
5534 Check if PHI performs an induction computation that can be vectorized.
5535 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5536 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5537 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5539 bool
5540 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5541 gimple *vec_stmt)
5543 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5544 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5545 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5546 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5547 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5548 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5549 tree vec_def;
5551 gcc_assert (ncopies >= 1);
5552 /* FORNOW. These restrictions should be relaxed. */
5553 if (nested_in_vect_loop_p (loop, phi))
5555 imm_use_iterator imm_iter;
5556 use_operand_p use_p;
5557 gimple exit_phi;
5558 edge latch_e;
5559 tree loop_arg;
5561 if (ncopies > 1)
5563 if (dump_enabled_p ())
5564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5565 "multiple types in nested loop.\n");
5566 return false;
5569 exit_phi = NULL;
5570 latch_e = loop_latch_edge (loop->inner);
5571 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5572 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5574 gimple use_stmt = USE_STMT (use_p);
5575 if (is_gimple_debug (use_stmt))
5576 continue;
5578 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5580 exit_phi = use_stmt;
5581 break;
5584 if (exit_phi)
5586 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5587 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5588 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5590 if (dump_enabled_p ())
5591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5592 "inner-loop induction only used outside "
5593 "of the outer vectorized loop.\n");
5594 return false;
5599 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5600 return false;
5602 /* FORNOW: SLP not supported. */
5603 if (STMT_SLP_TYPE (stmt_info))
5604 return false;
5606 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5608 if (gimple_code (phi) != GIMPLE_PHI)
5609 return false;
5611 if (!vec_stmt) /* transformation not required. */
5613 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5614 if (dump_enabled_p ())
5615 dump_printf_loc (MSG_NOTE, vect_location,
5616 "=== vectorizable_induction ===\n");
5617 vect_model_induction_cost (stmt_info, ncopies);
5618 return true;
5621 /** Transform. **/
5623 if (dump_enabled_p ())
5624 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5626 vec_def = get_initial_def_for_induction (phi);
5627 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5628 return true;
5631 /* Function vectorizable_live_operation.
5633 STMT computes a value that is used outside the loop. Check if
5634 it can be supported. */
5636 bool
5637 vectorizable_live_operation (gimple stmt,
5638 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5639 gimple *vec_stmt)
5641 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5642 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5643 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5644 int i;
5645 int op_type;
5646 tree op;
5647 tree def;
5648 gimple def_stmt;
5649 enum vect_def_type dt;
5650 enum tree_code code;
5651 enum gimple_rhs_class rhs_class;
5653 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5655 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5656 return false;
5658 if (!is_gimple_assign (stmt))
5660 if (gimple_call_internal_p (stmt)
5661 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5662 && gimple_call_lhs (stmt)
5663 && loop->simduid
5664 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5665 && loop->simduid
5666 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5668 edge e = single_exit (loop);
5669 basic_block merge_bb = e->dest;
5670 imm_use_iterator imm_iter;
5671 use_operand_p use_p;
5672 tree lhs = gimple_call_lhs (stmt);
5674 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5676 gimple use_stmt = USE_STMT (use_p);
5677 if (gimple_code (use_stmt) == GIMPLE_PHI
5678 && gimple_bb (use_stmt) == merge_bb)
5680 if (vec_stmt)
5682 tree vfm1
5683 = build_int_cst (unsigned_type_node,
5684 loop_vinfo->vectorization_factor - 1);
5685 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5687 return true;
5692 return false;
5695 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5696 return false;
5698 /* FORNOW. CHECKME. */
5699 if (nested_in_vect_loop_p (loop, stmt))
5700 return false;
5702 code = gimple_assign_rhs_code (stmt);
5703 op_type = TREE_CODE_LENGTH (code);
5704 rhs_class = get_gimple_rhs_class (code);
5705 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5706 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5708 /* FORNOW: support only if all uses are invariant. This means
5709 that the scalar operations can remain in place, unvectorized.
5710 The original last scalar value that they compute will be used. */
5712 for (i = 0; i < op_type; i++)
5714 if (rhs_class == GIMPLE_SINGLE_RHS)
5715 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5716 else
5717 op = gimple_op (stmt, i + 1);
5718 if (op
5719 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5720 &dt))
5722 if (dump_enabled_p ())
5723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5724 "use not simple.\n");
5725 return false;
5728 if (dt != vect_external_def && dt != vect_constant_def)
5729 return false;
5732 /* No transformation is required for the cases we currently support. */
5733 return true;
5736 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5738 static void
5739 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5741 ssa_op_iter op_iter;
5742 imm_use_iterator imm_iter;
5743 def_operand_p def_p;
5744 gimple ustmt;
5746 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5748 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5750 basic_block bb;
5752 if (!is_gimple_debug (ustmt))
5753 continue;
5755 bb = gimple_bb (ustmt);
5757 if (!flow_bb_inside_loop_p (loop, bb))
5759 if (gimple_debug_bind_p (ustmt))
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_NOTE, vect_location,
5763 "killing debug use\n");
5765 gimple_debug_bind_reset_value (ustmt);
5766 update_stmt (ustmt);
5768 else
5769 gcc_unreachable ();
5776 /* This function builds ni_name = number of iterations. Statements
5777 are emitted on the loop preheader edge. */
5779 static tree
5780 vect_build_loop_niters (loop_vec_info loop_vinfo)
5782 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5783 if (TREE_CODE (ni) == INTEGER_CST)
5784 return ni;
5785 else
5787 tree ni_name, var;
5788 gimple_seq stmts = NULL;
5789 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5791 var = create_tmp_var (TREE_TYPE (ni), "niters");
5792 ni_name = force_gimple_operand (ni, &stmts, false, var);
5793 if (stmts)
5794 gsi_insert_seq_on_edge_immediate (pe, stmts);
5796 return ni_name;
5801 /* This function generates the following statements:
5803 ni_name = number of iterations loop executes
5804 ratio = ni_name / vf
5805 ratio_mult_vf_name = ratio * vf
5807 and places them on the loop preheader edge. */
5809 static void
5810 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5811 tree ni_name,
5812 tree *ratio_mult_vf_name_ptr,
5813 tree *ratio_name_ptr)
5815 tree ni_minus_gap_name;
5816 tree var;
5817 tree ratio_name;
5818 tree ratio_mult_vf_name;
5819 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5820 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5821 tree log_vf;
5823 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5825 /* If epilogue loop is required because of data accesses with gaps, we
5826 subtract one iteration from the total number of iterations here for
5827 correct calculation of RATIO. */
5828 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5830 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5831 ni_name,
5832 build_one_cst (TREE_TYPE (ni_name)));
5833 if (!is_gimple_val (ni_minus_gap_name))
5835 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5836 gimple stmts = NULL;
5837 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5838 true, var);
5839 gsi_insert_seq_on_edge_immediate (pe, stmts);
5842 else
5843 ni_minus_gap_name = ni_name;
5845 /* Create: ratio = ni >> log2(vf) */
5846 /* ??? As we have ni == number of latch executions + 1, ni could
5847 have overflown to zero. So avoid computing ratio based on ni
5848 but compute it using the fact that we know ratio will be at least
5849 one, thus via (ni - vf) >> log2(vf) + 1. */
5850 ratio_name
5851 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5852 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5853 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5854 ni_minus_gap_name,
5855 build_int_cst
5856 (TREE_TYPE (ni_name), vf)),
5857 log_vf),
5858 build_int_cst (TREE_TYPE (ni_name), 1));
5859 if (!is_gimple_val (ratio_name))
5861 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5862 gimple stmts = NULL;
5863 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5864 gsi_insert_seq_on_edge_immediate (pe, stmts);
5866 *ratio_name_ptr = ratio_name;
5868 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5870 if (ratio_mult_vf_name_ptr)
5872 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5873 ratio_name, log_vf);
5874 if (!is_gimple_val (ratio_mult_vf_name))
5876 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5877 gimple stmts = NULL;
5878 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5879 true, var);
5880 gsi_insert_seq_on_edge_immediate (pe, stmts);
5882 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5885 return;
5889 /* Function vect_transform_loop.
5891 The analysis phase has determined that the loop is vectorizable.
5892 Vectorize the loop - created vectorized stmts to replace the scalar
5893 stmts in the loop, and update the loop exit condition. */
5895 void
5896 vect_transform_loop (loop_vec_info loop_vinfo)
5898 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5899 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5900 int nbbs = loop->num_nodes;
5901 int i;
5902 tree ratio = NULL;
5903 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5904 bool grouped_store;
5905 bool slp_scheduled = false;
5906 gimple stmt, pattern_stmt;
5907 gimple_seq pattern_def_seq = NULL;
5908 gimple_stmt_iterator pattern_def_si = gsi_none ();
5909 bool transform_pattern_stmt = false;
5910 bool check_profitability = false;
5911 int th;
5912 /* Record number of iterations before we started tampering with the profile. */
5913 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5915 if (dump_enabled_p ())
5916 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5918 /* If profile is inprecise, we have chance to fix it up. */
5919 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5920 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5922 /* Use the more conservative vectorization threshold. If the number
5923 of iterations is constant assume the cost check has been performed
5924 by our caller. If the threshold makes all loops profitable that
5925 run at least the vectorization factor number of times checking
5926 is pointless, too. */
5927 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5928 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5929 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5931 if (dump_enabled_p ())
5932 dump_printf_loc (MSG_NOTE, vect_location,
5933 "Profitability threshold is %d loop iterations.\n",
5934 th);
5935 check_profitability = true;
5938 /* Version the loop first, if required, so the profitability check
5939 comes first. */
5941 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5942 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5944 vect_loop_versioning (loop_vinfo, th, check_profitability);
5945 check_profitability = false;
5948 tree ni_name = vect_build_loop_niters (loop_vinfo);
5949 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5951 /* Peel the loop if there are data refs with unknown alignment.
5952 Only one data ref with unknown store is allowed. */
5954 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5956 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5957 th, check_profitability);
5958 check_profitability = false;
5959 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5960 be re-computed. */
5961 ni_name = NULL_TREE;
5964 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5965 compile time constant), or it is a constant that doesn't divide by the
5966 vectorization factor, then an epilog loop needs to be created.
5967 We therefore duplicate the loop: the original loop will be vectorized,
5968 and will compute the first (n/VF) iterations. The second copy of the loop
5969 will remain scalar and will compute the remaining (n%VF) iterations.
5970 (VF is the vectorization factor). */
5972 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5973 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5975 tree ratio_mult_vf;
5976 if (!ni_name)
5977 ni_name = vect_build_loop_niters (loop_vinfo);
5978 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5979 &ratio);
5980 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5981 th, check_profitability);
5983 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5984 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5985 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5986 else
5988 if (!ni_name)
5989 ni_name = vect_build_loop_niters (loop_vinfo);
5990 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5993 /* 1) Make sure the loop header has exactly two entries
5994 2) Make sure we have a preheader basic block. */
5996 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5998 split_edge (loop_preheader_edge (loop));
6000 /* FORNOW: the vectorizer supports only loops which body consist
6001 of one basic block (header + empty latch). When the vectorizer will
6002 support more involved loop forms, the order by which the BBs are
6003 traversed need to be reconsidered. */
6005 for (i = 0; i < nbbs; i++)
6007 basic_block bb = bbs[i];
6008 stmt_vec_info stmt_info;
6010 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6011 gsi_next (&si))
6013 gphi *phi = si.phi ();
6014 if (dump_enabled_p ())
6016 dump_printf_loc (MSG_NOTE, vect_location,
6017 "------>vectorizing phi: ");
6018 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6019 dump_printf (MSG_NOTE, "\n");
6021 stmt_info = vinfo_for_stmt (phi);
6022 if (!stmt_info)
6023 continue;
6025 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6026 vect_loop_kill_debug_uses (loop, phi);
6028 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6029 && !STMT_VINFO_LIVE_P (stmt_info))
6030 continue;
6032 if (STMT_VINFO_VECTYPE (stmt_info)
6033 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6034 != (unsigned HOST_WIDE_INT) vectorization_factor)
6035 && dump_enabled_p ())
6036 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6038 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6040 if (dump_enabled_p ())
6041 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6042 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6046 pattern_stmt = NULL;
6047 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6048 !gsi_end_p (si) || transform_pattern_stmt;)
6050 bool is_store;
6052 if (transform_pattern_stmt)
6053 stmt = pattern_stmt;
6054 else
6056 stmt = gsi_stmt (si);
6057 /* During vectorization remove existing clobber stmts. */
6058 if (gimple_clobber_p (stmt))
6060 unlink_stmt_vdef (stmt);
6061 gsi_remove (&si, true);
6062 release_defs (stmt);
6063 continue;
6067 if (dump_enabled_p ())
6069 dump_printf_loc (MSG_NOTE, vect_location,
6070 "------>vectorizing statement: ");
6071 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6072 dump_printf (MSG_NOTE, "\n");
6075 stmt_info = vinfo_for_stmt (stmt);
6077 /* vector stmts created in the outer-loop during vectorization of
6078 stmts in an inner-loop may not have a stmt_info, and do not
6079 need to be vectorized. */
6080 if (!stmt_info)
6082 gsi_next (&si);
6083 continue;
6086 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6087 vect_loop_kill_debug_uses (loop, stmt);
6089 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6090 && !STMT_VINFO_LIVE_P (stmt_info))
6092 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6093 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6094 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6095 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6097 stmt = pattern_stmt;
6098 stmt_info = vinfo_for_stmt (stmt);
6100 else
6102 gsi_next (&si);
6103 continue;
6106 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6107 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6108 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6109 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6110 transform_pattern_stmt = true;
6112 /* If pattern statement has def stmts, vectorize them too. */
6113 if (is_pattern_stmt_p (stmt_info))
6115 if (pattern_def_seq == NULL)
6117 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6118 pattern_def_si = gsi_start (pattern_def_seq);
6120 else if (!gsi_end_p (pattern_def_si))
6121 gsi_next (&pattern_def_si);
6122 if (pattern_def_seq != NULL)
6124 gimple pattern_def_stmt = NULL;
6125 stmt_vec_info pattern_def_stmt_info = NULL;
6127 while (!gsi_end_p (pattern_def_si))
6129 pattern_def_stmt = gsi_stmt (pattern_def_si);
6130 pattern_def_stmt_info
6131 = vinfo_for_stmt (pattern_def_stmt);
6132 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6133 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6134 break;
6135 gsi_next (&pattern_def_si);
6138 if (!gsi_end_p (pattern_def_si))
6140 if (dump_enabled_p ())
6142 dump_printf_loc (MSG_NOTE, vect_location,
6143 "==> vectorizing pattern def "
6144 "stmt: ");
6145 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6146 pattern_def_stmt, 0);
6147 dump_printf (MSG_NOTE, "\n");
6150 stmt = pattern_def_stmt;
6151 stmt_info = pattern_def_stmt_info;
6153 else
6155 pattern_def_si = gsi_none ();
6156 transform_pattern_stmt = false;
6159 else
6160 transform_pattern_stmt = false;
6163 if (STMT_VINFO_VECTYPE (stmt_info))
6165 unsigned int nunits
6166 = (unsigned int)
6167 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6168 if (!STMT_SLP_TYPE (stmt_info)
6169 && nunits != (unsigned int) vectorization_factor
6170 && dump_enabled_p ())
6171 /* For SLP VF is set according to unrolling factor, and not
6172 to vector size, hence for SLP this print is not valid. */
6173 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6176 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6177 reached. */
6178 if (STMT_SLP_TYPE (stmt_info))
6180 if (!slp_scheduled)
6182 slp_scheduled = true;
6184 if (dump_enabled_p ())
6185 dump_printf_loc (MSG_NOTE, vect_location,
6186 "=== scheduling SLP instances ===\n");
6188 vect_schedule_slp (loop_vinfo, NULL);
6191 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6192 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6194 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6196 pattern_def_seq = NULL;
6197 gsi_next (&si);
6199 continue;
6203 /* -------- vectorize statement ------------ */
6204 if (dump_enabled_p ())
6205 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6207 grouped_store = false;
6208 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6209 if (is_store)
6211 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6213 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6214 interleaving chain was completed - free all the stores in
6215 the chain. */
6216 gsi_next (&si);
6217 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6219 else
6221 /* Free the attached stmt_vec_info and remove the stmt. */
6222 gimple store = gsi_stmt (si);
6223 free_stmt_vec_info (store);
6224 unlink_stmt_vdef (store);
6225 gsi_remove (&si, true);
6226 release_defs (store);
6229 /* Stores can only appear at the end of pattern statements. */
6230 gcc_assert (!transform_pattern_stmt);
6231 pattern_def_seq = NULL;
6233 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6235 pattern_def_seq = NULL;
6236 gsi_next (&si);
6238 } /* stmts in BB */
6239 } /* BBs in loop */
6241 slpeel_make_loop_iterate_ntimes (loop, ratio);
6243 /* Reduce loop iterations by the vectorization factor. */
6244 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6245 expected_iterations / vectorization_factor);
6246 loop->nb_iterations_upper_bound
6247 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6248 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6249 && loop->nb_iterations_upper_bound != 0)
6250 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6251 if (loop->any_estimate)
6253 loop->nb_iterations_estimate
6254 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6255 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6256 && loop->nb_iterations_estimate != 0)
6257 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6260 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_NOTE, vect_location,
6263 "LOOP VECTORIZED\n");
6264 if (loop->inner)
6265 dump_printf_loc (MSG_NOTE, vect_location,
6266 "OUTER LOOP VECTORIZED\n");
6267 dump_printf (MSG_NOTE, "\n");