2015-02-10 David Wohlferd <dw@LimeGreenSocks.com>
[official-gcc.git] / gcc / tree-vect-loop.c
blob3e7c701e636b685d21ac2561348c98707350338f
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "predict.h"
40 #include "hard-reg-set.h"
41 #include "function.h"
42 #include "dominance.h"
43 #include "cfg.h"
44 #include "cfganal.h"
45 #include "basic-block.h"
46 #include "gimple-pretty-print.h"
47 #include "tree-ssa-alias.h"
48 #include "internal-fn.h"
49 #include "gimple-expr.h"
50 #include "is-a.h"
51 #include "gimple.h"
52 #include "gimplify.h"
53 #include "gimple-iterator.h"
54 #include "gimplify-me.h"
55 #include "gimple-ssa.h"
56 #include "tree-phinodes.h"
57 #include "ssa-iterators.h"
58 #include "stringpool.h"
59 #include "tree-ssanames.h"
60 #include "tree-ssa-loop-ivopts.h"
61 #include "tree-ssa-loop-manip.h"
62 #include "tree-ssa-loop-niter.h"
63 #include "tree-pass.h"
64 #include "cfgloop.h"
65 #include "hashtab.h"
66 #include "rtl.h"
67 #include "flags.h"
68 #include "statistics.h"
69 #include "real.h"
70 #include "fixed-value.h"
71 #include "insn-config.h"
72 #include "expmed.h"
73 #include "dojump.h"
74 #include "explow.h"
75 #include "calls.h"
76 #include "emit-rtl.h"
77 #include "varasm.h"
78 #include "stmt.h"
79 #include "expr.h"
80 #include "recog.h"
81 #include "insn-codes.h"
82 #include "optabs.h"
83 #include "params.h"
84 #include "diagnostic-core.h"
85 #include "tree-chrec.h"
86 #include "tree-scalar-evolution.h"
87 #include "tree-vectorizer.h"
88 #include "target.h"
90 /* Loop Vectorization Pass.
92 This pass tries to vectorize loops.
94 For example, the vectorizer transforms the following simple loop:
96 short a[N]; short b[N]; short c[N]; int i;
98 for (i=0; i<N; i++){
99 a[i] = b[i] + c[i];
102 as if it was manually vectorized by rewriting the source code into:
104 typedef int __attribute__((mode(V8HI))) v8hi;
105 short a[N]; short b[N]; short c[N]; int i;
106 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
107 v8hi va, vb, vc;
109 for (i=0; i<N/8; i++){
110 vb = pb[i];
111 vc = pc[i];
112 va = vb + vc;
113 pa[i] = va;
116 The main entry to this pass is vectorize_loops(), in which
117 the vectorizer applies a set of analyses on a given set of loops,
118 followed by the actual vectorization transformation for the loops that
119 had successfully passed the analysis phase.
120 Throughout this pass we make a distinction between two types of
121 data: scalars (which are represented by SSA_NAMES), and memory references
122 ("data-refs"). These two types of data require different handling both
123 during analysis and transformation. The types of data-refs that the
124 vectorizer currently supports are ARRAY_REFS which base is an array DECL
125 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
126 accesses are required to have a simple (consecutive) access pattern.
128 Analysis phase:
129 ===============
130 The driver for the analysis phase is vect_analyze_loop().
131 It applies a set of analyses, some of which rely on the scalar evolution
132 analyzer (scev) developed by Sebastian Pop.
134 During the analysis phase the vectorizer records some information
135 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
136 loop, as well as general information about the loop as a whole, which is
137 recorded in a "loop_vec_info" struct attached to each loop.
139 Transformation phase:
140 =====================
141 The loop transformation phase scans all the stmts in the loop, and
142 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
143 the loop that needs to be vectorized. It inserts the vector code sequence
144 just before the scalar stmt S, and records a pointer to the vector code
145 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
146 attached to S). This pointer will be used for the vectorization of following
147 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
148 otherwise, we rely on dead code elimination for removing it.
150 For example, say stmt S1 was vectorized into stmt VS1:
152 VS1: vb = px[i];
153 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
154 S2: a = b;
156 To vectorize stmt S2, the vectorizer first finds the stmt that defines
157 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
158 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
159 resulting sequence would be:
161 VS1: vb = px[i];
162 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
163 VS2: va = vb;
164 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
166 Operands that are not SSA_NAMEs, are data-refs that appear in
167 load/store operations (like 'x[i]' in S1), and are handled differently.
169 Target modeling:
170 =================
171 Currently the only target specific information that is used is the
172 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
173 Targets that can support different sizes of vectors, for now will need
174 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
175 flexibility will be added in the future.
177 Since we only vectorize operations which vector form can be
178 expressed using existing tree codes, to verify that an operation is
179 supported, the vectorizer checks the relevant optab at the relevant
180 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
181 the value found is CODE_FOR_nothing, then there's no target support, and
182 we can't vectorize the stmt.
184 For additional information on this project see:
185 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
190 /* Function vect_determine_vectorization_factor
192 Determine the vectorization factor (VF). VF is the number of data elements
193 that are operated upon in parallel in a single iteration of the vectorized
194 loop. For example, when vectorizing a loop that operates on 4byte elements,
195 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
196 elements can fit in a single vector register.
198 We currently support vectorization of loops in which all types operated upon
199 are of the same size. Therefore this function currently sets VF according to
200 the size of the types operated upon, and fails if there are multiple sizes
201 in the loop.
203 VF is also the factor by which the loop iterations are strip-mined, e.g.:
204 original loop:
205 for (i=0; i<N; i++){
206 a[i] = b[i] + c[i];
209 vectorized loop:
210 for (i=0; i<N; i+=VF){
211 a[i:VF] = b[i:VF] + c[i:VF];
215 static bool
216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
218 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
219 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
220 int nbbs = loop->num_nodes;
221 unsigned int vectorization_factor = 0;
222 tree scalar_type;
223 gphi *phi;
224 tree vectype;
225 unsigned int nunits;
226 stmt_vec_info stmt_info;
227 int i;
228 HOST_WIDE_INT dummy;
229 gimple stmt, pattern_stmt = NULL;
230 gimple_seq pattern_def_seq = NULL;
231 gimple_stmt_iterator pattern_def_si = gsi_none ();
232 bool analyze_pattern_stmt = false;
234 if (dump_enabled_p ())
235 dump_printf_loc (MSG_NOTE, vect_location,
236 "=== vect_determine_vectorization_factor ===\n");
238 for (i = 0; i < nbbs; i++)
240 basic_block bb = bbs[i];
242 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
243 gsi_next (&si))
245 phi = si.phi ();
246 stmt_info = vinfo_for_stmt (phi);
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
250 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
251 dump_printf (MSG_NOTE, "\n");
254 gcc_assert (stmt_info);
256 if (STMT_VINFO_RELEVANT_P (stmt_info))
258 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
259 scalar_type = TREE_TYPE (PHI_RESULT (phi));
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location,
264 "get vectype for scalar type: ");
265 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
266 dump_printf (MSG_NOTE, "\n");
269 vectype = get_vectype_for_scalar_type (scalar_type);
270 if (!vectype)
272 if (dump_enabled_p ())
274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
275 "not vectorized: unsupported "
276 "data-type ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 scalar_type);
279 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
281 return false;
283 STMT_VINFO_VECTYPE (stmt_info) = vectype;
285 if (dump_enabled_p ())
287 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
288 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
289 dump_printf (MSG_NOTE, "\n");
292 nunits = TYPE_VECTOR_SUBPARTS (vectype);
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
295 nunits);
297 if (!vectorization_factor
298 || (nunits > vectorization_factor))
299 vectorization_factor = nunits;
303 for (gimple_stmt_iterator si = gsi_start_bb (bb);
304 !gsi_end_p (si) || analyze_pattern_stmt;)
306 tree vf_vectype;
308 if (analyze_pattern_stmt)
309 stmt = pattern_stmt;
310 else
311 stmt = gsi_stmt (si);
313 stmt_info = vinfo_for_stmt (stmt);
315 if (dump_enabled_p ())
317 dump_printf_loc (MSG_NOTE, vect_location,
318 "==> examining statement: ");
319 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
320 dump_printf (MSG_NOTE, "\n");
323 gcc_assert (stmt_info);
325 /* Skip stmts which do not need to be vectorized. */
326 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
327 && !STMT_VINFO_LIVE_P (stmt_info))
328 || gimple_clobber_p (stmt))
330 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
331 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
332 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
333 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
335 stmt = pattern_stmt;
336 stmt_info = vinfo_for_stmt (pattern_stmt);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location,
340 "==> examining pattern statement: ");
341 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
342 dump_printf (MSG_NOTE, "\n");
345 else
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
349 gsi_next (&si);
350 continue;
353 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
354 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
355 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
356 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
357 analyze_pattern_stmt = true;
359 /* If a pattern statement has def stmts, analyze them too. */
360 if (is_pattern_stmt_p (stmt_info))
362 if (pattern_def_seq == NULL)
364 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
365 pattern_def_si = gsi_start (pattern_def_seq);
367 else if (!gsi_end_p (pattern_def_si))
368 gsi_next (&pattern_def_si);
369 if (pattern_def_seq != NULL)
371 gimple pattern_def_stmt = NULL;
372 stmt_vec_info pattern_def_stmt_info = NULL;
374 while (!gsi_end_p (pattern_def_si))
376 pattern_def_stmt = gsi_stmt (pattern_def_si);
377 pattern_def_stmt_info
378 = vinfo_for_stmt (pattern_def_stmt);
379 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
380 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
381 break;
382 gsi_next (&pattern_def_si);
385 if (!gsi_end_p (pattern_def_si))
387 if (dump_enabled_p ())
389 dump_printf_loc (MSG_NOTE, vect_location,
390 "==> examining pattern def stmt: ");
391 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
392 pattern_def_stmt, 0);
393 dump_printf (MSG_NOTE, "\n");
396 stmt = pattern_def_stmt;
397 stmt_info = pattern_def_stmt_info;
399 else
401 pattern_def_si = gsi_none ();
402 analyze_pattern_stmt = false;
405 else
406 analyze_pattern_stmt = false;
409 if (gimple_get_lhs (stmt) == NULL_TREE
410 /* MASK_STORE has no lhs, but is ok. */
411 && (!is_gimple_call (stmt)
412 || !gimple_call_internal_p (stmt)
413 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
415 if (is_gimple_call (stmt))
417 /* Ignore calls with no lhs. These must be calls to
418 #pragma omp simd functions, and what vectorization factor
419 it really needs can't be determined until
420 vectorizable_simd_clone_call. */
421 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
423 pattern_def_seq = NULL;
424 gsi_next (&si);
426 continue;
428 if (dump_enabled_p ())
430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431 "not vectorized: irregular stmt.");
432 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
434 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
436 return false;
439 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
441 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444 "not vectorized: vector stmt in loop:");
445 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
446 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 return false;
451 if (STMT_VINFO_VECTYPE (stmt_info))
453 /* The only case when a vectype had been already set is for stmts
454 that contain a dataref, or for "pattern-stmts" (stmts
455 generated by the vectorizer to represent/replace a certain
456 idiom). */
457 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
458 || is_pattern_stmt_p (stmt_info)
459 || !gsi_end_p (pattern_def_si));
460 vectype = STMT_VINFO_VECTYPE (stmt_info);
462 else
464 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
465 if (is_gimple_call (stmt)
466 && gimple_call_internal_p (stmt)
467 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
468 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
469 else
470 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_NOTE, vect_location,
474 "get vectype for scalar type: ");
475 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
476 dump_printf (MSG_NOTE, "\n");
478 vectype = get_vectype_for_scalar_type (scalar_type);
479 if (!vectype)
481 if (dump_enabled_p ())
483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484 "not vectorized: unsupported "
485 "data-type ");
486 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
487 scalar_type);
488 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
490 return false;
493 STMT_VINFO_VECTYPE (stmt_info) = vectype;
495 if (dump_enabled_p ())
497 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
498 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
499 dump_printf (MSG_NOTE, "\n");
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
516 if (!vf_vectype)
518 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "not vectorized: unsupported data-type ");
522 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523 scalar_type);
524 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
526 return false;
529 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "not vectorized: different sized vector "
536 "types in statement, ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538 vectype);
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541 vf_vectype);
542 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 return false;
547 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551 dump_printf (MSG_NOTE, "\n");
554 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557 if (!vectorization_factor
558 || (nunits > vectorization_factor))
559 vectorization_factor = nunits;
561 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
563 pattern_def_seq = NULL;
564 gsi_next (&si);
569 /* TODO: Analyze cost. Decide if worth while to vectorize. */
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572 vectorization_factor);
573 if (vectorization_factor <= 1)
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "not vectorized: unsupported data-type\n");
578 return false;
580 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
582 return true;
586 /* Function vect_is_simple_iv_evolution.
588 FORNOW: A simple evolution of an induction variables in the loop is
589 considered a polynomial evolution. */
591 static bool
592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
593 tree * step)
595 tree init_expr;
596 tree step_expr;
597 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
598 basic_block bb;
600 /* When there is no evolution in this loop, the evolution function
601 is not "simple". */
602 if (evolution_part == NULL_TREE)
603 return false;
605 /* When the evolution is a polynomial of degree >= 2
606 the evolution function is not "simple". */
607 if (tree_is_chrec (evolution_part))
608 return false;
610 step_expr = evolution_part;
611 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
613 if (dump_enabled_p ())
615 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
616 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
617 dump_printf (MSG_NOTE, ", init: ");
618 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
619 dump_printf (MSG_NOTE, "\n");
622 *init = init_expr;
623 *step = step_expr;
625 if (TREE_CODE (step_expr) != INTEGER_CST
626 && (TREE_CODE (step_expr) != SSA_NAME
627 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
628 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
629 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
630 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
631 || !flag_associative_math)))
632 && (TREE_CODE (step_expr) != REAL_CST
633 || !flag_associative_math))
635 if (dump_enabled_p ())
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637 "step unknown.\n");
638 return false;
641 return true;
644 /* Function vect_analyze_scalar_cycles_1.
646 Examine the cross iteration def-use cycles of scalar variables
647 in LOOP. LOOP_VINFO represents the loop that is now being
648 considered for vectorization (can be LOOP, or an outer-loop
649 enclosing LOOP). */
651 static void
652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
654 basic_block bb = loop->header;
655 tree init, step;
656 auto_vec<gimple, 64> worklist;
657 gphi_iterator gsi;
658 bool double_reduc;
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "=== vect_analyze_scalar_cycles ===\n");
664 /* First - identify all inductions. Reduction detection assumes that all the
665 inductions have been identified, therefore, this order must not be
666 changed. */
667 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
669 gphi *phi = gsi.phi ();
670 tree access_fn = NULL;
671 tree def = PHI_RESULT (phi);
672 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
678 dump_printf (MSG_NOTE, "\n");
681 /* Skip virtual phi's. The data dependences that are associated with
682 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
683 if (virtual_operand_p (def))
684 continue;
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
688 /* Analyze the evolution function. */
689 access_fn = analyze_scalar_evolution (loop, def);
690 if (access_fn)
692 STRIP_NOPS (access_fn);
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_NOTE, vect_location,
696 "Access function of PHI: ");
697 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
698 dump_printf (MSG_NOTE, "\n");
700 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
701 = evolution_part_in_loop_num (access_fn, loop->num);
704 if (!access_fn
705 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
706 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
707 && TREE_CODE (step) != INTEGER_CST))
709 worklist.safe_push (phi);
710 continue;
713 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
715 if (dump_enabled_p ())
716 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
721 /* Second - identify all reductions and nested cycles. */
722 while (worklist.length () > 0)
724 gimple phi = worklist.pop ();
725 tree def = PHI_RESULT (phi);
726 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
727 gimple reduc_stmt;
728 bool nested_cycle;
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
733 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
734 dump_printf (MSG_NOTE, "\n");
737 gcc_assert (!virtual_operand_p (def)
738 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
740 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
741 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
742 &double_reduc);
743 if (reduc_stmt)
745 if (double_reduc)
747 if (dump_enabled_p ())
748 dump_printf_loc (MSG_NOTE, vect_location,
749 "Detected double reduction.\n");
751 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
752 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
753 vect_double_reduction_def;
755 else
757 if (nested_cycle)
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_NOTE, vect_location,
761 "Detected vectorizable nested cycle.\n");
763 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
764 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
765 vect_nested_cycle;
767 else
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE, vect_location,
771 "Detected reduction.\n");
773 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
774 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
775 vect_reduction_def;
776 /* Store the reduction cycles for possible vectorization in
777 loop-aware SLP. */
778 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
782 else
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 "Unknown def-use cycle pattern.\n");
790 /* Function vect_analyze_scalar_cycles.
792 Examine the cross iteration def-use cycles of scalar variables, by
793 analyzing the loop-header PHIs of scalar variables. Classify each
794 cycle as one of the following: invariant, induction, reduction, unknown.
795 We do that for the loop represented by LOOP_VINFO, and also to its
796 inner-loop, if exists.
797 Examples for scalar cycles:
799 Example1: reduction:
801 loop1:
802 for (i=0; i<N; i++)
803 sum += a[i];
805 Example2: induction:
807 loop2:
808 for (i=0; i<N; i++)
809 a[i] = i; */
811 static void
812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
814 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
816 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
818 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
819 Reductions in such inner-loop therefore have different properties than
820 the reductions in the nest that gets vectorized:
821 1. When vectorized, they are executed in the same order as in the original
822 scalar loop, so we can't change the order of computation when
823 vectorizing them.
824 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
825 current checks are too strict. */
827 if (loop->inner)
828 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
832 /* Function vect_get_loop_niters.
834 Determine how many iterations the loop is executed and place it
835 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
836 in NUMBER_OF_ITERATIONSM1.
838 Return the loop exit condition. */
841 static gcond *
842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
843 tree *number_of_iterationsm1)
845 tree niters;
847 if (dump_enabled_p ())
848 dump_printf_loc (MSG_NOTE, vect_location,
849 "=== get_loop_niters ===\n");
851 niters = number_of_latch_executions (loop);
852 *number_of_iterationsm1 = niters;
854 /* We want the number of loop header executions which is the number
855 of latch executions plus one.
856 ??? For UINT_MAX latch executions this number overflows to zero
857 for loops like do { n++; } while (n != 0); */
858 if (niters && !chrec_contains_undetermined (niters))
859 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
860 build_int_cst (TREE_TYPE (niters), 1));
861 *number_of_iterations = niters;
863 return get_loop_exit_condition (loop);
867 /* Function bb_in_loop_p
869 Used as predicate for dfs order traversal of the loop bbs. */
871 static bool
872 bb_in_loop_p (const_basic_block bb, const void *data)
874 const struct loop *const loop = (const struct loop *)data;
875 if (flow_bb_inside_loop_p (loop, bb))
876 return true;
877 return false;
881 /* Function new_loop_vec_info.
883 Create and initialize a new loop_vec_info struct for LOOP, as well as
884 stmt_vec_info structs for all the stmts in LOOP. */
886 static loop_vec_info
887 new_loop_vec_info (struct loop *loop)
889 loop_vec_info res;
890 basic_block *bbs;
891 gimple_stmt_iterator si;
892 unsigned int i, nbbs;
894 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
895 LOOP_VINFO_LOOP (res) = loop;
897 bbs = get_loop_body (loop);
899 /* Create/Update stmt_info for all stmts in the loop. */
900 for (i = 0; i < loop->num_nodes; i++)
902 basic_block bb = bbs[i];
904 /* BBs in a nested inner-loop will have been already processed (because
905 we will have called vect_analyze_loop_form for any nested inner-loop).
906 Therefore, for stmts in an inner-loop we just want to update the
907 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
908 loop_info of the outer-loop we are currently considering to vectorize
909 (instead of the loop_info of the inner-loop).
910 For stmts in other BBs we need to create a stmt_info from scratch. */
911 if (bb->loop_father != loop)
913 /* Inner-loop bb. */
914 gcc_assert (loop->inner && bb->loop_father == loop->inner);
915 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
917 gimple phi = gsi_stmt (si);
918 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
919 loop_vec_info inner_loop_vinfo =
920 STMT_VINFO_LOOP_VINFO (stmt_info);
921 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
922 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
924 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
926 gimple stmt = gsi_stmt (si);
927 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
928 loop_vec_info inner_loop_vinfo =
929 STMT_VINFO_LOOP_VINFO (stmt_info);
930 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
931 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
934 else
936 /* bb in current nest. */
937 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
939 gimple phi = gsi_stmt (si);
940 gimple_set_uid (phi, 0);
941 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
944 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
946 gimple stmt = gsi_stmt (si);
947 gimple_set_uid (stmt, 0);
948 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
953 /* CHECKME: We want to visit all BBs before their successors (except for
954 latch blocks, for which this assertion wouldn't hold). In the simple
955 case of the loop forms we allow, a dfs order of the BBs would the same
956 as reversed postorder traversal, so we are safe. */
958 free (bbs);
959 bbs = XCNEWVEC (basic_block, loop->num_nodes);
960 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
961 bbs, loop->num_nodes, loop);
962 gcc_assert (nbbs == loop->num_nodes);
964 LOOP_VINFO_BBS (res) = bbs;
965 LOOP_VINFO_NITERSM1 (res) = NULL;
966 LOOP_VINFO_NITERS (res) = NULL;
967 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
968 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
969 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
970 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
971 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
972 LOOP_VINFO_VECT_FACTOR (res) = 0;
973 LOOP_VINFO_LOOP_NEST (res).create (3);
974 LOOP_VINFO_DATAREFS (res).create (10);
975 LOOP_VINFO_DDRS (res).create (10 * 10);
976 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
977 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
978 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
979 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
980 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
981 LOOP_VINFO_GROUPED_STORES (res).create (10);
982 LOOP_VINFO_REDUCTIONS (res).create (10);
983 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
984 LOOP_VINFO_SLP_INSTANCES (res).create (10);
985 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
986 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
987 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
988 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
989 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
991 return res;
995 /* Function destroy_loop_vec_info.
997 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
998 stmts in the loop. */
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1003 struct loop *loop;
1004 basic_block *bbs;
1005 int nbbs;
1006 gimple_stmt_iterator si;
1007 int j;
1008 vec<slp_instance> slp_instances;
1009 slp_instance instance;
1010 bool swapped;
1012 if (!loop_vinfo)
1013 return;
1015 loop = LOOP_VINFO_LOOP (loop_vinfo);
1017 bbs = LOOP_VINFO_BBS (loop_vinfo);
1018 nbbs = clean_stmts ? loop->num_nodes : 0;
1019 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1021 for (j = 0; j < nbbs; j++)
1023 basic_block bb = bbs[j];
1024 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025 free_stmt_vec_info (gsi_stmt (si));
1027 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1029 gimple stmt = gsi_stmt (si);
1031 /* We may have broken canonical form by moving a constant
1032 into RHS1 of a commutative op. Fix such occurrences. */
1033 if (swapped && is_gimple_assign (stmt))
1035 enum tree_code code = gimple_assign_rhs_code (stmt);
1037 if ((code == PLUS_EXPR
1038 || code == POINTER_PLUS_EXPR
1039 || code == MULT_EXPR)
1040 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041 swap_ssa_operands (stmt,
1042 gimple_assign_rhs1_ptr (stmt),
1043 gimple_assign_rhs2_ptr (stmt));
1046 /* Free stmt_vec_info. */
1047 free_stmt_vec_info (stmt);
1048 gsi_next (&si);
1052 free (LOOP_VINFO_BBS (loop_vinfo));
1053 vect_destroy_datarefs (loop_vinfo, NULL);
1054 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060 vect_free_slp_instance (instance);
1062 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1067 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1070 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1072 free (loop_vinfo);
1073 loop->aux = NULL;
1077 /* Function vect_analyze_loop_1.
1079 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080 for it. The different analyses will record information in the
1081 loop_vec_info struct. This is a subset of the analyses applied in
1082 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083 that is now considered for (outer-loop) vectorization. */
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1088 loop_vec_info loop_vinfo;
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "===== analyze_loop_nest_1 =====\n");
1094 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1096 loop_vinfo = vect_analyze_loop_form (loop);
1097 if (!loop_vinfo)
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "bad inner-loop form.\n");
1102 return NULL;
1105 return loop_vinfo;
1109 /* Function vect_analyze_loop_form.
1111 Verify that certain CFG restrictions hold, including:
1112 - the loop has a pre-header
1113 - the loop has a single entry and exit
1114 - the loop exit condition is simple enough, and the number of iterations
1115 can be analyzed (a countable loop). */
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1120 loop_vec_info loop_vinfo;
1121 gcond *loop_cond;
1122 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123 loop_vec_info inner_loop_vinfo = NULL;
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_NOTE, vect_location,
1127 "=== vect_analyze_loop_form ===\n");
1129 /* Different restrictions apply when we are considering an inner-most loop,
1130 vs. an outer (nested) loop.
1131 (FORNOW. May want to relax some of these restrictions in the future). */
1133 if (!loop->inner)
1135 /* Inner-most loop. We currently require that the number of BBs is
1136 exactly 2 (the header and latch). Vectorizable inner-most loops
1137 look like this:
1139 (pre-header)
1141 header <--------+
1142 | | |
1143 | +--> latch --+
1145 (exit-bb) */
1147 if (loop->num_nodes != 2)
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151 "not vectorized: control flow in loop.\n");
1152 return NULL;
1155 if (empty_block_p (loop->header))
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "not vectorized: empty loop.\n");
1160 return NULL;
1163 else
1165 struct loop *innerloop = loop->inner;
1166 edge entryedge;
1168 /* Nested loop. We currently require that the loop is doubly-nested,
1169 contains a single inner loop, and the number of BBs is exactly 5.
1170 Vectorizable outer-loops look like this:
1172 (pre-header)
1174 header <---+
1176 inner-loop |
1178 tail ------+
1180 (exit-bb)
1182 The inner-loop has the properties expected of inner-most loops
1183 as described above. */
1185 if ((loop->inner)->inner || (loop->inner)->next)
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "not vectorized: multiple nested loops.\n");
1190 return NULL;
1193 /* Analyze the inner-loop. */
1194 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195 if (!inner_loop_vinfo)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "not vectorized: Bad inner loop.\n");
1200 return NULL;
1203 if (!expr_invariant_in_loop_p (loop,
1204 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "not vectorized: inner-loop count not"
1209 " invariant.\n");
1210 destroy_loop_vec_info (inner_loop_vinfo, true);
1211 return NULL;
1214 if (loop->num_nodes != 5)
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "not vectorized: control flow in loop.\n");
1219 destroy_loop_vec_info (inner_loop_vinfo, true);
1220 return NULL;
1223 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224 entryedge = EDGE_PRED (innerloop->header, 0);
1225 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226 entryedge = EDGE_PRED (innerloop->header, 1);
1228 if (entryedge->src != loop->header
1229 || !single_exit (innerloop)
1230 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: unsupported outerloop form.\n");
1235 destroy_loop_vec_info (inner_loop_vinfo, true);
1236 return NULL;
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_NOTE, vect_location,
1241 "Considering outer-loop vectorization.\n");
1244 if (!single_exit (loop)
1245 || EDGE_COUNT (loop->header->preds) != 2)
1247 if (dump_enabled_p ())
1249 if (!single_exit (loop))
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 "not vectorized: multiple exits.\n");
1252 else if (EDGE_COUNT (loop->header->preds) != 2)
1253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 "not vectorized: too many incoming edges.\n");
1256 if (inner_loop_vinfo)
1257 destroy_loop_vec_info (inner_loop_vinfo, true);
1258 return NULL;
1261 /* We assume that the loop exit condition is at the end of the loop. i.e,
1262 that the loop is represented as a do-while (with a proper if-guard
1263 before the loop if needed), where the loop header contains all the
1264 executable statements, and the latch is empty. */
1265 if (!empty_block_p (loop->latch)
1266 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1268 if (dump_enabled_p ())
1269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270 "not vectorized: latch block not empty.\n");
1271 if (inner_loop_vinfo)
1272 destroy_loop_vec_info (inner_loop_vinfo, true);
1273 return NULL;
1276 /* Make sure there exists a single-predecessor exit bb: */
1277 if (!single_pred_p (single_exit (loop)->dest))
1279 edge e = single_exit (loop);
1280 if (!(e->flags & EDGE_ABNORMAL))
1282 split_loop_exit_edge (e);
1283 if (dump_enabled_p ())
1284 dump_printf (MSG_NOTE, "split exit edge.\n");
1286 else
1288 if (dump_enabled_p ())
1289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290 "not vectorized: abnormal loop exit edge.\n");
1291 if (inner_loop_vinfo)
1292 destroy_loop_vec_info (inner_loop_vinfo, true);
1293 return NULL;
1297 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298 &number_of_iterationsm1);
1299 if (!loop_cond)
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: complicated exit condition.\n");
1304 if (inner_loop_vinfo)
1305 destroy_loop_vec_info (inner_loop_vinfo, true);
1306 return NULL;
1309 if (!number_of_iterations
1310 || chrec_contains_undetermined (number_of_iterations))
1312 if (dump_enabled_p ())
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 "not vectorized: number of iterations cannot be "
1315 "computed.\n");
1316 if (inner_loop_vinfo)
1317 destroy_loop_vec_info (inner_loop_vinfo, true);
1318 return NULL;
1321 if (integer_zerop (number_of_iterations))
1323 if (dump_enabled_p ())
1324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325 "not vectorized: number of iterations = 0.\n");
1326 if (inner_loop_vinfo)
1327 destroy_loop_vec_info (inner_loop_vinfo, true);
1328 return NULL;
1331 loop_vinfo = new_loop_vec_info (loop);
1332 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1336 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1338 if (dump_enabled_p ())
1340 dump_printf_loc (MSG_NOTE, vect_location,
1341 "Symbolic number of iterations is ");
1342 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343 dump_printf (MSG_NOTE, "\n");
1347 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1349 /* CHECKME: May want to keep it around it in the future. */
1350 if (inner_loop_vinfo)
1351 destroy_loop_vec_info (inner_loop_vinfo, false);
1353 gcc_assert (!loop->aux);
1354 loop->aux = loop_vinfo;
1355 return loop_vinfo;
1359 /* Function vect_analyze_loop_operations.
1361 Scan the loop stmts and make sure they are all vectorizable. */
1363 static bool
1364 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1368 int nbbs = loop->num_nodes;
1369 unsigned int vectorization_factor = 0;
1370 int i;
1371 stmt_vec_info stmt_info;
1372 bool need_to_vectorize = false;
1373 int min_profitable_iters;
1374 int min_scalar_loop_bound;
1375 unsigned int th;
1376 bool only_slp_in_loop = true, ok;
1377 HOST_WIDE_INT max_niter;
1378 HOST_WIDE_INT estimated_niter;
1379 int min_profitable_estimate;
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location,
1383 "=== vect_analyze_loop_operations ===\n");
1385 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1386 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1387 if (slp)
1389 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390 vectorization factor of the loop is the unrolling factor required by
1391 the SLP instances. If that unrolling factor is 1, we say, that we
1392 perform pure SLP on loop - cross iteration parallelism is not
1393 exploited. */
1394 for (i = 0; i < nbbs; i++)
1396 basic_block bb = bbs[i];
1397 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398 gsi_next (&si))
1400 gimple stmt = gsi_stmt (si);
1401 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1402 gcc_assert (stmt_info);
1403 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1404 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1405 && !PURE_SLP_STMT (stmt_info))
1406 /* STMT needs both SLP and loop-based vectorization. */
1407 only_slp_in_loop = false;
1411 if (only_slp_in_loop)
1412 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1413 else
1414 vectorization_factor = least_common_multiple (vectorization_factor,
1415 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1417 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_NOTE, vect_location,
1420 "Updating vectorization factor to %d\n",
1421 vectorization_factor);
1424 for (i = 0; i < nbbs; i++)
1426 basic_block bb = bbs[i];
1428 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1429 gsi_next (&si))
1431 gphi *phi = si.phi ();
1432 ok = true;
1434 stmt_info = vinfo_for_stmt (phi);
1435 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1438 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1439 dump_printf (MSG_NOTE, "\n");
1442 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1443 (i.e., a phi in the tail of the outer-loop). */
1444 if (! is_loop_header_bb_p (bb))
1446 /* FORNOW: we currently don't support the case that these phis
1447 are not used in the outerloop (unless it is double reduction,
1448 i.e., this phi is vect_reduction_def), cause this case
1449 requires to actually do something here. */
1450 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1451 || STMT_VINFO_LIVE_P (stmt_info))
1452 && STMT_VINFO_DEF_TYPE (stmt_info)
1453 != vect_double_reduction_def)
1455 if (dump_enabled_p ())
1456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1457 "Unsupported loop-closed phi in "
1458 "outer-loop.\n");
1459 return false;
1462 /* If PHI is used in the outer loop, we check that its operand
1463 is defined in the inner loop. */
1464 if (STMT_VINFO_RELEVANT_P (stmt_info))
1466 tree phi_op;
1467 gimple op_def_stmt;
1469 if (gimple_phi_num_args (phi) != 1)
1470 return false;
1472 phi_op = PHI_ARG_DEF (phi, 0);
1473 if (TREE_CODE (phi_op) != SSA_NAME)
1474 return false;
1476 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1477 if (gimple_nop_p (op_def_stmt)
1478 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1479 || !vinfo_for_stmt (op_def_stmt))
1480 return false;
1482 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1483 != vect_used_in_outer
1484 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1485 != vect_used_in_outer_by_reduction)
1486 return false;
1489 continue;
1492 gcc_assert (stmt_info);
1494 if (STMT_VINFO_LIVE_P (stmt_info))
1496 /* FORNOW: not yet supported. */
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 "not vectorized: value used after loop.\n");
1500 return false;
1503 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1504 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1506 /* A scalar-dependence cycle that we don't support. */
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1509 "not vectorized: scalar dependence cycle.\n");
1510 return false;
1513 if (STMT_VINFO_RELEVANT_P (stmt_info))
1515 need_to_vectorize = true;
1516 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1517 ok = vectorizable_induction (phi, NULL, NULL);
1520 if (!ok)
1522 if (dump_enabled_p ())
1524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525 "not vectorized: relevant phi not "
1526 "supported: ");
1527 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1528 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1530 return false;
1534 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1535 gsi_next (&si))
1537 gimple stmt = gsi_stmt (si);
1538 if (!gimple_clobber_p (stmt)
1539 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1540 return false;
1542 } /* bbs */
1544 /* All operations in the loop are either irrelevant (deal with loop
1545 control, or dead), or only used outside the loop and can be moved
1546 out of the loop (e.g. invariants, inductions). The loop can be
1547 optimized away by scalar optimizations. We're better off not
1548 touching this loop. */
1549 if (!need_to_vectorize)
1551 if (dump_enabled_p ())
1552 dump_printf_loc (MSG_NOTE, vect_location,
1553 "All the computation can be taken out of the loop.\n");
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 "not vectorized: redundant loop. no profit to "
1557 "vectorize.\n");
1558 return false;
1561 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1562 dump_printf_loc (MSG_NOTE, vect_location,
1563 "vectorization_factor = %d, niters = "
1564 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1565 LOOP_VINFO_INT_NITERS (loop_vinfo));
1567 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1568 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1569 || ((max_niter = max_stmt_executions_int (loop)) != -1
1570 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574 "not vectorized: iteration count too small.\n");
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577 "not vectorized: iteration count smaller than "
1578 "vectorization factor.\n");
1579 return false;
1582 /* Analyze cost. Decide if worth while to vectorize. */
1584 /* Once VF is set, SLP costs should be updated since the number of created
1585 vector stmts depends on VF. */
1586 vect_update_slp_costs_according_to_vf (loop_vinfo);
1588 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1589 &min_profitable_estimate);
1590 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1592 if (min_profitable_iters < 0)
1594 if (dump_enabled_p ())
1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 "not vectorized: vectorization not profitable.\n");
1597 if (dump_enabled_p ())
1598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 "not vectorized: vector version will never be "
1600 "profitable.\n");
1601 return false;
1604 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1605 * vectorization_factor) - 1);
1608 /* Use the cost model only if it is more conservative than user specified
1609 threshold. */
1611 th = (unsigned) min_scalar_loop_bound;
1612 if (min_profitable_iters
1613 && (!min_scalar_loop_bound
1614 || min_profitable_iters > min_scalar_loop_bound))
1615 th = (unsigned) min_profitable_iters;
1617 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1619 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1620 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624 "not vectorized: vectorization not profitable.\n");
1625 if (dump_enabled_p ())
1626 dump_printf_loc (MSG_NOTE, vect_location,
1627 "not vectorized: iteration count smaller than user "
1628 "specified loop bound parameter or minimum profitable "
1629 "iterations (whichever is more conservative).\n");
1630 return false;
1633 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1634 && ((unsigned HOST_WIDE_INT) estimated_niter
1635 <= MAX (th, (unsigned)min_profitable_estimate)))
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "not vectorized: estimated iteration count too "
1640 "small.\n");
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "not vectorized: estimated iteration count smaller "
1644 "than specified loop bound parameter or minimum "
1645 "profitable iterations (whichever is more "
1646 "conservative).\n");
1647 return false;
1650 return true;
1654 /* Function vect_analyze_loop_2.
1656 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1657 for it. The different analyses will record information in the
1658 loop_vec_info struct. */
1659 static bool
1660 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1662 bool ok, slp = false;
1663 int max_vf = MAX_VECTORIZATION_FACTOR;
1664 int min_vf = 2;
1665 unsigned int th;
1666 unsigned int n_stmts = 0;
1668 /* Find all data references in the loop (which correspond to vdefs/vuses)
1669 and analyze their evolution in the loop. Also adjust the minimal
1670 vectorization factor according to the loads and stores.
1672 FORNOW: Handle only simple, array references, which
1673 alignment can be forced, and aligned pointer-references. */
1675 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1676 if (!ok)
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680 "bad data references.\n");
1681 return false;
1684 /* Classify all cross-iteration scalar data-flow cycles.
1685 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1687 vect_analyze_scalar_cycles (loop_vinfo);
1689 vect_pattern_recog (loop_vinfo, NULL);
1691 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1692 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1694 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1695 if (!ok)
1697 if (dump_enabled_p ())
1698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699 "bad data access.\n");
1700 return false;
1703 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1705 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1706 if (!ok)
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710 "unexpected pattern.\n");
1711 return false;
1714 /* Analyze data dependences between the data-refs in the loop
1715 and adjust the maximum vectorization factor according to
1716 the dependences.
1717 FORNOW: fail at the first data dependence that we encounter. */
1719 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1720 if (!ok
1721 || max_vf < min_vf)
1723 if (dump_enabled_p ())
1724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725 "bad data dependence.\n");
1726 return false;
1729 ok = vect_determine_vectorization_factor (loop_vinfo);
1730 if (!ok)
1732 if (dump_enabled_p ())
1733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734 "can't determine vectorization factor.\n");
1735 return false;
1737 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741 "bad data dependence.\n");
1742 return false;
1745 /* Analyze the alignment of the data-refs in the loop.
1746 Fail if a data reference is found that cannot be vectorized. */
1748 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1749 if (!ok)
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753 "bad data alignment.\n");
1754 return false;
1757 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1758 It is important to call pruning after vect_analyze_data_ref_accesses,
1759 since we use grouping information gathered by interleaving analysis. */
1760 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1761 if (!ok)
1763 if (dump_enabled_p ())
1764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1765 "number of versioning for alias "
1766 "run-time tests exceeds %d "
1767 "(--param vect-max-version-for-alias-checks)\n",
1768 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1769 return false;
1772 /* This pass will decide on using loop versioning and/or loop peeling in
1773 order to enhance the alignment of data references in the loop. */
1775 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1776 if (!ok)
1778 if (dump_enabled_p ())
1779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1780 "bad data alignment.\n");
1781 return false;
1784 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1785 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1786 if (ok)
1788 /* Decide which possible SLP instances to SLP. */
1789 slp = vect_make_slp_decision (loop_vinfo);
1791 /* Find stmts that need to be both vectorized and SLPed. */
1792 vect_detect_hybrid_slp (loop_vinfo);
1794 else
1795 return false;
1797 /* Scan all the operations in the loop and make sure they are
1798 vectorizable. */
1800 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1801 if (!ok)
1803 if (dump_enabled_p ())
1804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805 "bad operation or unsupported loop bound.\n");
1806 return false;
1809 /* Decide whether we need to create an epilogue loop to handle
1810 remaining scalar iterations. */
1811 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1812 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1813 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1815 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1816 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1818 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1819 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1820 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1821 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1823 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1824 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1825 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1826 /* In case of versioning, check if the maximum number of
1827 iterations is greater than th. If they are identical,
1828 the epilogue is unnecessary. */
1829 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1830 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1831 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1832 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1833 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1835 /* If an epilogue loop is required make sure we can create one. */
1836 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1837 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1841 if (!vect_can_advance_ivs_p (loop_vinfo)
1842 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1843 single_exit (LOOP_VINFO_LOOP
1844 (loop_vinfo))))
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 "not vectorized: can't create required "
1849 "epilog loop\n");
1850 return false;
1854 return true;
1857 /* Function vect_analyze_loop.
1859 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1860 for it. The different analyses will record information in the
1861 loop_vec_info struct. */
1862 loop_vec_info
1863 vect_analyze_loop (struct loop *loop)
1865 loop_vec_info loop_vinfo;
1866 unsigned int vector_sizes;
1868 /* Autodetect first vector size we try. */
1869 current_vector_size = 0;
1870 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_NOTE, vect_location,
1874 "===== analyze_loop_nest =====\n");
1876 if (loop_outer (loop)
1877 && loop_vec_info_for_loop (loop_outer (loop))
1878 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_NOTE, vect_location,
1882 "outer-loop already vectorized.\n");
1883 return NULL;
1886 while (1)
1888 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1889 loop_vinfo = vect_analyze_loop_form (loop);
1890 if (!loop_vinfo)
1892 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894 "bad loop form.\n");
1895 return NULL;
1898 if (vect_analyze_loop_2 (loop_vinfo))
1900 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1902 return loop_vinfo;
1905 destroy_loop_vec_info (loop_vinfo, true);
1907 vector_sizes &= ~current_vector_size;
1908 if (vector_sizes == 0
1909 || current_vector_size == 0)
1910 return NULL;
1912 /* Try the next biggest vector size. */
1913 current_vector_size = 1 << floor_log2 (vector_sizes);
1914 if (dump_enabled_p ())
1915 dump_printf_loc (MSG_NOTE, vect_location,
1916 "***** Re-trying analysis with "
1917 "vector size %d\n", current_vector_size);
1922 /* Function reduction_code_for_scalar_code
1924 Input:
1925 CODE - tree_code of a reduction operations.
1927 Output:
1928 REDUC_CODE - the corresponding tree-code to be used to reduce the
1929 vector of partial results into a single scalar result, or ERROR_MARK
1930 if the operation is a supported reduction operation, but does not have
1931 such a tree-code.
1933 Return FALSE if CODE currently cannot be vectorized as reduction. */
1935 static bool
1936 reduction_code_for_scalar_code (enum tree_code code,
1937 enum tree_code *reduc_code)
1939 switch (code)
1941 case MAX_EXPR:
1942 *reduc_code = REDUC_MAX_EXPR;
1943 return true;
1945 case MIN_EXPR:
1946 *reduc_code = REDUC_MIN_EXPR;
1947 return true;
1949 case PLUS_EXPR:
1950 *reduc_code = REDUC_PLUS_EXPR;
1951 return true;
1953 case MULT_EXPR:
1954 case MINUS_EXPR:
1955 case BIT_IOR_EXPR:
1956 case BIT_XOR_EXPR:
1957 case BIT_AND_EXPR:
1958 *reduc_code = ERROR_MARK;
1959 return true;
1961 default:
1962 return false;
1967 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1968 STMT is printed with a message MSG. */
1970 static void
1971 report_vect_op (int msg_type, gimple stmt, const char *msg)
1973 dump_printf_loc (msg_type, vect_location, "%s", msg);
1974 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1975 dump_printf (msg_type, "\n");
1979 /* Detect SLP reduction of the form:
1981 #a1 = phi <a5, a0>
1982 a2 = operation (a1)
1983 a3 = operation (a2)
1984 a4 = operation (a3)
1985 a5 = operation (a4)
1987 #a = phi <a5>
1989 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1990 FIRST_STMT is the first reduction stmt in the chain
1991 (a2 = operation (a1)).
1993 Return TRUE if a reduction chain was detected. */
1995 static bool
1996 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1998 struct loop *loop = (gimple_bb (phi))->loop_father;
1999 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2000 enum tree_code code;
2001 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2002 stmt_vec_info use_stmt_info, current_stmt_info;
2003 tree lhs;
2004 imm_use_iterator imm_iter;
2005 use_operand_p use_p;
2006 int nloop_uses, size = 0, n_out_of_loop_uses;
2007 bool found = false;
2009 if (loop != vect_loop)
2010 return false;
2012 lhs = PHI_RESULT (phi);
2013 code = gimple_assign_rhs_code (first_stmt);
2014 while (1)
2016 nloop_uses = 0;
2017 n_out_of_loop_uses = 0;
2018 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2020 gimple use_stmt = USE_STMT (use_p);
2021 if (is_gimple_debug (use_stmt))
2022 continue;
2024 /* Check if we got back to the reduction phi. */
2025 if (use_stmt == phi)
2027 loop_use_stmt = use_stmt;
2028 found = true;
2029 break;
2032 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2034 if (vinfo_for_stmt (use_stmt)
2035 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2037 loop_use_stmt = use_stmt;
2038 nloop_uses++;
2041 else
2042 n_out_of_loop_uses++;
2044 /* There are can be either a single use in the loop or two uses in
2045 phi nodes. */
2046 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2047 return false;
2050 if (found)
2051 break;
2053 /* We reached a statement with no loop uses. */
2054 if (nloop_uses == 0)
2055 return false;
2057 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2058 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2059 return false;
2061 if (!is_gimple_assign (loop_use_stmt)
2062 || code != gimple_assign_rhs_code (loop_use_stmt)
2063 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2064 return false;
2066 /* Insert USE_STMT into reduction chain. */
2067 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2068 if (current_stmt)
2070 current_stmt_info = vinfo_for_stmt (current_stmt);
2071 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2072 GROUP_FIRST_ELEMENT (use_stmt_info)
2073 = GROUP_FIRST_ELEMENT (current_stmt_info);
2075 else
2076 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2078 lhs = gimple_assign_lhs (loop_use_stmt);
2079 current_stmt = loop_use_stmt;
2080 size++;
2083 if (!found || loop_use_stmt != phi || size < 2)
2084 return false;
2086 /* Swap the operands, if needed, to make the reduction operand be the second
2087 operand. */
2088 lhs = PHI_RESULT (phi);
2089 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2090 while (next_stmt)
2092 if (gimple_assign_rhs2 (next_stmt) == lhs)
2094 tree op = gimple_assign_rhs1 (next_stmt);
2095 gimple def_stmt = NULL;
2097 if (TREE_CODE (op) == SSA_NAME)
2098 def_stmt = SSA_NAME_DEF_STMT (op);
2100 /* Check that the other def is either defined in the loop
2101 ("vect_internal_def"), or it's an induction (defined by a
2102 loop-header phi-node). */
2103 if (def_stmt
2104 && gimple_bb (def_stmt)
2105 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2106 && (is_gimple_assign (def_stmt)
2107 || is_gimple_call (def_stmt)
2108 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2109 == vect_induction_def
2110 || (gimple_code (def_stmt) == GIMPLE_PHI
2111 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2112 == vect_internal_def
2113 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2115 lhs = gimple_assign_lhs (next_stmt);
2116 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2117 continue;
2120 return false;
2122 else
2124 tree op = gimple_assign_rhs2 (next_stmt);
2125 gimple def_stmt = NULL;
2127 if (TREE_CODE (op) == SSA_NAME)
2128 def_stmt = SSA_NAME_DEF_STMT (op);
2130 /* Check that the other def is either defined in the loop
2131 ("vect_internal_def"), or it's an induction (defined by a
2132 loop-header phi-node). */
2133 if (def_stmt
2134 && gimple_bb (def_stmt)
2135 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2136 && (is_gimple_assign (def_stmt)
2137 || is_gimple_call (def_stmt)
2138 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2139 == vect_induction_def
2140 || (gimple_code (def_stmt) == GIMPLE_PHI
2141 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2142 == vect_internal_def
2143 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2145 if (dump_enabled_p ())
2147 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2148 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2149 dump_printf (MSG_NOTE, "\n");
2152 swap_ssa_operands (next_stmt,
2153 gimple_assign_rhs1_ptr (next_stmt),
2154 gimple_assign_rhs2_ptr (next_stmt));
2155 update_stmt (next_stmt);
2157 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2158 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2160 else
2161 return false;
2164 lhs = gimple_assign_lhs (next_stmt);
2165 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2168 /* Save the chain for further analysis in SLP detection. */
2169 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2170 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2171 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2173 return true;
2177 /* Function vect_is_simple_reduction_1
2179 (1) Detect a cross-iteration def-use cycle that represents a simple
2180 reduction computation. We look for the following pattern:
2182 loop_header:
2183 a1 = phi < a0, a2 >
2184 a3 = ...
2185 a2 = operation (a3, a1)
2189 a3 = ...
2190 loop_header:
2191 a1 = phi < a0, a2 >
2192 a2 = operation (a3, a1)
2194 such that:
2195 1. operation is commutative and associative and it is safe to
2196 change the order of the computation (if CHECK_REDUCTION is true)
2197 2. no uses for a2 in the loop (a2 is used out of the loop)
2198 3. no uses of a1 in the loop besides the reduction operation
2199 4. no uses of a1 outside the loop.
2201 Conditions 1,4 are tested here.
2202 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2204 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2205 nested cycles, if CHECK_REDUCTION is false.
2207 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2208 reductions:
2210 a1 = phi < a0, a2 >
2211 inner loop (def of a3)
2212 a2 = phi < a3 >
2214 If MODIFY is true it tries also to rework the code in-place to enable
2215 detection of more reduction patterns. For the time being we rewrite
2216 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2219 static gimple
2220 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2221 bool check_reduction, bool *double_reduc,
2222 bool modify)
2224 struct loop *loop = (gimple_bb (phi))->loop_father;
2225 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2226 edge latch_e = loop_latch_edge (loop);
2227 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2228 gimple def_stmt, def1 = NULL, def2 = NULL;
2229 enum tree_code orig_code, code;
2230 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2231 tree type;
2232 int nloop_uses;
2233 tree name;
2234 imm_use_iterator imm_iter;
2235 use_operand_p use_p;
2236 bool phi_def;
2238 *double_reduc = false;
2240 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2241 otherwise, we assume outer loop vectorization. */
2242 gcc_assert ((check_reduction && loop == vect_loop)
2243 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2245 name = PHI_RESULT (phi);
2246 /* ??? If there are no uses of the PHI result the inner loop reduction
2247 won't be detected as possibly double-reduction by vectorizable_reduction
2248 because that tries to walk the PHI arg from the preheader edge which
2249 can be constant. See PR60382. */
2250 if (has_zero_uses (name))
2251 return NULL;
2252 nloop_uses = 0;
2253 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2255 gimple use_stmt = USE_STMT (use_p);
2256 if (is_gimple_debug (use_stmt))
2257 continue;
2259 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "intermediate value used outside loop.\n");
2265 return NULL;
2268 if (vinfo_for_stmt (use_stmt)
2269 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2270 nloop_uses++;
2271 if (nloop_uses > 1)
2273 if (dump_enabled_p ())
2274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275 "reduction used in loop.\n");
2276 return NULL;
2280 if (TREE_CODE (loop_arg) != SSA_NAME)
2282 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 "reduction: not ssa_name: ");
2286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2287 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2289 return NULL;
2292 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2293 if (!def_stmt)
2295 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297 "reduction: no def_stmt.\n");
2298 return NULL;
2301 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2303 if (dump_enabled_p ())
2305 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2306 dump_printf (MSG_NOTE, "\n");
2308 return NULL;
2311 if (is_gimple_assign (def_stmt))
2313 name = gimple_assign_lhs (def_stmt);
2314 phi_def = false;
2316 else
2318 name = PHI_RESULT (def_stmt);
2319 phi_def = true;
2322 nloop_uses = 0;
2323 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2325 gimple use_stmt = USE_STMT (use_p);
2326 if (is_gimple_debug (use_stmt))
2327 continue;
2328 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2329 && vinfo_for_stmt (use_stmt)
2330 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2331 nloop_uses++;
2332 if (nloop_uses > 1)
2334 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336 "reduction used in loop.\n");
2337 return NULL;
2341 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2342 defined in the inner loop. */
2343 if (phi_def)
2345 op1 = PHI_ARG_DEF (def_stmt, 0);
2347 if (gimple_phi_num_args (def_stmt) != 1
2348 || TREE_CODE (op1) != SSA_NAME)
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352 "unsupported phi node definition.\n");
2354 return NULL;
2357 def1 = SSA_NAME_DEF_STMT (op1);
2358 if (gimple_bb (def1)
2359 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2360 && loop->inner
2361 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2362 && is_gimple_assign (def1))
2364 if (dump_enabled_p ())
2365 report_vect_op (MSG_NOTE, def_stmt,
2366 "detected double reduction: ");
2368 *double_reduc = true;
2369 return def_stmt;
2372 return NULL;
2375 code = orig_code = gimple_assign_rhs_code (def_stmt);
2377 /* We can handle "res -= x[i]", which is non-associative by
2378 simply rewriting this into "res += -x[i]". Avoid changing
2379 gimple instruction for the first simple tests and only do this
2380 if we're allowed to change code at all. */
2381 if (code == MINUS_EXPR
2382 && modify
2383 && (op1 = gimple_assign_rhs1 (def_stmt))
2384 && TREE_CODE (op1) == SSA_NAME
2385 && SSA_NAME_DEF_STMT (op1) == phi)
2386 code = PLUS_EXPR;
2388 if (check_reduction
2389 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2391 if (dump_enabled_p ())
2392 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2393 "reduction: not commutative/associative: ");
2394 return NULL;
2397 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2399 if (code != COND_EXPR)
2401 if (dump_enabled_p ())
2402 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403 "reduction: not binary operation: ");
2405 return NULL;
2408 op3 = gimple_assign_rhs1 (def_stmt);
2409 if (COMPARISON_CLASS_P (op3))
2411 op4 = TREE_OPERAND (op3, 1);
2412 op3 = TREE_OPERAND (op3, 0);
2415 op1 = gimple_assign_rhs2 (def_stmt);
2416 op2 = gimple_assign_rhs3 (def_stmt);
2418 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2420 if (dump_enabled_p ())
2421 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2422 "reduction: uses not ssa_names: ");
2424 return NULL;
2427 else
2429 op1 = gimple_assign_rhs1 (def_stmt);
2430 op2 = gimple_assign_rhs2 (def_stmt);
2432 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2434 if (dump_enabled_p ())
2435 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2436 "reduction: uses not ssa_names: ");
2438 return NULL;
2442 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2443 if ((TREE_CODE (op1) == SSA_NAME
2444 && !types_compatible_p (type,TREE_TYPE (op1)))
2445 || (TREE_CODE (op2) == SSA_NAME
2446 && !types_compatible_p (type, TREE_TYPE (op2)))
2447 || (op3 && TREE_CODE (op3) == SSA_NAME
2448 && !types_compatible_p (type, TREE_TYPE (op3)))
2449 || (op4 && TREE_CODE (op4) == SSA_NAME
2450 && !types_compatible_p (type, TREE_TYPE (op4))))
2452 if (dump_enabled_p ())
2454 dump_printf_loc (MSG_NOTE, vect_location,
2455 "reduction: multiple types: operation type: ");
2456 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2457 dump_printf (MSG_NOTE, ", operands types: ");
2458 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2459 TREE_TYPE (op1));
2460 dump_printf (MSG_NOTE, ",");
2461 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2462 TREE_TYPE (op2));
2463 if (op3)
2465 dump_printf (MSG_NOTE, ",");
2466 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2467 TREE_TYPE (op3));
2470 if (op4)
2472 dump_printf (MSG_NOTE, ",");
2473 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2474 TREE_TYPE (op4));
2476 dump_printf (MSG_NOTE, "\n");
2479 return NULL;
2482 /* Check that it's ok to change the order of the computation.
2483 Generally, when vectorizing a reduction we change the order of the
2484 computation. This may change the behavior of the program in some
2485 cases, so we need to check that this is ok. One exception is when
2486 vectorizing an outer-loop: the inner-loop is executed sequentially,
2487 and therefore vectorizing reductions in the inner-loop during
2488 outer-loop vectorization is safe. */
2490 /* CHECKME: check for !flag_finite_math_only too? */
2491 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2492 && check_reduction)
2494 /* Changing the order of operations changes the semantics. */
2495 if (dump_enabled_p ())
2496 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2497 "reduction: unsafe fp math optimization: ");
2498 return NULL;
2500 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2501 && check_reduction)
2503 /* Changing the order of operations changes the semantics. */
2504 if (dump_enabled_p ())
2505 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2506 "reduction: unsafe int math optimization: ");
2507 return NULL;
2509 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2511 /* Changing the order of operations changes the semantics. */
2512 if (dump_enabled_p ())
2513 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2514 "reduction: unsafe fixed-point math optimization: ");
2515 return NULL;
2518 /* If we detected "res -= x[i]" earlier, rewrite it into
2519 "res += -x[i]" now. If this turns out to be useless reassoc
2520 will clean it up again. */
2521 if (orig_code == MINUS_EXPR)
2523 tree rhs = gimple_assign_rhs2 (def_stmt);
2524 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2525 gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2526 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2527 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2528 loop_info, NULL));
2529 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2530 gimple_assign_set_rhs2 (def_stmt, negrhs);
2531 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2532 update_stmt (def_stmt);
2535 /* Reduction is safe. We're dealing with one of the following:
2536 1) integer arithmetic and no trapv
2537 2) floating point arithmetic, and special flags permit this optimization
2538 3) nested cycle (i.e., outer loop vectorization). */
2539 if (TREE_CODE (op1) == SSA_NAME)
2540 def1 = SSA_NAME_DEF_STMT (op1);
2542 if (TREE_CODE (op2) == SSA_NAME)
2543 def2 = SSA_NAME_DEF_STMT (op2);
2545 if (code != COND_EXPR
2546 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2548 if (dump_enabled_p ())
2549 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2550 return NULL;
2553 /* Check that one def is the reduction def, defined by PHI,
2554 the other def is either defined in the loop ("vect_internal_def"),
2555 or it's an induction (defined by a loop-header phi-node). */
2557 if (def2 && def2 == phi
2558 && (code == COND_EXPR
2559 || !def1 || gimple_nop_p (def1)
2560 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2561 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2562 && (is_gimple_assign (def1)
2563 || is_gimple_call (def1)
2564 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2565 == vect_induction_def
2566 || (gimple_code (def1) == GIMPLE_PHI
2567 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2568 == vect_internal_def
2569 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2571 if (dump_enabled_p ())
2572 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2573 return def_stmt;
2576 if (def1 && def1 == phi
2577 && (code == COND_EXPR
2578 || !def2 || gimple_nop_p (def2)
2579 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2580 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2581 && (is_gimple_assign (def2)
2582 || is_gimple_call (def2)
2583 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2584 == vect_induction_def
2585 || (gimple_code (def2) == GIMPLE_PHI
2586 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2587 == vect_internal_def
2588 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2590 if (check_reduction)
2592 /* Swap operands (just for simplicity - so that the rest of the code
2593 can assume that the reduction variable is always the last (second)
2594 argument). */
2595 if (dump_enabled_p ())
2596 report_vect_op (MSG_NOTE, def_stmt,
2597 "detected reduction: need to swap operands: ");
2599 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2600 gimple_assign_rhs2_ptr (def_stmt));
2602 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2603 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2605 else
2607 if (dump_enabled_p ())
2608 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2611 return def_stmt;
2614 /* Try to find SLP reduction chain. */
2615 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2617 if (dump_enabled_p ())
2618 report_vect_op (MSG_NOTE, def_stmt,
2619 "reduction: detected reduction chain: ");
2621 return def_stmt;
2624 if (dump_enabled_p ())
2625 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2626 "reduction: unknown pattern: ");
2628 return NULL;
2631 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2632 in-place. Arguments as there. */
2634 static gimple
2635 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2636 bool check_reduction, bool *double_reduc)
2638 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2639 double_reduc, false);
2642 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2643 in-place if it enables detection of more reductions. Arguments
2644 as there. */
2646 gimple
2647 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2648 bool check_reduction, bool *double_reduc)
2650 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2651 double_reduc, true);
2654 /* Calculate the cost of one scalar iteration of the loop. */
2656 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2658 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2659 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2660 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2661 int innerloop_iters, i, stmt_cost;
2663 /* Count statements in scalar loop. Using this as scalar cost for a single
2664 iteration for now.
2666 TODO: Add outer loop support.
2668 TODO: Consider assigning different costs to different scalar
2669 statements. */
2671 /* FORNOW. */
2672 innerloop_iters = 1;
2673 if (loop->inner)
2674 innerloop_iters = 50; /* FIXME */
2676 for (i = 0; i < nbbs; i++)
2678 gimple_stmt_iterator si;
2679 basic_block bb = bbs[i];
2681 if (bb->loop_father == loop->inner)
2682 factor = innerloop_iters;
2683 else
2684 factor = 1;
2686 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2688 gimple stmt = gsi_stmt (si);
2689 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2691 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2692 continue;
2694 /* Skip stmts that are not vectorized inside the loop. */
2695 if (stmt_info
2696 && !STMT_VINFO_RELEVANT_P (stmt_info)
2697 && (!STMT_VINFO_LIVE_P (stmt_info)
2698 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2699 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2700 continue;
2702 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2704 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2705 stmt_cost = vect_get_stmt_cost (scalar_load);
2706 else
2707 stmt_cost = vect_get_stmt_cost (scalar_store);
2709 else
2710 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2712 scalar_single_iter_cost += stmt_cost * factor;
2715 return scalar_single_iter_cost;
2718 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2720 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2721 int *peel_iters_epilogue,
2722 int scalar_single_iter_cost,
2723 stmt_vector_for_cost *prologue_cost_vec,
2724 stmt_vector_for_cost *epilogue_cost_vec)
2726 int retval = 0;
2727 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2729 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2731 *peel_iters_epilogue = vf/2;
2732 if (dump_enabled_p ())
2733 dump_printf_loc (MSG_NOTE, vect_location,
2734 "cost model: epilogue peel iters set to vf/2 "
2735 "because loop iterations are unknown .\n");
2737 /* If peeled iterations are known but number of scalar loop
2738 iterations are unknown, count a taken branch per peeled loop. */
2739 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2740 NULL, 0, vect_prologue);
2742 else
2744 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2745 peel_iters_prologue = niters < peel_iters_prologue ?
2746 niters : peel_iters_prologue;
2747 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2748 /* If we need to peel for gaps, but no peeling is required, we have to
2749 peel VF iterations. */
2750 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2751 *peel_iters_epilogue = vf;
2754 if (peel_iters_prologue)
2755 retval += record_stmt_cost (prologue_cost_vec,
2756 peel_iters_prologue * scalar_single_iter_cost,
2757 scalar_stmt, NULL, 0, vect_prologue);
2758 if (*peel_iters_epilogue)
2759 retval += record_stmt_cost (epilogue_cost_vec,
2760 *peel_iters_epilogue * scalar_single_iter_cost,
2761 scalar_stmt, NULL, 0, vect_epilogue);
2762 return retval;
2765 /* Function vect_estimate_min_profitable_iters
2767 Return the number of iterations required for the vector version of the
2768 loop to be profitable relative to the cost of the scalar version of the
2769 loop. */
2771 static void
2772 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2773 int *ret_min_profitable_niters,
2774 int *ret_min_profitable_estimate)
2776 int min_profitable_iters;
2777 int min_profitable_estimate;
2778 int peel_iters_prologue;
2779 int peel_iters_epilogue;
2780 unsigned vec_inside_cost = 0;
2781 int vec_outside_cost = 0;
2782 unsigned vec_prologue_cost = 0;
2783 unsigned vec_epilogue_cost = 0;
2784 int scalar_single_iter_cost = 0;
2785 int scalar_outside_cost = 0;
2786 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2787 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2788 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2790 /* Cost model disabled. */
2791 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2793 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2794 *ret_min_profitable_niters = 0;
2795 *ret_min_profitable_estimate = 0;
2796 return;
2799 /* Requires loop versioning tests to handle misalignment. */
2800 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2802 /* FIXME: Make cost depend on complexity of individual check. */
2803 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2804 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2805 vect_prologue);
2806 dump_printf (MSG_NOTE,
2807 "cost model: Adding cost of checks for loop "
2808 "versioning to treat misalignment.\n");
2811 /* Requires loop versioning with alias checks. */
2812 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2814 /* FIXME: Make cost depend on complexity of individual check. */
2815 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2816 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2817 vect_prologue);
2818 dump_printf (MSG_NOTE,
2819 "cost model: Adding cost of checks for loop "
2820 "versioning aliasing.\n");
2823 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2824 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2825 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2826 vect_prologue);
2828 /* Count statements in scalar loop. Using this as scalar cost for a single
2829 iteration for now.
2831 TODO: Add outer loop support.
2833 TODO: Consider assigning different costs to different scalar
2834 statements. */
2836 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2837 /* ??? Below we use this cost as number of stmts with scalar_stmt cost,
2838 thus divide by that. This introduces rounding errors, thus better
2839 introduce a new cost kind (raw_cost? scalar_iter_cost?). */
2840 int scalar_single_iter_stmts
2841 = scalar_single_iter_cost / vect_get_stmt_cost (scalar_stmt);
2843 /* Add additional cost for the peeled instructions in prologue and epilogue
2844 loop.
2846 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2847 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2849 TODO: Build an expression that represents peel_iters for prologue and
2850 epilogue to be used in a run-time test. */
2852 if (npeel < 0)
2854 peel_iters_prologue = vf/2;
2855 dump_printf (MSG_NOTE, "cost model: "
2856 "prologue peel iters set to vf/2.\n");
2858 /* If peeling for alignment is unknown, loop bound of main loop becomes
2859 unknown. */
2860 peel_iters_epilogue = vf/2;
2861 dump_printf (MSG_NOTE, "cost model: "
2862 "epilogue peel iters set to vf/2 because "
2863 "peeling for alignment is unknown.\n");
2865 /* If peeled iterations are unknown, count a taken branch and a not taken
2866 branch per peeled loop. Even if scalar loop iterations are known,
2867 vector iterations are not known since peeled prologue iterations are
2868 not known. Hence guards remain the same. */
2869 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2870 NULL, 0, vect_prologue);
2871 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2872 NULL, 0, vect_prologue);
2873 /* FORNOW: Don't attempt to pass individual scalar instructions to
2874 the model; just assume linear cost for scalar iterations. */
2875 (void) add_stmt_cost (target_cost_data,
2876 peel_iters_prologue * scalar_single_iter_stmts,
2877 scalar_stmt, NULL, 0, vect_prologue);
2878 (void) add_stmt_cost (target_cost_data,
2879 peel_iters_epilogue * scalar_single_iter_stmts,
2880 scalar_stmt, NULL, 0, vect_epilogue);
2882 else
2884 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2885 stmt_info_for_cost *si;
2886 int j;
2887 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2889 prologue_cost_vec.create (2);
2890 epilogue_cost_vec.create (2);
2891 peel_iters_prologue = npeel;
2893 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2894 &peel_iters_epilogue,
2895 scalar_single_iter_stmts,
2896 &prologue_cost_vec,
2897 &epilogue_cost_vec);
2899 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2901 struct _stmt_vec_info *stmt_info
2902 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2903 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2904 si->misalign, vect_prologue);
2907 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2909 struct _stmt_vec_info *stmt_info
2910 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2911 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2912 si->misalign, vect_epilogue);
2915 prologue_cost_vec.release ();
2916 epilogue_cost_vec.release ();
2919 /* FORNOW: The scalar outside cost is incremented in one of the
2920 following ways:
2922 1. The vectorizer checks for alignment and aliasing and generates
2923 a condition that allows dynamic vectorization. A cost model
2924 check is ANDED with the versioning condition. Hence scalar code
2925 path now has the added cost of the versioning check.
2927 if (cost > th & versioning_check)
2928 jmp to vector code
2930 Hence run-time scalar is incremented by not-taken branch cost.
2932 2. The vectorizer then checks if a prologue is required. If the
2933 cost model check was not done before during versioning, it has to
2934 be done before the prologue check.
2936 if (cost <= th)
2937 prologue = scalar_iters
2938 if (prologue == 0)
2939 jmp to vector code
2940 else
2941 execute prologue
2942 if (prologue == num_iters)
2943 go to exit
2945 Hence the run-time scalar cost is incremented by a taken branch,
2946 plus a not-taken branch, plus a taken branch cost.
2948 3. The vectorizer then checks if an epilogue is required. If the
2949 cost model check was not done before during prologue check, it
2950 has to be done with the epilogue check.
2952 if (prologue == 0)
2953 jmp to vector code
2954 else
2955 execute prologue
2956 if (prologue == num_iters)
2957 go to exit
2958 vector code:
2959 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2960 jmp to epilogue
2962 Hence the run-time scalar cost should be incremented by 2 taken
2963 branches.
2965 TODO: The back end may reorder the BBS's differently and reverse
2966 conditions/branch directions. Change the estimates below to
2967 something more reasonable. */
2969 /* If the number of iterations is known and we do not do versioning, we can
2970 decide whether to vectorize at compile time. Hence the scalar version
2971 do not carry cost model guard costs. */
2972 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2973 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2974 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2976 /* Cost model check occurs at versioning. */
2977 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2978 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2979 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2980 else
2982 /* Cost model check occurs at prologue generation. */
2983 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2984 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2985 + vect_get_stmt_cost (cond_branch_not_taken);
2986 /* Cost model check occurs at epilogue generation. */
2987 else
2988 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2992 /* Complete the target-specific cost calculations. */
2993 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2994 &vec_inside_cost, &vec_epilogue_cost);
2996 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2998 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3001 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3002 vec_inside_cost);
3003 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3004 vec_prologue_cost);
3005 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3006 vec_epilogue_cost);
3007 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3008 scalar_single_iter_cost);
3009 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3010 scalar_outside_cost);
3011 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3012 vec_outside_cost);
3013 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3014 peel_iters_prologue);
3015 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3016 peel_iters_epilogue);
3019 /* Calculate number of iterations required to make the vector version
3020 profitable, relative to the loop bodies only. The following condition
3021 must hold true:
3022 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3023 where
3024 SIC = scalar iteration cost, VIC = vector iteration cost,
3025 VOC = vector outside cost, VF = vectorization factor,
3026 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3027 SOC = scalar outside cost for run time cost model check. */
3029 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3031 if (vec_outside_cost <= 0)
3032 min_profitable_iters = 1;
3033 else
3035 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3036 - vec_inside_cost * peel_iters_prologue
3037 - vec_inside_cost * peel_iters_epilogue)
3038 / ((scalar_single_iter_cost * vf)
3039 - vec_inside_cost);
3041 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3042 <= (((int) vec_inside_cost * min_profitable_iters)
3043 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3044 min_profitable_iters++;
3047 /* vector version will never be profitable. */
3048 else
3050 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3051 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3052 "did not happen for a simd loop");
3054 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3056 "cost model: the vector iteration cost = %d "
3057 "divided by the scalar iteration cost = %d "
3058 "is greater or equal to the vectorization factor = %d"
3059 ".\n",
3060 vec_inside_cost, scalar_single_iter_cost, vf);
3061 *ret_min_profitable_niters = -1;
3062 *ret_min_profitable_estimate = -1;
3063 return;
3066 dump_printf (MSG_NOTE,
3067 " Calculated minimum iters for profitability: %d\n",
3068 min_profitable_iters);
3070 min_profitable_iters =
3071 min_profitable_iters < vf ? vf : min_profitable_iters;
3073 /* Because the condition we create is:
3074 if (niters <= min_profitable_iters)
3075 then skip the vectorized loop. */
3076 min_profitable_iters--;
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE, vect_location,
3080 " Runtime profitability threshold = %d\n",
3081 min_profitable_iters);
3083 *ret_min_profitable_niters = min_profitable_iters;
3085 /* Calculate number of iterations required to make the vector version
3086 profitable, relative to the loop bodies only.
3088 Non-vectorized variant is SIC * niters and it must win over vector
3089 variant on the expected loop trip count. The following condition must hold true:
3090 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3092 if (vec_outside_cost <= 0)
3093 min_profitable_estimate = 1;
3094 else
3096 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3097 - vec_inside_cost * peel_iters_prologue
3098 - vec_inside_cost * peel_iters_epilogue)
3099 / ((scalar_single_iter_cost * vf)
3100 - vec_inside_cost);
3102 min_profitable_estimate --;
3103 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3104 if (dump_enabled_p ())
3105 dump_printf_loc (MSG_NOTE, vect_location,
3106 " Static estimate profitability threshold = %d\n",
3107 min_profitable_iters);
3109 *ret_min_profitable_estimate = min_profitable_estimate;
3112 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3113 vector elements (not bits) for a vector of mode MODE. */
3114 static void
3115 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3116 unsigned char *sel)
3118 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3120 for (i = 0; i < nelt; i++)
3121 sel[i] = (i + offset) & (2*nelt - 1);
3124 /* Checks whether the target supports whole-vector shifts for vectors of mode
3125 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3126 it supports vec_perm_const with masks for all necessary shift amounts. */
3127 static bool
3128 have_whole_vector_shift (enum machine_mode mode)
3130 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3131 return true;
3133 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3134 return false;
3136 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3137 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3139 for (i = nelt/2; i >= 1; i/=2)
3141 calc_vec_perm_mask_for_shift (mode, i, sel);
3142 if (!can_vec_perm_p (mode, false, sel))
3143 return false;
3145 return true;
3148 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3149 functions. Design better to avoid maintenance issues. */
3151 /* Function vect_model_reduction_cost.
3153 Models cost for a reduction operation, including the vector ops
3154 generated within the strip-mine loop, the initial definition before
3155 the loop, and the epilogue code that must be generated. */
3157 static bool
3158 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3159 int ncopies)
3161 int prologue_cost = 0, epilogue_cost = 0;
3162 enum tree_code code;
3163 optab optab;
3164 tree vectype;
3165 gimple stmt, orig_stmt;
3166 tree reduction_op;
3167 machine_mode mode;
3168 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3170 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3172 /* Cost of reduction op inside loop. */
3173 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3174 stmt_info, 0, vect_body);
3175 stmt = STMT_VINFO_STMT (stmt_info);
3177 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3179 case GIMPLE_SINGLE_RHS:
3180 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3181 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3182 break;
3183 case GIMPLE_UNARY_RHS:
3184 reduction_op = gimple_assign_rhs1 (stmt);
3185 break;
3186 case GIMPLE_BINARY_RHS:
3187 reduction_op = gimple_assign_rhs2 (stmt);
3188 break;
3189 case GIMPLE_TERNARY_RHS:
3190 reduction_op = gimple_assign_rhs3 (stmt);
3191 break;
3192 default:
3193 gcc_unreachable ();
3196 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3197 if (!vectype)
3199 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3202 "unsupported data-type ");
3203 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3204 TREE_TYPE (reduction_op));
3205 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3207 return false;
3210 mode = TYPE_MODE (vectype);
3211 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3213 if (!orig_stmt)
3214 orig_stmt = STMT_VINFO_STMT (stmt_info);
3216 code = gimple_assign_rhs_code (orig_stmt);
3218 /* Add in cost for initial definition. */
3219 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3220 stmt_info, 0, vect_prologue);
3222 /* Determine cost of epilogue code.
3224 We have a reduction operator that will reduce the vector in one statement.
3225 Also requires scalar extract. */
3227 if (!nested_in_vect_loop_p (loop, orig_stmt))
3229 if (reduc_code != ERROR_MARK)
3231 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3232 stmt_info, 0, vect_epilogue);
3233 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3234 stmt_info, 0, vect_epilogue);
3236 else
3238 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3239 tree bitsize =
3240 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3241 int element_bitsize = tree_to_uhwi (bitsize);
3242 int nelements = vec_size_in_bits / element_bitsize;
3244 optab = optab_for_tree_code (code, vectype, optab_default);
3246 /* We have a whole vector shift available. */
3247 if (VECTOR_MODE_P (mode)
3248 && optab_handler (optab, mode) != CODE_FOR_nothing
3249 && have_whole_vector_shift (mode))
3251 /* Final reduction via vector shifts and the reduction operator.
3252 Also requires scalar extract. */
3253 epilogue_cost += add_stmt_cost (target_cost_data,
3254 exact_log2 (nelements) * 2,
3255 vector_stmt, stmt_info, 0,
3256 vect_epilogue);
3257 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3258 vec_to_scalar, stmt_info, 0,
3259 vect_epilogue);
3261 else
3262 /* Use extracts and reduction op for final reduction. For N
3263 elements, we have N extracts and N-1 reduction ops. */
3264 epilogue_cost += add_stmt_cost (target_cost_data,
3265 nelements + nelements - 1,
3266 vector_stmt, stmt_info, 0,
3267 vect_epilogue);
3271 if (dump_enabled_p ())
3272 dump_printf (MSG_NOTE,
3273 "vect_model_reduction_cost: inside_cost = %d, "
3274 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3275 prologue_cost, epilogue_cost);
3277 return true;
3281 /* Function vect_model_induction_cost.
3283 Models cost for induction operations. */
3285 static void
3286 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3288 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3289 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3290 unsigned inside_cost, prologue_cost;
3292 /* loop cost for vec_loop. */
3293 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3294 stmt_info, 0, vect_body);
3296 /* prologue cost for vec_init and vec_step. */
3297 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3298 stmt_info, 0, vect_prologue);
3300 if (dump_enabled_p ())
3301 dump_printf_loc (MSG_NOTE, vect_location,
3302 "vect_model_induction_cost: inside_cost = %d, "
3303 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3307 /* Function get_initial_def_for_induction
3309 Input:
3310 STMT - a stmt that performs an induction operation in the loop.
3311 IV_PHI - the initial value of the induction variable
3313 Output:
3314 Return a vector variable, initialized with the first VF values of
3315 the induction variable. E.g., for an iv with IV_PHI='X' and
3316 evolution S, for a vector of 4 units, we want to return:
3317 [X, X + S, X + 2*S, X + 3*S]. */
3319 static tree
3320 get_initial_def_for_induction (gimple iv_phi)
3322 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3323 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3324 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3325 tree vectype;
3326 int nunits;
3327 edge pe = loop_preheader_edge (loop);
3328 struct loop *iv_loop;
3329 basic_block new_bb;
3330 tree new_vec, vec_init, vec_step, t;
3331 tree new_var;
3332 tree new_name;
3333 gimple init_stmt, new_stmt;
3334 gphi *induction_phi;
3335 tree induc_def, vec_def, vec_dest;
3336 tree init_expr, step_expr;
3337 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3338 int i;
3339 int ncopies;
3340 tree expr;
3341 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3342 bool nested_in_vect_loop = false;
3343 gimple_seq stmts = NULL;
3344 imm_use_iterator imm_iter;
3345 use_operand_p use_p;
3346 gimple exit_phi;
3347 edge latch_e;
3348 tree loop_arg;
3349 gimple_stmt_iterator si;
3350 basic_block bb = gimple_bb (iv_phi);
3351 tree stepvectype;
3352 tree resvectype;
3354 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3355 if (nested_in_vect_loop_p (loop, iv_phi))
3357 nested_in_vect_loop = true;
3358 iv_loop = loop->inner;
3360 else
3361 iv_loop = loop;
3362 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3364 latch_e = loop_latch_edge (iv_loop);
3365 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3367 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3368 gcc_assert (step_expr != NULL_TREE);
3370 pe = loop_preheader_edge (iv_loop);
3371 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3372 loop_preheader_edge (iv_loop));
3374 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3375 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3376 gcc_assert (vectype);
3377 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3378 ncopies = vf / nunits;
3380 gcc_assert (phi_info);
3381 gcc_assert (ncopies >= 1);
3383 /* Convert the step to the desired type. */
3384 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3385 step_expr),
3386 &stmts, true, NULL_TREE);
3387 if (stmts)
3389 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3390 gcc_assert (!new_bb);
3393 /* Find the first insertion point in the BB. */
3394 si = gsi_after_labels (bb);
3396 /* Create the vector that holds the initial_value of the induction. */
3397 if (nested_in_vect_loop)
3399 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3400 been created during vectorization of previous stmts. We obtain it
3401 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3402 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3403 /* If the initial value is not of proper type, convert it. */
3404 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3406 new_stmt
3407 = gimple_build_assign (vect_get_new_vect_var (vectype,
3408 vect_simple_var,
3409 "vec_iv_"),
3410 VIEW_CONVERT_EXPR,
3411 build1 (VIEW_CONVERT_EXPR, vectype,
3412 vec_init));
3413 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3414 gimple_assign_set_lhs (new_stmt, vec_init);
3415 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3416 new_stmt);
3417 gcc_assert (!new_bb);
3418 set_vinfo_for_stmt (new_stmt,
3419 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3422 else
3424 vec<constructor_elt, va_gc> *v;
3426 /* iv_loop is the loop to be vectorized. Create:
3427 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3428 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3429 vect_scalar_var, "var_");
3430 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3431 init_expr),
3432 &stmts, false, new_var);
3433 if (stmts)
3435 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3436 gcc_assert (!new_bb);
3439 vec_alloc (v, nunits);
3440 bool constant_p = is_gimple_min_invariant (new_name);
3441 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3442 for (i = 1; i < nunits; i++)
3444 /* Create: new_name_i = new_name + step_expr */
3445 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3446 new_name, step_expr);
3447 if (!is_gimple_min_invariant (new_name))
3449 init_stmt = gimple_build_assign (new_var, new_name);
3450 new_name = make_ssa_name (new_var, init_stmt);
3451 gimple_assign_set_lhs (init_stmt, new_name);
3452 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3453 gcc_assert (!new_bb);
3454 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_NOTE, vect_location,
3457 "created new init_stmt: ");
3458 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3459 dump_printf (MSG_NOTE, "\n");
3461 constant_p = false;
3463 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3465 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3466 if (constant_p)
3467 new_vec = build_vector_from_ctor (vectype, v);
3468 else
3469 new_vec = build_constructor (vectype, v);
3470 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3474 /* Create the vector that holds the step of the induction. */
3475 if (nested_in_vect_loop)
3476 /* iv_loop is nested in the loop to be vectorized. Generate:
3477 vec_step = [S, S, S, S] */
3478 new_name = step_expr;
3479 else
3481 /* iv_loop is the loop to be vectorized. Generate:
3482 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3483 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3485 expr = build_int_cst (integer_type_node, vf);
3486 expr = fold_convert (TREE_TYPE (step_expr), expr);
3488 else
3489 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3490 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3491 expr, step_expr);
3492 if (TREE_CODE (step_expr) == SSA_NAME)
3493 new_name = vect_init_vector (iv_phi, new_name,
3494 TREE_TYPE (step_expr), NULL);
3497 t = unshare_expr (new_name);
3498 gcc_assert (CONSTANT_CLASS_P (new_name)
3499 || TREE_CODE (new_name) == SSA_NAME);
3500 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3501 gcc_assert (stepvectype);
3502 new_vec = build_vector_from_val (stepvectype, t);
3503 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3506 /* Create the following def-use cycle:
3507 loop prolog:
3508 vec_init = ...
3509 vec_step = ...
3510 loop:
3511 vec_iv = PHI <vec_init, vec_loop>
3513 STMT
3515 vec_loop = vec_iv + vec_step; */
3517 /* Create the induction-phi that defines the induction-operand. */
3518 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3519 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3520 set_vinfo_for_stmt (induction_phi,
3521 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3522 induc_def = PHI_RESULT (induction_phi);
3524 /* Create the iv update inside the loop */
3525 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3526 vec_def = make_ssa_name (vec_dest, new_stmt);
3527 gimple_assign_set_lhs (new_stmt, vec_def);
3528 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3529 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3530 NULL));
3532 /* Set the arguments of the phi node: */
3533 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3534 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3535 UNKNOWN_LOCATION);
3538 /* In case that vectorization factor (VF) is bigger than the number
3539 of elements that we can fit in a vectype (nunits), we have to generate
3540 more than one vector stmt - i.e - we need to "unroll" the
3541 vector stmt by a factor VF/nunits. For more details see documentation
3542 in vectorizable_operation. */
3544 if (ncopies > 1)
3546 stmt_vec_info prev_stmt_vinfo;
3547 /* FORNOW. This restriction should be relaxed. */
3548 gcc_assert (!nested_in_vect_loop);
3550 /* Create the vector that holds the step of the induction. */
3551 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3553 expr = build_int_cst (integer_type_node, nunits);
3554 expr = fold_convert (TREE_TYPE (step_expr), expr);
3556 else
3557 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3558 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3559 expr, step_expr);
3560 if (TREE_CODE (step_expr) == SSA_NAME)
3561 new_name = vect_init_vector (iv_phi, new_name,
3562 TREE_TYPE (step_expr), NULL);
3563 t = unshare_expr (new_name);
3564 gcc_assert (CONSTANT_CLASS_P (new_name)
3565 || TREE_CODE (new_name) == SSA_NAME);
3566 new_vec = build_vector_from_val (stepvectype, t);
3567 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3569 vec_def = induc_def;
3570 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3571 for (i = 1; i < ncopies; i++)
3573 /* vec_i = vec_prev + vec_step */
3574 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3575 vec_def, vec_step);
3576 vec_def = make_ssa_name (vec_dest, new_stmt);
3577 gimple_assign_set_lhs (new_stmt, vec_def);
3579 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3580 if (!useless_type_conversion_p (resvectype, vectype))
3582 new_stmt
3583 = gimple_build_assign
3584 (vect_get_new_vect_var (resvectype, vect_simple_var,
3585 "vec_iv_"),
3586 VIEW_CONVERT_EXPR,
3587 build1 (VIEW_CONVERT_EXPR, resvectype,
3588 gimple_assign_lhs (new_stmt)));
3589 gimple_assign_set_lhs (new_stmt,
3590 make_ssa_name
3591 (gimple_assign_lhs (new_stmt), new_stmt));
3592 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3594 set_vinfo_for_stmt (new_stmt,
3595 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3596 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3597 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3601 if (nested_in_vect_loop)
3603 /* Find the loop-closed exit-phi of the induction, and record
3604 the final vector of induction results: */
3605 exit_phi = NULL;
3606 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3608 gimple use_stmt = USE_STMT (use_p);
3609 if (is_gimple_debug (use_stmt))
3610 continue;
3612 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3614 exit_phi = use_stmt;
3615 break;
3618 if (exit_phi)
3620 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3621 /* FORNOW. Currently not supporting the case that an inner-loop induction
3622 is not used in the outer-loop (i.e. only outside the outer-loop). */
3623 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3624 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3626 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3627 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_NOTE, vect_location,
3630 "vector of inductions after inner-loop:");
3631 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3632 dump_printf (MSG_NOTE, "\n");
3638 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_NOTE, vect_location,
3641 "transform induction: created def-use cycle: ");
3642 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3643 dump_printf (MSG_NOTE, "\n");
3644 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3645 SSA_NAME_DEF_STMT (vec_def), 0);
3646 dump_printf (MSG_NOTE, "\n");
3649 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3650 if (!useless_type_conversion_p (resvectype, vectype))
3652 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3653 vect_simple_var,
3654 "vec_iv_"),
3655 VIEW_CONVERT_EXPR,
3656 build1 (VIEW_CONVERT_EXPR, resvectype,
3657 induc_def));
3658 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3659 gimple_assign_set_lhs (new_stmt, induc_def);
3660 si = gsi_after_labels (bb);
3661 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3662 set_vinfo_for_stmt (new_stmt,
3663 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3664 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3665 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3668 return induc_def;
3672 /* Function get_initial_def_for_reduction
3674 Input:
3675 STMT - a stmt that performs a reduction operation in the loop.
3676 INIT_VAL - the initial value of the reduction variable
3678 Output:
3679 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3680 of the reduction (used for adjusting the epilog - see below).
3681 Return a vector variable, initialized according to the operation that STMT
3682 performs. This vector will be used as the initial value of the
3683 vector of partial results.
3685 Option1 (adjust in epilog): Initialize the vector as follows:
3686 add/bit or/xor: [0,0,...,0,0]
3687 mult/bit and: [1,1,...,1,1]
3688 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3689 and when necessary (e.g. add/mult case) let the caller know
3690 that it needs to adjust the result by init_val.
3692 Option2: Initialize the vector as follows:
3693 add/bit or/xor: [init_val,0,0,...,0]
3694 mult/bit and: [init_val,1,1,...,1]
3695 min/max/cond_expr: [init_val,init_val,...,init_val]
3696 and no adjustments are needed.
3698 For example, for the following code:
3700 s = init_val;
3701 for (i=0;i<n;i++)
3702 s = s + a[i];
3704 STMT is 's = s + a[i]', and the reduction variable is 's'.
3705 For a vector of 4 units, we want to return either [0,0,0,init_val],
3706 or [0,0,0,0] and let the caller know that it needs to adjust
3707 the result at the end by 'init_val'.
3709 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3710 initialization vector is simpler (same element in all entries), if
3711 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3713 A cost model should help decide between these two schemes. */
3715 tree
3716 get_initial_def_for_reduction (gimple stmt, tree init_val,
3717 tree *adjustment_def)
3719 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3720 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3721 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3722 tree scalar_type = TREE_TYPE (init_val);
3723 tree vectype = get_vectype_for_scalar_type (scalar_type);
3724 int nunits;
3725 enum tree_code code = gimple_assign_rhs_code (stmt);
3726 tree def_for_init;
3727 tree init_def;
3728 tree *elts;
3729 int i;
3730 bool nested_in_vect_loop = false;
3731 tree init_value;
3732 REAL_VALUE_TYPE real_init_val = dconst0;
3733 int int_init_val = 0;
3734 gimple def_stmt = NULL;
3736 gcc_assert (vectype);
3737 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3739 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3740 || SCALAR_FLOAT_TYPE_P (scalar_type));
3742 if (nested_in_vect_loop_p (loop, stmt))
3743 nested_in_vect_loop = true;
3744 else
3745 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3747 /* In case of double reduction we only create a vector variable to be put
3748 in the reduction phi node. The actual statement creation is done in
3749 vect_create_epilog_for_reduction. */
3750 if (adjustment_def && nested_in_vect_loop
3751 && TREE_CODE (init_val) == SSA_NAME
3752 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3753 && gimple_code (def_stmt) == GIMPLE_PHI
3754 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3755 && vinfo_for_stmt (def_stmt)
3756 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3757 == vect_double_reduction_def)
3759 *adjustment_def = NULL;
3760 return vect_create_destination_var (init_val, vectype);
3763 if (TREE_CONSTANT (init_val))
3765 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3766 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3767 else
3768 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3770 else
3771 init_value = init_val;
3773 switch (code)
3775 case WIDEN_SUM_EXPR:
3776 case DOT_PROD_EXPR:
3777 case SAD_EXPR:
3778 case PLUS_EXPR:
3779 case MINUS_EXPR:
3780 case BIT_IOR_EXPR:
3781 case BIT_XOR_EXPR:
3782 case MULT_EXPR:
3783 case BIT_AND_EXPR:
3784 /* ADJUSMENT_DEF is NULL when called from
3785 vect_create_epilog_for_reduction to vectorize double reduction. */
3786 if (adjustment_def)
3788 if (nested_in_vect_loop)
3789 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3790 NULL);
3791 else
3792 *adjustment_def = init_val;
3795 if (code == MULT_EXPR)
3797 real_init_val = dconst1;
3798 int_init_val = 1;
3801 if (code == BIT_AND_EXPR)
3802 int_init_val = -1;
3804 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3805 def_for_init = build_real (scalar_type, real_init_val);
3806 else
3807 def_for_init = build_int_cst (scalar_type, int_init_val);
3809 /* Create a vector of '0' or '1' except the first element. */
3810 elts = XALLOCAVEC (tree, nunits);
3811 for (i = nunits - 2; i >= 0; --i)
3812 elts[i + 1] = def_for_init;
3814 /* Option1: the first element is '0' or '1' as well. */
3815 if (adjustment_def)
3817 elts[0] = def_for_init;
3818 init_def = build_vector (vectype, elts);
3819 break;
3822 /* Option2: the first element is INIT_VAL. */
3823 elts[0] = init_val;
3824 if (TREE_CONSTANT (init_val))
3825 init_def = build_vector (vectype, elts);
3826 else
3828 vec<constructor_elt, va_gc> *v;
3829 vec_alloc (v, nunits);
3830 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3831 for (i = 1; i < nunits; ++i)
3832 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3833 init_def = build_constructor (vectype, v);
3836 break;
3838 case MIN_EXPR:
3839 case MAX_EXPR:
3840 case COND_EXPR:
3841 if (adjustment_def)
3843 *adjustment_def = NULL_TREE;
3844 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3845 break;
3848 init_def = build_vector_from_val (vectype, init_value);
3849 break;
3851 default:
3852 gcc_unreachable ();
3855 return init_def;
3858 /* Function vect_create_epilog_for_reduction
3860 Create code at the loop-epilog to finalize the result of a reduction
3861 computation.
3863 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3864 reduction statements.
3865 STMT is the scalar reduction stmt that is being vectorized.
3866 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3867 number of elements that we can fit in a vectype (nunits). In this case
3868 we have to generate more than one vector stmt - i.e - we need to "unroll"
3869 the vector stmt by a factor VF/nunits. For more details see documentation
3870 in vectorizable_operation.
3871 REDUC_CODE is the tree-code for the epilog reduction.
3872 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3873 computation.
3874 REDUC_INDEX is the index of the operand in the right hand side of the
3875 statement that is defined by REDUCTION_PHI.
3876 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3877 SLP_NODE is an SLP node containing a group of reduction statements. The
3878 first one in this group is STMT.
3880 This function:
3881 1. Creates the reduction def-use cycles: sets the arguments for
3882 REDUCTION_PHIS:
3883 The loop-entry argument is the vectorized initial-value of the reduction.
3884 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3885 sums.
3886 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3887 by applying the operation specified by REDUC_CODE if available, or by
3888 other means (whole-vector shifts or a scalar loop).
3889 The function also creates a new phi node at the loop exit to preserve
3890 loop-closed form, as illustrated below.
3892 The flow at the entry to this function:
3894 loop:
3895 vec_def = phi <null, null> # REDUCTION_PHI
3896 VECT_DEF = vector_stmt # vectorized form of STMT
3897 s_loop = scalar_stmt # (scalar) STMT
3898 loop_exit:
3899 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3900 use <s_out0>
3901 use <s_out0>
3903 The above is transformed by this function into:
3905 loop:
3906 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3907 VECT_DEF = vector_stmt # vectorized form of STMT
3908 s_loop = scalar_stmt # (scalar) STMT
3909 loop_exit:
3910 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3911 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3912 v_out2 = reduce <v_out1>
3913 s_out3 = extract_field <v_out2, 0>
3914 s_out4 = adjust_result <s_out3>
3915 use <s_out4>
3916 use <s_out4>
3919 static void
3920 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3921 int ncopies, enum tree_code reduc_code,
3922 vec<gimple> reduction_phis,
3923 int reduc_index, bool double_reduc,
3924 slp_tree slp_node)
3926 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3927 stmt_vec_info prev_phi_info;
3928 tree vectype;
3929 machine_mode mode;
3930 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3931 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3932 basic_block exit_bb;
3933 tree scalar_dest;
3934 tree scalar_type;
3935 gimple new_phi = NULL, phi;
3936 gimple_stmt_iterator exit_gsi;
3937 tree vec_dest;
3938 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3939 gimple epilog_stmt = NULL;
3940 enum tree_code code = gimple_assign_rhs_code (stmt);
3941 gimple exit_phi;
3942 tree bitsize;
3943 tree adjustment_def = NULL;
3944 tree vec_initial_def = NULL;
3945 tree reduction_op, expr, def;
3946 tree orig_name, scalar_result;
3947 imm_use_iterator imm_iter, phi_imm_iter;
3948 use_operand_p use_p, phi_use_p;
3949 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3950 bool nested_in_vect_loop = false;
3951 auto_vec<gimple> new_phis;
3952 auto_vec<gimple> inner_phis;
3953 enum vect_def_type dt = vect_unknown_def_type;
3954 int j, i;
3955 auto_vec<tree> scalar_results;
3956 unsigned int group_size = 1, k, ratio;
3957 auto_vec<tree> vec_initial_defs;
3958 auto_vec<gimple> phis;
3959 bool slp_reduc = false;
3960 tree new_phi_result;
3961 gimple inner_phi = NULL;
3963 if (slp_node)
3964 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3966 if (nested_in_vect_loop_p (loop, stmt))
3968 outer_loop = loop;
3969 loop = loop->inner;
3970 nested_in_vect_loop = true;
3971 gcc_assert (!slp_node);
3974 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3976 case GIMPLE_SINGLE_RHS:
3977 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3978 == ternary_op);
3979 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3980 break;
3981 case GIMPLE_UNARY_RHS:
3982 reduction_op = gimple_assign_rhs1 (stmt);
3983 break;
3984 case GIMPLE_BINARY_RHS:
3985 reduction_op = reduc_index ?
3986 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3987 break;
3988 case GIMPLE_TERNARY_RHS:
3989 reduction_op = gimple_op (stmt, reduc_index + 1);
3990 break;
3991 default:
3992 gcc_unreachable ();
3995 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3996 gcc_assert (vectype);
3997 mode = TYPE_MODE (vectype);
3999 /* 1. Create the reduction def-use cycle:
4000 Set the arguments of REDUCTION_PHIS, i.e., transform
4002 loop:
4003 vec_def = phi <null, null> # REDUCTION_PHI
4004 VECT_DEF = vector_stmt # vectorized form of STMT
4007 into:
4009 loop:
4010 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4011 VECT_DEF = vector_stmt # vectorized form of STMT
4014 (in case of SLP, do it for all the phis). */
4016 /* Get the loop-entry arguments. */
4017 if (slp_node)
4018 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4019 NULL, slp_node, reduc_index);
4020 else
4022 vec_initial_defs.create (1);
4023 /* For the case of reduction, vect_get_vec_def_for_operand returns
4024 the scalar def before the loop, that defines the initial value
4025 of the reduction variable. */
4026 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4027 &adjustment_def);
4028 vec_initial_defs.quick_push (vec_initial_def);
4031 /* Set phi nodes arguments. */
4032 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4034 tree vec_init_def, def;
4035 gimple_seq stmts;
4036 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4037 true, NULL_TREE);
4038 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4039 def = vect_defs[i];
4040 for (j = 0; j < ncopies; j++)
4042 /* Set the loop-entry arg of the reduction-phi. */
4043 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4044 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4046 /* Set the loop-latch arg for the reduction-phi. */
4047 if (j > 0)
4048 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4050 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4051 UNKNOWN_LOCATION);
4053 if (dump_enabled_p ())
4055 dump_printf_loc (MSG_NOTE, vect_location,
4056 "transform reduction: created def-use cycle: ");
4057 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4058 dump_printf (MSG_NOTE, "\n");
4059 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4060 dump_printf (MSG_NOTE, "\n");
4063 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4067 /* 2. Create epilog code.
4068 The reduction epilog code operates across the elements of the vector
4069 of partial results computed by the vectorized loop.
4070 The reduction epilog code consists of:
4072 step 1: compute the scalar result in a vector (v_out2)
4073 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4074 step 3: adjust the scalar result (s_out3) if needed.
4076 Step 1 can be accomplished using one the following three schemes:
4077 (scheme 1) using reduc_code, if available.
4078 (scheme 2) using whole-vector shifts, if available.
4079 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4080 combined.
4082 The overall epilog code looks like this:
4084 s_out0 = phi <s_loop> # original EXIT_PHI
4085 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4086 v_out2 = reduce <v_out1> # step 1
4087 s_out3 = extract_field <v_out2, 0> # step 2
4088 s_out4 = adjust_result <s_out3> # step 3
4090 (step 3 is optional, and steps 1 and 2 may be combined).
4091 Lastly, the uses of s_out0 are replaced by s_out4. */
4094 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4095 v_out1 = phi <VECT_DEF>
4096 Store them in NEW_PHIS. */
4098 exit_bb = single_exit (loop)->dest;
4099 prev_phi_info = NULL;
4100 new_phis.create (vect_defs.length ());
4101 FOR_EACH_VEC_ELT (vect_defs, i, def)
4103 for (j = 0; j < ncopies; j++)
4105 tree new_def = copy_ssa_name (def);
4106 phi = create_phi_node (new_def, exit_bb);
4107 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4108 if (j == 0)
4109 new_phis.quick_push (phi);
4110 else
4112 def = vect_get_vec_def_for_stmt_copy (dt, def);
4113 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4116 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4117 prev_phi_info = vinfo_for_stmt (phi);
4121 /* The epilogue is created for the outer-loop, i.e., for the loop being
4122 vectorized. Create exit phis for the outer loop. */
4123 if (double_reduc)
4125 loop = outer_loop;
4126 exit_bb = single_exit (loop)->dest;
4127 inner_phis.create (vect_defs.length ());
4128 FOR_EACH_VEC_ELT (new_phis, i, phi)
4130 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4131 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4132 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4133 PHI_RESULT (phi));
4134 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4135 loop_vinfo, NULL));
4136 inner_phis.quick_push (phi);
4137 new_phis[i] = outer_phi;
4138 prev_phi_info = vinfo_for_stmt (outer_phi);
4139 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4141 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4142 new_result = copy_ssa_name (PHI_RESULT (phi));
4143 outer_phi = create_phi_node (new_result, exit_bb);
4144 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4145 PHI_RESULT (phi));
4146 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4147 loop_vinfo, NULL));
4148 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4149 prev_phi_info = vinfo_for_stmt (outer_phi);
4154 exit_gsi = gsi_after_labels (exit_bb);
4156 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4157 (i.e. when reduc_code is not available) and in the final adjustment
4158 code (if needed). Also get the original scalar reduction variable as
4159 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4160 represents a reduction pattern), the tree-code and scalar-def are
4161 taken from the original stmt that the pattern-stmt (STMT) replaces.
4162 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4163 are taken from STMT. */
4165 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4166 if (!orig_stmt)
4168 /* Regular reduction */
4169 orig_stmt = stmt;
4171 else
4173 /* Reduction pattern */
4174 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4175 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4176 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4179 code = gimple_assign_rhs_code (orig_stmt);
4180 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4181 partial results are added and not subtracted. */
4182 if (code == MINUS_EXPR)
4183 code = PLUS_EXPR;
4185 scalar_dest = gimple_assign_lhs (orig_stmt);
4186 scalar_type = TREE_TYPE (scalar_dest);
4187 scalar_results.create (group_size);
4188 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4189 bitsize = TYPE_SIZE (scalar_type);
4191 /* In case this is a reduction in an inner-loop while vectorizing an outer
4192 loop - we don't need to extract a single scalar result at the end of the
4193 inner-loop (unless it is double reduction, i.e., the use of reduction is
4194 outside the outer-loop). The final vector of partial results will be used
4195 in the vectorized outer-loop, or reduced to a scalar result at the end of
4196 the outer-loop. */
4197 if (nested_in_vect_loop && !double_reduc)
4198 goto vect_finalize_reduction;
4200 /* SLP reduction without reduction chain, e.g.,
4201 # a1 = phi <a2, a0>
4202 # b1 = phi <b2, b0>
4203 a2 = operation (a1)
4204 b2 = operation (b1) */
4205 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4207 /* In case of reduction chain, e.g.,
4208 # a1 = phi <a3, a0>
4209 a2 = operation (a1)
4210 a3 = operation (a2),
4212 we may end up with more than one vector result. Here we reduce them to
4213 one vector. */
4214 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4216 tree first_vect = PHI_RESULT (new_phis[0]);
4217 tree tmp;
4218 gassign *new_vec_stmt = NULL;
4220 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4221 for (k = 1; k < new_phis.length (); k++)
4223 gimple next_phi = new_phis[k];
4224 tree second_vect = PHI_RESULT (next_phi);
4226 tmp = build2 (code, vectype, first_vect, second_vect);
4227 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4228 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4229 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4230 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4233 new_phi_result = first_vect;
4234 if (new_vec_stmt)
4236 new_phis.truncate (0);
4237 new_phis.safe_push (new_vec_stmt);
4240 else
4241 new_phi_result = PHI_RESULT (new_phis[0]);
4243 /* 2.3 Create the reduction code, using one of the three schemes described
4244 above. In SLP we simply need to extract all the elements from the
4245 vector (without reducing them), so we use scalar shifts. */
4246 if (reduc_code != ERROR_MARK && !slp_reduc)
4248 tree tmp;
4249 tree vec_elem_type;
4251 /*** Case 1: Create:
4252 v_out2 = reduc_expr <v_out1> */
4254 if (dump_enabled_p ())
4255 dump_printf_loc (MSG_NOTE, vect_location,
4256 "Reduce using direct vector reduction.\n");
4258 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4259 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4261 tree tmp_dest =
4262 vect_create_destination_var (scalar_dest, vec_elem_type);
4263 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4264 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4265 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4266 gimple_assign_set_lhs (epilog_stmt, new_temp);
4267 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4269 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4271 else
4272 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4273 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4274 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4275 gimple_assign_set_lhs (epilog_stmt, new_temp);
4276 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4277 scalar_results.safe_push (new_temp);
4279 else
4281 bool reduce_with_shift = have_whole_vector_shift (mode);
4282 int element_bitsize = tree_to_uhwi (bitsize);
4283 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4284 tree vec_temp;
4286 /* Regardless of whether we have a whole vector shift, if we're
4287 emulating the operation via tree-vect-generic, we don't want
4288 to use it. Only the first round of the reduction is likely
4289 to still be profitable via emulation. */
4290 /* ??? It might be better to emit a reduction tree code here, so that
4291 tree-vect-generic can expand the first round via bit tricks. */
4292 if (!VECTOR_MODE_P (mode))
4293 reduce_with_shift = false;
4294 else
4296 optab optab = optab_for_tree_code (code, vectype, optab_default);
4297 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4298 reduce_with_shift = false;
4301 if (reduce_with_shift && !slp_reduc)
4303 int nelements = vec_size_in_bits / element_bitsize;
4304 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4306 int elt_offset;
4308 tree zero_vec = build_zero_cst (vectype);
4309 /*** Case 2: Create:
4310 for (offset = nelements/2; offset >= 1; offset/=2)
4312 Create: va' = vec_shift <va, offset>
4313 Create: va = vop <va, va'>
4314 } */
4316 tree rhs;
4318 if (dump_enabled_p ())
4319 dump_printf_loc (MSG_NOTE, vect_location,
4320 "Reduce using vector shifts\n");
4322 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4323 new_temp = new_phi_result;
4324 for (elt_offset = nelements / 2;
4325 elt_offset >= 1;
4326 elt_offset /= 2)
4328 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4329 tree mask = vect_gen_perm_mask_any (vectype, sel);
4330 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4331 new_temp, zero_vec, mask);
4332 new_name = make_ssa_name (vec_dest, epilog_stmt);
4333 gimple_assign_set_lhs (epilog_stmt, new_name);
4334 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4336 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4337 new_temp);
4338 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4339 gimple_assign_set_lhs (epilog_stmt, new_temp);
4340 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4343 /* 2.4 Extract the final scalar result. Create:
4344 s_out3 = extract_field <v_out2, bitpos> */
4346 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_NOTE, vect_location,
4348 "extract scalar result\n");
4350 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4351 bitsize, bitsize_zero_node);
4352 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4353 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4354 gimple_assign_set_lhs (epilog_stmt, new_temp);
4355 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4356 scalar_results.safe_push (new_temp);
4358 else
4360 /*** Case 3: Create:
4361 s = extract_field <v_out2, 0>
4362 for (offset = element_size;
4363 offset < vector_size;
4364 offset += element_size;)
4366 Create: s' = extract_field <v_out2, offset>
4367 Create: s = op <s, s'> // For non SLP cases
4368 } */
4370 if (dump_enabled_p ())
4371 dump_printf_loc (MSG_NOTE, vect_location,
4372 "Reduce using scalar code.\n");
4374 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4375 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4377 int bit_offset;
4378 if (gimple_code (new_phi) == GIMPLE_PHI)
4379 vec_temp = PHI_RESULT (new_phi);
4380 else
4381 vec_temp = gimple_assign_lhs (new_phi);
4382 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4383 bitsize_zero_node);
4384 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4385 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4386 gimple_assign_set_lhs (epilog_stmt, new_temp);
4387 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4389 /* In SLP we don't need to apply reduction operation, so we just
4390 collect s' values in SCALAR_RESULTS. */
4391 if (slp_reduc)
4392 scalar_results.safe_push (new_temp);
4394 for (bit_offset = element_bitsize;
4395 bit_offset < vec_size_in_bits;
4396 bit_offset += element_bitsize)
4398 tree bitpos = bitsize_int (bit_offset);
4399 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4400 bitsize, bitpos);
4402 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4403 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4404 gimple_assign_set_lhs (epilog_stmt, new_name);
4405 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4407 if (slp_reduc)
4409 /* In SLP we don't need to apply reduction operation, so
4410 we just collect s' values in SCALAR_RESULTS. */
4411 new_temp = new_name;
4412 scalar_results.safe_push (new_name);
4414 else
4416 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4417 new_name, new_temp);
4418 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4419 gimple_assign_set_lhs (epilog_stmt, new_temp);
4420 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4425 /* The only case where we need to reduce scalar results in SLP, is
4426 unrolling. If the size of SCALAR_RESULTS is greater than
4427 GROUP_SIZE, we reduce them combining elements modulo
4428 GROUP_SIZE. */
4429 if (slp_reduc)
4431 tree res, first_res, new_res;
4432 gimple new_stmt;
4434 /* Reduce multiple scalar results in case of SLP unrolling. */
4435 for (j = group_size; scalar_results.iterate (j, &res);
4436 j++)
4438 first_res = scalar_results[j % group_size];
4439 new_stmt = gimple_build_assign (new_scalar_dest, code,
4440 first_res, res);
4441 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4442 gimple_assign_set_lhs (new_stmt, new_res);
4443 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4444 scalar_results[j % group_size] = new_res;
4447 else
4448 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4449 scalar_results.safe_push (new_temp);
4453 vect_finalize_reduction:
4455 if (double_reduc)
4456 loop = loop->inner;
4458 /* 2.5 Adjust the final result by the initial value of the reduction
4459 variable. (When such adjustment is not needed, then
4460 'adjustment_def' is zero). For example, if code is PLUS we create:
4461 new_temp = loop_exit_def + adjustment_def */
4463 if (adjustment_def)
4465 gcc_assert (!slp_reduc);
4466 if (nested_in_vect_loop)
4468 new_phi = new_phis[0];
4469 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4470 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4471 new_dest = vect_create_destination_var (scalar_dest, vectype);
4473 else
4475 new_temp = scalar_results[0];
4476 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4477 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4478 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4481 epilog_stmt = gimple_build_assign (new_dest, expr);
4482 new_temp = make_ssa_name (new_dest, epilog_stmt);
4483 gimple_assign_set_lhs (epilog_stmt, new_temp);
4484 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4485 if (nested_in_vect_loop)
4487 set_vinfo_for_stmt (epilog_stmt,
4488 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4489 NULL));
4490 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4491 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4493 if (!double_reduc)
4494 scalar_results.quick_push (new_temp);
4495 else
4496 scalar_results[0] = new_temp;
4498 else
4499 scalar_results[0] = new_temp;
4501 new_phis[0] = epilog_stmt;
4504 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4505 phis with new adjusted scalar results, i.e., replace use <s_out0>
4506 with use <s_out4>.
4508 Transform:
4509 loop_exit:
4510 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4511 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4512 v_out2 = reduce <v_out1>
4513 s_out3 = extract_field <v_out2, 0>
4514 s_out4 = adjust_result <s_out3>
4515 use <s_out0>
4516 use <s_out0>
4518 into:
4520 loop_exit:
4521 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4522 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4523 v_out2 = reduce <v_out1>
4524 s_out3 = extract_field <v_out2, 0>
4525 s_out4 = adjust_result <s_out3>
4526 use <s_out4>
4527 use <s_out4> */
4530 /* In SLP reduction chain we reduce vector results into one vector if
4531 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4532 the last stmt in the reduction chain, since we are looking for the loop
4533 exit phi node. */
4534 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4536 scalar_dest = gimple_assign_lhs (
4537 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4538 group_size = 1;
4541 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4542 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4543 need to match SCALAR_RESULTS with corresponding statements. The first
4544 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4545 the first vector stmt, etc.
4546 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4547 if (group_size > new_phis.length ())
4549 ratio = group_size / new_phis.length ();
4550 gcc_assert (!(group_size % new_phis.length ()));
4552 else
4553 ratio = 1;
4555 for (k = 0; k < group_size; k++)
4557 if (k % ratio == 0)
4559 epilog_stmt = new_phis[k / ratio];
4560 reduction_phi = reduction_phis[k / ratio];
4561 if (double_reduc)
4562 inner_phi = inner_phis[k / ratio];
4565 if (slp_reduc)
4567 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4569 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4570 /* SLP statements can't participate in patterns. */
4571 gcc_assert (!orig_stmt);
4572 scalar_dest = gimple_assign_lhs (current_stmt);
4575 phis.create (3);
4576 /* Find the loop-closed-use at the loop exit of the original scalar
4577 result. (The reduction result is expected to have two immediate uses -
4578 one at the latch block, and one at the loop exit). */
4579 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4580 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4581 && !is_gimple_debug (USE_STMT (use_p)))
4582 phis.safe_push (USE_STMT (use_p));
4584 /* While we expect to have found an exit_phi because of loop-closed-ssa
4585 form we can end up without one if the scalar cycle is dead. */
4587 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4589 if (outer_loop)
4591 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4592 gphi *vect_phi;
4594 /* FORNOW. Currently not supporting the case that an inner-loop
4595 reduction is not used in the outer-loop (but only outside the
4596 outer-loop), unless it is double reduction. */
4597 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4598 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4599 || double_reduc);
4601 if (double_reduc)
4602 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4603 else
4604 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4605 if (!double_reduc
4606 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4607 != vect_double_reduction_def)
4608 continue;
4610 /* Handle double reduction:
4612 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4613 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4614 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4615 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4617 At that point the regular reduction (stmt2 and stmt3) is
4618 already vectorized, as well as the exit phi node, stmt4.
4619 Here we vectorize the phi node of double reduction, stmt1, and
4620 update all relevant statements. */
4622 /* Go through all the uses of s2 to find double reduction phi
4623 node, i.e., stmt1 above. */
4624 orig_name = PHI_RESULT (exit_phi);
4625 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4627 stmt_vec_info use_stmt_vinfo;
4628 stmt_vec_info new_phi_vinfo;
4629 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4630 basic_block bb = gimple_bb (use_stmt);
4631 gimple use;
4633 /* Check that USE_STMT is really double reduction phi
4634 node. */
4635 if (gimple_code (use_stmt) != GIMPLE_PHI
4636 || gimple_phi_num_args (use_stmt) != 2
4637 || bb->loop_father != outer_loop)
4638 continue;
4639 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4640 if (!use_stmt_vinfo
4641 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4642 != vect_double_reduction_def)
4643 continue;
4645 /* Create vector phi node for double reduction:
4646 vs1 = phi <vs0, vs2>
4647 vs1 was created previously in this function by a call to
4648 vect_get_vec_def_for_operand and is stored in
4649 vec_initial_def;
4650 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4651 vs0 is created here. */
4653 /* Create vector phi node. */
4654 vect_phi = create_phi_node (vec_initial_def, bb);
4655 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4656 loop_vec_info_for_loop (outer_loop), NULL);
4657 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4659 /* Create vs0 - initial def of the double reduction phi. */
4660 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4661 loop_preheader_edge (outer_loop));
4662 init_def = get_initial_def_for_reduction (stmt,
4663 preheader_arg, NULL);
4664 vect_phi_init = vect_init_vector (use_stmt, init_def,
4665 vectype, NULL);
4667 /* Update phi node arguments with vs0 and vs2. */
4668 add_phi_arg (vect_phi, vect_phi_init,
4669 loop_preheader_edge (outer_loop),
4670 UNKNOWN_LOCATION);
4671 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4672 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4673 if (dump_enabled_p ())
4675 dump_printf_loc (MSG_NOTE, vect_location,
4676 "created double reduction phi node: ");
4677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4678 dump_printf (MSG_NOTE, "\n");
4681 vect_phi_res = PHI_RESULT (vect_phi);
4683 /* Replace the use, i.e., set the correct vs1 in the regular
4684 reduction phi node. FORNOW, NCOPIES is always 1, so the
4685 loop is redundant. */
4686 use = reduction_phi;
4687 for (j = 0; j < ncopies; j++)
4689 edge pr_edge = loop_preheader_edge (loop);
4690 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4691 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4697 phis.release ();
4698 if (nested_in_vect_loop)
4700 if (double_reduc)
4701 loop = outer_loop;
4702 else
4703 continue;
4706 phis.create (3);
4707 /* Find the loop-closed-use at the loop exit of the original scalar
4708 result. (The reduction result is expected to have two immediate uses,
4709 one at the latch block, and one at the loop exit). For double
4710 reductions we are looking for exit phis of the outer loop. */
4711 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4713 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4715 if (!is_gimple_debug (USE_STMT (use_p)))
4716 phis.safe_push (USE_STMT (use_p));
4718 else
4720 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4722 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4724 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4726 if (!flow_bb_inside_loop_p (loop,
4727 gimple_bb (USE_STMT (phi_use_p)))
4728 && !is_gimple_debug (USE_STMT (phi_use_p)))
4729 phis.safe_push (USE_STMT (phi_use_p));
4735 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4737 /* Replace the uses: */
4738 orig_name = PHI_RESULT (exit_phi);
4739 scalar_result = scalar_results[k];
4740 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4741 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4742 SET_USE (use_p, scalar_result);
4745 phis.release ();
4750 /* Function vectorizable_reduction.
4752 Check if STMT performs a reduction operation that can be vectorized.
4753 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4754 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4755 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4757 This function also handles reduction idioms (patterns) that have been
4758 recognized in advance during vect_pattern_recog. In this case, STMT may be
4759 of this form:
4760 X = pattern_expr (arg0, arg1, ..., X)
4761 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4762 sequence that had been detected and replaced by the pattern-stmt (STMT).
4764 In some cases of reduction patterns, the type of the reduction variable X is
4765 different than the type of the other arguments of STMT.
4766 In such cases, the vectype that is used when transforming STMT into a vector
4767 stmt is different than the vectype that is used to determine the
4768 vectorization factor, because it consists of a different number of elements
4769 than the actual number of elements that are being operated upon in parallel.
4771 For example, consider an accumulation of shorts into an int accumulator.
4772 On some targets it's possible to vectorize this pattern operating on 8
4773 shorts at a time (hence, the vectype for purposes of determining the
4774 vectorization factor should be V8HI); on the other hand, the vectype that
4775 is used to create the vector form is actually V4SI (the type of the result).
4777 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4778 indicates what is the actual level of parallelism (V8HI in the example), so
4779 that the right vectorization factor would be derived. This vectype
4780 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4781 be used to create the vectorized stmt. The right vectype for the vectorized
4782 stmt is obtained from the type of the result X:
4783 get_vectype_for_scalar_type (TREE_TYPE (X))
4785 This means that, contrary to "regular" reductions (or "regular" stmts in
4786 general), the following equation:
4787 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4788 does *NOT* necessarily hold for reduction patterns. */
4790 bool
4791 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4792 gimple *vec_stmt, slp_tree slp_node)
4794 tree vec_dest;
4795 tree scalar_dest;
4796 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4797 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4798 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4799 tree vectype_in = NULL_TREE;
4800 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4801 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4802 enum tree_code code, orig_code, epilog_reduc_code;
4803 machine_mode vec_mode;
4804 int op_type;
4805 optab optab, reduc_optab;
4806 tree new_temp = NULL_TREE;
4807 tree def;
4808 gimple def_stmt;
4809 enum vect_def_type dt;
4810 gphi *new_phi = NULL;
4811 tree scalar_type;
4812 bool is_simple_use;
4813 gimple orig_stmt;
4814 stmt_vec_info orig_stmt_info;
4815 tree expr = NULL_TREE;
4816 int i;
4817 int ncopies;
4818 int epilog_copies;
4819 stmt_vec_info prev_stmt_info, prev_phi_info;
4820 bool single_defuse_cycle = false;
4821 tree reduc_def = NULL_TREE;
4822 gimple new_stmt = NULL;
4823 int j;
4824 tree ops[3];
4825 bool nested_cycle = false, found_nested_cycle_def = false;
4826 gimple reduc_def_stmt = NULL;
4827 /* The default is that the reduction variable is the last in statement. */
4828 int reduc_index = 2;
4829 bool double_reduc = false, dummy;
4830 basic_block def_bb;
4831 struct loop * def_stmt_loop, *outer_loop = NULL;
4832 tree def_arg;
4833 gimple def_arg_stmt;
4834 auto_vec<tree> vec_oprnds0;
4835 auto_vec<tree> vec_oprnds1;
4836 auto_vec<tree> vect_defs;
4837 auto_vec<gimple> phis;
4838 int vec_num;
4839 tree def0, def1, tem, op0, op1 = NULL_TREE;
4841 /* In case of reduction chain we switch to the first stmt in the chain, but
4842 we don't update STMT_INFO, since only the last stmt is marked as reduction
4843 and has reduction properties. */
4844 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4845 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4847 if (nested_in_vect_loop_p (loop, stmt))
4849 outer_loop = loop;
4850 loop = loop->inner;
4851 nested_cycle = true;
4854 /* 1. Is vectorizable reduction? */
4855 /* Not supportable if the reduction variable is used in the loop, unless
4856 it's a reduction chain. */
4857 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4858 && !GROUP_FIRST_ELEMENT (stmt_info))
4859 return false;
4861 /* Reductions that are not used even in an enclosing outer-loop,
4862 are expected to be "live" (used out of the loop). */
4863 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4864 && !STMT_VINFO_LIVE_P (stmt_info))
4865 return false;
4867 /* Make sure it was already recognized as a reduction computation. */
4868 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4869 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4870 return false;
4872 /* 2. Has this been recognized as a reduction pattern?
4874 Check if STMT represents a pattern that has been recognized
4875 in earlier analysis stages. For stmts that represent a pattern,
4876 the STMT_VINFO_RELATED_STMT field records the last stmt in
4877 the original sequence that constitutes the pattern. */
4879 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4880 if (orig_stmt)
4882 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4883 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4884 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4887 /* 3. Check the operands of the operation. The first operands are defined
4888 inside the loop body. The last operand is the reduction variable,
4889 which is defined by the loop-header-phi. */
4891 gcc_assert (is_gimple_assign (stmt));
4893 /* Flatten RHS. */
4894 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4896 case GIMPLE_SINGLE_RHS:
4897 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4898 if (op_type == ternary_op)
4900 tree rhs = gimple_assign_rhs1 (stmt);
4901 ops[0] = TREE_OPERAND (rhs, 0);
4902 ops[1] = TREE_OPERAND (rhs, 1);
4903 ops[2] = TREE_OPERAND (rhs, 2);
4904 code = TREE_CODE (rhs);
4906 else
4907 return false;
4908 break;
4910 case GIMPLE_BINARY_RHS:
4911 code = gimple_assign_rhs_code (stmt);
4912 op_type = TREE_CODE_LENGTH (code);
4913 gcc_assert (op_type == binary_op);
4914 ops[0] = gimple_assign_rhs1 (stmt);
4915 ops[1] = gimple_assign_rhs2 (stmt);
4916 break;
4918 case GIMPLE_TERNARY_RHS:
4919 code = gimple_assign_rhs_code (stmt);
4920 op_type = TREE_CODE_LENGTH (code);
4921 gcc_assert (op_type == ternary_op);
4922 ops[0] = gimple_assign_rhs1 (stmt);
4923 ops[1] = gimple_assign_rhs2 (stmt);
4924 ops[2] = gimple_assign_rhs3 (stmt);
4925 break;
4927 case GIMPLE_UNARY_RHS:
4928 return false;
4930 default:
4931 gcc_unreachable ();
4934 if (code == COND_EXPR && slp_node)
4935 return false;
4937 scalar_dest = gimple_assign_lhs (stmt);
4938 scalar_type = TREE_TYPE (scalar_dest);
4939 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4940 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4941 return false;
4943 /* Do not try to vectorize bit-precision reductions. */
4944 if ((TYPE_PRECISION (scalar_type)
4945 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4946 return false;
4948 /* All uses but the last are expected to be defined in the loop.
4949 The last use is the reduction variable. In case of nested cycle this
4950 assumption is not true: we use reduc_index to record the index of the
4951 reduction variable. */
4952 for (i = 0; i < op_type - 1; i++)
4954 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4955 if (i == 0 && code == COND_EXPR)
4956 continue;
4958 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4959 &def_stmt, &def, &dt, &tem);
4960 if (!vectype_in)
4961 vectype_in = tem;
4962 gcc_assert (is_simple_use);
4964 if (dt != vect_internal_def
4965 && dt != vect_external_def
4966 && dt != vect_constant_def
4967 && dt != vect_induction_def
4968 && !(dt == vect_nested_cycle && nested_cycle))
4969 return false;
4971 if (dt == vect_nested_cycle)
4973 found_nested_cycle_def = true;
4974 reduc_def_stmt = def_stmt;
4975 reduc_index = i;
4979 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4980 &def_stmt, &def, &dt, &tem);
4981 if (!vectype_in)
4982 vectype_in = tem;
4983 gcc_assert (is_simple_use);
4984 if (!(dt == vect_reduction_def
4985 || dt == vect_nested_cycle
4986 || ((dt == vect_internal_def || dt == vect_external_def
4987 || dt == vect_constant_def || dt == vect_induction_def)
4988 && nested_cycle && found_nested_cycle_def)))
4990 /* For pattern recognized stmts, orig_stmt might be a reduction,
4991 but some helper statements for the pattern might not, or
4992 might be COND_EXPRs with reduction uses in the condition. */
4993 gcc_assert (orig_stmt);
4994 return false;
4996 if (!found_nested_cycle_def)
4997 reduc_def_stmt = def_stmt;
4999 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
5000 if (orig_stmt)
5001 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5002 reduc_def_stmt,
5003 !nested_cycle,
5004 &dummy));
5005 else
5007 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5008 !nested_cycle, &dummy);
5009 /* We changed STMT to be the first stmt in reduction chain, hence we
5010 check that in this case the first element in the chain is STMT. */
5011 gcc_assert (stmt == tmp
5012 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5015 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5016 return false;
5018 if (slp_node || PURE_SLP_STMT (stmt_info))
5019 ncopies = 1;
5020 else
5021 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5022 / TYPE_VECTOR_SUBPARTS (vectype_in));
5024 gcc_assert (ncopies >= 1);
5026 vec_mode = TYPE_MODE (vectype_in);
5028 if (code == COND_EXPR)
5030 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5032 if (dump_enabled_p ())
5033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5034 "unsupported condition in reduction\n");
5036 return false;
5039 else
5041 /* 4. Supportable by target? */
5043 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5044 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5046 /* Shifts and rotates are only supported by vectorizable_shifts,
5047 not vectorizable_reduction. */
5048 if (dump_enabled_p ())
5049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5050 "unsupported shift or rotation.\n");
5051 return false;
5054 /* 4.1. check support for the operation in the loop */
5055 optab = optab_for_tree_code (code, vectype_in, optab_default);
5056 if (!optab)
5058 if (dump_enabled_p ())
5059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5060 "no optab.\n");
5062 return false;
5065 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5067 if (dump_enabled_p ())
5068 dump_printf (MSG_NOTE, "op not supported by target.\n");
5070 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5071 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5072 < vect_min_worthwhile_factor (code))
5073 return false;
5075 if (dump_enabled_p ())
5076 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5079 /* Worthwhile without SIMD support? */
5080 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5081 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5082 < vect_min_worthwhile_factor (code))
5084 if (dump_enabled_p ())
5085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5086 "not worthwhile without SIMD support.\n");
5088 return false;
5092 /* 4.2. Check support for the epilog operation.
5094 If STMT represents a reduction pattern, then the type of the
5095 reduction variable may be different than the type of the rest
5096 of the arguments. For example, consider the case of accumulation
5097 of shorts into an int accumulator; The original code:
5098 S1: int_a = (int) short_a;
5099 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5101 was replaced with:
5102 STMT: int_acc = widen_sum <short_a, int_acc>
5104 This means that:
5105 1. The tree-code that is used to create the vector operation in the
5106 epilog code (that reduces the partial results) is not the
5107 tree-code of STMT, but is rather the tree-code of the original
5108 stmt from the pattern that STMT is replacing. I.e, in the example
5109 above we want to use 'widen_sum' in the loop, but 'plus' in the
5110 epilog.
5111 2. The type (mode) we use to check available target support
5112 for the vector operation to be created in the *epilog*, is
5113 determined by the type of the reduction variable (in the example
5114 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5115 However the type (mode) we use to check available target support
5116 for the vector operation to be created *inside the loop*, is
5117 determined by the type of the other arguments to STMT (in the
5118 example we'd check this: optab_handler (widen_sum_optab,
5119 vect_short_mode)).
5121 This is contrary to "regular" reductions, in which the types of all
5122 the arguments are the same as the type of the reduction variable.
5123 For "regular" reductions we can therefore use the same vector type
5124 (and also the same tree-code) when generating the epilog code and
5125 when generating the code inside the loop. */
5127 if (orig_stmt)
5129 /* This is a reduction pattern: get the vectype from the type of the
5130 reduction variable, and get the tree-code from orig_stmt. */
5131 orig_code = gimple_assign_rhs_code (orig_stmt);
5132 gcc_assert (vectype_out);
5133 vec_mode = TYPE_MODE (vectype_out);
5135 else
5137 /* Regular reduction: use the same vectype and tree-code as used for
5138 the vector code inside the loop can be used for the epilog code. */
5139 orig_code = code;
5142 if (nested_cycle)
5144 def_bb = gimple_bb (reduc_def_stmt);
5145 def_stmt_loop = def_bb->loop_father;
5146 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5147 loop_preheader_edge (def_stmt_loop));
5148 if (TREE_CODE (def_arg) == SSA_NAME
5149 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5150 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5151 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5152 && vinfo_for_stmt (def_arg_stmt)
5153 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5154 == vect_double_reduction_def)
5155 double_reduc = true;
5158 epilog_reduc_code = ERROR_MARK;
5159 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5161 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5162 optab_default);
5163 if (!reduc_optab)
5165 if (dump_enabled_p ())
5166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5167 "no optab for reduction.\n");
5169 epilog_reduc_code = ERROR_MARK;
5171 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5173 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5174 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5176 if (dump_enabled_p ())
5177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5178 "reduc op not supported by target.\n");
5180 epilog_reduc_code = ERROR_MARK;
5184 else
5186 if (!nested_cycle || double_reduc)
5188 if (dump_enabled_p ())
5189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5190 "no reduc code for scalar code.\n");
5192 return false;
5196 if (double_reduc && ncopies > 1)
5198 if (dump_enabled_p ())
5199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5200 "multiple types in double reduction\n");
5202 return false;
5205 /* In case of widenning multiplication by a constant, we update the type
5206 of the constant to be the type of the other operand. We check that the
5207 constant fits the type in the pattern recognition pass. */
5208 if (code == DOT_PROD_EXPR
5209 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5211 if (TREE_CODE (ops[0]) == INTEGER_CST)
5212 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5213 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5214 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5215 else
5217 if (dump_enabled_p ())
5218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5219 "invalid types in dot-prod\n");
5221 return false;
5225 if (!vec_stmt) /* transformation not required. */
5227 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5228 return false;
5229 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5230 return true;
5233 /** Transform. **/
5235 if (dump_enabled_p ())
5236 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5238 /* FORNOW: Multiple types are not supported for condition. */
5239 if (code == COND_EXPR)
5240 gcc_assert (ncopies == 1);
5242 /* Create the destination vector */
5243 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5245 /* In case the vectorization factor (VF) is bigger than the number
5246 of elements that we can fit in a vectype (nunits), we have to generate
5247 more than one vector stmt - i.e - we need to "unroll" the
5248 vector stmt by a factor VF/nunits. For more details see documentation
5249 in vectorizable_operation. */
5251 /* If the reduction is used in an outer loop we need to generate
5252 VF intermediate results, like so (e.g. for ncopies=2):
5253 r0 = phi (init, r0)
5254 r1 = phi (init, r1)
5255 r0 = x0 + r0;
5256 r1 = x1 + r1;
5257 (i.e. we generate VF results in 2 registers).
5258 In this case we have a separate def-use cycle for each copy, and therefore
5259 for each copy we get the vector def for the reduction variable from the
5260 respective phi node created for this copy.
5262 Otherwise (the reduction is unused in the loop nest), we can combine
5263 together intermediate results, like so (e.g. for ncopies=2):
5264 r = phi (init, r)
5265 r = x0 + r;
5266 r = x1 + r;
5267 (i.e. we generate VF/2 results in a single register).
5268 In this case for each copy we get the vector def for the reduction variable
5269 from the vectorized reduction operation generated in the previous iteration.
5272 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5274 single_defuse_cycle = true;
5275 epilog_copies = 1;
5277 else
5278 epilog_copies = ncopies;
5280 prev_stmt_info = NULL;
5281 prev_phi_info = NULL;
5282 if (slp_node)
5284 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5285 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5286 == TYPE_VECTOR_SUBPARTS (vectype_in));
5288 else
5290 vec_num = 1;
5291 vec_oprnds0.create (1);
5292 if (op_type == ternary_op)
5293 vec_oprnds1.create (1);
5296 phis.create (vec_num);
5297 vect_defs.create (vec_num);
5298 if (!slp_node)
5299 vect_defs.quick_push (NULL_TREE);
5301 for (j = 0; j < ncopies; j++)
5303 if (j == 0 || !single_defuse_cycle)
5305 for (i = 0; i < vec_num; i++)
5307 /* Create the reduction-phi that defines the reduction
5308 operand. */
5309 new_phi = create_phi_node (vec_dest, loop->header);
5310 set_vinfo_for_stmt (new_phi,
5311 new_stmt_vec_info (new_phi, loop_vinfo,
5312 NULL));
5313 if (j == 0 || slp_node)
5314 phis.quick_push (new_phi);
5318 if (code == COND_EXPR)
5320 gcc_assert (!slp_node);
5321 vectorizable_condition (stmt, gsi, vec_stmt,
5322 PHI_RESULT (phis[0]),
5323 reduc_index, NULL);
5324 /* Multiple types are not supported for condition. */
5325 break;
5328 /* Handle uses. */
5329 if (j == 0)
5331 op0 = ops[!reduc_index];
5332 if (op_type == ternary_op)
5334 if (reduc_index == 0)
5335 op1 = ops[2];
5336 else
5337 op1 = ops[1];
5340 if (slp_node)
5341 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5342 slp_node, -1);
5343 else
5345 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5346 stmt, NULL);
5347 vec_oprnds0.quick_push (loop_vec_def0);
5348 if (op_type == ternary_op)
5350 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5351 NULL);
5352 vec_oprnds1.quick_push (loop_vec_def1);
5356 else
5358 if (!slp_node)
5360 enum vect_def_type dt;
5361 gimple dummy_stmt;
5362 tree dummy;
5364 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5365 &dummy_stmt, &dummy, &dt);
5366 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5367 loop_vec_def0);
5368 vec_oprnds0[0] = loop_vec_def0;
5369 if (op_type == ternary_op)
5371 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5372 &dummy, &dt);
5373 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5374 loop_vec_def1);
5375 vec_oprnds1[0] = loop_vec_def1;
5379 if (single_defuse_cycle)
5380 reduc_def = gimple_assign_lhs (new_stmt);
5382 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5385 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5387 if (slp_node)
5388 reduc_def = PHI_RESULT (phis[i]);
5389 else
5391 if (!single_defuse_cycle || j == 0)
5392 reduc_def = PHI_RESULT (new_phi);
5395 def1 = ((op_type == ternary_op)
5396 ? vec_oprnds1[i] : NULL);
5397 if (op_type == binary_op)
5399 if (reduc_index == 0)
5400 expr = build2 (code, vectype_out, reduc_def, def0);
5401 else
5402 expr = build2 (code, vectype_out, def0, reduc_def);
5404 else
5406 if (reduc_index == 0)
5407 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5408 else
5410 if (reduc_index == 1)
5411 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5412 else
5413 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5417 new_stmt = gimple_build_assign (vec_dest, expr);
5418 new_temp = make_ssa_name (vec_dest, new_stmt);
5419 gimple_assign_set_lhs (new_stmt, new_temp);
5420 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5422 if (slp_node)
5424 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5425 vect_defs.quick_push (new_temp);
5427 else
5428 vect_defs[0] = new_temp;
5431 if (slp_node)
5432 continue;
5434 if (j == 0)
5435 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5436 else
5437 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5439 prev_stmt_info = vinfo_for_stmt (new_stmt);
5440 prev_phi_info = vinfo_for_stmt (new_phi);
5443 /* Finalize the reduction-phi (set its arguments) and create the
5444 epilog reduction code. */
5445 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5447 new_temp = gimple_assign_lhs (*vec_stmt);
5448 vect_defs[0] = new_temp;
5451 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5452 epilog_reduc_code, phis, reduc_index,
5453 double_reduc, slp_node);
5455 return true;
5458 /* Function vect_min_worthwhile_factor.
5460 For a loop where we could vectorize the operation indicated by CODE,
5461 return the minimum vectorization factor that makes it worthwhile
5462 to use generic vectors. */
5464 vect_min_worthwhile_factor (enum tree_code code)
5466 switch (code)
5468 case PLUS_EXPR:
5469 case MINUS_EXPR:
5470 case NEGATE_EXPR:
5471 return 4;
5473 case BIT_AND_EXPR:
5474 case BIT_IOR_EXPR:
5475 case BIT_XOR_EXPR:
5476 case BIT_NOT_EXPR:
5477 return 2;
5479 default:
5480 return INT_MAX;
5485 /* Function vectorizable_induction
5487 Check if PHI performs an induction computation that can be vectorized.
5488 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5489 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5490 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5492 bool
5493 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5494 gimple *vec_stmt)
5496 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5497 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5498 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5499 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5500 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5501 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5502 tree vec_def;
5504 gcc_assert (ncopies >= 1);
5505 /* FORNOW. These restrictions should be relaxed. */
5506 if (nested_in_vect_loop_p (loop, phi))
5508 imm_use_iterator imm_iter;
5509 use_operand_p use_p;
5510 gimple exit_phi;
5511 edge latch_e;
5512 tree loop_arg;
5514 if (ncopies > 1)
5516 if (dump_enabled_p ())
5517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5518 "multiple types in nested loop.\n");
5519 return false;
5522 exit_phi = NULL;
5523 latch_e = loop_latch_edge (loop->inner);
5524 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5525 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5527 gimple use_stmt = USE_STMT (use_p);
5528 if (is_gimple_debug (use_stmt))
5529 continue;
5531 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5533 exit_phi = use_stmt;
5534 break;
5537 if (exit_phi)
5539 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5540 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5541 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5543 if (dump_enabled_p ())
5544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5545 "inner-loop induction only used outside "
5546 "of the outer vectorized loop.\n");
5547 return false;
5552 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5553 return false;
5555 /* FORNOW: SLP not supported. */
5556 if (STMT_SLP_TYPE (stmt_info))
5557 return false;
5559 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5561 if (gimple_code (phi) != GIMPLE_PHI)
5562 return false;
5564 if (!vec_stmt) /* transformation not required. */
5566 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5567 if (dump_enabled_p ())
5568 dump_printf_loc (MSG_NOTE, vect_location,
5569 "=== vectorizable_induction ===\n");
5570 vect_model_induction_cost (stmt_info, ncopies);
5571 return true;
5574 /** Transform. **/
5576 if (dump_enabled_p ())
5577 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5579 vec_def = get_initial_def_for_induction (phi);
5580 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5581 return true;
5584 /* Function vectorizable_live_operation.
5586 STMT computes a value that is used outside the loop. Check if
5587 it can be supported. */
5589 bool
5590 vectorizable_live_operation (gimple stmt,
5591 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5592 gimple *vec_stmt)
5594 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5595 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5596 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5597 int i;
5598 int op_type;
5599 tree op;
5600 tree def;
5601 gimple def_stmt;
5602 enum vect_def_type dt;
5603 enum tree_code code;
5604 enum gimple_rhs_class rhs_class;
5606 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5608 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5609 return false;
5611 if (!is_gimple_assign (stmt))
5613 if (gimple_call_internal_p (stmt)
5614 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5615 && gimple_call_lhs (stmt)
5616 && loop->simduid
5617 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5618 && loop->simduid
5619 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5621 edge e = single_exit (loop);
5622 basic_block merge_bb = e->dest;
5623 imm_use_iterator imm_iter;
5624 use_operand_p use_p;
5625 tree lhs = gimple_call_lhs (stmt);
5627 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5629 gimple use_stmt = USE_STMT (use_p);
5630 if (gimple_code (use_stmt) == GIMPLE_PHI
5631 && gimple_bb (use_stmt) == merge_bb)
5633 if (vec_stmt)
5635 tree vfm1
5636 = build_int_cst (unsigned_type_node,
5637 loop_vinfo->vectorization_factor - 1);
5638 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5640 return true;
5645 return false;
5648 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5649 return false;
5651 /* FORNOW. CHECKME. */
5652 if (nested_in_vect_loop_p (loop, stmt))
5653 return false;
5655 code = gimple_assign_rhs_code (stmt);
5656 op_type = TREE_CODE_LENGTH (code);
5657 rhs_class = get_gimple_rhs_class (code);
5658 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5659 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5661 /* FORNOW: support only if all uses are invariant. This means
5662 that the scalar operations can remain in place, unvectorized.
5663 The original last scalar value that they compute will be used. */
5665 for (i = 0; i < op_type; i++)
5667 if (rhs_class == GIMPLE_SINGLE_RHS)
5668 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5669 else
5670 op = gimple_op (stmt, i + 1);
5671 if (op
5672 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5673 &dt))
5675 if (dump_enabled_p ())
5676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5677 "use not simple.\n");
5678 return false;
5681 if (dt != vect_external_def && dt != vect_constant_def)
5682 return false;
5685 /* No transformation is required for the cases we currently support. */
5686 return true;
5689 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5691 static void
5692 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5694 ssa_op_iter op_iter;
5695 imm_use_iterator imm_iter;
5696 def_operand_p def_p;
5697 gimple ustmt;
5699 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5701 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5703 basic_block bb;
5705 if (!is_gimple_debug (ustmt))
5706 continue;
5708 bb = gimple_bb (ustmt);
5710 if (!flow_bb_inside_loop_p (loop, bb))
5712 if (gimple_debug_bind_p (ustmt))
5714 if (dump_enabled_p ())
5715 dump_printf_loc (MSG_NOTE, vect_location,
5716 "killing debug use\n");
5718 gimple_debug_bind_reset_value (ustmt);
5719 update_stmt (ustmt);
5721 else
5722 gcc_unreachable ();
5729 /* This function builds ni_name = number of iterations. Statements
5730 are emitted on the loop preheader edge. */
5732 static tree
5733 vect_build_loop_niters (loop_vec_info loop_vinfo)
5735 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5736 if (TREE_CODE (ni) == INTEGER_CST)
5737 return ni;
5738 else
5740 tree ni_name, var;
5741 gimple_seq stmts = NULL;
5742 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5744 var = create_tmp_var (TREE_TYPE (ni), "niters");
5745 ni_name = force_gimple_operand (ni, &stmts, false, var);
5746 if (stmts)
5747 gsi_insert_seq_on_edge_immediate (pe, stmts);
5749 return ni_name;
5754 /* This function generates the following statements:
5756 ni_name = number of iterations loop executes
5757 ratio = ni_name / vf
5758 ratio_mult_vf_name = ratio * vf
5760 and places them on the loop preheader edge. */
5762 static void
5763 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5764 tree ni_name,
5765 tree *ratio_mult_vf_name_ptr,
5766 tree *ratio_name_ptr)
5768 tree ni_minus_gap_name;
5769 tree var;
5770 tree ratio_name;
5771 tree ratio_mult_vf_name;
5772 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5773 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5774 tree log_vf;
5776 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5778 /* If epilogue loop is required because of data accesses with gaps, we
5779 subtract one iteration from the total number of iterations here for
5780 correct calculation of RATIO. */
5781 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5783 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5784 ni_name,
5785 build_one_cst (TREE_TYPE (ni_name)));
5786 if (!is_gimple_val (ni_minus_gap_name))
5788 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5789 gimple stmts = NULL;
5790 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5791 true, var);
5792 gsi_insert_seq_on_edge_immediate (pe, stmts);
5795 else
5796 ni_minus_gap_name = ni_name;
5798 /* Create: ratio = ni >> log2(vf) */
5799 /* ??? As we have ni == number of latch executions + 1, ni could
5800 have overflown to zero. So avoid computing ratio based on ni
5801 but compute it using the fact that we know ratio will be at least
5802 one, thus via (ni - vf) >> log2(vf) + 1. */
5803 ratio_name
5804 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5805 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5806 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5807 ni_minus_gap_name,
5808 build_int_cst
5809 (TREE_TYPE (ni_name), vf)),
5810 log_vf),
5811 build_int_cst (TREE_TYPE (ni_name), 1));
5812 if (!is_gimple_val (ratio_name))
5814 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5815 gimple stmts = NULL;
5816 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5817 gsi_insert_seq_on_edge_immediate (pe, stmts);
5819 *ratio_name_ptr = ratio_name;
5821 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5823 if (ratio_mult_vf_name_ptr)
5825 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5826 ratio_name, log_vf);
5827 if (!is_gimple_val (ratio_mult_vf_name))
5829 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5830 gimple stmts = NULL;
5831 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5832 true, var);
5833 gsi_insert_seq_on_edge_immediate (pe, stmts);
5835 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5838 return;
5842 /* Function vect_transform_loop.
5844 The analysis phase has determined that the loop is vectorizable.
5845 Vectorize the loop - created vectorized stmts to replace the scalar
5846 stmts in the loop, and update the loop exit condition. */
5848 void
5849 vect_transform_loop (loop_vec_info loop_vinfo)
5851 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5852 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5853 int nbbs = loop->num_nodes;
5854 int i;
5855 tree ratio = NULL;
5856 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5857 bool grouped_store;
5858 bool slp_scheduled = false;
5859 gimple stmt, pattern_stmt;
5860 gimple_seq pattern_def_seq = NULL;
5861 gimple_stmt_iterator pattern_def_si = gsi_none ();
5862 bool transform_pattern_stmt = false;
5863 bool check_profitability = false;
5864 int th;
5865 /* Record number of iterations before we started tampering with the profile. */
5866 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5868 if (dump_enabled_p ())
5869 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5871 /* If profile is inprecise, we have chance to fix it up. */
5872 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5873 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5875 /* Use the more conservative vectorization threshold. If the number
5876 of iterations is constant assume the cost check has been performed
5877 by our caller. If the threshold makes all loops profitable that
5878 run at least the vectorization factor number of times checking
5879 is pointless, too. */
5880 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5881 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5882 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5884 if (dump_enabled_p ())
5885 dump_printf_loc (MSG_NOTE, vect_location,
5886 "Profitability threshold is %d loop iterations.\n",
5887 th);
5888 check_profitability = true;
5891 /* Version the loop first, if required, so the profitability check
5892 comes first. */
5894 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5895 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5897 vect_loop_versioning (loop_vinfo, th, check_profitability);
5898 check_profitability = false;
5901 tree ni_name = vect_build_loop_niters (loop_vinfo);
5902 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5904 /* Peel the loop if there are data refs with unknown alignment.
5905 Only one data ref with unknown store is allowed. */
5907 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5909 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5910 th, check_profitability);
5911 check_profitability = false;
5912 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5913 be re-computed. */
5914 ni_name = NULL_TREE;
5917 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5918 compile time constant), or it is a constant that doesn't divide by the
5919 vectorization factor, then an epilog loop needs to be created.
5920 We therefore duplicate the loop: the original loop will be vectorized,
5921 and will compute the first (n/VF) iterations. The second copy of the loop
5922 will remain scalar and will compute the remaining (n%VF) iterations.
5923 (VF is the vectorization factor). */
5925 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5926 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5928 tree ratio_mult_vf;
5929 if (!ni_name)
5930 ni_name = vect_build_loop_niters (loop_vinfo);
5931 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5932 &ratio);
5933 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5934 th, check_profitability);
5936 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5937 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5938 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5939 else
5941 if (!ni_name)
5942 ni_name = vect_build_loop_niters (loop_vinfo);
5943 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5946 /* 1) Make sure the loop header has exactly two entries
5947 2) Make sure we have a preheader basic block. */
5949 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5951 split_edge (loop_preheader_edge (loop));
5953 /* FORNOW: the vectorizer supports only loops which body consist
5954 of one basic block (header + empty latch). When the vectorizer will
5955 support more involved loop forms, the order by which the BBs are
5956 traversed need to be reconsidered. */
5958 for (i = 0; i < nbbs; i++)
5960 basic_block bb = bbs[i];
5961 stmt_vec_info stmt_info;
5963 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5964 gsi_next (&si))
5966 gphi *phi = si.phi ();
5967 if (dump_enabled_p ())
5969 dump_printf_loc (MSG_NOTE, vect_location,
5970 "------>vectorizing phi: ");
5971 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5972 dump_printf (MSG_NOTE, "\n");
5974 stmt_info = vinfo_for_stmt (phi);
5975 if (!stmt_info)
5976 continue;
5978 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5979 vect_loop_kill_debug_uses (loop, phi);
5981 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5982 && !STMT_VINFO_LIVE_P (stmt_info))
5983 continue;
5985 if (STMT_VINFO_VECTYPE (stmt_info)
5986 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5987 != (unsigned HOST_WIDE_INT) vectorization_factor)
5988 && dump_enabled_p ())
5989 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5991 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5993 if (dump_enabled_p ())
5994 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5995 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5999 pattern_stmt = NULL;
6000 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6001 !gsi_end_p (si) || transform_pattern_stmt;)
6003 bool is_store;
6005 if (transform_pattern_stmt)
6006 stmt = pattern_stmt;
6007 else
6009 stmt = gsi_stmt (si);
6010 /* During vectorization remove existing clobber stmts. */
6011 if (gimple_clobber_p (stmt))
6013 unlink_stmt_vdef (stmt);
6014 gsi_remove (&si, true);
6015 release_defs (stmt);
6016 continue;
6020 if (dump_enabled_p ())
6022 dump_printf_loc (MSG_NOTE, vect_location,
6023 "------>vectorizing statement: ");
6024 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6025 dump_printf (MSG_NOTE, "\n");
6028 stmt_info = vinfo_for_stmt (stmt);
6030 /* vector stmts created in the outer-loop during vectorization of
6031 stmts in an inner-loop may not have a stmt_info, and do not
6032 need to be vectorized. */
6033 if (!stmt_info)
6035 gsi_next (&si);
6036 continue;
6039 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6040 vect_loop_kill_debug_uses (loop, stmt);
6042 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6043 && !STMT_VINFO_LIVE_P (stmt_info))
6045 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6046 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6047 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6048 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6050 stmt = pattern_stmt;
6051 stmt_info = vinfo_for_stmt (stmt);
6053 else
6055 gsi_next (&si);
6056 continue;
6059 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6060 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6061 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6062 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6063 transform_pattern_stmt = true;
6065 /* If pattern statement has def stmts, vectorize them too. */
6066 if (is_pattern_stmt_p (stmt_info))
6068 if (pattern_def_seq == NULL)
6070 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6071 pattern_def_si = gsi_start (pattern_def_seq);
6073 else if (!gsi_end_p (pattern_def_si))
6074 gsi_next (&pattern_def_si);
6075 if (pattern_def_seq != NULL)
6077 gimple pattern_def_stmt = NULL;
6078 stmt_vec_info pattern_def_stmt_info = NULL;
6080 while (!gsi_end_p (pattern_def_si))
6082 pattern_def_stmt = gsi_stmt (pattern_def_si);
6083 pattern_def_stmt_info
6084 = vinfo_for_stmt (pattern_def_stmt);
6085 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6086 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6087 break;
6088 gsi_next (&pattern_def_si);
6091 if (!gsi_end_p (pattern_def_si))
6093 if (dump_enabled_p ())
6095 dump_printf_loc (MSG_NOTE, vect_location,
6096 "==> vectorizing pattern def "
6097 "stmt: ");
6098 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6099 pattern_def_stmt, 0);
6100 dump_printf (MSG_NOTE, "\n");
6103 stmt = pattern_def_stmt;
6104 stmt_info = pattern_def_stmt_info;
6106 else
6108 pattern_def_si = gsi_none ();
6109 transform_pattern_stmt = false;
6112 else
6113 transform_pattern_stmt = false;
6116 if (STMT_VINFO_VECTYPE (stmt_info))
6118 unsigned int nunits
6119 = (unsigned int)
6120 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6121 if (!STMT_SLP_TYPE (stmt_info)
6122 && nunits != (unsigned int) vectorization_factor
6123 && dump_enabled_p ())
6124 /* For SLP VF is set according to unrolling factor, and not
6125 to vector size, hence for SLP this print is not valid. */
6126 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6129 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6130 reached. */
6131 if (STMT_SLP_TYPE (stmt_info))
6133 if (!slp_scheduled)
6135 slp_scheduled = true;
6137 if (dump_enabled_p ())
6138 dump_printf_loc (MSG_NOTE, vect_location,
6139 "=== scheduling SLP instances ===\n");
6141 vect_schedule_slp (loop_vinfo, NULL);
6144 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6145 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6147 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6149 pattern_def_seq = NULL;
6150 gsi_next (&si);
6152 continue;
6156 /* -------- vectorize statement ------------ */
6157 if (dump_enabled_p ())
6158 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6160 grouped_store = false;
6161 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6162 if (is_store)
6164 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6166 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6167 interleaving chain was completed - free all the stores in
6168 the chain. */
6169 gsi_next (&si);
6170 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6172 else
6174 /* Free the attached stmt_vec_info and remove the stmt. */
6175 gimple store = gsi_stmt (si);
6176 free_stmt_vec_info (store);
6177 unlink_stmt_vdef (store);
6178 gsi_remove (&si, true);
6179 release_defs (store);
6182 /* Stores can only appear at the end of pattern statements. */
6183 gcc_assert (!transform_pattern_stmt);
6184 pattern_def_seq = NULL;
6186 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6188 pattern_def_seq = NULL;
6189 gsi_next (&si);
6191 } /* stmts in BB */
6192 } /* BBs in loop */
6194 slpeel_make_loop_iterate_ntimes (loop, ratio);
6196 /* Reduce loop iterations by the vectorization factor. */
6197 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6198 expected_iterations / vectorization_factor);
6199 loop->nb_iterations_upper_bound
6200 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6201 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6202 && loop->nb_iterations_upper_bound != 0)
6203 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6204 if (loop->any_estimate)
6206 loop->nb_iterations_estimate
6207 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6208 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6209 && loop->nb_iterations_estimate != 0)
6210 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6213 if (dump_enabled_p ())
6215 dump_printf_loc (MSG_NOTE, vect_location,
6216 "LOOP VECTORIZED\n");
6217 if (loop->inner)
6218 dump_printf_loc (MSG_NOTE, vect_location,
6219 "OUTER LOOP VECTORIZED\n");
6220 dump_printf (MSG_NOTE, "\n");