[AArch64] PR target/65491: Classify V1TF vectors as AAPCS64 short vectors rather...
[official-gcc.git] / gcc / tree-vect-loop.c
blob2c983b889d170091bd566b8d1d1ba8cfd58e78b1
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "predict.h"
40 #include "hard-reg-set.h"
41 #include "function.h"
42 #include "dominance.h"
43 #include "cfg.h"
44 #include "cfganal.h"
45 #include "basic-block.h"
46 #include "gimple-pretty-print.h"
47 #include "tree-ssa-alias.h"
48 #include "internal-fn.h"
49 #include "gimple-expr.h"
50 #include "is-a.h"
51 #include "gimple.h"
52 #include "gimplify.h"
53 #include "gimple-iterator.h"
54 #include "gimplify-me.h"
55 #include "gimple-ssa.h"
56 #include "tree-phinodes.h"
57 #include "ssa-iterators.h"
58 #include "stringpool.h"
59 #include "tree-ssanames.h"
60 #include "tree-ssa-loop-ivopts.h"
61 #include "tree-ssa-loop-manip.h"
62 #include "tree-ssa-loop-niter.h"
63 #include "tree-pass.h"
64 #include "cfgloop.h"
65 #include "hashtab.h"
66 #include "rtl.h"
67 #include "flags.h"
68 #include "statistics.h"
69 #include "real.h"
70 #include "fixed-value.h"
71 #include "insn-config.h"
72 #include "expmed.h"
73 #include "dojump.h"
74 #include "explow.h"
75 #include "calls.h"
76 #include "emit-rtl.h"
77 #include "varasm.h"
78 #include "stmt.h"
79 #include "expr.h"
80 #include "recog.h"
81 #include "insn-codes.h"
82 #include "optabs.h"
83 #include "params.h"
84 #include "diagnostic-core.h"
85 #include "tree-chrec.h"
86 #include "tree-scalar-evolution.h"
87 #include "tree-vectorizer.h"
88 #include "target.h"
90 /* Loop Vectorization Pass.
92 This pass tries to vectorize loops.
94 For example, the vectorizer transforms the following simple loop:
96 short a[N]; short b[N]; short c[N]; int i;
98 for (i=0; i<N; i++){
99 a[i] = b[i] + c[i];
102 as if it was manually vectorized by rewriting the source code into:
104 typedef int __attribute__((mode(V8HI))) v8hi;
105 short a[N]; short b[N]; short c[N]; int i;
106 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
107 v8hi va, vb, vc;
109 for (i=0; i<N/8; i++){
110 vb = pb[i];
111 vc = pc[i];
112 va = vb + vc;
113 pa[i] = va;
116 The main entry to this pass is vectorize_loops(), in which
117 the vectorizer applies a set of analyses on a given set of loops,
118 followed by the actual vectorization transformation for the loops that
119 had successfully passed the analysis phase.
120 Throughout this pass we make a distinction between two types of
121 data: scalars (which are represented by SSA_NAMES), and memory references
122 ("data-refs"). These two types of data require different handling both
123 during analysis and transformation. The types of data-refs that the
124 vectorizer currently supports are ARRAY_REFS which base is an array DECL
125 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
126 accesses are required to have a simple (consecutive) access pattern.
128 Analysis phase:
129 ===============
130 The driver for the analysis phase is vect_analyze_loop().
131 It applies a set of analyses, some of which rely on the scalar evolution
132 analyzer (scev) developed by Sebastian Pop.
134 During the analysis phase the vectorizer records some information
135 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
136 loop, as well as general information about the loop as a whole, which is
137 recorded in a "loop_vec_info" struct attached to each loop.
139 Transformation phase:
140 =====================
141 The loop transformation phase scans all the stmts in the loop, and
142 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
143 the loop that needs to be vectorized. It inserts the vector code sequence
144 just before the scalar stmt S, and records a pointer to the vector code
145 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
146 attached to S). This pointer will be used for the vectorization of following
147 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
148 otherwise, we rely on dead code elimination for removing it.
150 For example, say stmt S1 was vectorized into stmt VS1:
152 VS1: vb = px[i];
153 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
154 S2: a = b;
156 To vectorize stmt S2, the vectorizer first finds the stmt that defines
157 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
158 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
159 resulting sequence would be:
161 VS1: vb = px[i];
162 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
163 VS2: va = vb;
164 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
166 Operands that are not SSA_NAMEs, are data-refs that appear in
167 load/store operations (like 'x[i]' in S1), and are handled differently.
169 Target modeling:
170 =================
171 Currently the only target specific information that is used is the
172 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
173 Targets that can support different sizes of vectors, for now will need
174 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
175 flexibility will be added in the future.
177 Since we only vectorize operations which vector form can be
178 expressed using existing tree codes, to verify that an operation is
179 supported, the vectorizer checks the relevant optab at the relevant
180 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
181 the value found is CODE_FOR_nothing, then there's no target support, and
182 we can't vectorize the stmt.
184 For additional information on this project see:
185 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
190 /* Function vect_determine_vectorization_factor
192 Determine the vectorization factor (VF). VF is the number of data elements
193 that are operated upon in parallel in a single iteration of the vectorized
194 loop. For example, when vectorizing a loop that operates on 4byte elements,
195 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
196 elements can fit in a single vector register.
198 We currently support vectorization of loops in which all types operated upon
199 are of the same size. Therefore this function currently sets VF according to
200 the size of the types operated upon, and fails if there are multiple sizes
201 in the loop.
203 VF is also the factor by which the loop iterations are strip-mined, e.g.:
204 original loop:
205 for (i=0; i<N; i++){
206 a[i] = b[i] + c[i];
209 vectorized loop:
210 for (i=0; i<N; i+=VF){
211 a[i:VF] = b[i:VF] + c[i:VF];
215 static bool
216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
218 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
219 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
220 int nbbs = loop->num_nodes;
221 unsigned int vectorization_factor = 0;
222 tree scalar_type;
223 gphi *phi;
224 tree vectype;
225 unsigned int nunits;
226 stmt_vec_info stmt_info;
227 int i;
228 HOST_WIDE_INT dummy;
229 gimple stmt, pattern_stmt = NULL;
230 gimple_seq pattern_def_seq = NULL;
231 gimple_stmt_iterator pattern_def_si = gsi_none ();
232 bool analyze_pattern_stmt = false;
234 if (dump_enabled_p ())
235 dump_printf_loc (MSG_NOTE, vect_location,
236 "=== vect_determine_vectorization_factor ===\n");
238 for (i = 0; i < nbbs; i++)
240 basic_block bb = bbs[i];
242 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
243 gsi_next (&si))
245 phi = si.phi ();
246 stmt_info = vinfo_for_stmt (phi);
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
250 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
251 dump_printf (MSG_NOTE, "\n");
254 gcc_assert (stmt_info);
256 if (STMT_VINFO_RELEVANT_P (stmt_info))
258 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
259 scalar_type = TREE_TYPE (PHI_RESULT (phi));
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location,
264 "get vectype for scalar type: ");
265 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
266 dump_printf (MSG_NOTE, "\n");
269 vectype = get_vectype_for_scalar_type (scalar_type);
270 if (!vectype)
272 if (dump_enabled_p ())
274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
275 "not vectorized: unsupported "
276 "data-type ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 scalar_type);
279 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
281 return false;
283 STMT_VINFO_VECTYPE (stmt_info) = vectype;
285 if (dump_enabled_p ())
287 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
288 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
289 dump_printf (MSG_NOTE, "\n");
292 nunits = TYPE_VECTOR_SUBPARTS (vectype);
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
295 nunits);
297 if (!vectorization_factor
298 || (nunits > vectorization_factor))
299 vectorization_factor = nunits;
303 for (gimple_stmt_iterator si = gsi_start_bb (bb);
304 !gsi_end_p (si) || analyze_pattern_stmt;)
306 tree vf_vectype;
308 if (analyze_pattern_stmt)
309 stmt = pattern_stmt;
310 else
311 stmt = gsi_stmt (si);
313 stmt_info = vinfo_for_stmt (stmt);
315 if (dump_enabled_p ())
317 dump_printf_loc (MSG_NOTE, vect_location,
318 "==> examining statement: ");
319 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
320 dump_printf (MSG_NOTE, "\n");
323 gcc_assert (stmt_info);
325 /* Skip stmts which do not need to be vectorized. */
326 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
327 && !STMT_VINFO_LIVE_P (stmt_info))
328 || gimple_clobber_p (stmt))
330 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
331 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
332 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
333 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
335 stmt = pattern_stmt;
336 stmt_info = vinfo_for_stmt (pattern_stmt);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location,
340 "==> examining pattern statement: ");
341 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
342 dump_printf (MSG_NOTE, "\n");
345 else
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
349 gsi_next (&si);
350 continue;
353 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
354 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
355 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
356 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
357 analyze_pattern_stmt = true;
359 /* If a pattern statement has def stmts, analyze them too. */
360 if (is_pattern_stmt_p (stmt_info))
362 if (pattern_def_seq == NULL)
364 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
365 pattern_def_si = gsi_start (pattern_def_seq);
367 else if (!gsi_end_p (pattern_def_si))
368 gsi_next (&pattern_def_si);
369 if (pattern_def_seq != NULL)
371 gimple pattern_def_stmt = NULL;
372 stmt_vec_info pattern_def_stmt_info = NULL;
374 while (!gsi_end_p (pattern_def_si))
376 pattern_def_stmt = gsi_stmt (pattern_def_si);
377 pattern_def_stmt_info
378 = vinfo_for_stmt (pattern_def_stmt);
379 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
380 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
381 break;
382 gsi_next (&pattern_def_si);
385 if (!gsi_end_p (pattern_def_si))
387 if (dump_enabled_p ())
389 dump_printf_loc (MSG_NOTE, vect_location,
390 "==> examining pattern def stmt: ");
391 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
392 pattern_def_stmt, 0);
393 dump_printf (MSG_NOTE, "\n");
396 stmt = pattern_def_stmt;
397 stmt_info = pattern_def_stmt_info;
399 else
401 pattern_def_si = gsi_none ();
402 analyze_pattern_stmt = false;
405 else
406 analyze_pattern_stmt = false;
409 if (gimple_get_lhs (stmt) == NULL_TREE
410 /* MASK_STORE has no lhs, but is ok. */
411 && (!is_gimple_call (stmt)
412 || !gimple_call_internal_p (stmt)
413 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
415 if (is_gimple_call (stmt))
417 /* Ignore calls with no lhs. These must be calls to
418 #pragma omp simd functions, and what vectorization factor
419 it really needs can't be determined until
420 vectorizable_simd_clone_call. */
421 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
423 pattern_def_seq = NULL;
424 gsi_next (&si);
426 continue;
428 if (dump_enabled_p ())
430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431 "not vectorized: irregular stmt.");
432 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
434 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
436 return false;
439 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
441 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444 "not vectorized: vector stmt in loop:");
445 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
446 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 return false;
451 if (STMT_VINFO_VECTYPE (stmt_info))
453 /* The only case when a vectype had been already set is for stmts
454 that contain a dataref, or for "pattern-stmts" (stmts
455 generated by the vectorizer to represent/replace a certain
456 idiom). */
457 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
458 || is_pattern_stmt_p (stmt_info)
459 || !gsi_end_p (pattern_def_si));
460 vectype = STMT_VINFO_VECTYPE (stmt_info);
462 else
464 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
465 if (is_gimple_call (stmt)
466 && gimple_call_internal_p (stmt)
467 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
468 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
469 else
470 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
471 if (dump_enabled_p ())
473 dump_printf_loc (MSG_NOTE, vect_location,
474 "get vectype for scalar type: ");
475 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
476 dump_printf (MSG_NOTE, "\n");
478 vectype = get_vectype_for_scalar_type (scalar_type);
479 if (!vectype)
481 if (dump_enabled_p ())
483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484 "not vectorized: unsupported "
485 "data-type ");
486 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
487 scalar_type);
488 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
490 return false;
493 STMT_VINFO_VECTYPE (stmt_info) = vectype;
495 if (dump_enabled_p ())
497 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
498 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
499 dump_printf (MSG_NOTE, "\n");
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
516 if (!vf_vectype)
518 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "not vectorized: unsupported data-type ");
522 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523 scalar_type);
524 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
526 return false;
529 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "not vectorized: different sized vector "
536 "types in statement, ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538 vectype);
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541 vf_vectype);
542 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 return false;
547 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551 dump_printf (MSG_NOTE, "\n");
554 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557 if (!vectorization_factor
558 || (nunits > vectorization_factor))
559 vectorization_factor = nunits;
561 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
563 pattern_def_seq = NULL;
564 gsi_next (&si);
569 /* TODO: Analyze cost. Decide if worth while to vectorize. */
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572 vectorization_factor);
573 if (vectorization_factor <= 1)
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "not vectorized: unsupported data-type\n");
578 return false;
580 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
582 return true;
586 /* Function vect_is_simple_iv_evolution.
588 FORNOW: A simple evolution of an induction variables in the loop is
589 considered a polynomial evolution. */
591 static bool
592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
593 tree * step)
595 tree init_expr;
596 tree step_expr;
597 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
598 basic_block bb;
600 /* When there is no evolution in this loop, the evolution function
601 is not "simple". */
602 if (evolution_part == NULL_TREE)
603 return false;
605 /* When the evolution is a polynomial of degree >= 2
606 the evolution function is not "simple". */
607 if (tree_is_chrec (evolution_part))
608 return false;
610 step_expr = evolution_part;
611 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
613 if (dump_enabled_p ())
615 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
616 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
617 dump_printf (MSG_NOTE, ", init: ");
618 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
619 dump_printf (MSG_NOTE, "\n");
622 *init = init_expr;
623 *step = step_expr;
625 if (TREE_CODE (step_expr) != INTEGER_CST
626 && (TREE_CODE (step_expr) != SSA_NAME
627 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
628 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
629 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
630 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
631 || !flag_associative_math)))
632 && (TREE_CODE (step_expr) != REAL_CST
633 || !flag_associative_math))
635 if (dump_enabled_p ())
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637 "step unknown.\n");
638 return false;
641 return true;
644 /* Function vect_analyze_scalar_cycles_1.
646 Examine the cross iteration def-use cycles of scalar variables
647 in LOOP. LOOP_VINFO represents the loop that is now being
648 considered for vectorization (can be LOOP, or an outer-loop
649 enclosing LOOP). */
651 static void
652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
654 basic_block bb = loop->header;
655 tree init, step;
656 auto_vec<gimple, 64> worklist;
657 gphi_iterator gsi;
658 bool double_reduc;
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "=== vect_analyze_scalar_cycles ===\n");
664 /* First - identify all inductions. Reduction detection assumes that all the
665 inductions have been identified, therefore, this order must not be
666 changed. */
667 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
669 gphi *phi = gsi.phi ();
670 tree access_fn = NULL;
671 tree def = PHI_RESULT (phi);
672 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
678 dump_printf (MSG_NOTE, "\n");
681 /* Skip virtual phi's. The data dependences that are associated with
682 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
683 if (virtual_operand_p (def))
684 continue;
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
688 /* Analyze the evolution function. */
689 access_fn = analyze_scalar_evolution (loop, def);
690 if (access_fn)
692 STRIP_NOPS (access_fn);
693 if (dump_enabled_p ())
695 dump_printf_loc (MSG_NOTE, vect_location,
696 "Access function of PHI: ");
697 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
698 dump_printf (MSG_NOTE, "\n");
700 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
701 = evolution_part_in_loop_num (access_fn, loop->num);
704 if (!access_fn
705 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
706 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
707 && TREE_CODE (step) != INTEGER_CST))
709 worklist.safe_push (phi);
710 continue;
713 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
715 if (dump_enabled_p ())
716 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
721 /* Second - identify all reductions and nested cycles. */
722 while (worklist.length () > 0)
724 gimple phi = worklist.pop ();
725 tree def = PHI_RESULT (phi);
726 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
727 gimple reduc_stmt;
728 bool nested_cycle;
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
733 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
734 dump_printf (MSG_NOTE, "\n");
737 gcc_assert (!virtual_operand_p (def)
738 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
740 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
741 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
742 &double_reduc);
743 if (reduc_stmt)
745 if (double_reduc)
747 if (dump_enabled_p ())
748 dump_printf_loc (MSG_NOTE, vect_location,
749 "Detected double reduction.\n");
751 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
752 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
753 vect_double_reduction_def;
755 else
757 if (nested_cycle)
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_NOTE, vect_location,
761 "Detected vectorizable nested cycle.\n");
763 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
764 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
765 vect_nested_cycle;
767 else
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE, vect_location,
771 "Detected reduction.\n");
773 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
774 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
775 vect_reduction_def;
776 /* Store the reduction cycles for possible vectorization in
777 loop-aware SLP. */
778 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
782 else
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 "Unknown def-use cycle pattern.\n");
790 /* Function vect_analyze_scalar_cycles.
792 Examine the cross iteration def-use cycles of scalar variables, by
793 analyzing the loop-header PHIs of scalar variables. Classify each
794 cycle as one of the following: invariant, induction, reduction, unknown.
795 We do that for the loop represented by LOOP_VINFO, and also to its
796 inner-loop, if exists.
797 Examples for scalar cycles:
799 Example1: reduction:
801 loop1:
802 for (i=0; i<N; i++)
803 sum += a[i];
805 Example2: induction:
807 loop2:
808 for (i=0; i<N; i++)
809 a[i] = i; */
811 static void
812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
814 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
816 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
818 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
819 Reductions in such inner-loop therefore have different properties than
820 the reductions in the nest that gets vectorized:
821 1. When vectorized, they are executed in the same order as in the original
822 scalar loop, so we can't change the order of computation when
823 vectorizing them.
824 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
825 current checks are too strict. */
827 if (loop->inner)
828 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
832 /* Function vect_get_loop_niters.
834 Determine how many iterations the loop is executed and place it
835 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
836 in NUMBER_OF_ITERATIONSM1.
838 Return the loop exit condition. */
841 static gcond *
842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
843 tree *number_of_iterationsm1)
845 tree niters;
847 if (dump_enabled_p ())
848 dump_printf_loc (MSG_NOTE, vect_location,
849 "=== get_loop_niters ===\n");
851 niters = number_of_latch_executions (loop);
852 *number_of_iterationsm1 = niters;
854 /* We want the number of loop header executions which is the number
855 of latch executions plus one.
856 ??? For UINT_MAX latch executions this number overflows to zero
857 for loops like do { n++; } while (n != 0); */
858 if (niters && !chrec_contains_undetermined (niters))
859 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
860 build_int_cst (TREE_TYPE (niters), 1));
861 *number_of_iterations = niters;
863 return get_loop_exit_condition (loop);
867 /* Function bb_in_loop_p
869 Used as predicate for dfs order traversal of the loop bbs. */
871 static bool
872 bb_in_loop_p (const_basic_block bb, const void *data)
874 const struct loop *const loop = (const struct loop *)data;
875 if (flow_bb_inside_loop_p (loop, bb))
876 return true;
877 return false;
881 /* Function new_loop_vec_info.
883 Create and initialize a new loop_vec_info struct for LOOP, as well as
884 stmt_vec_info structs for all the stmts in LOOP. */
886 static loop_vec_info
887 new_loop_vec_info (struct loop *loop)
889 loop_vec_info res;
890 basic_block *bbs;
891 gimple_stmt_iterator si;
892 unsigned int i, nbbs;
894 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
895 LOOP_VINFO_LOOP (res) = loop;
897 bbs = get_loop_body (loop);
899 /* Create/Update stmt_info for all stmts in the loop. */
900 for (i = 0; i < loop->num_nodes; i++)
902 basic_block bb = bbs[i];
904 /* BBs in a nested inner-loop will have been already processed (because
905 we will have called vect_analyze_loop_form for any nested inner-loop).
906 Therefore, for stmts in an inner-loop we just want to update the
907 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
908 loop_info of the outer-loop we are currently considering to vectorize
909 (instead of the loop_info of the inner-loop).
910 For stmts in other BBs we need to create a stmt_info from scratch. */
911 if (bb->loop_father != loop)
913 /* Inner-loop bb. */
914 gcc_assert (loop->inner && bb->loop_father == loop->inner);
915 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
917 gimple phi = gsi_stmt (si);
918 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
919 loop_vec_info inner_loop_vinfo =
920 STMT_VINFO_LOOP_VINFO (stmt_info);
921 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
922 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
924 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
926 gimple stmt = gsi_stmt (si);
927 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
928 loop_vec_info inner_loop_vinfo =
929 STMT_VINFO_LOOP_VINFO (stmt_info);
930 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
931 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
934 else
936 /* bb in current nest. */
937 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
939 gimple phi = gsi_stmt (si);
940 gimple_set_uid (phi, 0);
941 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
944 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
946 gimple stmt = gsi_stmt (si);
947 gimple_set_uid (stmt, 0);
948 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
953 /* CHECKME: We want to visit all BBs before their successors (except for
954 latch blocks, for which this assertion wouldn't hold). In the simple
955 case of the loop forms we allow, a dfs order of the BBs would the same
956 as reversed postorder traversal, so we are safe. */
958 free (bbs);
959 bbs = XCNEWVEC (basic_block, loop->num_nodes);
960 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
961 bbs, loop->num_nodes, loop);
962 gcc_assert (nbbs == loop->num_nodes);
964 LOOP_VINFO_BBS (res) = bbs;
965 LOOP_VINFO_NITERSM1 (res) = NULL;
966 LOOP_VINFO_NITERS (res) = NULL;
967 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
968 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
969 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
970 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
971 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
972 LOOP_VINFO_VECT_FACTOR (res) = 0;
973 LOOP_VINFO_LOOP_NEST (res).create (3);
974 LOOP_VINFO_DATAREFS (res).create (10);
975 LOOP_VINFO_DDRS (res).create (10 * 10);
976 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
977 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
978 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
979 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
980 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
981 LOOP_VINFO_GROUPED_STORES (res).create (10);
982 LOOP_VINFO_REDUCTIONS (res).create (10);
983 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
984 LOOP_VINFO_SLP_INSTANCES (res).create (10);
985 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
986 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
987 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
988 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
989 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
991 return res;
995 /* Function destroy_loop_vec_info.
997 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
998 stmts in the loop. */
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1003 struct loop *loop;
1004 basic_block *bbs;
1005 int nbbs;
1006 gimple_stmt_iterator si;
1007 int j;
1008 vec<slp_instance> slp_instances;
1009 slp_instance instance;
1010 bool swapped;
1012 if (!loop_vinfo)
1013 return;
1015 loop = LOOP_VINFO_LOOP (loop_vinfo);
1017 bbs = LOOP_VINFO_BBS (loop_vinfo);
1018 nbbs = clean_stmts ? loop->num_nodes : 0;
1019 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1021 for (j = 0; j < nbbs; j++)
1023 basic_block bb = bbs[j];
1024 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025 free_stmt_vec_info (gsi_stmt (si));
1027 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1029 gimple stmt = gsi_stmt (si);
1031 /* We may have broken canonical form by moving a constant
1032 into RHS1 of a commutative op. Fix such occurrences. */
1033 if (swapped && is_gimple_assign (stmt))
1035 enum tree_code code = gimple_assign_rhs_code (stmt);
1037 if ((code == PLUS_EXPR
1038 || code == POINTER_PLUS_EXPR
1039 || code == MULT_EXPR)
1040 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041 swap_ssa_operands (stmt,
1042 gimple_assign_rhs1_ptr (stmt),
1043 gimple_assign_rhs2_ptr (stmt));
1046 /* Free stmt_vec_info. */
1047 free_stmt_vec_info (stmt);
1048 gsi_next (&si);
1052 free (LOOP_VINFO_BBS (loop_vinfo));
1053 vect_destroy_datarefs (loop_vinfo, NULL);
1054 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060 vect_free_slp_instance (instance);
1062 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1067 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1070 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1072 free (loop_vinfo);
1073 loop->aux = NULL;
1077 /* Function vect_analyze_loop_1.
1079 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080 for it. The different analyses will record information in the
1081 loop_vec_info struct. This is a subset of the analyses applied in
1082 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083 that is now considered for (outer-loop) vectorization. */
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1088 loop_vec_info loop_vinfo;
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "===== analyze_loop_nest_1 =====\n");
1094 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1096 loop_vinfo = vect_analyze_loop_form (loop);
1097 if (!loop_vinfo)
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "bad inner-loop form.\n");
1102 return NULL;
1105 return loop_vinfo;
1109 /* Function vect_analyze_loop_form.
1111 Verify that certain CFG restrictions hold, including:
1112 - the loop has a pre-header
1113 - the loop has a single entry and exit
1114 - the loop exit condition is simple enough, and the number of iterations
1115 can be analyzed (a countable loop). */
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1120 loop_vec_info loop_vinfo;
1121 gcond *loop_cond;
1122 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123 loop_vec_info inner_loop_vinfo = NULL;
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_NOTE, vect_location,
1127 "=== vect_analyze_loop_form ===\n");
1129 /* Different restrictions apply when we are considering an inner-most loop,
1130 vs. an outer (nested) loop.
1131 (FORNOW. May want to relax some of these restrictions in the future). */
1133 if (!loop->inner)
1135 /* Inner-most loop. We currently require that the number of BBs is
1136 exactly 2 (the header and latch). Vectorizable inner-most loops
1137 look like this:
1139 (pre-header)
1141 header <--------+
1142 | | |
1143 | +--> latch --+
1145 (exit-bb) */
1147 if (loop->num_nodes != 2)
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151 "not vectorized: control flow in loop.\n");
1152 return NULL;
1155 if (empty_block_p (loop->header))
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "not vectorized: empty loop.\n");
1160 return NULL;
1163 else
1165 struct loop *innerloop = loop->inner;
1166 edge entryedge;
1168 /* Nested loop. We currently require that the loop is doubly-nested,
1169 contains a single inner loop, and the number of BBs is exactly 5.
1170 Vectorizable outer-loops look like this:
1172 (pre-header)
1174 header <---+
1176 inner-loop |
1178 tail ------+
1180 (exit-bb)
1182 The inner-loop has the properties expected of inner-most loops
1183 as described above. */
1185 if ((loop->inner)->inner || (loop->inner)->next)
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "not vectorized: multiple nested loops.\n");
1190 return NULL;
1193 /* Analyze the inner-loop. */
1194 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195 if (!inner_loop_vinfo)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "not vectorized: Bad inner loop.\n");
1200 return NULL;
1203 if (!expr_invariant_in_loop_p (loop,
1204 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "not vectorized: inner-loop count not"
1209 " invariant.\n");
1210 destroy_loop_vec_info (inner_loop_vinfo, true);
1211 return NULL;
1214 if (loop->num_nodes != 5)
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "not vectorized: control flow in loop.\n");
1219 destroy_loop_vec_info (inner_loop_vinfo, true);
1220 return NULL;
1223 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224 entryedge = EDGE_PRED (innerloop->header, 0);
1225 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226 entryedge = EDGE_PRED (innerloop->header, 1);
1228 if (entryedge->src != loop->header
1229 || !single_exit (innerloop)
1230 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: unsupported outerloop form.\n");
1235 destroy_loop_vec_info (inner_loop_vinfo, true);
1236 return NULL;
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_NOTE, vect_location,
1241 "Considering outer-loop vectorization.\n");
1244 if (!single_exit (loop)
1245 || EDGE_COUNT (loop->header->preds) != 2)
1247 if (dump_enabled_p ())
1249 if (!single_exit (loop))
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 "not vectorized: multiple exits.\n");
1252 else if (EDGE_COUNT (loop->header->preds) != 2)
1253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 "not vectorized: too many incoming edges.\n");
1256 if (inner_loop_vinfo)
1257 destroy_loop_vec_info (inner_loop_vinfo, true);
1258 return NULL;
1261 /* We assume that the loop exit condition is at the end of the loop. i.e,
1262 that the loop is represented as a do-while (with a proper if-guard
1263 before the loop if needed), where the loop header contains all the
1264 executable statements, and the latch is empty. */
1265 if (!empty_block_p (loop->latch)
1266 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1268 if (dump_enabled_p ())
1269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270 "not vectorized: latch block not empty.\n");
1271 if (inner_loop_vinfo)
1272 destroy_loop_vec_info (inner_loop_vinfo, true);
1273 return NULL;
1276 /* Make sure there exists a single-predecessor exit bb: */
1277 if (!single_pred_p (single_exit (loop)->dest))
1279 edge e = single_exit (loop);
1280 if (!(e->flags & EDGE_ABNORMAL))
1282 split_loop_exit_edge (e);
1283 if (dump_enabled_p ())
1284 dump_printf (MSG_NOTE, "split exit edge.\n");
1286 else
1288 if (dump_enabled_p ())
1289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290 "not vectorized: abnormal loop exit edge.\n");
1291 if (inner_loop_vinfo)
1292 destroy_loop_vec_info (inner_loop_vinfo, true);
1293 return NULL;
1297 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298 &number_of_iterationsm1);
1299 if (!loop_cond)
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: complicated exit condition.\n");
1304 if (inner_loop_vinfo)
1305 destroy_loop_vec_info (inner_loop_vinfo, true);
1306 return NULL;
1309 if (!number_of_iterations
1310 || chrec_contains_undetermined (number_of_iterations))
1312 if (dump_enabled_p ())
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 "not vectorized: number of iterations cannot be "
1315 "computed.\n");
1316 if (inner_loop_vinfo)
1317 destroy_loop_vec_info (inner_loop_vinfo, true);
1318 return NULL;
1321 if (integer_zerop (number_of_iterations))
1323 if (dump_enabled_p ())
1324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325 "not vectorized: number of iterations = 0.\n");
1326 if (inner_loop_vinfo)
1327 destroy_loop_vec_info (inner_loop_vinfo, true);
1328 return NULL;
1331 loop_vinfo = new_loop_vec_info (loop);
1332 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1336 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1338 if (dump_enabled_p ())
1340 dump_printf_loc (MSG_NOTE, vect_location,
1341 "Symbolic number of iterations is ");
1342 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343 dump_printf (MSG_NOTE, "\n");
1347 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1349 /* CHECKME: May want to keep it around it in the future. */
1350 if (inner_loop_vinfo)
1351 destroy_loop_vec_info (inner_loop_vinfo, false);
1353 gcc_assert (!loop->aux);
1354 loop->aux = loop_vinfo;
1355 return loop_vinfo;
1359 /* Function vect_analyze_loop_operations.
1361 Scan the loop stmts and make sure they are all vectorizable. */
1363 static bool
1364 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1368 int nbbs = loop->num_nodes;
1369 unsigned int vectorization_factor = 0;
1370 int i;
1371 stmt_vec_info stmt_info;
1372 bool need_to_vectorize = false;
1373 int min_profitable_iters;
1374 int min_scalar_loop_bound;
1375 unsigned int th;
1376 bool only_slp_in_loop = true, ok;
1377 HOST_WIDE_INT max_niter;
1378 HOST_WIDE_INT estimated_niter;
1379 int min_profitable_estimate;
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location,
1383 "=== vect_analyze_loop_operations ===\n");
1385 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1386 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1387 if (slp)
1389 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390 vectorization factor of the loop is the unrolling factor required by
1391 the SLP instances. If that unrolling factor is 1, we say, that we
1392 perform pure SLP on loop - cross iteration parallelism is not
1393 exploited. */
1394 for (i = 0; i < nbbs; i++)
1396 basic_block bb = bbs[i];
1397 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398 gsi_next (&si))
1400 gimple stmt = gsi_stmt (si);
1401 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1402 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1403 && STMT_VINFO_RELATED_STMT (stmt_info))
1405 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1406 stmt_info = vinfo_for_stmt (stmt);
1408 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1409 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1410 && !PURE_SLP_STMT (stmt_info))
1411 /* STMT needs both SLP and loop-based vectorization. */
1412 only_slp_in_loop = false;
1416 if (only_slp_in_loop)
1417 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1418 else
1419 vectorization_factor = least_common_multiple (vectorization_factor,
1420 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1422 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1423 if (dump_enabled_p ())
1424 dump_printf_loc (MSG_NOTE, vect_location,
1425 "Updating vectorization factor to %d\n",
1426 vectorization_factor);
1429 for (i = 0; i < nbbs; i++)
1431 basic_block bb = bbs[i];
1433 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1434 gsi_next (&si))
1436 gphi *phi = si.phi ();
1437 ok = true;
1439 stmt_info = vinfo_for_stmt (phi);
1440 if (dump_enabled_p ())
1442 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1443 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1444 dump_printf (MSG_NOTE, "\n");
1447 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1448 (i.e., a phi in the tail of the outer-loop). */
1449 if (! is_loop_header_bb_p (bb))
1451 /* FORNOW: we currently don't support the case that these phis
1452 are not used in the outerloop (unless it is double reduction,
1453 i.e., this phi is vect_reduction_def), cause this case
1454 requires to actually do something here. */
1455 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1456 || STMT_VINFO_LIVE_P (stmt_info))
1457 && STMT_VINFO_DEF_TYPE (stmt_info)
1458 != vect_double_reduction_def)
1460 if (dump_enabled_p ())
1461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1462 "Unsupported loop-closed phi in "
1463 "outer-loop.\n");
1464 return false;
1467 /* If PHI is used in the outer loop, we check that its operand
1468 is defined in the inner loop. */
1469 if (STMT_VINFO_RELEVANT_P (stmt_info))
1471 tree phi_op;
1472 gimple op_def_stmt;
1474 if (gimple_phi_num_args (phi) != 1)
1475 return false;
1477 phi_op = PHI_ARG_DEF (phi, 0);
1478 if (TREE_CODE (phi_op) != SSA_NAME)
1479 return false;
1481 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1482 if (gimple_nop_p (op_def_stmt)
1483 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1484 || !vinfo_for_stmt (op_def_stmt))
1485 return false;
1487 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1488 != vect_used_in_outer
1489 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1490 != vect_used_in_outer_by_reduction)
1491 return false;
1494 continue;
1497 gcc_assert (stmt_info);
1499 if (STMT_VINFO_LIVE_P (stmt_info))
1501 /* FORNOW: not yet supported. */
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504 "not vectorized: value used after loop.\n");
1505 return false;
1508 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1509 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1511 /* A scalar-dependence cycle that we don't support. */
1512 if (dump_enabled_p ())
1513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1514 "not vectorized: scalar dependence cycle.\n");
1515 return false;
1518 if (STMT_VINFO_RELEVANT_P (stmt_info))
1520 need_to_vectorize = true;
1521 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1522 ok = vectorizable_induction (phi, NULL, NULL);
1525 if (!ok)
1527 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 "not vectorized: relevant phi not "
1531 "supported: ");
1532 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1533 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1535 return false;
1539 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1540 gsi_next (&si))
1542 gimple stmt = gsi_stmt (si);
1543 if (!gimple_clobber_p (stmt)
1544 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1545 return false;
1547 } /* bbs */
1549 /* All operations in the loop are either irrelevant (deal with loop
1550 control, or dead), or only used outside the loop and can be moved
1551 out of the loop (e.g. invariants, inductions). The loop can be
1552 optimized away by scalar optimizations. We're better off not
1553 touching this loop. */
1554 if (!need_to_vectorize)
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_NOTE, vect_location,
1558 "All the computation can be taken out of the loop.\n");
1559 if (dump_enabled_p ())
1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561 "not vectorized: redundant loop. no profit to "
1562 "vectorize.\n");
1563 return false;
1566 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1567 dump_printf_loc (MSG_NOTE, vect_location,
1568 "vectorization_factor = %d, niters = "
1569 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1570 LOOP_VINFO_INT_NITERS (loop_vinfo));
1572 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1573 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1574 || ((max_niter = max_stmt_executions_int (loop)) != -1
1575 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 "not vectorized: iteration count too small.\n");
1580 if (dump_enabled_p ())
1581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1582 "not vectorized: iteration count smaller than "
1583 "vectorization factor.\n");
1584 return false;
1587 /* Analyze cost. Decide if worth while to vectorize. */
1589 /* Once VF is set, SLP costs should be updated since the number of created
1590 vector stmts depends on VF. */
1591 vect_update_slp_costs_according_to_vf (loop_vinfo);
1593 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1594 &min_profitable_estimate);
1595 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1597 if (min_profitable_iters < 0)
1599 if (dump_enabled_p ())
1600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601 "not vectorized: vectorization not profitable.\n");
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1604 "not vectorized: vector version will never be "
1605 "profitable.\n");
1606 return false;
1609 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1610 * vectorization_factor) - 1);
1613 /* Use the cost model only if it is more conservative than user specified
1614 threshold. */
1616 th = (unsigned) min_scalar_loop_bound;
1617 if (min_profitable_iters
1618 && (!min_scalar_loop_bound
1619 || min_profitable_iters > min_scalar_loop_bound))
1620 th = (unsigned) min_profitable_iters;
1622 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1624 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1625 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1627 if (dump_enabled_p ())
1628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1629 "not vectorized: vectorization not profitable.\n");
1630 if (dump_enabled_p ())
1631 dump_printf_loc (MSG_NOTE, vect_location,
1632 "not vectorized: iteration count smaller than user "
1633 "specified loop bound parameter or minimum profitable "
1634 "iterations (whichever is more conservative).\n");
1635 return false;
1638 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1639 && ((unsigned HOST_WIDE_INT) estimated_niter
1640 <= MAX (th, (unsigned)min_profitable_estimate)))
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644 "not vectorized: estimated iteration count too "
1645 "small.\n");
1646 if (dump_enabled_p ())
1647 dump_printf_loc (MSG_NOTE, vect_location,
1648 "not vectorized: estimated iteration count smaller "
1649 "than specified loop bound parameter or minimum "
1650 "profitable iterations (whichever is more "
1651 "conservative).\n");
1652 return false;
1655 return true;
1659 /* Function vect_analyze_loop_2.
1661 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1662 for it. The different analyses will record information in the
1663 loop_vec_info struct. */
1664 static bool
1665 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1667 bool ok, slp = false;
1668 int max_vf = MAX_VECTORIZATION_FACTOR;
1669 int min_vf = 2;
1670 unsigned int th;
1671 unsigned int n_stmts = 0;
1673 /* Find all data references in the loop (which correspond to vdefs/vuses)
1674 and analyze their evolution in the loop. Also adjust the minimal
1675 vectorization factor according to the loads and stores.
1677 FORNOW: Handle only simple, array references, which
1678 alignment can be forced, and aligned pointer-references. */
1680 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1681 if (!ok)
1683 if (dump_enabled_p ())
1684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1685 "bad data references.\n");
1686 return false;
1689 /* Classify all cross-iteration scalar data-flow cycles.
1690 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1692 vect_analyze_scalar_cycles (loop_vinfo);
1694 vect_pattern_recog (loop_vinfo, NULL);
1696 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1697 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1699 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1700 if (!ok)
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704 "bad data access.\n");
1705 return false;
1708 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1710 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1711 if (!ok)
1713 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715 "unexpected pattern.\n");
1716 return false;
1719 /* Analyze data dependences between the data-refs in the loop
1720 and adjust the maximum vectorization factor according to
1721 the dependences.
1722 FORNOW: fail at the first data dependence that we encounter. */
1724 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1725 if (!ok
1726 || max_vf < min_vf)
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730 "bad data dependence.\n");
1731 return false;
1734 ok = vect_determine_vectorization_factor (loop_vinfo);
1735 if (!ok)
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1739 "can't determine vectorization factor.\n");
1740 return false;
1742 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1744 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1746 "bad data dependence.\n");
1747 return false;
1750 /* Analyze the alignment of the data-refs in the loop.
1751 Fail if a data reference is found that cannot be vectorized. */
1753 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1754 if (!ok)
1756 if (dump_enabled_p ())
1757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1758 "bad data alignment.\n");
1759 return false;
1762 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1763 It is important to call pruning after vect_analyze_data_ref_accesses,
1764 since we use grouping information gathered by interleaving analysis. */
1765 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1766 if (!ok)
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1770 "number of versioning for alias "
1771 "run-time tests exceeds %d "
1772 "(--param vect-max-version-for-alias-checks)\n",
1773 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1774 return false;
1777 /* This pass will decide on using loop versioning and/or loop peeling in
1778 order to enhance the alignment of data references in the loop. */
1780 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1781 if (!ok)
1783 if (dump_enabled_p ())
1784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785 "bad data alignment.\n");
1786 return false;
1789 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1790 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1791 if (ok)
1793 /* Decide which possible SLP instances to SLP. */
1794 slp = vect_make_slp_decision (loop_vinfo);
1796 /* Find stmts that need to be both vectorized and SLPed. */
1797 vect_detect_hybrid_slp (loop_vinfo);
1799 else
1800 return false;
1802 /* Scan all the operations in the loop and make sure they are
1803 vectorizable. */
1805 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1806 if (!ok)
1808 if (dump_enabled_p ())
1809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1810 "bad operation or unsupported loop bound.\n");
1811 return false;
1814 /* Decide whether we need to create an epilogue loop to handle
1815 remaining scalar iterations. */
1816 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1817 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1818 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1820 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1821 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1823 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1824 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1825 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1826 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1828 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1829 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1830 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1831 /* In case of versioning, check if the maximum number of
1832 iterations is greater than th. If they are identical,
1833 the epilogue is unnecessary. */
1834 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1835 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1836 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1837 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1838 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1840 /* If an epilogue loop is required make sure we can create one. */
1841 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1842 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1844 if (dump_enabled_p ())
1845 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1846 if (!vect_can_advance_ivs_p (loop_vinfo)
1847 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1848 single_exit (LOOP_VINFO_LOOP
1849 (loop_vinfo))))
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853 "not vectorized: can't create required "
1854 "epilog loop\n");
1855 return false;
1859 return true;
1862 /* Function vect_analyze_loop.
1864 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1865 for it. The different analyses will record information in the
1866 loop_vec_info struct. */
1867 loop_vec_info
1868 vect_analyze_loop (struct loop *loop)
1870 loop_vec_info loop_vinfo;
1871 unsigned int vector_sizes;
1873 /* Autodetect first vector size we try. */
1874 current_vector_size = 0;
1875 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_NOTE, vect_location,
1879 "===== analyze_loop_nest =====\n");
1881 if (loop_outer (loop)
1882 && loop_vec_info_for_loop (loop_outer (loop))
1883 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1885 if (dump_enabled_p ())
1886 dump_printf_loc (MSG_NOTE, vect_location,
1887 "outer-loop already vectorized.\n");
1888 return NULL;
1891 while (1)
1893 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1894 loop_vinfo = vect_analyze_loop_form (loop);
1895 if (!loop_vinfo)
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 "bad loop form.\n");
1900 return NULL;
1903 if (vect_analyze_loop_2 (loop_vinfo))
1905 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1907 return loop_vinfo;
1910 destroy_loop_vec_info (loop_vinfo, true);
1912 vector_sizes &= ~current_vector_size;
1913 if (vector_sizes == 0
1914 || current_vector_size == 0)
1915 return NULL;
1917 /* Try the next biggest vector size. */
1918 current_vector_size = 1 << floor_log2 (vector_sizes);
1919 if (dump_enabled_p ())
1920 dump_printf_loc (MSG_NOTE, vect_location,
1921 "***** Re-trying analysis with "
1922 "vector size %d\n", current_vector_size);
1927 /* Function reduction_code_for_scalar_code
1929 Input:
1930 CODE - tree_code of a reduction operations.
1932 Output:
1933 REDUC_CODE - the corresponding tree-code to be used to reduce the
1934 vector of partial results into a single scalar result, or ERROR_MARK
1935 if the operation is a supported reduction operation, but does not have
1936 such a tree-code.
1938 Return FALSE if CODE currently cannot be vectorized as reduction. */
1940 static bool
1941 reduction_code_for_scalar_code (enum tree_code code,
1942 enum tree_code *reduc_code)
1944 switch (code)
1946 case MAX_EXPR:
1947 *reduc_code = REDUC_MAX_EXPR;
1948 return true;
1950 case MIN_EXPR:
1951 *reduc_code = REDUC_MIN_EXPR;
1952 return true;
1954 case PLUS_EXPR:
1955 *reduc_code = REDUC_PLUS_EXPR;
1956 return true;
1958 case MULT_EXPR:
1959 case MINUS_EXPR:
1960 case BIT_IOR_EXPR:
1961 case BIT_XOR_EXPR:
1962 case BIT_AND_EXPR:
1963 *reduc_code = ERROR_MARK;
1964 return true;
1966 default:
1967 return false;
1972 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1973 STMT is printed with a message MSG. */
1975 static void
1976 report_vect_op (int msg_type, gimple stmt, const char *msg)
1978 dump_printf_loc (msg_type, vect_location, "%s", msg);
1979 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1980 dump_printf (msg_type, "\n");
1984 /* Detect SLP reduction of the form:
1986 #a1 = phi <a5, a0>
1987 a2 = operation (a1)
1988 a3 = operation (a2)
1989 a4 = operation (a3)
1990 a5 = operation (a4)
1992 #a = phi <a5>
1994 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1995 FIRST_STMT is the first reduction stmt in the chain
1996 (a2 = operation (a1)).
1998 Return TRUE if a reduction chain was detected. */
2000 static bool
2001 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2003 struct loop *loop = (gimple_bb (phi))->loop_father;
2004 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2005 enum tree_code code;
2006 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2007 stmt_vec_info use_stmt_info, current_stmt_info;
2008 tree lhs;
2009 imm_use_iterator imm_iter;
2010 use_operand_p use_p;
2011 int nloop_uses, size = 0, n_out_of_loop_uses;
2012 bool found = false;
2014 if (loop != vect_loop)
2015 return false;
2017 lhs = PHI_RESULT (phi);
2018 code = gimple_assign_rhs_code (first_stmt);
2019 while (1)
2021 nloop_uses = 0;
2022 n_out_of_loop_uses = 0;
2023 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2025 gimple use_stmt = USE_STMT (use_p);
2026 if (is_gimple_debug (use_stmt))
2027 continue;
2029 /* Check if we got back to the reduction phi. */
2030 if (use_stmt == phi)
2032 loop_use_stmt = use_stmt;
2033 found = true;
2034 break;
2037 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2039 loop_use_stmt = use_stmt;
2040 nloop_uses++;
2042 else
2043 n_out_of_loop_uses++;
2045 /* There are can be either a single use in the loop or two uses in
2046 phi nodes. */
2047 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2048 return false;
2051 if (found)
2052 break;
2054 /* We reached a statement with no loop uses. */
2055 if (nloop_uses == 0)
2056 return false;
2058 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2059 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2060 return false;
2062 if (!is_gimple_assign (loop_use_stmt)
2063 || code != gimple_assign_rhs_code (loop_use_stmt)
2064 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2065 return false;
2067 /* Insert USE_STMT into reduction chain. */
2068 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2069 if (current_stmt)
2071 current_stmt_info = vinfo_for_stmt (current_stmt);
2072 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2073 GROUP_FIRST_ELEMENT (use_stmt_info)
2074 = GROUP_FIRST_ELEMENT (current_stmt_info);
2076 else
2077 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2079 lhs = gimple_assign_lhs (loop_use_stmt);
2080 current_stmt = loop_use_stmt;
2081 size++;
2084 if (!found || loop_use_stmt != phi || size < 2)
2085 return false;
2087 /* Swap the operands, if needed, to make the reduction operand be the second
2088 operand. */
2089 lhs = PHI_RESULT (phi);
2090 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2091 while (next_stmt)
2093 if (gimple_assign_rhs2 (next_stmt) == lhs)
2095 tree op = gimple_assign_rhs1 (next_stmt);
2096 gimple def_stmt = NULL;
2098 if (TREE_CODE (op) == SSA_NAME)
2099 def_stmt = SSA_NAME_DEF_STMT (op);
2101 /* Check that the other def is either defined in the loop
2102 ("vect_internal_def"), or it's an induction (defined by a
2103 loop-header phi-node). */
2104 if (def_stmt
2105 && gimple_bb (def_stmt)
2106 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2107 && (is_gimple_assign (def_stmt)
2108 || is_gimple_call (def_stmt)
2109 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2110 == vect_induction_def
2111 || (gimple_code (def_stmt) == GIMPLE_PHI
2112 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2113 == vect_internal_def
2114 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2116 lhs = gimple_assign_lhs (next_stmt);
2117 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2118 continue;
2121 return false;
2123 else
2125 tree op = gimple_assign_rhs2 (next_stmt);
2126 gimple def_stmt = NULL;
2128 if (TREE_CODE (op) == SSA_NAME)
2129 def_stmt = SSA_NAME_DEF_STMT (op);
2131 /* Check that the other def is either defined in the loop
2132 ("vect_internal_def"), or it's an induction (defined by a
2133 loop-header phi-node). */
2134 if (def_stmt
2135 && gimple_bb (def_stmt)
2136 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2137 && (is_gimple_assign (def_stmt)
2138 || is_gimple_call (def_stmt)
2139 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2140 == vect_induction_def
2141 || (gimple_code (def_stmt) == GIMPLE_PHI
2142 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2143 == vect_internal_def
2144 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2146 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2149 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2150 dump_printf (MSG_NOTE, "\n");
2153 swap_ssa_operands (next_stmt,
2154 gimple_assign_rhs1_ptr (next_stmt),
2155 gimple_assign_rhs2_ptr (next_stmt));
2156 update_stmt (next_stmt);
2158 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2159 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2161 else
2162 return false;
2165 lhs = gimple_assign_lhs (next_stmt);
2166 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2169 /* Save the chain for further analysis in SLP detection. */
2170 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2171 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2172 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2174 return true;
2178 /* Function vect_is_simple_reduction_1
2180 (1) Detect a cross-iteration def-use cycle that represents a simple
2181 reduction computation. We look for the following pattern:
2183 loop_header:
2184 a1 = phi < a0, a2 >
2185 a3 = ...
2186 a2 = operation (a3, a1)
2190 a3 = ...
2191 loop_header:
2192 a1 = phi < a0, a2 >
2193 a2 = operation (a3, a1)
2195 such that:
2196 1. operation is commutative and associative and it is safe to
2197 change the order of the computation (if CHECK_REDUCTION is true)
2198 2. no uses for a2 in the loop (a2 is used out of the loop)
2199 3. no uses of a1 in the loop besides the reduction operation
2200 4. no uses of a1 outside the loop.
2202 Conditions 1,4 are tested here.
2203 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2205 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2206 nested cycles, if CHECK_REDUCTION is false.
2208 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2209 reductions:
2211 a1 = phi < a0, a2 >
2212 inner loop (def of a3)
2213 a2 = phi < a3 >
2215 If MODIFY is true it tries also to rework the code in-place to enable
2216 detection of more reduction patterns. For the time being we rewrite
2217 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2220 static gimple
2221 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2222 bool check_reduction, bool *double_reduc,
2223 bool modify)
2225 struct loop *loop = (gimple_bb (phi))->loop_father;
2226 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2227 edge latch_e = loop_latch_edge (loop);
2228 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2229 gimple def_stmt, def1 = NULL, def2 = NULL;
2230 enum tree_code orig_code, code;
2231 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2232 tree type;
2233 int nloop_uses;
2234 tree name;
2235 imm_use_iterator imm_iter;
2236 use_operand_p use_p;
2237 bool phi_def;
2239 *double_reduc = false;
2241 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2242 otherwise, we assume outer loop vectorization. */
2243 gcc_assert ((check_reduction && loop == vect_loop)
2244 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2246 name = PHI_RESULT (phi);
2247 /* ??? If there are no uses of the PHI result the inner loop reduction
2248 won't be detected as possibly double-reduction by vectorizable_reduction
2249 because that tries to walk the PHI arg from the preheader edge which
2250 can be constant. See PR60382. */
2251 if (has_zero_uses (name))
2252 return NULL;
2253 nloop_uses = 0;
2254 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2256 gimple use_stmt = USE_STMT (use_p);
2257 if (is_gimple_debug (use_stmt))
2258 continue;
2260 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264 "intermediate value used outside loop.\n");
2266 return NULL;
2269 nloop_uses++;
2270 if (nloop_uses > 1)
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274 "reduction used in loop.\n");
2275 return NULL;
2279 if (TREE_CODE (loop_arg) != SSA_NAME)
2281 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2284 "reduction: not ssa_name: ");
2285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2286 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2288 return NULL;
2291 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2292 if (!def_stmt)
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296 "reduction: no def_stmt.\n");
2297 return NULL;
2300 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2302 if (dump_enabled_p ())
2304 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2305 dump_printf (MSG_NOTE, "\n");
2307 return NULL;
2310 if (is_gimple_assign (def_stmt))
2312 name = gimple_assign_lhs (def_stmt);
2313 phi_def = false;
2315 else
2317 name = PHI_RESULT (def_stmt);
2318 phi_def = true;
2321 nloop_uses = 0;
2322 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2324 gimple use_stmt = USE_STMT (use_p);
2325 if (is_gimple_debug (use_stmt))
2326 continue;
2327 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2328 nloop_uses++;
2329 if (nloop_uses > 1)
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "reduction used in loop.\n");
2334 return NULL;
2338 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2339 defined in the inner loop. */
2340 if (phi_def)
2342 op1 = PHI_ARG_DEF (def_stmt, 0);
2344 if (gimple_phi_num_args (def_stmt) != 1
2345 || TREE_CODE (op1) != SSA_NAME)
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349 "unsupported phi node definition.\n");
2351 return NULL;
2354 def1 = SSA_NAME_DEF_STMT (op1);
2355 if (gimple_bb (def1)
2356 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2357 && loop->inner
2358 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2359 && is_gimple_assign (def1))
2361 if (dump_enabled_p ())
2362 report_vect_op (MSG_NOTE, def_stmt,
2363 "detected double reduction: ");
2365 *double_reduc = true;
2366 return def_stmt;
2369 return NULL;
2372 code = orig_code = gimple_assign_rhs_code (def_stmt);
2374 /* We can handle "res -= x[i]", which is non-associative by
2375 simply rewriting this into "res += -x[i]". Avoid changing
2376 gimple instruction for the first simple tests and only do this
2377 if we're allowed to change code at all. */
2378 if (code == MINUS_EXPR
2379 && modify
2380 && (op1 = gimple_assign_rhs1 (def_stmt))
2381 && TREE_CODE (op1) == SSA_NAME
2382 && SSA_NAME_DEF_STMT (op1) == phi)
2383 code = PLUS_EXPR;
2385 if (check_reduction
2386 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2388 if (dump_enabled_p ())
2389 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2390 "reduction: not commutative/associative: ");
2391 return NULL;
2394 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2396 if (code != COND_EXPR)
2398 if (dump_enabled_p ())
2399 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2400 "reduction: not binary operation: ");
2402 return NULL;
2405 op3 = gimple_assign_rhs1 (def_stmt);
2406 if (COMPARISON_CLASS_P (op3))
2408 op4 = TREE_OPERAND (op3, 1);
2409 op3 = TREE_OPERAND (op3, 0);
2412 op1 = gimple_assign_rhs2 (def_stmt);
2413 op2 = gimple_assign_rhs3 (def_stmt);
2415 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2417 if (dump_enabled_p ())
2418 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2419 "reduction: uses not ssa_names: ");
2421 return NULL;
2424 else
2426 op1 = gimple_assign_rhs1 (def_stmt);
2427 op2 = gimple_assign_rhs2 (def_stmt);
2429 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2431 if (dump_enabled_p ())
2432 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2433 "reduction: uses not ssa_names: ");
2435 return NULL;
2439 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2440 if ((TREE_CODE (op1) == SSA_NAME
2441 && !types_compatible_p (type,TREE_TYPE (op1)))
2442 || (TREE_CODE (op2) == SSA_NAME
2443 && !types_compatible_p (type, TREE_TYPE (op2)))
2444 || (op3 && TREE_CODE (op3) == SSA_NAME
2445 && !types_compatible_p (type, TREE_TYPE (op3)))
2446 || (op4 && TREE_CODE (op4) == SSA_NAME
2447 && !types_compatible_p (type, TREE_TYPE (op4))))
2449 if (dump_enabled_p ())
2451 dump_printf_loc (MSG_NOTE, vect_location,
2452 "reduction: multiple types: operation type: ");
2453 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2454 dump_printf (MSG_NOTE, ", operands types: ");
2455 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2456 TREE_TYPE (op1));
2457 dump_printf (MSG_NOTE, ",");
2458 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2459 TREE_TYPE (op2));
2460 if (op3)
2462 dump_printf (MSG_NOTE, ",");
2463 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2464 TREE_TYPE (op3));
2467 if (op4)
2469 dump_printf (MSG_NOTE, ",");
2470 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2471 TREE_TYPE (op4));
2473 dump_printf (MSG_NOTE, "\n");
2476 return NULL;
2479 /* Check that it's ok to change the order of the computation.
2480 Generally, when vectorizing a reduction we change the order of the
2481 computation. This may change the behavior of the program in some
2482 cases, so we need to check that this is ok. One exception is when
2483 vectorizing an outer-loop: the inner-loop is executed sequentially,
2484 and therefore vectorizing reductions in the inner-loop during
2485 outer-loop vectorization is safe. */
2487 /* CHECKME: check for !flag_finite_math_only too? */
2488 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2489 && check_reduction)
2491 /* Changing the order of operations changes the semantics. */
2492 if (dump_enabled_p ())
2493 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2494 "reduction: unsafe fp math optimization: ");
2495 return NULL;
2497 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2498 && check_reduction)
2500 /* Changing the order of operations changes the semantics. */
2501 if (dump_enabled_p ())
2502 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2503 "reduction: unsafe int math optimization: ");
2504 return NULL;
2506 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2508 /* Changing the order of operations changes the semantics. */
2509 if (dump_enabled_p ())
2510 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2511 "reduction: unsafe fixed-point math optimization: ");
2512 return NULL;
2515 /* If we detected "res -= x[i]" earlier, rewrite it into
2516 "res += -x[i]" now. If this turns out to be useless reassoc
2517 will clean it up again. */
2518 if (orig_code == MINUS_EXPR)
2520 tree rhs = gimple_assign_rhs2 (def_stmt);
2521 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2522 gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2523 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2524 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2525 loop_info, NULL));
2526 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2527 gimple_assign_set_rhs2 (def_stmt, negrhs);
2528 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2529 update_stmt (def_stmt);
2532 /* Reduction is safe. We're dealing with one of the following:
2533 1) integer arithmetic and no trapv
2534 2) floating point arithmetic, and special flags permit this optimization
2535 3) nested cycle (i.e., outer loop vectorization). */
2536 if (TREE_CODE (op1) == SSA_NAME)
2537 def1 = SSA_NAME_DEF_STMT (op1);
2539 if (TREE_CODE (op2) == SSA_NAME)
2540 def2 = SSA_NAME_DEF_STMT (op2);
2542 if (code != COND_EXPR
2543 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2545 if (dump_enabled_p ())
2546 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2547 return NULL;
2550 /* Check that one def is the reduction def, defined by PHI,
2551 the other def is either defined in the loop ("vect_internal_def"),
2552 or it's an induction (defined by a loop-header phi-node). */
2554 if (def2 && def2 == phi
2555 && (code == COND_EXPR
2556 || !def1 || gimple_nop_p (def1)
2557 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2558 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2559 && (is_gimple_assign (def1)
2560 || is_gimple_call (def1)
2561 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2562 == vect_induction_def
2563 || (gimple_code (def1) == GIMPLE_PHI
2564 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2565 == vect_internal_def
2566 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2568 if (dump_enabled_p ())
2569 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2570 return def_stmt;
2573 if (def1 && def1 == phi
2574 && (code == COND_EXPR
2575 || !def2 || gimple_nop_p (def2)
2576 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2577 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2578 && (is_gimple_assign (def2)
2579 || is_gimple_call (def2)
2580 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2581 == vect_induction_def
2582 || (gimple_code (def2) == GIMPLE_PHI
2583 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2584 == vect_internal_def
2585 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2587 if (check_reduction)
2589 /* Swap operands (just for simplicity - so that the rest of the code
2590 can assume that the reduction variable is always the last (second)
2591 argument). */
2592 if (dump_enabled_p ())
2593 report_vect_op (MSG_NOTE, def_stmt,
2594 "detected reduction: need to swap operands: ");
2596 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2597 gimple_assign_rhs2_ptr (def_stmt));
2599 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2600 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2602 else
2604 if (dump_enabled_p ())
2605 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2608 return def_stmt;
2611 /* Try to find SLP reduction chain. */
2612 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2614 if (dump_enabled_p ())
2615 report_vect_op (MSG_NOTE, def_stmt,
2616 "reduction: detected reduction chain: ");
2618 return def_stmt;
2621 if (dump_enabled_p ())
2622 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2623 "reduction: unknown pattern: ");
2625 return NULL;
2628 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2629 in-place. Arguments as there. */
2631 static gimple
2632 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2633 bool check_reduction, bool *double_reduc)
2635 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2636 double_reduc, false);
2639 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2640 in-place if it enables detection of more reductions. Arguments
2641 as there. */
2643 gimple
2644 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2645 bool check_reduction, bool *double_reduc)
2647 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2648 double_reduc, true);
2651 /* Calculate the cost of one scalar iteration of the loop. */
2653 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2654 stmt_vector_for_cost *scalar_cost_vec)
2656 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2657 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2658 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2659 int innerloop_iters, i;
2661 /* Count statements in scalar loop. Using this as scalar cost for a single
2662 iteration for now.
2664 TODO: Add outer loop support.
2666 TODO: Consider assigning different costs to different scalar
2667 statements. */
2669 /* FORNOW. */
2670 innerloop_iters = 1;
2671 if (loop->inner)
2672 innerloop_iters = 50; /* FIXME */
2674 for (i = 0; i < nbbs; i++)
2676 gimple_stmt_iterator si;
2677 basic_block bb = bbs[i];
2679 if (bb->loop_father == loop->inner)
2680 factor = innerloop_iters;
2681 else
2682 factor = 1;
2684 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2686 gimple stmt = gsi_stmt (si);
2687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2689 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2690 continue;
2692 /* Skip stmts that are not vectorized inside the loop. */
2693 if (stmt_info
2694 && !STMT_VINFO_RELEVANT_P (stmt_info)
2695 && (!STMT_VINFO_LIVE_P (stmt_info)
2696 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2697 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2698 continue;
2700 vect_cost_for_stmt kind;
2701 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2703 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2704 kind = scalar_load;
2705 else
2706 kind = scalar_store;
2708 else
2709 kind = scalar_stmt;
2711 scalar_single_iter_cost
2712 += record_stmt_cost (scalar_cost_vec, factor, kind,
2713 NULL, 0, vect_prologue);
2716 return scalar_single_iter_cost;
2719 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2721 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2722 int *peel_iters_epilogue,
2723 stmt_vector_for_cost *scalar_cost_vec,
2724 stmt_vector_for_cost *prologue_cost_vec,
2725 stmt_vector_for_cost *epilogue_cost_vec)
2727 int retval = 0;
2728 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2730 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2732 *peel_iters_epilogue = vf/2;
2733 if (dump_enabled_p ())
2734 dump_printf_loc (MSG_NOTE, vect_location,
2735 "cost model: epilogue peel iters set to vf/2 "
2736 "because loop iterations are unknown .\n");
2738 /* If peeled iterations are known but number of scalar loop
2739 iterations are unknown, count a taken branch per peeled loop. */
2740 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2741 NULL, 0, vect_prologue);
2742 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2743 NULL, 0, vect_epilogue);
2745 else
2747 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2748 peel_iters_prologue = niters < peel_iters_prologue ?
2749 niters : peel_iters_prologue;
2750 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2751 /* If we need to peel for gaps, but no peeling is required, we have to
2752 peel VF iterations. */
2753 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2754 *peel_iters_epilogue = vf;
2757 stmt_info_for_cost *si;
2758 int j;
2759 if (peel_iters_prologue)
2760 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2761 retval += record_stmt_cost (prologue_cost_vec,
2762 si->count * peel_iters_prologue,
2763 si->kind, NULL, si->misalign,
2764 vect_prologue);
2765 if (*peel_iters_epilogue)
2766 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2767 retval += record_stmt_cost (epilogue_cost_vec,
2768 si->count * *peel_iters_epilogue,
2769 si->kind, NULL, si->misalign,
2770 vect_epilogue);
2772 return retval;
2775 /* Function vect_estimate_min_profitable_iters
2777 Return the number of iterations required for the vector version of the
2778 loop to be profitable relative to the cost of the scalar version of the
2779 loop. */
2781 static void
2782 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2783 int *ret_min_profitable_niters,
2784 int *ret_min_profitable_estimate)
2786 int min_profitable_iters;
2787 int min_profitable_estimate;
2788 int peel_iters_prologue;
2789 int peel_iters_epilogue;
2790 unsigned vec_inside_cost = 0;
2791 int vec_outside_cost = 0;
2792 unsigned vec_prologue_cost = 0;
2793 unsigned vec_epilogue_cost = 0;
2794 int scalar_single_iter_cost = 0;
2795 int scalar_outside_cost = 0;
2796 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2797 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2798 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2800 /* Cost model disabled. */
2801 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2803 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2804 *ret_min_profitable_niters = 0;
2805 *ret_min_profitable_estimate = 0;
2806 return;
2809 /* Requires loop versioning tests to handle misalignment. */
2810 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2812 /* FIXME: Make cost depend on complexity of individual check. */
2813 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2814 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2815 vect_prologue);
2816 dump_printf (MSG_NOTE,
2817 "cost model: Adding cost of checks for loop "
2818 "versioning to treat misalignment.\n");
2821 /* Requires loop versioning with alias checks. */
2822 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2824 /* FIXME: Make cost depend on complexity of individual check. */
2825 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2826 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2827 vect_prologue);
2828 dump_printf (MSG_NOTE,
2829 "cost model: Adding cost of checks for loop "
2830 "versioning aliasing.\n");
2833 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2834 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2835 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2836 vect_prologue);
2838 /* Count statements in scalar loop. Using this as scalar cost for a single
2839 iteration for now.
2841 TODO: Add outer loop support.
2843 TODO: Consider assigning different costs to different scalar
2844 statements. */
2846 auto_vec<stmt_info_for_cost> scalar_cost_vec;
2847 scalar_single_iter_cost
2848 = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2850 /* Add additional cost for the peeled instructions in prologue and epilogue
2851 loop.
2853 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2854 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2856 TODO: Build an expression that represents peel_iters for prologue and
2857 epilogue to be used in a run-time test. */
2859 if (npeel < 0)
2861 peel_iters_prologue = vf/2;
2862 dump_printf (MSG_NOTE, "cost model: "
2863 "prologue peel iters set to vf/2.\n");
2865 /* If peeling for alignment is unknown, loop bound of main loop becomes
2866 unknown. */
2867 peel_iters_epilogue = vf/2;
2868 dump_printf (MSG_NOTE, "cost model: "
2869 "epilogue peel iters set to vf/2 because "
2870 "peeling for alignment is unknown.\n");
2872 /* If peeled iterations are unknown, count a taken branch and a not taken
2873 branch per peeled loop. Even if scalar loop iterations are known,
2874 vector iterations are not known since peeled prologue iterations are
2875 not known. Hence guards remain the same. */
2876 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2877 NULL, 0, vect_prologue);
2878 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2879 NULL, 0, vect_prologue);
2880 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2881 NULL, 0, vect_epilogue);
2882 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2883 NULL, 0, vect_epilogue);
2884 stmt_info_for_cost *si;
2885 int j;
2886 FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2888 struct _stmt_vec_info *stmt_info
2889 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2890 (void) add_stmt_cost (target_cost_data,
2891 si->count * peel_iters_prologue,
2892 si->kind, stmt_info, si->misalign,
2893 vect_prologue);
2894 (void) add_stmt_cost (target_cost_data,
2895 si->count * peel_iters_epilogue,
2896 si->kind, stmt_info, si->misalign,
2897 vect_epilogue);
2900 else
2902 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2903 stmt_info_for_cost *si;
2904 int j;
2905 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2907 prologue_cost_vec.create (2);
2908 epilogue_cost_vec.create (2);
2909 peel_iters_prologue = npeel;
2911 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2912 &peel_iters_epilogue,
2913 &scalar_cost_vec,
2914 &prologue_cost_vec,
2915 &epilogue_cost_vec);
2917 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2919 struct _stmt_vec_info *stmt_info
2920 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2921 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2922 si->misalign, vect_prologue);
2925 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2927 struct _stmt_vec_info *stmt_info
2928 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2929 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2930 si->misalign, vect_epilogue);
2933 prologue_cost_vec.release ();
2934 epilogue_cost_vec.release ();
2937 /* FORNOW: The scalar outside cost is incremented in one of the
2938 following ways:
2940 1. The vectorizer checks for alignment and aliasing and generates
2941 a condition that allows dynamic vectorization. A cost model
2942 check is ANDED with the versioning condition. Hence scalar code
2943 path now has the added cost of the versioning check.
2945 if (cost > th & versioning_check)
2946 jmp to vector code
2948 Hence run-time scalar is incremented by not-taken branch cost.
2950 2. The vectorizer then checks if a prologue is required. If the
2951 cost model check was not done before during versioning, it has to
2952 be done before the prologue check.
2954 if (cost <= th)
2955 prologue = scalar_iters
2956 if (prologue == 0)
2957 jmp to vector code
2958 else
2959 execute prologue
2960 if (prologue == num_iters)
2961 go to exit
2963 Hence the run-time scalar cost is incremented by a taken branch,
2964 plus a not-taken branch, plus a taken branch cost.
2966 3. The vectorizer then checks if an epilogue is required. If the
2967 cost model check was not done before during prologue check, it
2968 has to be done with the epilogue check.
2970 if (prologue == 0)
2971 jmp to vector code
2972 else
2973 execute prologue
2974 if (prologue == num_iters)
2975 go to exit
2976 vector code:
2977 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2978 jmp to epilogue
2980 Hence the run-time scalar cost should be incremented by 2 taken
2981 branches.
2983 TODO: The back end may reorder the BBS's differently and reverse
2984 conditions/branch directions. Change the estimates below to
2985 something more reasonable. */
2987 /* If the number of iterations is known and we do not do versioning, we can
2988 decide whether to vectorize at compile time. Hence the scalar version
2989 do not carry cost model guard costs. */
2990 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2991 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2992 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2994 /* Cost model check occurs at versioning. */
2995 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2996 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2997 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2998 else
3000 /* Cost model check occurs at prologue generation. */
3001 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3002 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3003 + vect_get_stmt_cost (cond_branch_not_taken);
3004 /* Cost model check occurs at epilogue generation. */
3005 else
3006 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3010 /* Complete the target-specific cost calculations. */
3011 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3012 &vec_inside_cost, &vec_epilogue_cost);
3014 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3016 if (dump_enabled_p ())
3018 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3019 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3020 vec_inside_cost);
3021 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3022 vec_prologue_cost);
3023 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3024 vec_epilogue_cost);
3025 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3026 scalar_single_iter_cost);
3027 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3028 scalar_outside_cost);
3029 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3030 vec_outside_cost);
3031 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3032 peel_iters_prologue);
3033 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3034 peel_iters_epilogue);
3037 /* Calculate number of iterations required to make the vector version
3038 profitable, relative to the loop bodies only. The following condition
3039 must hold true:
3040 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3041 where
3042 SIC = scalar iteration cost, VIC = vector iteration cost,
3043 VOC = vector outside cost, VF = vectorization factor,
3044 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3045 SOC = scalar outside cost for run time cost model check. */
3047 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3049 if (vec_outside_cost <= 0)
3050 min_profitable_iters = 1;
3051 else
3053 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3054 - vec_inside_cost * peel_iters_prologue
3055 - vec_inside_cost * peel_iters_epilogue)
3056 / ((scalar_single_iter_cost * vf)
3057 - vec_inside_cost);
3059 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3060 <= (((int) vec_inside_cost * min_profitable_iters)
3061 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3062 min_profitable_iters++;
3065 /* vector version will never be profitable. */
3066 else
3068 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3069 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3070 "did not happen for a simd loop");
3072 if (dump_enabled_p ())
3073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3074 "cost model: the vector iteration cost = %d "
3075 "divided by the scalar iteration cost = %d "
3076 "is greater or equal to the vectorization factor = %d"
3077 ".\n",
3078 vec_inside_cost, scalar_single_iter_cost, vf);
3079 *ret_min_profitable_niters = -1;
3080 *ret_min_profitable_estimate = -1;
3081 return;
3084 dump_printf (MSG_NOTE,
3085 " Calculated minimum iters for profitability: %d\n",
3086 min_profitable_iters);
3088 min_profitable_iters =
3089 min_profitable_iters < vf ? vf : min_profitable_iters;
3091 /* Because the condition we create is:
3092 if (niters <= min_profitable_iters)
3093 then skip the vectorized loop. */
3094 min_profitable_iters--;
3096 if (dump_enabled_p ())
3097 dump_printf_loc (MSG_NOTE, vect_location,
3098 " Runtime profitability threshold = %d\n",
3099 min_profitable_iters);
3101 *ret_min_profitable_niters = min_profitable_iters;
3103 /* Calculate number of iterations required to make the vector version
3104 profitable, relative to the loop bodies only.
3106 Non-vectorized variant is SIC * niters and it must win over vector
3107 variant on the expected loop trip count. The following condition must hold true:
3108 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3110 if (vec_outside_cost <= 0)
3111 min_profitable_estimate = 1;
3112 else
3114 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3115 - vec_inside_cost * peel_iters_prologue
3116 - vec_inside_cost * peel_iters_epilogue)
3117 / ((scalar_single_iter_cost * vf)
3118 - vec_inside_cost);
3120 min_profitable_estimate --;
3121 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3122 if (dump_enabled_p ())
3123 dump_printf_loc (MSG_NOTE, vect_location,
3124 " Static estimate profitability threshold = %d\n",
3125 min_profitable_iters);
3127 *ret_min_profitable_estimate = min_profitable_estimate;
3130 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3131 vector elements (not bits) for a vector of mode MODE. */
3132 static void
3133 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3134 unsigned char *sel)
3136 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3138 for (i = 0; i < nelt; i++)
3139 sel[i] = (i + offset) & (2*nelt - 1);
3142 /* Checks whether the target supports whole-vector shifts for vectors of mode
3143 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3144 it supports vec_perm_const with masks for all necessary shift amounts. */
3145 static bool
3146 have_whole_vector_shift (enum machine_mode mode)
3148 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3149 return true;
3151 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3152 return false;
3154 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3155 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3157 for (i = nelt/2; i >= 1; i/=2)
3159 calc_vec_perm_mask_for_shift (mode, i, sel);
3160 if (!can_vec_perm_p (mode, false, sel))
3161 return false;
3163 return true;
3166 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3168 static tree
3169 get_reduction_op (gimple stmt, int reduc_index)
3171 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3173 case GIMPLE_SINGLE_RHS:
3174 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3175 == ternary_op);
3176 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3177 case GIMPLE_UNARY_RHS:
3178 return gimple_assign_rhs1 (stmt);
3179 case GIMPLE_BINARY_RHS:
3180 return (reduc_index
3181 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3182 case GIMPLE_TERNARY_RHS:
3183 return gimple_op (stmt, reduc_index + 1);
3184 default:
3185 gcc_unreachable ();
3189 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3190 functions. Design better to avoid maintenance issues. */
3192 /* Function vect_model_reduction_cost.
3194 Models cost for a reduction operation, including the vector ops
3195 generated within the strip-mine loop, the initial definition before
3196 the loop, and the epilogue code that must be generated. */
3198 static bool
3199 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3200 int ncopies, int reduc_index)
3202 int prologue_cost = 0, epilogue_cost = 0;
3203 enum tree_code code;
3204 optab optab;
3205 tree vectype;
3206 gimple stmt, orig_stmt;
3207 tree reduction_op;
3208 machine_mode mode;
3209 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3210 struct loop *loop = NULL;
3211 void *target_cost_data;
3213 if (loop_vinfo)
3215 loop = LOOP_VINFO_LOOP (loop_vinfo);
3216 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3218 else
3219 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3221 /* Cost of reduction op inside loop. */
3222 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3223 stmt_info, 0, vect_body);
3224 stmt = STMT_VINFO_STMT (stmt_info);
3226 reduction_op = get_reduction_op (stmt, reduc_index);
3228 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3229 if (!vectype)
3231 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3234 "unsupported data-type ");
3235 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3236 TREE_TYPE (reduction_op));
3237 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3239 return false;
3242 mode = TYPE_MODE (vectype);
3243 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3245 if (!orig_stmt)
3246 orig_stmt = STMT_VINFO_STMT (stmt_info);
3248 code = gimple_assign_rhs_code (orig_stmt);
3250 /* Add in cost for initial definition. */
3251 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3252 stmt_info, 0, vect_prologue);
3254 /* Determine cost of epilogue code.
3256 We have a reduction operator that will reduce the vector in one statement.
3257 Also requires scalar extract. */
3259 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3261 if (reduc_code != ERROR_MARK)
3263 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3264 stmt_info, 0, vect_epilogue);
3265 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3266 stmt_info, 0, vect_epilogue);
3268 else
3270 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3271 tree bitsize =
3272 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3273 int element_bitsize = tree_to_uhwi (bitsize);
3274 int nelements = vec_size_in_bits / element_bitsize;
3276 optab = optab_for_tree_code (code, vectype, optab_default);
3278 /* We have a whole vector shift available. */
3279 if (VECTOR_MODE_P (mode)
3280 && optab_handler (optab, mode) != CODE_FOR_nothing
3281 && have_whole_vector_shift (mode))
3283 /* Final reduction via vector shifts and the reduction operator.
3284 Also requires scalar extract. */
3285 epilogue_cost += add_stmt_cost (target_cost_data,
3286 exact_log2 (nelements) * 2,
3287 vector_stmt, stmt_info, 0,
3288 vect_epilogue);
3289 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3290 vec_to_scalar, stmt_info, 0,
3291 vect_epilogue);
3293 else
3294 /* Use extracts and reduction op for final reduction. For N
3295 elements, we have N extracts and N-1 reduction ops. */
3296 epilogue_cost += add_stmt_cost (target_cost_data,
3297 nelements + nelements - 1,
3298 vector_stmt, stmt_info, 0,
3299 vect_epilogue);
3303 if (dump_enabled_p ())
3304 dump_printf (MSG_NOTE,
3305 "vect_model_reduction_cost: inside_cost = %d, "
3306 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3307 prologue_cost, epilogue_cost);
3309 return true;
3313 /* Function vect_model_induction_cost.
3315 Models cost for induction operations. */
3317 static void
3318 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3320 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3321 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3322 unsigned inside_cost, prologue_cost;
3324 /* loop cost for vec_loop. */
3325 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3326 stmt_info, 0, vect_body);
3328 /* prologue cost for vec_init and vec_step. */
3329 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3330 stmt_info, 0, vect_prologue);
3332 if (dump_enabled_p ())
3333 dump_printf_loc (MSG_NOTE, vect_location,
3334 "vect_model_induction_cost: inside_cost = %d, "
3335 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3339 /* Function get_initial_def_for_induction
3341 Input:
3342 STMT - a stmt that performs an induction operation in the loop.
3343 IV_PHI - the initial value of the induction variable
3345 Output:
3346 Return a vector variable, initialized with the first VF values of
3347 the induction variable. E.g., for an iv with IV_PHI='X' and
3348 evolution S, for a vector of 4 units, we want to return:
3349 [X, X + S, X + 2*S, X + 3*S]. */
3351 static tree
3352 get_initial_def_for_induction (gimple iv_phi)
3354 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3355 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3356 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3357 tree vectype;
3358 int nunits;
3359 edge pe = loop_preheader_edge (loop);
3360 struct loop *iv_loop;
3361 basic_block new_bb;
3362 tree new_vec, vec_init, vec_step, t;
3363 tree new_var;
3364 tree new_name;
3365 gimple init_stmt, new_stmt;
3366 gphi *induction_phi;
3367 tree induc_def, vec_def, vec_dest;
3368 tree init_expr, step_expr;
3369 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3370 int i;
3371 int ncopies;
3372 tree expr;
3373 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3374 bool nested_in_vect_loop = false;
3375 gimple_seq stmts = NULL;
3376 imm_use_iterator imm_iter;
3377 use_operand_p use_p;
3378 gimple exit_phi;
3379 edge latch_e;
3380 tree loop_arg;
3381 gimple_stmt_iterator si;
3382 basic_block bb = gimple_bb (iv_phi);
3383 tree stepvectype;
3384 tree resvectype;
3386 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3387 if (nested_in_vect_loop_p (loop, iv_phi))
3389 nested_in_vect_loop = true;
3390 iv_loop = loop->inner;
3392 else
3393 iv_loop = loop;
3394 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3396 latch_e = loop_latch_edge (iv_loop);
3397 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3399 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3400 gcc_assert (step_expr != NULL_TREE);
3402 pe = loop_preheader_edge (iv_loop);
3403 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3404 loop_preheader_edge (iv_loop));
3406 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3407 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3408 gcc_assert (vectype);
3409 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3410 ncopies = vf / nunits;
3412 gcc_assert (phi_info);
3413 gcc_assert (ncopies >= 1);
3415 /* Convert the step to the desired type. */
3416 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3417 step_expr),
3418 &stmts, true, NULL_TREE);
3419 if (stmts)
3421 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3422 gcc_assert (!new_bb);
3425 /* Find the first insertion point in the BB. */
3426 si = gsi_after_labels (bb);
3428 /* Create the vector that holds the initial_value of the induction. */
3429 if (nested_in_vect_loop)
3431 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3432 been created during vectorization of previous stmts. We obtain it
3433 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3434 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3435 /* If the initial value is not of proper type, convert it. */
3436 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3438 new_stmt
3439 = gimple_build_assign (vect_get_new_vect_var (vectype,
3440 vect_simple_var,
3441 "vec_iv_"),
3442 VIEW_CONVERT_EXPR,
3443 build1 (VIEW_CONVERT_EXPR, vectype,
3444 vec_init));
3445 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3446 gimple_assign_set_lhs (new_stmt, vec_init);
3447 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3448 new_stmt);
3449 gcc_assert (!new_bb);
3450 set_vinfo_for_stmt (new_stmt,
3451 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3454 else
3456 vec<constructor_elt, va_gc> *v;
3458 /* iv_loop is the loop to be vectorized. Create:
3459 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3460 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3461 vect_scalar_var, "var_");
3462 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3463 init_expr),
3464 &stmts, false, new_var);
3465 if (stmts)
3467 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3468 gcc_assert (!new_bb);
3471 vec_alloc (v, nunits);
3472 bool constant_p = is_gimple_min_invariant (new_name);
3473 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3474 for (i = 1; i < nunits; i++)
3476 /* Create: new_name_i = new_name + step_expr */
3477 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3478 new_name, step_expr);
3479 if (!is_gimple_min_invariant (new_name))
3481 init_stmt = gimple_build_assign (new_var, new_name);
3482 new_name = make_ssa_name (new_var, init_stmt);
3483 gimple_assign_set_lhs (init_stmt, new_name);
3484 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3485 gcc_assert (!new_bb);
3486 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE, vect_location,
3489 "created new init_stmt: ");
3490 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3491 dump_printf (MSG_NOTE, "\n");
3493 constant_p = false;
3495 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3497 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3498 if (constant_p)
3499 new_vec = build_vector_from_ctor (vectype, v);
3500 else
3501 new_vec = build_constructor (vectype, v);
3502 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3506 /* Create the vector that holds the step of the induction. */
3507 if (nested_in_vect_loop)
3508 /* iv_loop is nested in the loop to be vectorized. Generate:
3509 vec_step = [S, S, S, S] */
3510 new_name = step_expr;
3511 else
3513 /* iv_loop is the loop to be vectorized. Generate:
3514 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3515 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3517 expr = build_int_cst (integer_type_node, vf);
3518 expr = fold_convert (TREE_TYPE (step_expr), expr);
3520 else
3521 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3522 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3523 expr, step_expr);
3524 if (TREE_CODE (step_expr) == SSA_NAME)
3525 new_name = vect_init_vector (iv_phi, new_name,
3526 TREE_TYPE (step_expr), NULL);
3529 t = unshare_expr (new_name);
3530 gcc_assert (CONSTANT_CLASS_P (new_name)
3531 || TREE_CODE (new_name) == SSA_NAME);
3532 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3533 gcc_assert (stepvectype);
3534 new_vec = build_vector_from_val (stepvectype, t);
3535 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3538 /* Create the following def-use cycle:
3539 loop prolog:
3540 vec_init = ...
3541 vec_step = ...
3542 loop:
3543 vec_iv = PHI <vec_init, vec_loop>
3545 STMT
3547 vec_loop = vec_iv + vec_step; */
3549 /* Create the induction-phi that defines the induction-operand. */
3550 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3551 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3552 set_vinfo_for_stmt (induction_phi,
3553 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3554 induc_def = PHI_RESULT (induction_phi);
3556 /* Create the iv update inside the loop */
3557 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3558 vec_def = make_ssa_name (vec_dest, new_stmt);
3559 gimple_assign_set_lhs (new_stmt, vec_def);
3560 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3561 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3562 NULL));
3564 /* Set the arguments of the phi node: */
3565 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3566 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3567 UNKNOWN_LOCATION);
3570 /* In case that vectorization factor (VF) is bigger than the number
3571 of elements that we can fit in a vectype (nunits), we have to generate
3572 more than one vector stmt - i.e - we need to "unroll" the
3573 vector stmt by a factor VF/nunits. For more details see documentation
3574 in vectorizable_operation. */
3576 if (ncopies > 1)
3578 stmt_vec_info prev_stmt_vinfo;
3579 /* FORNOW. This restriction should be relaxed. */
3580 gcc_assert (!nested_in_vect_loop);
3582 /* Create the vector that holds the step of the induction. */
3583 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3585 expr = build_int_cst (integer_type_node, nunits);
3586 expr = fold_convert (TREE_TYPE (step_expr), expr);
3588 else
3589 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3590 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3591 expr, step_expr);
3592 if (TREE_CODE (step_expr) == SSA_NAME)
3593 new_name = vect_init_vector (iv_phi, new_name,
3594 TREE_TYPE (step_expr), NULL);
3595 t = unshare_expr (new_name);
3596 gcc_assert (CONSTANT_CLASS_P (new_name)
3597 || TREE_CODE (new_name) == SSA_NAME);
3598 new_vec = build_vector_from_val (stepvectype, t);
3599 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3601 vec_def = induc_def;
3602 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3603 for (i = 1; i < ncopies; i++)
3605 /* vec_i = vec_prev + vec_step */
3606 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3607 vec_def, vec_step);
3608 vec_def = make_ssa_name (vec_dest, new_stmt);
3609 gimple_assign_set_lhs (new_stmt, vec_def);
3611 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3612 if (!useless_type_conversion_p (resvectype, vectype))
3614 new_stmt
3615 = gimple_build_assign
3616 (vect_get_new_vect_var (resvectype, vect_simple_var,
3617 "vec_iv_"),
3618 VIEW_CONVERT_EXPR,
3619 build1 (VIEW_CONVERT_EXPR, resvectype,
3620 gimple_assign_lhs (new_stmt)));
3621 gimple_assign_set_lhs (new_stmt,
3622 make_ssa_name
3623 (gimple_assign_lhs (new_stmt), new_stmt));
3624 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3626 set_vinfo_for_stmt (new_stmt,
3627 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3628 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3629 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3633 if (nested_in_vect_loop)
3635 /* Find the loop-closed exit-phi of the induction, and record
3636 the final vector of induction results: */
3637 exit_phi = NULL;
3638 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3640 gimple use_stmt = USE_STMT (use_p);
3641 if (is_gimple_debug (use_stmt))
3642 continue;
3644 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3646 exit_phi = use_stmt;
3647 break;
3650 if (exit_phi)
3652 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3653 /* FORNOW. Currently not supporting the case that an inner-loop induction
3654 is not used in the outer-loop (i.e. only outside the outer-loop). */
3655 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3656 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3658 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3659 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_NOTE, vect_location,
3662 "vector of inductions after inner-loop:");
3663 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3664 dump_printf (MSG_NOTE, "\n");
3670 if (dump_enabled_p ())
3672 dump_printf_loc (MSG_NOTE, vect_location,
3673 "transform induction: created def-use cycle: ");
3674 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3675 dump_printf (MSG_NOTE, "\n");
3676 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3677 SSA_NAME_DEF_STMT (vec_def), 0);
3678 dump_printf (MSG_NOTE, "\n");
3681 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3682 if (!useless_type_conversion_p (resvectype, vectype))
3684 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3685 vect_simple_var,
3686 "vec_iv_"),
3687 VIEW_CONVERT_EXPR,
3688 build1 (VIEW_CONVERT_EXPR, resvectype,
3689 induc_def));
3690 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3691 gimple_assign_set_lhs (new_stmt, induc_def);
3692 si = gsi_after_labels (bb);
3693 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3694 set_vinfo_for_stmt (new_stmt,
3695 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3696 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3697 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3700 return induc_def;
3704 /* Function get_initial_def_for_reduction
3706 Input:
3707 STMT - a stmt that performs a reduction operation in the loop.
3708 INIT_VAL - the initial value of the reduction variable
3710 Output:
3711 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3712 of the reduction (used for adjusting the epilog - see below).
3713 Return a vector variable, initialized according to the operation that STMT
3714 performs. This vector will be used as the initial value of the
3715 vector of partial results.
3717 Option1 (adjust in epilog): Initialize the vector as follows:
3718 add/bit or/xor: [0,0,...,0,0]
3719 mult/bit and: [1,1,...,1,1]
3720 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3721 and when necessary (e.g. add/mult case) let the caller know
3722 that it needs to adjust the result by init_val.
3724 Option2: Initialize the vector as follows:
3725 add/bit or/xor: [init_val,0,0,...,0]
3726 mult/bit and: [init_val,1,1,...,1]
3727 min/max/cond_expr: [init_val,init_val,...,init_val]
3728 and no adjustments are needed.
3730 For example, for the following code:
3732 s = init_val;
3733 for (i=0;i<n;i++)
3734 s = s + a[i];
3736 STMT is 's = s + a[i]', and the reduction variable is 's'.
3737 For a vector of 4 units, we want to return either [0,0,0,init_val],
3738 or [0,0,0,0] and let the caller know that it needs to adjust
3739 the result at the end by 'init_val'.
3741 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3742 initialization vector is simpler (same element in all entries), if
3743 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3745 A cost model should help decide between these two schemes. */
3747 tree
3748 get_initial_def_for_reduction (gimple stmt, tree init_val,
3749 tree *adjustment_def)
3751 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3752 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3753 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3754 tree scalar_type = TREE_TYPE (init_val);
3755 tree vectype = get_vectype_for_scalar_type (scalar_type);
3756 int nunits;
3757 enum tree_code code = gimple_assign_rhs_code (stmt);
3758 tree def_for_init;
3759 tree init_def;
3760 tree *elts;
3761 int i;
3762 bool nested_in_vect_loop = false;
3763 tree init_value;
3764 REAL_VALUE_TYPE real_init_val = dconst0;
3765 int int_init_val = 0;
3766 gimple def_stmt = NULL;
3768 gcc_assert (vectype);
3769 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3771 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3772 || SCALAR_FLOAT_TYPE_P (scalar_type));
3774 if (nested_in_vect_loop_p (loop, stmt))
3775 nested_in_vect_loop = true;
3776 else
3777 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3779 /* In case of double reduction we only create a vector variable to be put
3780 in the reduction phi node. The actual statement creation is done in
3781 vect_create_epilog_for_reduction. */
3782 if (adjustment_def && nested_in_vect_loop
3783 && TREE_CODE (init_val) == SSA_NAME
3784 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3785 && gimple_code (def_stmt) == GIMPLE_PHI
3786 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3787 && vinfo_for_stmt (def_stmt)
3788 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3789 == vect_double_reduction_def)
3791 *adjustment_def = NULL;
3792 return vect_create_destination_var (init_val, vectype);
3795 if (TREE_CONSTANT (init_val))
3797 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3798 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3799 else
3800 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3802 else
3803 init_value = init_val;
3805 switch (code)
3807 case WIDEN_SUM_EXPR:
3808 case DOT_PROD_EXPR:
3809 case SAD_EXPR:
3810 case PLUS_EXPR:
3811 case MINUS_EXPR:
3812 case BIT_IOR_EXPR:
3813 case BIT_XOR_EXPR:
3814 case MULT_EXPR:
3815 case BIT_AND_EXPR:
3816 /* ADJUSMENT_DEF is NULL when called from
3817 vect_create_epilog_for_reduction to vectorize double reduction. */
3818 if (adjustment_def)
3820 if (nested_in_vect_loop)
3821 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3822 NULL);
3823 else
3824 *adjustment_def = init_val;
3827 if (code == MULT_EXPR)
3829 real_init_val = dconst1;
3830 int_init_val = 1;
3833 if (code == BIT_AND_EXPR)
3834 int_init_val = -1;
3836 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3837 def_for_init = build_real (scalar_type, real_init_val);
3838 else
3839 def_for_init = build_int_cst (scalar_type, int_init_val);
3841 /* Create a vector of '0' or '1' except the first element. */
3842 elts = XALLOCAVEC (tree, nunits);
3843 for (i = nunits - 2; i >= 0; --i)
3844 elts[i + 1] = def_for_init;
3846 /* Option1: the first element is '0' or '1' as well. */
3847 if (adjustment_def)
3849 elts[0] = def_for_init;
3850 init_def = build_vector (vectype, elts);
3851 break;
3854 /* Option2: the first element is INIT_VAL. */
3855 elts[0] = init_val;
3856 if (TREE_CONSTANT (init_val))
3857 init_def = build_vector (vectype, elts);
3858 else
3860 vec<constructor_elt, va_gc> *v;
3861 vec_alloc (v, nunits);
3862 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3863 for (i = 1; i < nunits; ++i)
3864 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3865 init_def = build_constructor (vectype, v);
3868 break;
3870 case MIN_EXPR:
3871 case MAX_EXPR:
3872 case COND_EXPR:
3873 if (adjustment_def)
3875 *adjustment_def = NULL_TREE;
3876 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3877 break;
3880 init_def = build_vector_from_val (vectype, init_value);
3881 break;
3883 default:
3884 gcc_unreachable ();
3887 return init_def;
3890 /* Function vect_create_epilog_for_reduction
3892 Create code at the loop-epilog to finalize the result of a reduction
3893 computation.
3895 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3896 reduction statements.
3897 STMT is the scalar reduction stmt that is being vectorized.
3898 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3899 number of elements that we can fit in a vectype (nunits). In this case
3900 we have to generate more than one vector stmt - i.e - we need to "unroll"
3901 the vector stmt by a factor VF/nunits. For more details see documentation
3902 in vectorizable_operation.
3903 REDUC_CODE is the tree-code for the epilog reduction.
3904 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3905 computation.
3906 REDUC_INDEX is the index of the operand in the right hand side of the
3907 statement that is defined by REDUCTION_PHI.
3908 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3909 SLP_NODE is an SLP node containing a group of reduction statements. The
3910 first one in this group is STMT.
3912 This function:
3913 1. Creates the reduction def-use cycles: sets the arguments for
3914 REDUCTION_PHIS:
3915 The loop-entry argument is the vectorized initial-value of the reduction.
3916 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3917 sums.
3918 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3919 by applying the operation specified by REDUC_CODE if available, or by
3920 other means (whole-vector shifts or a scalar loop).
3921 The function also creates a new phi node at the loop exit to preserve
3922 loop-closed form, as illustrated below.
3924 The flow at the entry to this function:
3926 loop:
3927 vec_def = phi <null, null> # REDUCTION_PHI
3928 VECT_DEF = vector_stmt # vectorized form of STMT
3929 s_loop = scalar_stmt # (scalar) STMT
3930 loop_exit:
3931 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3932 use <s_out0>
3933 use <s_out0>
3935 The above is transformed by this function into:
3937 loop:
3938 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3939 VECT_DEF = vector_stmt # vectorized form of STMT
3940 s_loop = scalar_stmt # (scalar) STMT
3941 loop_exit:
3942 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3943 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3944 v_out2 = reduce <v_out1>
3945 s_out3 = extract_field <v_out2, 0>
3946 s_out4 = adjust_result <s_out3>
3947 use <s_out4>
3948 use <s_out4>
3951 static void
3952 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3953 int ncopies, enum tree_code reduc_code,
3954 vec<gimple> reduction_phis,
3955 int reduc_index, bool double_reduc,
3956 slp_tree slp_node)
3958 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3959 stmt_vec_info prev_phi_info;
3960 tree vectype;
3961 machine_mode mode;
3962 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3963 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3964 basic_block exit_bb;
3965 tree scalar_dest;
3966 tree scalar_type;
3967 gimple new_phi = NULL, phi;
3968 gimple_stmt_iterator exit_gsi;
3969 tree vec_dest;
3970 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3971 gimple epilog_stmt = NULL;
3972 enum tree_code code = gimple_assign_rhs_code (stmt);
3973 gimple exit_phi;
3974 tree bitsize;
3975 tree adjustment_def = NULL;
3976 tree vec_initial_def = NULL;
3977 tree reduction_op, expr, def;
3978 tree orig_name, scalar_result;
3979 imm_use_iterator imm_iter, phi_imm_iter;
3980 use_operand_p use_p, phi_use_p;
3981 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3982 bool nested_in_vect_loop = false;
3983 auto_vec<gimple> new_phis;
3984 auto_vec<gimple> inner_phis;
3985 enum vect_def_type dt = vect_unknown_def_type;
3986 int j, i;
3987 auto_vec<tree> scalar_results;
3988 unsigned int group_size = 1, k, ratio;
3989 auto_vec<tree> vec_initial_defs;
3990 auto_vec<gimple> phis;
3991 bool slp_reduc = false;
3992 tree new_phi_result;
3993 gimple inner_phi = NULL;
3995 if (slp_node)
3996 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3998 if (nested_in_vect_loop_p (loop, stmt))
4000 outer_loop = loop;
4001 loop = loop->inner;
4002 nested_in_vect_loop = true;
4003 gcc_assert (!slp_node);
4006 reduction_op = get_reduction_op (stmt, reduc_index);
4008 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4009 gcc_assert (vectype);
4010 mode = TYPE_MODE (vectype);
4012 /* 1. Create the reduction def-use cycle:
4013 Set the arguments of REDUCTION_PHIS, i.e., transform
4015 loop:
4016 vec_def = phi <null, null> # REDUCTION_PHI
4017 VECT_DEF = vector_stmt # vectorized form of STMT
4020 into:
4022 loop:
4023 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4024 VECT_DEF = vector_stmt # vectorized form of STMT
4027 (in case of SLP, do it for all the phis). */
4029 /* Get the loop-entry arguments. */
4030 if (slp_node)
4031 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4032 NULL, slp_node, reduc_index);
4033 else
4035 vec_initial_defs.create (1);
4036 /* For the case of reduction, vect_get_vec_def_for_operand returns
4037 the scalar def before the loop, that defines the initial value
4038 of the reduction variable. */
4039 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4040 &adjustment_def);
4041 vec_initial_defs.quick_push (vec_initial_def);
4044 /* Set phi nodes arguments. */
4045 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4047 tree vec_init_def, def;
4048 gimple_seq stmts;
4049 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4050 true, NULL_TREE);
4051 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4052 def = vect_defs[i];
4053 for (j = 0; j < ncopies; j++)
4055 /* Set the loop-entry arg of the reduction-phi. */
4056 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4057 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4059 /* Set the loop-latch arg for the reduction-phi. */
4060 if (j > 0)
4061 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4063 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4064 UNKNOWN_LOCATION);
4066 if (dump_enabled_p ())
4068 dump_printf_loc (MSG_NOTE, vect_location,
4069 "transform reduction: created def-use cycle: ");
4070 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4071 dump_printf (MSG_NOTE, "\n");
4072 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4073 dump_printf (MSG_NOTE, "\n");
4076 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4080 /* 2. Create epilog code.
4081 The reduction epilog code operates across the elements of the vector
4082 of partial results computed by the vectorized loop.
4083 The reduction epilog code consists of:
4085 step 1: compute the scalar result in a vector (v_out2)
4086 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4087 step 3: adjust the scalar result (s_out3) if needed.
4089 Step 1 can be accomplished using one the following three schemes:
4090 (scheme 1) using reduc_code, if available.
4091 (scheme 2) using whole-vector shifts, if available.
4092 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4093 combined.
4095 The overall epilog code looks like this:
4097 s_out0 = phi <s_loop> # original EXIT_PHI
4098 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4099 v_out2 = reduce <v_out1> # step 1
4100 s_out3 = extract_field <v_out2, 0> # step 2
4101 s_out4 = adjust_result <s_out3> # step 3
4103 (step 3 is optional, and steps 1 and 2 may be combined).
4104 Lastly, the uses of s_out0 are replaced by s_out4. */
4107 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4108 v_out1 = phi <VECT_DEF>
4109 Store them in NEW_PHIS. */
4111 exit_bb = single_exit (loop)->dest;
4112 prev_phi_info = NULL;
4113 new_phis.create (vect_defs.length ());
4114 FOR_EACH_VEC_ELT (vect_defs, i, def)
4116 for (j = 0; j < ncopies; j++)
4118 tree new_def = copy_ssa_name (def);
4119 phi = create_phi_node (new_def, exit_bb);
4120 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4121 if (j == 0)
4122 new_phis.quick_push (phi);
4123 else
4125 def = vect_get_vec_def_for_stmt_copy (dt, def);
4126 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4129 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4130 prev_phi_info = vinfo_for_stmt (phi);
4134 /* The epilogue is created for the outer-loop, i.e., for the loop being
4135 vectorized. Create exit phis for the outer loop. */
4136 if (double_reduc)
4138 loop = outer_loop;
4139 exit_bb = single_exit (loop)->dest;
4140 inner_phis.create (vect_defs.length ());
4141 FOR_EACH_VEC_ELT (new_phis, i, phi)
4143 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4144 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4145 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4146 PHI_RESULT (phi));
4147 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4148 loop_vinfo, NULL));
4149 inner_phis.quick_push (phi);
4150 new_phis[i] = outer_phi;
4151 prev_phi_info = vinfo_for_stmt (outer_phi);
4152 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4154 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4155 new_result = copy_ssa_name (PHI_RESULT (phi));
4156 outer_phi = create_phi_node (new_result, exit_bb);
4157 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4158 PHI_RESULT (phi));
4159 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4160 loop_vinfo, NULL));
4161 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4162 prev_phi_info = vinfo_for_stmt (outer_phi);
4167 exit_gsi = gsi_after_labels (exit_bb);
4169 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4170 (i.e. when reduc_code is not available) and in the final adjustment
4171 code (if needed). Also get the original scalar reduction variable as
4172 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4173 represents a reduction pattern), the tree-code and scalar-def are
4174 taken from the original stmt that the pattern-stmt (STMT) replaces.
4175 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4176 are taken from STMT. */
4178 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4179 if (!orig_stmt)
4181 /* Regular reduction */
4182 orig_stmt = stmt;
4184 else
4186 /* Reduction pattern */
4187 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4188 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4189 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4192 code = gimple_assign_rhs_code (orig_stmt);
4193 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4194 partial results are added and not subtracted. */
4195 if (code == MINUS_EXPR)
4196 code = PLUS_EXPR;
4198 scalar_dest = gimple_assign_lhs (orig_stmt);
4199 scalar_type = TREE_TYPE (scalar_dest);
4200 scalar_results.create (group_size);
4201 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4202 bitsize = TYPE_SIZE (scalar_type);
4204 /* In case this is a reduction in an inner-loop while vectorizing an outer
4205 loop - we don't need to extract a single scalar result at the end of the
4206 inner-loop (unless it is double reduction, i.e., the use of reduction is
4207 outside the outer-loop). The final vector of partial results will be used
4208 in the vectorized outer-loop, or reduced to a scalar result at the end of
4209 the outer-loop. */
4210 if (nested_in_vect_loop && !double_reduc)
4211 goto vect_finalize_reduction;
4213 /* SLP reduction without reduction chain, e.g.,
4214 # a1 = phi <a2, a0>
4215 # b1 = phi <b2, b0>
4216 a2 = operation (a1)
4217 b2 = operation (b1) */
4218 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4220 /* In case of reduction chain, e.g.,
4221 # a1 = phi <a3, a0>
4222 a2 = operation (a1)
4223 a3 = operation (a2),
4225 we may end up with more than one vector result. Here we reduce them to
4226 one vector. */
4227 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4229 tree first_vect = PHI_RESULT (new_phis[0]);
4230 tree tmp;
4231 gassign *new_vec_stmt = NULL;
4233 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4234 for (k = 1; k < new_phis.length (); k++)
4236 gimple next_phi = new_phis[k];
4237 tree second_vect = PHI_RESULT (next_phi);
4239 tmp = build2 (code, vectype, first_vect, second_vect);
4240 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4241 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4242 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4243 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4246 new_phi_result = first_vect;
4247 if (new_vec_stmt)
4249 new_phis.truncate (0);
4250 new_phis.safe_push (new_vec_stmt);
4253 else
4254 new_phi_result = PHI_RESULT (new_phis[0]);
4256 /* 2.3 Create the reduction code, using one of the three schemes described
4257 above. In SLP we simply need to extract all the elements from the
4258 vector (without reducing them), so we use scalar shifts. */
4259 if (reduc_code != ERROR_MARK && !slp_reduc)
4261 tree tmp;
4262 tree vec_elem_type;
4264 /*** Case 1: Create:
4265 v_out2 = reduc_expr <v_out1> */
4267 if (dump_enabled_p ())
4268 dump_printf_loc (MSG_NOTE, vect_location,
4269 "Reduce using direct vector reduction.\n");
4271 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4272 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4274 tree tmp_dest =
4275 vect_create_destination_var (scalar_dest, vec_elem_type);
4276 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4277 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4278 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4279 gimple_assign_set_lhs (epilog_stmt, new_temp);
4280 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4282 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4284 else
4285 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4286 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4287 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4288 gimple_assign_set_lhs (epilog_stmt, new_temp);
4289 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4290 scalar_results.safe_push (new_temp);
4292 else
4294 bool reduce_with_shift = have_whole_vector_shift (mode);
4295 int element_bitsize = tree_to_uhwi (bitsize);
4296 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4297 tree vec_temp;
4299 /* Regardless of whether we have a whole vector shift, if we're
4300 emulating the operation via tree-vect-generic, we don't want
4301 to use it. Only the first round of the reduction is likely
4302 to still be profitable via emulation. */
4303 /* ??? It might be better to emit a reduction tree code here, so that
4304 tree-vect-generic can expand the first round via bit tricks. */
4305 if (!VECTOR_MODE_P (mode))
4306 reduce_with_shift = false;
4307 else
4309 optab optab = optab_for_tree_code (code, vectype, optab_default);
4310 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4311 reduce_with_shift = false;
4314 if (reduce_with_shift && !slp_reduc)
4316 int nelements = vec_size_in_bits / element_bitsize;
4317 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4319 int elt_offset;
4321 tree zero_vec = build_zero_cst (vectype);
4322 /*** Case 2: Create:
4323 for (offset = nelements/2; offset >= 1; offset/=2)
4325 Create: va' = vec_shift <va, offset>
4326 Create: va = vop <va, va'>
4327 } */
4329 tree rhs;
4331 if (dump_enabled_p ())
4332 dump_printf_loc (MSG_NOTE, vect_location,
4333 "Reduce using vector shifts\n");
4335 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4336 new_temp = new_phi_result;
4337 for (elt_offset = nelements / 2;
4338 elt_offset >= 1;
4339 elt_offset /= 2)
4341 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4342 tree mask = vect_gen_perm_mask_any (vectype, sel);
4343 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4344 new_temp, zero_vec, mask);
4345 new_name = make_ssa_name (vec_dest, epilog_stmt);
4346 gimple_assign_set_lhs (epilog_stmt, new_name);
4347 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4349 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4350 new_temp);
4351 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4352 gimple_assign_set_lhs (epilog_stmt, new_temp);
4353 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4356 /* 2.4 Extract the final scalar result. Create:
4357 s_out3 = extract_field <v_out2, bitpos> */
4359 if (dump_enabled_p ())
4360 dump_printf_loc (MSG_NOTE, vect_location,
4361 "extract scalar result\n");
4363 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4364 bitsize, bitsize_zero_node);
4365 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4366 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4367 gimple_assign_set_lhs (epilog_stmt, new_temp);
4368 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4369 scalar_results.safe_push (new_temp);
4371 else
4373 /*** Case 3: Create:
4374 s = extract_field <v_out2, 0>
4375 for (offset = element_size;
4376 offset < vector_size;
4377 offset += element_size;)
4379 Create: s' = extract_field <v_out2, offset>
4380 Create: s = op <s, s'> // For non SLP cases
4381 } */
4383 if (dump_enabled_p ())
4384 dump_printf_loc (MSG_NOTE, vect_location,
4385 "Reduce using scalar code.\n");
4387 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4388 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4390 int bit_offset;
4391 if (gimple_code (new_phi) == GIMPLE_PHI)
4392 vec_temp = PHI_RESULT (new_phi);
4393 else
4394 vec_temp = gimple_assign_lhs (new_phi);
4395 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4396 bitsize_zero_node);
4397 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4398 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4399 gimple_assign_set_lhs (epilog_stmt, new_temp);
4400 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4402 /* In SLP we don't need to apply reduction operation, so we just
4403 collect s' values in SCALAR_RESULTS. */
4404 if (slp_reduc)
4405 scalar_results.safe_push (new_temp);
4407 for (bit_offset = element_bitsize;
4408 bit_offset < vec_size_in_bits;
4409 bit_offset += element_bitsize)
4411 tree bitpos = bitsize_int (bit_offset);
4412 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4413 bitsize, bitpos);
4415 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4416 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4417 gimple_assign_set_lhs (epilog_stmt, new_name);
4418 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4420 if (slp_reduc)
4422 /* In SLP we don't need to apply reduction operation, so
4423 we just collect s' values in SCALAR_RESULTS. */
4424 new_temp = new_name;
4425 scalar_results.safe_push (new_name);
4427 else
4429 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4430 new_name, new_temp);
4431 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4432 gimple_assign_set_lhs (epilog_stmt, new_temp);
4433 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4438 /* The only case where we need to reduce scalar results in SLP, is
4439 unrolling. If the size of SCALAR_RESULTS is greater than
4440 GROUP_SIZE, we reduce them combining elements modulo
4441 GROUP_SIZE. */
4442 if (slp_reduc)
4444 tree res, first_res, new_res;
4445 gimple new_stmt;
4447 /* Reduce multiple scalar results in case of SLP unrolling. */
4448 for (j = group_size; scalar_results.iterate (j, &res);
4449 j++)
4451 first_res = scalar_results[j % group_size];
4452 new_stmt = gimple_build_assign (new_scalar_dest, code,
4453 first_res, res);
4454 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4455 gimple_assign_set_lhs (new_stmt, new_res);
4456 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4457 scalar_results[j % group_size] = new_res;
4460 else
4461 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4462 scalar_results.safe_push (new_temp);
4466 vect_finalize_reduction:
4468 if (double_reduc)
4469 loop = loop->inner;
4471 /* 2.5 Adjust the final result by the initial value of the reduction
4472 variable. (When such adjustment is not needed, then
4473 'adjustment_def' is zero). For example, if code is PLUS we create:
4474 new_temp = loop_exit_def + adjustment_def */
4476 if (adjustment_def)
4478 gcc_assert (!slp_reduc);
4479 if (nested_in_vect_loop)
4481 new_phi = new_phis[0];
4482 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4483 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4484 new_dest = vect_create_destination_var (scalar_dest, vectype);
4486 else
4488 new_temp = scalar_results[0];
4489 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4490 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4491 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4494 epilog_stmt = gimple_build_assign (new_dest, expr);
4495 new_temp = make_ssa_name (new_dest, epilog_stmt);
4496 gimple_assign_set_lhs (epilog_stmt, new_temp);
4497 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4498 if (nested_in_vect_loop)
4500 set_vinfo_for_stmt (epilog_stmt,
4501 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4502 NULL));
4503 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4504 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4506 if (!double_reduc)
4507 scalar_results.quick_push (new_temp);
4508 else
4509 scalar_results[0] = new_temp;
4511 else
4512 scalar_results[0] = new_temp;
4514 new_phis[0] = epilog_stmt;
4517 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4518 phis with new adjusted scalar results, i.e., replace use <s_out0>
4519 with use <s_out4>.
4521 Transform:
4522 loop_exit:
4523 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4524 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4525 v_out2 = reduce <v_out1>
4526 s_out3 = extract_field <v_out2, 0>
4527 s_out4 = adjust_result <s_out3>
4528 use <s_out0>
4529 use <s_out0>
4531 into:
4533 loop_exit:
4534 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4535 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4536 v_out2 = reduce <v_out1>
4537 s_out3 = extract_field <v_out2, 0>
4538 s_out4 = adjust_result <s_out3>
4539 use <s_out4>
4540 use <s_out4> */
4543 /* In SLP reduction chain we reduce vector results into one vector if
4544 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4545 the last stmt in the reduction chain, since we are looking for the loop
4546 exit phi node. */
4547 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4549 scalar_dest = gimple_assign_lhs (
4550 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4551 group_size = 1;
4554 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4555 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4556 need to match SCALAR_RESULTS with corresponding statements. The first
4557 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4558 the first vector stmt, etc.
4559 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4560 if (group_size > new_phis.length ())
4562 ratio = group_size / new_phis.length ();
4563 gcc_assert (!(group_size % new_phis.length ()));
4565 else
4566 ratio = 1;
4568 for (k = 0; k < group_size; k++)
4570 if (k % ratio == 0)
4572 epilog_stmt = new_phis[k / ratio];
4573 reduction_phi = reduction_phis[k / ratio];
4574 if (double_reduc)
4575 inner_phi = inner_phis[k / ratio];
4578 if (slp_reduc)
4580 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4582 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4583 /* SLP statements can't participate in patterns. */
4584 gcc_assert (!orig_stmt);
4585 scalar_dest = gimple_assign_lhs (current_stmt);
4588 phis.create (3);
4589 /* Find the loop-closed-use at the loop exit of the original scalar
4590 result. (The reduction result is expected to have two immediate uses -
4591 one at the latch block, and one at the loop exit). */
4592 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4593 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4594 && !is_gimple_debug (USE_STMT (use_p)))
4595 phis.safe_push (USE_STMT (use_p));
4597 /* While we expect to have found an exit_phi because of loop-closed-ssa
4598 form we can end up without one if the scalar cycle is dead. */
4600 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4602 if (outer_loop)
4604 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4605 gphi *vect_phi;
4607 /* FORNOW. Currently not supporting the case that an inner-loop
4608 reduction is not used in the outer-loop (but only outside the
4609 outer-loop), unless it is double reduction. */
4610 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4611 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4612 || double_reduc);
4614 if (double_reduc)
4615 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4616 else
4617 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4618 if (!double_reduc
4619 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4620 != vect_double_reduction_def)
4621 continue;
4623 /* Handle double reduction:
4625 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4626 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4627 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4628 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4630 At that point the regular reduction (stmt2 and stmt3) is
4631 already vectorized, as well as the exit phi node, stmt4.
4632 Here we vectorize the phi node of double reduction, stmt1, and
4633 update all relevant statements. */
4635 /* Go through all the uses of s2 to find double reduction phi
4636 node, i.e., stmt1 above. */
4637 orig_name = PHI_RESULT (exit_phi);
4638 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4640 stmt_vec_info use_stmt_vinfo;
4641 stmt_vec_info new_phi_vinfo;
4642 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4643 basic_block bb = gimple_bb (use_stmt);
4644 gimple use;
4646 /* Check that USE_STMT is really double reduction phi
4647 node. */
4648 if (gimple_code (use_stmt) != GIMPLE_PHI
4649 || gimple_phi_num_args (use_stmt) != 2
4650 || bb->loop_father != outer_loop)
4651 continue;
4652 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4653 if (!use_stmt_vinfo
4654 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4655 != vect_double_reduction_def)
4656 continue;
4658 /* Create vector phi node for double reduction:
4659 vs1 = phi <vs0, vs2>
4660 vs1 was created previously in this function by a call to
4661 vect_get_vec_def_for_operand and is stored in
4662 vec_initial_def;
4663 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4664 vs0 is created here. */
4666 /* Create vector phi node. */
4667 vect_phi = create_phi_node (vec_initial_def, bb);
4668 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4669 loop_vec_info_for_loop (outer_loop), NULL);
4670 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4672 /* Create vs0 - initial def of the double reduction phi. */
4673 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4674 loop_preheader_edge (outer_loop));
4675 init_def = get_initial_def_for_reduction (stmt,
4676 preheader_arg, NULL);
4677 vect_phi_init = vect_init_vector (use_stmt, init_def,
4678 vectype, NULL);
4680 /* Update phi node arguments with vs0 and vs2. */
4681 add_phi_arg (vect_phi, vect_phi_init,
4682 loop_preheader_edge (outer_loop),
4683 UNKNOWN_LOCATION);
4684 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4685 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4686 if (dump_enabled_p ())
4688 dump_printf_loc (MSG_NOTE, vect_location,
4689 "created double reduction phi node: ");
4690 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4691 dump_printf (MSG_NOTE, "\n");
4694 vect_phi_res = PHI_RESULT (vect_phi);
4696 /* Replace the use, i.e., set the correct vs1 in the regular
4697 reduction phi node. FORNOW, NCOPIES is always 1, so the
4698 loop is redundant. */
4699 use = reduction_phi;
4700 for (j = 0; j < ncopies; j++)
4702 edge pr_edge = loop_preheader_edge (loop);
4703 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4704 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4710 phis.release ();
4711 if (nested_in_vect_loop)
4713 if (double_reduc)
4714 loop = outer_loop;
4715 else
4716 continue;
4719 phis.create (3);
4720 /* Find the loop-closed-use at the loop exit of the original scalar
4721 result. (The reduction result is expected to have two immediate uses,
4722 one at the latch block, and one at the loop exit). For double
4723 reductions we are looking for exit phis of the outer loop. */
4724 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4726 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4728 if (!is_gimple_debug (USE_STMT (use_p)))
4729 phis.safe_push (USE_STMT (use_p));
4731 else
4733 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4735 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4737 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4739 if (!flow_bb_inside_loop_p (loop,
4740 gimple_bb (USE_STMT (phi_use_p)))
4741 && !is_gimple_debug (USE_STMT (phi_use_p)))
4742 phis.safe_push (USE_STMT (phi_use_p));
4748 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4750 /* Replace the uses: */
4751 orig_name = PHI_RESULT (exit_phi);
4752 scalar_result = scalar_results[k];
4753 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4754 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4755 SET_USE (use_p, scalar_result);
4758 phis.release ();
4763 /* Function vectorizable_reduction.
4765 Check if STMT performs a reduction operation that can be vectorized.
4766 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4767 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4768 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4770 This function also handles reduction idioms (patterns) that have been
4771 recognized in advance during vect_pattern_recog. In this case, STMT may be
4772 of this form:
4773 X = pattern_expr (arg0, arg1, ..., X)
4774 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4775 sequence that had been detected and replaced by the pattern-stmt (STMT).
4777 In some cases of reduction patterns, the type of the reduction variable X is
4778 different than the type of the other arguments of STMT.
4779 In such cases, the vectype that is used when transforming STMT into a vector
4780 stmt is different than the vectype that is used to determine the
4781 vectorization factor, because it consists of a different number of elements
4782 than the actual number of elements that are being operated upon in parallel.
4784 For example, consider an accumulation of shorts into an int accumulator.
4785 On some targets it's possible to vectorize this pattern operating on 8
4786 shorts at a time (hence, the vectype for purposes of determining the
4787 vectorization factor should be V8HI); on the other hand, the vectype that
4788 is used to create the vector form is actually V4SI (the type of the result).
4790 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4791 indicates what is the actual level of parallelism (V8HI in the example), so
4792 that the right vectorization factor would be derived. This vectype
4793 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4794 be used to create the vectorized stmt. The right vectype for the vectorized
4795 stmt is obtained from the type of the result X:
4796 get_vectype_for_scalar_type (TREE_TYPE (X))
4798 This means that, contrary to "regular" reductions (or "regular" stmts in
4799 general), the following equation:
4800 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4801 does *NOT* necessarily hold for reduction patterns. */
4803 bool
4804 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4805 gimple *vec_stmt, slp_tree slp_node)
4807 tree vec_dest;
4808 tree scalar_dest;
4809 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4810 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4811 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4812 tree vectype_in = NULL_TREE;
4813 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4814 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4815 enum tree_code code, orig_code, epilog_reduc_code;
4816 machine_mode vec_mode;
4817 int op_type;
4818 optab optab, reduc_optab;
4819 tree new_temp = NULL_TREE;
4820 tree def;
4821 gimple def_stmt;
4822 enum vect_def_type dt;
4823 gphi *new_phi = NULL;
4824 tree scalar_type;
4825 bool is_simple_use;
4826 gimple orig_stmt;
4827 stmt_vec_info orig_stmt_info;
4828 tree expr = NULL_TREE;
4829 int i;
4830 int ncopies;
4831 int epilog_copies;
4832 stmt_vec_info prev_stmt_info, prev_phi_info;
4833 bool single_defuse_cycle = false;
4834 tree reduc_def = NULL_TREE;
4835 gimple new_stmt = NULL;
4836 int j;
4837 tree ops[3];
4838 bool nested_cycle = false, found_nested_cycle_def = false;
4839 gimple reduc_def_stmt = NULL;
4840 bool double_reduc = false, dummy;
4841 basic_block def_bb;
4842 struct loop * def_stmt_loop, *outer_loop = NULL;
4843 tree def_arg;
4844 gimple def_arg_stmt;
4845 auto_vec<tree> vec_oprnds0;
4846 auto_vec<tree> vec_oprnds1;
4847 auto_vec<tree> vect_defs;
4848 auto_vec<gimple> phis;
4849 int vec_num;
4850 tree def0, def1, tem, op0, op1 = NULL_TREE;
4852 /* In case of reduction chain we switch to the first stmt in the chain, but
4853 we don't update STMT_INFO, since only the last stmt is marked as reduction
4854 and has reduction properties. */
4855 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4856 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4858 if (nested_in_vect_loop_p (loop, stmt))
4860 outer_loop = loop;
4861 loop = loop->inner;
4862 nested_cycle = true;
4865 /* 1. Is vectorizable reduction? */
4866 /* Not supportable if the reduction variable is used in the loop, unless
4867 it's a reduction chain. */
4868 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4869 && !GROUP_FIRST_ELEMENT (stmt_info))
4870 return false;
4872 /* Reductions that are not used even in an enclosing outer-loop,
4873 are expected to be "live" (used out of the loop). */
4874 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4875 && !STMT_VINFO_LIVE_P (stmt_info))
4876 return false;
4878 /* Make sure it was already recognized as a reduction computation. */
4879 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4880 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4881 return false;
4883 /* 2. Has this been recognized as a reduction pattern?
4885 Check if STMT represents a pattern that has been recognized
4886 in earlier analysis stages. For stmts that represent a pattern,
4887 the STMT_VINFO_RELATED_STMT field records the last stmt in
4888 the original sequence that constitutes the pattern. */
4890 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4891 if (orig_stmt)
4893 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4894 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4895 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4898 /* 3. Check the operands of the operation. The first operands are defined
4899 inside the loop body. The last operand is the reduction variable,
4900 which is defined by the loop-header-phi. */
4902 gcc_assert (is_gimple_assign (stmt));
4904 /* Flatten RHS. */
4905 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4907 case GIMPLE_SINGLE_RHS:
4908 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4909 if (op_type == ternary_op)
4911 tree rhs = gimple_assign_rhs1 (stmt);
4912 ops[0] = TREE_OPERAND (rhs, 0);
4913 ops[1] = TREE_OPERAND (rhs, 1);
4914 ops[2] = TREE_OPERAND (rhs, 2);
4915 code = TREE_CODE (rhs);
4917 else
4918 return false;
4919 break;
4921 case GIMPLE_BINARY_RHS:
4922 code = gimple_assign_rhs_code (stmt);
4923 op_type = TREE_CODE_LENGTH (code);
4924 gcc_assert (op_type == binary_op);
4925 ops[0] = gimple_assign_rhs1 (stmt);
4926 ops[1] = gimple_assign_rhs2 (stmt);
4927 break;
4929 case GIMPLE_TERNARY_RHS:
4930 code = gimple_assign_rhs_code (stmt);
4931 op_type = TREE_CODE_LENGTH (code);
4932 gcc_assert (op_type == ternary_op);
4933 ops[0] = gimple_assign_rhs1 (stmt);
4934 ops[1] = gimple_assign_rhs2 (stmt);
4935 ops[2] = gimple_assign_rhs3 (stmt);
4936 break;
4938 case GIMPLE_UNARY_RHS:
4939 return false;
4941 default:
4942 gcc_unreachable ();
4944 /* The default is that the reduction variable is the last in statement. */
4945 int reduc_index = op_type - 1;
4947 if (code == COND_EXPR && slp_node)
4948 return false;
4950 scalar_dest = gimple_assign_lhs (stmt);
4951 scalar_type = TREE_TYPE (scalar_dest);
4952 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4953 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4954 return false;
4956 /* Do not try to vectorize bit-precision reductions. */
4957 if ((TYPE_PRECISION (scalar_type)
4958 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4959 return false;
4961 /* All uses but the last are expected to be defined in the loop.
4962 The last use is the reduction variable. In case of nested cycle this
4963 assumption is not true: we use reduc_index to record the index of the
4964 reduction variable. */
4965 for (i = 0; i < op_type - 1; i++)
4967 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4968 if (i == 0 && code == COND_EXPR)
4969 continue;
4971 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4972 &def_stmt, &def, &dt, &tem);
4973 if (!vectype_in)
4974 vectype_in = tem;
4975 gcc_assert (is_simple_use);
4977 if (dt != vect_internal_def
4978 && dt != vect_external_def
4979 && dt != vect_constant_def
4980 && dt != vect_induction_def
4981 && !(dt == vect_nested_cycle && nested_cycle))
4982 return false;
4984 if (dt == vect_nested_cycle)
4986 found_nested_cycle_def = true;
4987 reduc_def_stmt = def_stmt;
4988 reduc_index = i;
4992 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4993 &def_stmt, &def, &dt, &tem);
4994 if (!vectype_in)
4995 vectype_in = tem;
4996 gcc_assert (is_simple_use);
4997 if (!found_nested_cycle_def)
4998 reduc_def_stmt = def_stmt;
5000 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5001 return false;
5003 if (!(dt == vect_reduction_def
5004 || dt == vect_nested_cycle
5005 || ((dt == vect_internal_def || dt == vect_external_def
5006 || dt == vect_constant_def || dt == vect_induction_def)
5007 && nested_cycle && found_nested_cycle_def)))
5009 /* For pattern recognized stmts, orig_stmt might be a reduction,
5010 but some helper statements for the pattern might not, or
5011 might be COND_EXPRs with reduction uses in the condition. */
5012 gcc_assert (orig_stmt);
5013 return false;
5016 if (orig_stmt)
5017 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5018 reduc_def_stmt,
5019 !nested_cycle,
5020 &dummy));
5021 else
5023 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5024 !nested_cycle, &dummy);
5025 /* We changed STMT to be the first stmt in reduction chain, hence we
5026 check that in this case the first element in the chain is STMT. */
5027 gcc_assert (stmt == tmp
5028 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5031 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5032 return false;
5034 if (slp_node || PURE_SLP_STMT (stmt_info))
5035 ncopies = 1;
5036 else
5037 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5038 / TYPE_VECTOR_SUBPARTS (vectype_in));
5040 gcc_assert (ncopies >= 1);
5042 vec_mode = TYPE_MODE (vectype_in);
5044 if (code == COND_EXPR)
5046 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5048 if (dump_enabled_p ())
5049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5050 "unsupported condition in reduction\n");
5052 return false;
5055 else
5057 /* 4. Supportable by target? */
5059 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5060 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5062 /* Shifts and rotates are only supported by vectorizable_shifts,
5063 not vectorizable_reduction. */
5064 if (dump_enabled_p ())
5065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5066 "unsupported shift or rotation.\n");
5067 return false;
5070 /* 4.1. check support for the operation in the loop */
5071 optab = optab_for_tree_code (code, vectype_in, optab_default);
5072 if (!optab)
5074 if (dump_enabled_p ())
5075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5076 "no optab.\n");
5078 return false;
5081 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5083 if (dump_enabled_p ())
5084 dump_printf (MSG_NOTE, "op not supported by target.\n");
5086 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5087 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5088 < vect_min_worthwhile_factor (code))
5089 return false;
5091 if (dump_enabled_p ())
5092 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5095 /* Worthwhile without SIMD support? */
5096 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5097 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5098 < vect_min_worthwhile_factor (code))
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102 "not worthwhile without SIMD support.\n");
5104 return false;
5108 /* 4.2. Check support for the epilog operation.
5110 If STMT represents a reduction pattern, then the type of the
5111 reduction variable may be different than the type of the rest
5112 of the arguments. For example, consider the case of accumulation
5113 of shorts into an int accumulator; The original code:
5114 S1: int_a = (int) short_a;
5115 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5117 was replaced with:
5118 STMT: int_acc = widen_sum <short_a, int_acc>
5120 This means that:
5121 1. The tree-code that is used to create the vector operation in the
5122 epilog code (that reduces the partial results) is not the
5123 tree-code of STMT, but is rather the tree-code of the original
5124 stmt from the pattern that STMT is replacing. I.e, in the example
5125 above we want to use 'widen_sum' in the loop, but 'plus' in the
5126 epilog.
5127 2. The type (mode) we use to check available target support
5128 for the vector operation to be created in the *epilog*, is
5129 determined by the type of the reduction variable (in the example
5130 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5131 However the type (mode) we use to check available target support
5132 for the vector operation to be created *inside the loop*, is
5133 determined by the type of the other arguments to STMT (in the
5134 example we'd check this: optab_handler (widen_sum_optab,
5135 vect_short_mode)).
5137 This is contrary to "regular" reductions, in which the types of all
5138 the arguments are the same as the type of the reduction variable.
5139 For "regular" reductions we can therefore use the same vector type
5140 (and also the same tree-code) when generating the epilog code and
5141 when generating the code inside the loop. */
5143 if (orig_stmt)
5145 /* This is a reduction pattern: get the vectype from the type of the
5146 reduction variable, and get the tree-code from orig_stmt. */
5147 orig_code = gimple_assign_rhs_code (orig_stmt);
5148 gcc_assert (vectype_out);
5149 vec_mode = TYPE_MODE (vectype_out);
5151 else
5153 /* Regular reduction: use the same vectype and tree-code as used for
5154 the vector code inside the loop can be used for the epilog code. */
5155 orig_code = code;
5158 if (nested_cycle)
5160 def_bb = gimple_bb (reduc_def_stmt);
5161 def_stmt_loop = def_bb->loop_father;
5162 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5163 loop_preheader_edge (def_stmt_loop));
5164 if (TREE_CODE (def_arg) == SSA_NAME
5165 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5166 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5167 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5168 && vinfo_for_stmt (def_arg_stmt)
5169 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5170 == vect_double_reduction_def)
5171 double_reduc = true;
5174 epilog_reduc_code = ERROR_MARK;
5175 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5177 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5178 optab_default);
5179 if (!reduc_optab)
5181 if (dump_enabled_p ())
5182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5183 "no optab for reduction.\n");
5185 epilog_reduc_code = ERROR_MARK;
5187 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5189 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5190 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5192 if (dump_enabled_p ())
5193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5194 "reduc op not supported by target.\n");
5196 epilog_reduc_code = ERROR_MARK;
5200 else
5202 if (!nested_cycle || double_reduc)
5204 if (dump_enabled_p ())
5205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5206 "no reduc code for scalar code.\n");
5208 return false;
5212 if (double_reduc && ncopies > 1)
5214 if (dump_enabled_p ())
5215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5216 "multiple types in double reduction\n");
5218 return false;
5221 /* In case of widenning multiplication by a constant, we update the type
5222 of the constant to be the type of the other operand. We check that the
5223 constant fits the type in the pattern recognition pass. */
5224 if (code == DOT_PROD_EXPR
5225 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5227 if (TREE_CODE (ops[0]) == INTEGER_CST)
5228 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5229 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5230 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5231 else
5233 if (dump_enabled_p ())
5234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5235 "invalid types in dot-prod\n");
5237 return false;
5241 if (!vec_stmt) /* transformation not required. */
5243 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5244 reduc_index))
5245 return false;
5246 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5247 return true;
5250 /** Transform. **/
5252 if (dump_enabled_p ())
5253 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5255 /* FORNOW: Multiple types are not supported for condition. */
5256 if (code == COND_EXPR)
5257 gcc_assert (ncopies == 1);
5259 /* Create the destination vector */
5260 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5262 /* In case the vectorization factor (VF) is bigger than the number
5263 of elements that we can fit in a vectype (nunits), we have to generate
5264 more than one vector stmt - i.e - we need to "unroll" the
5265 vector stmt by a factor VF/nunits. For more details see documentation
5266 in vectorizable_operation. */
5268 /* If the reduction is used in an outer loop we need to generate
5269 VF intermediate results, like so (e.g. for ncopies=2):
5270 r0 = phi (init, r0)
5271 r1 = phi (init, r1)
5272 r0 = x0 + r0;
5273 r1 = x1 + r1;
5274 (i.e. we generate VF results in 2 registers).
5275 In this case we have a separate def-use cycle for each copy, and therefore
5276 for each copy we get the vector def for the reduction variable from the
5277 respective phi node created for this copy.
5279 Otherwise (the reduction is unused in the loop nest), we can combine
5280 together intermediate results, like so (e.g. for ncopies=2):
5281 r = phi (init, r)
5282 r = x0 + r;
5283 r = x1 + r;
5284 (i.e. we generate VF/2 results in a single register).
5285 In this case for each copy we get the vector def for the reduction variable
5286 from the vectorized reduction operation generated in the previous iteration.
5289 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5291 single_defuse_cycle = true;
5292 epilog_copies = 1;
5294 else
5295 epilog_copies = ncopies;
5297 prev_stmt_info = NULL;
5298 prev_phi_info = NULL;
5299 if (slp_node)
5301 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5302 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5303 == TYPE_VECTOR_SUBPARTS (vectype_in));
5305 else
5307 vec_num = 1;
5308 vec_oprnds0.create (1);
5309 if (op_type == ternary_op)
5310 vec_oprnds1.create (1);
5313 phis.create (vec_num);
5314 vect_defs.create (vec_num);
5315 if (!slp_node)
5316 vect_defs.quick_push (NULL_TREE);
5318 for (j = 0; j < ncopies; j++)
5320 if (j == 0 || !single_defuse_cycle)
5322 for (i = 0; i < vec_num; i++)
5324 /* Create the reduction-phi that defines the reduction
5325 operand. */
5326 new_phi = create_phi_node (vec_dest, loop->header);
5327 set_vinfo_for_stmt (new_phi,
5328 new_stmt_vec_info (new_phi, loop_vinfo,
5329 NULL));
5330 if (j == 0 || slp_node)
5331 phis.quick_push (new_phi);
5335 if (code == COND_EXPR)
5337 gcc_assert (!slp_node);
5338 vectorizable_condition (stmt, gsi, vec_stmt,
5339 PHI_RESULT (phis[0]),
5340 reduc_index, NULL);
5341 /* Multiple types are not supported for condition. */
5342 break;
5345 /* Handle uses. */
5346 if (j == 0)
5348 op0 = ops[!reduc_index];
5349 if (op_type == ternary_op)
5351 if (reduc_index == 0)
5352 op1 = ops[2];
5353 else
5354 op1 = ops[1];
5357 if (slp_node)
5358 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5359 slp_node, -1);
5360 else
5362 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5363 stmt, NULL);
5364 vec_oprnds0.quick_push (loop_vec_def0);
5365 if (op_type == ternary_op)
5367 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5368 NULL);
5369 vec_oprnds1.quick_push (loop_vec_def1);
5373 else
5375 if (!slp_node)
5377 enum vect_def_type dt;
5378 gimple dummy_stmt;
5379 tree dummy;
5381 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5382 &dummy_stmt, &dummy, &dt);
5383 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5384 loop_vec_def0);
5385 vec_oprnds0[0] = loop_vec_def0;
5386 if (op_type == ternary_op)
5388 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5389 &dummy, &dt);
5390 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5391 loop_vec_def1);
5392 vec_oprnds1[0] = loop_vec_def1;
5396 if (single_defuse_cycle)
5397 reduc_def = gimple_assign_lhs (new_stmt);
5399 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5402 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5404 if (slp_node)
5405 reduc_def = PHI_RESULT (phis[i]);
5406 else
5408 if (!single_defuse_cycle || j == 0)
5409 reduc_def = PHI_RESULT (new_phi);
5412 def1 = ((op_type == ternary_op)
5413 ? vec_oprnds1[i] : NULL);
5414 if (op_type == binary_op)
5416 if (reduc_index == 0)
5417 expr = build2 (code, vectype_out, reduc_def, def0);
5418 else
5419 expr = build2 (code, vectype_out, def0, reduc_def);
5421 else
5423 if (reduc_index == 0)
5424 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5425 else
5427 if (reduc_index == 1)
5428 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5429 else
5430 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5434 new_stmt = gimple_build_assign (vec_dest, expr);
5435 new_temp = make_ssa_name (vec_dest, new_stmt);
5436 gimple_assign_set_lhs (new_stmt, new_temp);
5437 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5439 if (slp_node)
5441 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5442 vect_defs.quick_push (new_temp);
5444 else
5445 vect_defs[0] = new_temp;
5448 if (slp_node)
5449 continue;
5451 if (j == 0)
5452 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5453 else
5454 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5456 prev_stmt_info = vinfo_for_stmt (new_stmt);
5457 prev_phi_info = vinfo_for_stmt (new_phi);
5460 /* Finalize the reduction-phi (set its arguments) and create the
5461 epilog reduction code. */
5462 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5464 new_temp = gimple_assign_lhs (*vec_stmt);
5465 vect_defs[0] = new_temp;
5468 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5469 epilog_reduc_code, phis, reduc_index,
5470 double_reduc, slp_node);
5472 return true;
5475 /* Function vect_min_worthwhile_factor.
5477 For a loop where we could vectorize the operation indicated by CODE,
5478 return the minimum vectorization factor that makes it worthwhile
5479 to use generic vectors. */
5481 vect_min_worthwhile_factor (enum tree_code code)
5483 switch (code)
5485 case PLUS_EXPR:
5486 case MINUS_EXPR:
5487 case NEGATE_EXPR:
5488 return 4;
5490 case BIT_AND_EXPR:
5491 case BIT_IOR_EXPR:
5492 case BIT_XOR_EXPR:
5493 case BIT_NOT_EXPR:
5494 return 2;
5496 default:
5497 return INT_MAX;
5502 /* Function vectorizable_induction
5504 Check if PHI performs an induction computation that can be vectorized.
5505 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5506 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5507 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5509 bool
5510 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5511 gimple *vec_stmt)
5513 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5514 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5515 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5516 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5517 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5518 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5519 tree vec_def;
5521 gcc_assert (ncopies >= 1);
5522 /* FORNOW. These restrictions should be relaxed. */
5523 if (nested_in_vect_loop_p (loop, phi))
5525 imm_use_iterator imm_iter;
5526 use_operand_p use_p;
5527 gimple exit_phi;
5528 edge latch_e;
5529 tree loop_arg;
5531 if (ncopies > 1)
5533 if (dump_enabled_p ())
5534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5535 "multiple types in nested loop.\n");
5536 return false;
5539 exit_phi = NULL;
5540 latch_e = loop_latch_edge (loop->inner);
5541 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5542 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5544 gimple use_stmt = USE_STMT (use_p);
5545 if (is_gimple_debug (use_stmt))
5546 continue;
5548 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5550 exit_phi = use_stmt;
5551 break;
5554 if (exit_phi)
5556 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5557 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5558 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5560 if (dump_enabled_p ())
5561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5562 "inner-loop induction only used outside "
5563 "of the outer vectorized loop.\n");
5564 return false;
5569 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5570 return false;
5572 /* FORNOW: SLP not supported. */
5573 if (STMT_SLP_TYPE (stmt_info))
5574 return false;
5576 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5578 if (gimple_code (phi) != GIMPLE_PHI)
5579 return false;
5581 if (!vec_stmt) /* transformation not required. */
5583 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5584 if (dump_enabled_p ())
5585 dump_printf_loc (MSG_NOTE, vect_location,
5586 "=== vectorizable_induction ===\n");
5587 vect_model_induction_cost (stmt_info, ncopies);
5588 return true;
5591 /** Transform. **/
5593 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5596 vec_def = get_initial_def_for_induction (phi);
5597 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5598 return true;
5601 /* Function vectorizable_live_operation.
5603 STMT computes a value that is used outside the loop. Check if
5604 it can be supported. */
5606 bool
5607 vectorizable_live_operation (gimple stmt,
5608 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5609 gimple *vec_stmt)
5611 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5612 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5613 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5614 int i;
5615 int op_type;
5616 tree op;
5617 tree def;
5618 gimple def_stmt;
5619 enum vect_def_type dt;
5620 enum tree_code code;
5621 enum gimple_rhs_class rhs_class;
5623 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5625 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5626 return false;
5628 if (!is_gimple_assign (stmt))
5630 if (gimple_call_internal_p (stmt)
5631 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5632 && gimple_call_lhs (stmt)
5633 && loop->simduid
5634 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5635 && loop->simduid
5636 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5638 edge e = single_exit (loop);
5639 basic_block merge_bb = e->dest;
5640 imm_use_iterator imm_iter;
5641 use_operand_p use_p;
5642 tree lhs = gimple_call_lhs (stmt);
5644 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5646 gimple use_stmt = USE_STMT (use_p);
5647 if (gimple_code (use_stmt) == GIMPLE_PHI
5648 && gimple_bb (use_stmt) == merge_bb)
5650 if (vec_stmt)
5652 tree vfm1
5653 = build_int_cst (unsigned_type_node,
5654 loop_vinfo->vectorization_factor - 1);
5655 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5657 return true;
5662 return false;
5665 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5666 return false;
5668 /* FORNOW. CHECKME. */
5669 if (nested_in_vect_loop_p (loop, stmt))
5670 return false;
5672 code = gimple_assign_rhs_code (stmt);
5673 op_type = TREE_CODE_LENGTH (code);
5674 rhs_class = get_gimple_rhs_class (code);
5675 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5676 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5678 /* FORNOW: support only if all uses are invariant. This means
5679 that the scalar operations can remain in place, unvectorized.
5680 The original last scalar value that they compute will be used. */
5682 for (i = 0; i < op_type; i++)
5684 if (rhs_class == GIMPLE_SINGLE_RHS)
5685 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5686 else
5687 op = gimple_op (stmt, i + 1);
5688 if (op
5689 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5690 &dt))
5692 if (dump_enabled_p ())
5693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5694 "use not simple.\n");
5695 return false;
5698 if (dt != vect_external_def && dt != vect_constant_def)
5699 return false;
5702 /* No transformation is required for the cases we currently support. */
5703 return true;
5706 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5708 static void
5709 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5711 ssa_op_iter op_iter;
5712 imm_use_iterator imm_iter;
5713 def_operand_p def_p;
5714 gimple ustmt;
5716 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5718 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5720 basic_block bb;
5722 if (!is_gimple_debug (ustmt))
5723 continue;
5725 bb = gimple_bb (ustmt);
5727 if (!flow_bb_inside_loop_p (loop, bb))
5729 if (gimple_debug_bind_p (ustmt))
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_NOTE, vect_location,
5733 "killing debug use\n");
5735 gimple_debug_bind_reset_value (ustmt);
5736 update_stmt (ustmt);
5738 else
5739 gcc_unreachable ();
5746 /* This function builds ni_name = number of iterations. Statements
5747 are emitted on the loop preheader edge. */
5749 static tree
5750 vect_build_loop_niters (loop_vec_info loop_vinfo)
5752 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5753 if (TREE_CODE (ni) == INTEGER_CST)
5754 return ni;
5755 else
5757 tree ni_name, var;
5758 gimple_seq stmts = NULL;
5759 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5761 var = create_tmp_var (TREE_TYPE (ni), "niters");
5762 ni_name = force_gimple_operand (ni, &stmts, false, var);
5763 if (stmts)
5764 gsi_insert_seq_on_edge_immediate (pe, stmts);
5766 return ni_name;
5771 /* This function generates the following statements:
5773 ni_name = number of iterations loop executes
5774 ratio = ni_name / vf
5775 ratio_mult_vf_name = ratio * vf
5777 and places them on the loop preheader edge. */
5779 static void
5780 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5781 tree ni_name,
5782 tree *ratio_mult_vf_name_ptr,
5783 tree *ratio_name_ptr)
5785 tree ni_minus_gap_name;
5786 tree var;
5787 tree ratio_name;
5788 tree ratio_mult_vf_name;
5789 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5790 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5791 tree log_vf;
5793 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5795 /* If epilogue loop is required because of data accesses with gaps, we
5796 subtract one iteration from the total number of iterations here for
5797 correct calculation of RATIO. */
5798 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5800 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5801 ni_name,
5802 build_one_cst (TREE_TYPE (ni_name)));
5803 if (!is_gimple_val (ni_minus_gap_name))
5805 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5806 gimple stmts = NULL;
5807 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5808 true, var);
5809 gsi_insert_seq_on_edge_immediate (pe, stmts);
5812 else
5813 ni_minus_gap_name = ni_name;
5815 /* Create: ratio = ni >> log2(vf) */
5816 /* ??? As we have ni == number of latch executions + 1, ni could
5817 have overflown to zero. So avoid computing ratio based on ni
5818 but compute it using the fact that we know ratio will be at least
5819 one, thus via (ni - vf) >> log2(vf) + 1. */
5820 ratio_name
5821 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5822 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5823 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5824 ni_minus_gap_name,
5825 build_int_cst
5826 (TREE_TYPE (ni_name), vf)),
5827 log_vf),
5828 build_int_cst (TREE_TYPE (ni_name), 1));
5829 if (!is_gimple_val (ratio_name))
5831 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5832 gimple stmts = NULL;
5833 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5834 gsi_insert_seq_on_edge_immediate (pe, stmts);
5836 *ratio_name_ptr = ratio_name;
5838 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5840 if (ratio_mult_vf_name_ptr)
5842 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5843 ratio_name, log_vf);
5844 if (!is_gimple_val (ratio_mult_vf_name))
5846 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5847 gimple stmts = NULL;
5848 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5849 true, var);
5850 gsi_insert_seq_on_edge_immediate (pe, stmts);
5852 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5855 return;
5859 /* Function vect_transform_loop.
5861 The analysis phase has determined that the loop is vectorizable.
5862 Vectorize the loop - created vectorized stmts to replace the scalar
5863 stmts in the loop, and update the loop exit condition. */
5865 void
5866 vect_transform_loop (loop_vec_info loop_vinfo)
5868 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5869 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5870 int nbbs = loop->num_nodes;
5871 int i;
5872 tree ratio = NULL;
5873 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5874 bool grouped_store;
5875 bool slp_scheduled = false;
5876 gimple stmt, pattern_stmt;
5877 gimple_seq pattern_def_seq = NULL;
5878 gimple_stmt_iterator pattern_def_si = gsi_none ();
5879 bool transform_pattern_stmt = false;
5880 bool check_profitability = false;
5881 int th;
5882 /* Record number of iterations before we started tampering with the profile. */
5883 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5885 if (dump_enabled_p ())
5886 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5888 /* If profile is inprecise, we have chance to fix it up. */
5889 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5890 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5892 /* Use the more conservative vectorization threshold. If the number
5893 of iterations is constant assume the cost check has been performed
5894 by our caller. If the threshold makes all loops profitable that
5895 run at least the vectorization factor number of times checking
5896 is pointless, too. */
5897 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5898 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5899 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5901 if (dump_enabled_p ())
5902 dump_printf_loc (MSG_NOTE, vect_location,
5903 "Profitability threshold is %d loop iterations.\n",
5904 th);
5905 check_profitability = true;
5908 /* Version the loop first, if required, so the profitability check
5909 comes first. */
5911 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5912 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5914 vect_loop_versioning (loop_vinfo, th, check_profitability);
5915 check_profitability = false;
5918 tree ni_name = vect_build_loop_niters (loop_vinfo);
5919 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5921 /* Peel the loop if there are data refs with unknown alignment.
5922 Only one data ref with unknown store is allowed. */
5924 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5926 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5927 th, check_profitability);
5928 check_profitability = false;
5929 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5930 be re-computed. */
5931 ni_name = NULL_TREE;
5934 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5935 compile time constant), or it is a constant that doesn't divide by the
5936 vectorization factor, then an epilog loop needs to be created.
5937 We therefore duplicate the loop: the original loop will be vectorized,
5938 and will compute the first (n/VF) iterations. The second copy of the loop
5939 will remain scalar and will compute the remaining (n%VF) iterations.
5940 (VF is the vectorization factor). */
5942 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5943 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5945 tree ratio_mult_vf;
5946 if (!ni_name)
5947 ni_name = vect_build_loop_niters (loop_vinfo);
5948 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5949 &ratio);
5950 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5951 th, check_profitability);
5953 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5954 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5955 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5956 else
5958 if (!ni_name)
5959 ni_name = vect_build_loop_niters (loop_vinfo);
5960 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5963 /* 1) Make sure the loop header has exactly two entries
5964 2) Make sure we have a preheader basic block. */
5966 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5968 split_edge (loop_preheader_edge (loop));
5970 /* FORNOW: the vectorizer supports only loops which body consist
5971 of one basic block (header + empty latch). When the vectorizer will
5972 support more involved loop forms, the order by which the BBs are
5973 traversed need to be reconsidered. */
5975 for (i = 0; i < nbbs; i++)
5977 basic_block bb = bbs[i];
5978 stmt_vec_info stmt_info;
5980 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5981 gsi_next (&si))
5983 gphi *phi = si.phi ();
5984 if (dump_enabled_p ())
5986 dump_printf_loc (MSG_NOTE, vect_location,
5987 "------>vectorizing phi: ");
5988 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5989 dump_printf (MSG_NOTE, "\n");
5991 stmt_info = vinfo_for_stmt (phi);
5992 if (!stmt_info)
5993 continue;
5995 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5996 vect_loop_kill_debug_uses (loop, phi);
5998 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5999 && !STMT_VINFO_LIVE_P (stmt_info))
6000 continue;
6002 if (STMT_VINFO_VECTYPE (stmt_info)
6003 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6004 != (unsigned HOST_WIDE_INT) vectorization_factor)
6005 && dump_enabled_p ())
6006 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6008 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6010 if (dump_enabled_p ())
6011 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6012 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6016 pattern_stmt = NULL;
6017 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6018 !gsi_end_p (si) || transform_pattern_stmt;)
6020 bool is_store;
6022 if (transform_pattern_stmt)
6023 stmt = pattern_stmt;
6024 else
6026 stmt = gsi_stmt (si);
6027 /* During vectorization remove existing clobber stmts. */
6028 if (gimple_clobber_p (stmt))
6030 unlink_stmt_vdef (stmt);
6031 gsi_remove (&si, true);
6032 release_defs (stmt);
6033 continue;
6037 if (dump_enabled_p ())
6039 dump_printf_loc (MSG_NOTE, vect_location,
6040 "------>vectorizing statement: ");
6041 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6042 dump_printf (MSG_NOTE, "\n");
6045 stmt_info = vinfo_for_stmt (stmt);
6047 /* vector stmts created in the outer-loop during vectorization of
6048 stmts in an inner-loop may not have a stmt_info, and do not
6049 need to be vectorized. */
6050 if (!stmt_info)
6052 gsi_next (&si);
6053 continue;
6056 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6057 vect_loop_kill_debug_uses (loop, stmt);
6059 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6060 && !STMT_VINFO_LIVE_P (stmt_info))
6062 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6063 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6064 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6065 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6067 stmt = pattern_stmt;
6068 stmt_info = vinfo_for_stmt (stmt);
6070 else
6072 gsi_next (&si);
6073 continue;
6076 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6077 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6078 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6079 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6080 transform_pattern_stmt = true;
6082 /* If pattern statement has def stmts, vectorize them too. */
6083 if (is_pattern_stmt_p (stmt_info))
6085 if (pattern_def_seq == NULL)
6087 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6088 pattern_def_si = gsi_start (pattern_def_seq);
6090 else if (!gsi_end_p (pattern_def_si))
6091 gsi_next (&pattern_def_si);
6092 if (pattern_def_seq != NULL)
6094 gimple pattern_def_stmt = NULL;
6095 stmt_vec_info pattern_def_stmt_info = NULL;
6097 while (!gsi_end_p (pattern_def_si))
6099 pattern_def_stmt = gsi_stmt (pattern_def_si);
6100 pattern_def_stmt_info
6101 = vinfo_for_stmt (pattern_def_stmt);
6102 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6103 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6104 break;
6105 gsi_next (&pattern_def_si);
6108 if (!gsi_end_p (pattern_def_si))
6110 if (dump_enabled_p ())
6112 dump_printf_loc (MSG_NOTE, vect_location,
6113 "==> vectorizing pattern def "
6114 "stmt: ");
6115 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6116 pattern_def_stmt, 0);
6117 dump_printf (MSG_NOTE, "\n");
6120 stmt = pattern_def_stmt;
6121 stmt_info = pattern_def_stmt_info;
6123 else
6125 pattern_def_si = gsi_none ();
6126 transform_pattern_stmt = false;
6129 else
6130 transform_pattern_stmt = false;
6133 if (STMT_VINFO_VECTYPE (stmt_info))
6135 unsigned int nunits
6136 = (unsigned int)
6137 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6138 if (!STMT_SLP_TYPE (stmt_info)
6139 && nunits != (unsigned int) vectorization_factor
6140 && dump_enabled_p ())
6141 /* For SLP VF is set according to unrolling factor, and not
6142 to vector size, hence for SLP this print is not valid. */
6143 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6146 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6147 reached. */
6148 if (STMT_SLP_TYPE (stmt_info))
6150 if (!slp_scheduled)
6152 slp_scheduled = true;
6154 if (dump_enabled_p ())
6155 dump_printf_loc (MSG_NOTE, vect_location,
6156 "=== scheduling SLP instances ===\n");
6158 vect_schedule_slp (loop_vinfo, NULL);
6161 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6162 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6164 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6166 pattern_def_seq = NULL;
6167 gsi_next (&si);
6169 continue;
6173 /* -------- vectorize statement ------------ */
6174 if (dump_enabled_p ())
6175 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6177 grouped_store = false;
6178 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6179 if (is_store)
6181 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6183 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6184 interleaving chain was completed - free all the stores in
6185 the chain. */
6186 gsi_next (&si);
6187 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6189 else
6191 /* Free the attached stmt_vec_info and remove the stmt. */
6192 gimple store = gsi_stmt (si);
6193 free_stmt_vec_info (store);
6194 unlink_stmt_vdef (store);
6195 gsi_remove (&si, true);
6196 release_defs (store);
6199 /* Stores can only appear at the end of pattern statements. */
6200 gcc_assert (!transform_pattern_stmt);
6201 pattern_def_seq = NULL;
6203 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6205 pattern_def_seq = NULL;
6206 gsi_next (&si);
6208 } /* stmts in BB */
6209 } /* BBs in loop */
6211 slpeel_make_loop_iterate_ntimes (loop, ratio);
6213 /* Reduce loop iterations by the vectorization factor. */
6214 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6215 expected_iterations / vectorization_factor);
6216 loop->nb_iterations_upper_bound
6217 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6218 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6219 && loop->nb_iterations_upper_bound != 0)
6220 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6221 if (loop->any_estimate)
6223 loop->nb_iterations_estimate
6224 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6225 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6226 && loop->nb_iterations_estimate != 0)
6227 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6230 if (dump_enabled_p ())
6232 dump_printf_loc (MSG_NOTE, vect_location,
6233 "LOOP VECTORIZED\n");
6234 if (loop->inner)
6235 dump_printf_loc (MSG_NOTE, vect_location,
6236 "OUTER LOOP VECTORIZED\n");
6237 dump_printf (MSG_NOTE, "\n");