Concretize gimple_cond_set_code
[official-gcc.git] / gcc / tree-vect-loop.c
blob19535b3eb1a22856006508b837174508b24809b7
1 /* Loop Vectorization
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-ssa-alias.h"
32 #include "internal-fn.h"
33 #include "gimple-expr.h"
34 #include "is-a.h"
35 #include "gimple.h"
36 #include "gimplify.h"
37 #include "gimple-iterator.h"
38 #include "gimplify-me.h"
39 #include "gimple-ssa.h"
40 #include "tree-phinodes.h"
41 #include "ssa-iterators.h"
42 #include "stringpool.h"
43 #include "tree-ssanames.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-pass.h"
48 #include "cfgloop.h"
49 #include "expr.h"
50 #include "recog.h"
51 #include "optabs.h"
52 #include "params.h"
53 #include "diagnostic-core.h"
54 #include "tree-chrec.h"
55 #include "tree-scalar-evolution.h"
56 #include "tree-vectorizer.h"
57 #include "target.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
159 /* Function vect_determine_vectorization_factor
161 Determine the vectorization factor (VF). VF is the number of data elements
162 that are operated upon in parallel in a single iteration of the vectorized
163 loop. For example, when vectorizing a loop that operates on 4byte elements,
164 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
165 elements can fit in a single vector register.
167 We currently support vectorization of loops in which all types operated upon
168 are of the same size. Therefore this function currently sets VF according to
169 the size of the types operated upon, and fails if there are multiple sizes
170 in the loop.
172 VF is also the factor by which the loop iterations are strip-mined, e.g.:
173 original loop:
174 for (i=0; i<N; i++){
175 a[i] = b[i] + c[i];
178 vectorized loop:
179 for (i=0; i<N; i+=VF){
180 a[i:VF] = b[i:VF] + c[i:VF];
184 static bool
185 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
187 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
188 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
189 int nbbs = loop->num_nodes;
190 gimple_stmt_iterator si;
191 unsigned int vectorization_factor = 0;
192 tree scalar_type;
193 gimple phi;
194 tree vectype;
195 unsigned int nunits;
196 stmt_vec_info stmt_info;
197 int i;
198 HOST_WIDE_INT dummy;
199 gimple stmt, pattern_stmt = NULL;
200 gimple_seq pattern_def_seq = NULL;
201 gimple_stmt_iterator pattern_def_si = gsi_none ();
202 bool analyze_pattern_stmt = false;
204 if (dump_enabled_p ())
205 dump_printf_loc (MSG_NOTE, vect_location,
206 "=== vect_determine_vectorization_factor ===\n");
208 for (i = 0; i < nbbs; i++)
210 basic_block bb = bbs[i];
212 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
214 phi = gsi_stmt (si);
215 stmt_info = vinfo_for_stmt (phi);
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
220 dump_printf (MSG_NOTE, "\n");
223 gcc_assert (stmt_info);
225 if (STMT_VINFO_RELEVANT_P (stmt_info))
227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228 scalar_type = TREE_TYPE (PHI_RESULT (phi));
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_NOTE, vect_location,
233 "get vectype for scalar type: ");
234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235 dump_printf (MSG_NOTE, "\n");
238 vectype = get_vectype_for_scalar_type (scalar_type);
239 if (!vectype)
241 if (dump_enabled_p ())
243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244 "not vectorized: unsupported "
245 "data-type ");
246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247 scalar_type);
248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
250 return false;
252 STMT_VINFO_VECTYPE (stmt_info) = vectype;
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258 dump_printf (MSG_NOTE, "\n");
261 nunits = TYPE_VECTOR_SUBPARTS (vectype);
262 if (dump_enabled_p ())
263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
264 nunits);
266 if (!vectorization_factor
267 || (nunits > vectorization_factor))
268 vectorization_factor = nunits;
272 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
274 tree vf_vectype;
276 if (analyze_pattern_stmt)
277 stmt = pattern_stmt;
278 else
279 stmt = gsi_stmt (si);
281 stmt_info = vinfo_for_stmt (stmt);
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_NOTE, vect_location,
286 "==> examining statement: ");
287 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
288 dump_printf (MSG_NOTE, "\n");
291 gcc_assert (stmt_info);
293 /* Skip stmts which do not need to be vectorized. */
294 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295 && !STMT_VINFO_LIVE_P (stmt_info))
296 || gimple_clobber_p (stmt))
298 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
303 stmt = pattern_stmt;
304 stmt_info = vinfo_for_stmt (pattern_stmt);
305 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location,
308 "==> examining pattern statement: ");
309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
310 dump_printf (MSG_NOTE, "\n");
313 else
315 if (dump_enabled_p ())
316 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
317 gsi_next (&si);
318 continue;
321 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
322 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
323 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
324 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
325 analyze_pattern_stmt = true;
327 /* If a pattern statement has def stmts, analyze them too. */
328 if (is_pattern_stmt_p (stmt_info))
330 if (pattern_def_seq == NULL)
332 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
333 pattern_def_si = gsi_start (pattern_def_seq);
335 else if (!gsi_end_p (pattern_def_si))
336 gsi_next (&pattern_def_si);
337 if (pattern_def_seq != NULL)
339 gimple pattern_def_stmt = NULL;
340 stmt_vec_info pattern_def_stmt_info = NULL;
342 while (!gsi_end_p (pattern_def_si))
344 pattern_def_stmt = gsi_stmt (pattern_def_si);
345 pattern_def_stmt_info
346 = vinfo_for_stmt (pattern_def_stmt);
347 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
348 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
349 break;
350 gsi_next (&pattern_def_si);
353 if (!gsi_end_p (pattern_def_si))
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_NOTE, vect_location,
358 "==> examining pattern def stmt: ");
359 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
360 pattern_def_stmt, 0);
361 dump_printf (MSG_NOTE, "\n");
364 stmt = pattern_def_stmt;
365 stmt_info = pattern_def_stmt_info;
367 else
369 pattern_def_si = gsi_none ();
370 analyze_pattern_stmt = false;
373 else
374 analyze_pattern_stmt = false;
377 if (gimple_get_lhs (stmt) == NULL_TREE
378 /* MASK_STORE has no lhs, but is ok. */
379 && (!is_gimple_call (stmt)
380 || !gimple_call_internal_p (stmt)
381 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
383 if (is_gimple_call (stmt))
385 /* Ignore calls with no lhs. These must be calls to
386 #pragma omp simd functions, and what vectorization factor
387 it really needs can't be determined until
388 vectorizable_simd_clone_call. */
389 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
391 pattern_def_seq = NULL;
392 gsi_next (&si);
394 continue;
396 if (dump_enabled_p ())
398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399 "not vectorized: irregular stmt.");
400 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
402 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
404 return false;
407 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
409 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 "not vectorized: vector stmt in loop:");
413 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
414 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
416 return false;
419 if (STMT_VINFO_VECTYPE (stmt_info))
421 /* The only case when a vectype had been already set is for stmts
422 that contain a dataref, or for "pattern-stmts" (stmts
423 generated by the vectorizer to represent/replace a certain
424 idiom). */
425 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
426 || is_pattern_stmt_p (stmt_info)
427 || !gsi_end_p (pattern_def_si));
428 vectype = STMT_VINFO_VECTYPE (stmt_info);
430 else
432 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
433 if (is_gimple_call (stmt)
434 && gimple_call_internal_p (stmt)
435 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
436 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
437 else
438 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
439 if (dump_enabled_p ())
441 dump_printf_loc (MSG_NOTE, vect_location,
442 "get vectype for scalar type: ");
443 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
444 dump_printf (MSG_NOTE, "\n");
446 vectype = get_vectype_for_scalar_type (scalar_type);
447 if (!vectype)
449 if (dump_enabled_p ())
451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
452 "not vectorized: unsupported "
453 "data-type ");
454 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
455 scalar_type);
456 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
458 return false;
461 STMT_VINFO_VECTYPE (stmt_info) = vectype;
463 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
466 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
467 dump_printf (MSG_NOTE, "\n");
471 /* The vectorization factor is according to the smallest
472 scalar type (or the largest vector size, but we only
473 support one vector size per loop). */
474 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
475 &dummy);
476 if (dump_enabled_p ())
478 dump_printf_loc (MSG_NOTE, vect_location,
479 "get vectype for scalar type: ");
480 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
481 dump_printf (MSG_NOTE, "\n");
483 vf_vectype = get_vectype_for_scalar_type (scalar_type);
484 if (!vf_vectype)
486 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
489 "not vectorized: unsupported data-type ");
490 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
491 scalar_type);
492 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
494 return false;
497 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
498 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
500 if (dump_enabled_p ())
502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
503 "not vectorized: different sized vector "
504 "types in statement, ");
505 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
506 vectype);
507 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
508 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
509 vf_vectype);
510 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
512 return false;
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
518 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
519 dump_printf (MSG_NOTE, "\n");
522 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
525 if (!vectorization_factor
526 || (nunits > vectorization_factor))
527 vectorization_factor = nunits;
529 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
531 pattern_def_seq = NULL;
532 gsi_next (&si);
537 /* TODO: Analyze cost. Decide if worth while to vectorize. */
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
540 vectorization_factor);
541 if (vectorization_factor <= 1)
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
545 "not vectorized: unsupported data-type\n");
546 return false;
548 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
550 return true;
554 /* Function vect_is_simple_iv_evolution.
556 FORNOW: A simple evolution of an induction variables in the loop is
557 considered a polynomial evolution. */
559 static bool
560 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
561 tree * step)
563 tree init_expr;
564 tree step_expr;
565 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
566 basic_block bb;
568 /* When there is no evolution in this loop, the evolution function
569 is not "simple". */
570 if (evolution_part == NULL_TREE)
571 return false;
573 /* When the evolution is a polynomial of degree >= 2
574 the evolution function is not "simple". */
575 if (tree_is_chrec (evolution_part))
576 return false;
578 step_expr = evolution_part;
579 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
581 if (dump_enabled_p ())
583 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
584 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
585 dump_printf (MSG_NOTE, ", init: ");
586 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
587 dump_printf (MSG_NOTE, "\n");
590 *init = init_expr;
591 *step = step_expr;
593 if (TREE_CODE (step_expr) != INTEGER_CST
594 && (TREE_CODE (step_expr) != SSA_NAME
595 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
596 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
597 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
598 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
599 || !flag_associative_math)))
600 && (TREE_CODE (step_expr) != REAL_CST
601 || !flag_associative_math))
603 if (dump_enabled_p ())
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "step unknown.\n");
606 return false;
609 return true;
612 /* Function vect_analyze_scalar_cycles_1.
614 Examine the cross iteration def-use cycles of scalar variables
615 in LOOP. LOOP_VINFO represents the loop that is now being
616 considered for vectorization (can be LOOP, or an outer-loop
617 enclosing LOOP). */
619 static void
620 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
622 basic_block bb = loop->header;
623 tree init, step;
624 auto_vec<gimple, 64> worklist;
625 gimple_phi_iterator gsi;
626 bool double_reduc;
628 if (dump_enabled_p ())
629 dump_printf_loc (MSG_NOTE, vect_location,
630 "=== vect_analyze_scalar_cycles ===\n");
632 /* First - identify all inductions. Reduction detection assumes that all the
633 inductions have been identified, therefore, this order must not be
634 changed. */
635 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
637 gimple_phi phi = gsi.phi ();
638 tree access_fn = NULL;
639 tree def = PHI_RESULT (phi);
640 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
642 if (dump_enabled_p ())
644 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
645 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
646 dump_printf (MSG_NOTE, "\n");
649 /* Skip virtual phi's. The data dependences that are associated with
650 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
651 if (virtual_operand_p (def))
652 continue;
654 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
656 /* Analyze the evolution function. */
657 access_fn = analyze_scalar_evolution (loop, def);
658 if (access_fn)
660 STRIP_NOPS (access_fn);
661 if (dump_enabled_p ())
663 dump_printf_loc (MSG_NOTE, vect_location,
664 "Access function of PHI: ");
665 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
666 dump_printf (MSG_NOTE, "\n");
668 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
669 = evolution_part_in_loop_num (access_fn, loop->num);
672 if (!access_fn
673 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
674 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
675 && TREE_CODE (step) != INTEGER_CST))
677 worklist.safe_push (phi);
678 continue;
681 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
683 if (dump_enabled_p ())
684 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
685 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
689 /* Second - identify all reductions and nested cycles. */
690 while (worklist.length () > 0)
692 gimple phi = worklist.pop ();
693 tree def = PHI_RESULT (phi);
694 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
695 gimple reduc_stmt;
696 bool nested_cycle;
698 if (dump_enabled_p ())
700 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
701 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
702 dump_printf (MSG_NOTE, "\n");
705 gcc_assert (!virtual_operand_p (def)
706 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
708 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
709 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
710 &double_reduc);
711 if (reduc_stmt)
713 if (double_reduc)
715 if (dump_enabled_p ())
716 dump_printf_loc (MSG_NOTE, vect_location,
717 "Detected double reduction.\n");
719 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
720 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
721 vect_double_reduction_def;
723 else
725 if (nested_cycle)
727 if (dump_enabled_p ())
728 dump_printf_loc (MSG_NOTE, vect_location,
729 "Detected vectorizable nested cycle.\n");
731 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
732 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
733 vect_nested_cycle;
735 else
737 if (dump_enabled_p ())
738 dump_printf_loc (MSG_NOTE, vect_location,
739 "Detected reduction.\n");
741 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
742 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
743 vect_reduction_def;
744 /* Store the reduction cycles for possible vectorization in
745 loop-aware SLP. */
746 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
750 else
751 if (dump_enabled_p ())
752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
753 "Unknown def-use cycle pattern.\n");
758 /* Function vect_analyze_scalar_cycles.
760 Examine the cross iteration def-use cycles of scalar variables, by
761 analyzing the loop-header PHIs of scalar variables. Classify each
762 cycle as one of the following: invariant, induction, reduction, unknown.
763 We do that for the loop represented by LOOP_VINFO, and also to its
764 inner-loop, if exists.
765 Examples for scalar cycles:
767 Example1: reduction:
769 loop1:
770 for (i=0; i<N; i++)
771 sum += a[i];
773 Example2: induction:
775 loop2:
776 for (i=0; i<N; i++)
777 a[i] = i; */
779 static void
780 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
782 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
784 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
786 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
787 Reductions in such inner-loop therefore have different properties than
788 the reductions in the nest that gets vectorized:
789 1. When vectorized, they are executed in the same order as in the original
790 scalar loop, so we can't change the order of computation when
791 vectorizing them.
792 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
793 current checks are too strict. */
795 if (loop->inner)
796 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
800 /* Function vect_get_loop_niters.
802 Determine how many iterations the loop is executed and place it
803 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
804 in NUMBER_OF_ITERATIONSM1.
806 Return the loop exit condition. */
809 static gimple_cond
810 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
811 tree *number_of_iterationsm1)
813 tree niters;
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_NOTE, vect_location,
817 "=== get_loop_niters ===\n");
819 niters = number_of_latch_executions (loop);
820 *number_of_iterationsm1 = niters;
822 /* We want the number of loop header executions which is the number
823 of latch executions plus one.
824 ??? For UINT_MAX latch executions this number overflows to zero
825 for loops like do { n++; } while (n != 0); */
826 if (niters && !chrec_contains_undetermined (niters))
827 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
828 build_int_cst (TREE_TYPE (niters), 1));
829 *number_of_iterations = niters;
831 return get_loop_exit_condition (loop);
835 /* Function bb_in_loop_p
837 Used as predicate for dfs order traversal of the loop bbs. */
839 static bool
840 bb_in_loop_p (const_basic_block bb, const void *data)
842 const struct loop *const loop = (const struct loop *)data;
843 if (flow_bb_inside_loop_p (loop, bb))
844 return true;
845 return false;
849 /* Function new_loop_vec_info.
851 Create and initialize a new loop_vec_info struct for LOOP, as well as
852 stmt_vec_info structs for all the stmts in LOOP. */
854 static loop_vec_info
855 new_loop_vec_info (struct loop *loop)
857 loop_vec_info res;
858 basic_block *bbs;
859 gimple_stmt_iterator si;
860 unsigned int i, nbbs;
862 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
863 LOOP_VINFO_LOOP (res) = loop;
865 bbs = get_loop_body (loop);
867 /* Create/Update stmt_info for all stmts in the loop. */
868 for (i = 0; i < loop->num_nodes; i++)
870 basic_block bb = bbs[i];
872 /* BBs in a nested inner-loop will have been already processed (because
873 we will have called vect_analyze_loop_form for any nested inner-loop).
874 Therefore, for stmts in an inner-loop we just want to update the
875 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
876 loop_info of the outer-loop we are currently considering to vectorize
877 (instead of the loop_info of the inner-loop).
878 For stmts in other BBs we need to create a stmt_info from scratch. */
879 if (bb->loop_father != loop)
881 /* Inner-loop bb. */
882 gcc_assert (loop->inner && bb->loop_father == loop->inner);
883 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
885 gimple phi = gsi_stmt (si);
886 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
887 loop_vec_info inner_loop_vinfo =
888 STMT_VINFO_LOOP_VINFO (stmt_info);
889 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
890 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
892 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
894 gimple stmt = gsi_stmt (si);
895 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
896 loop_vec_info inner_loop_vinfo =
897 STMT_VINFO_LOOP_VINFO (stmt_info);
898 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
899 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
902 else
904 /* bb in current nest. */
905 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
907 gimple phi = gsi_stmt (si);
908 gimple_set_uid (phi, 0);
909 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
912 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
914 gimple stmt = gsi_stmt (si);
915 gimple_set_uid (stmt, 0);
916 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
921 /* CHECKME: We want to visit all BBs before their successors (except for
922 latch blocks, for which this assertion wouldn't hold). In the simple
923 case of the loop forms we allow, a dfs order of the BBs would the same
924 as reversed postorder traversal, so we are safe. */
926 free (bbs);
927 bbs = XCNEWVEC (basic_block, loop->num_nodes);
928 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
929 bbs, loop->num_nodes, loop);
930 gcc_assert (nbbs == loop->num_nodes);
932 LOOP_VINFO_BBS (res) = bbs;
933 LOOP_VINFO_NITERSM1 (res) = NULL;
934 LOOP_VINFO_NITERS (res) = NULL;
935 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
936 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
937 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
938 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
939 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
940 LOOP_VINFO_VECT_FACTOR (res) = 0;
941 LOOP_VINFO_LOOP_NEST (res).create (3);
942 LOOP_VINFO_DATAREFS (res).create (10);
943 LOOP_VINFO_DDRS (res).create (10 * 10);
944 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
945 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
946 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
947 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
948 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
949 LOOP_VINFO_GROUPED_STORES (res).create (10);
950 LOOP_VINFO_REDUCTIONS (res).create (10);
951 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
952 LOOP_VINFO_SLP_INSTANCES (res).create (10);
953 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
954 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
955 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
956 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
957 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
959 return res;
963 /* Function destroy_loop_vec_info.
965 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
966 stmts in the loop. */
968 void
969 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
971 struct loop *loop;
972 basic_block *bbs;
973 int nbbs;
974 gimple_stmt_iterator si;
975 int j;
976 vec<slp_instance> slp_instances;
977 slp_instance instance;
978 bool swapped;
980 if (!loop_vinfo)
981 return;
983 loop = LOOP_VINFO_LOOP (loop_vinfo);
985 bbs = LOOP_VINFO_BBS (loop_vinfo);
986 nbbs = clean_stmts ? loop->num_nodes : 0;
987 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
989 for (j = 0; j < nbbs; j++)
991 basic_block bb = bbs[j];
992 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
993 free_stmt_vec_info (gsi_stmt (si));
995 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
997 gimple stmt = gsi_stmt (si);
999 /* We may have broken canonical form by moving a constant
1000 into RHS1 of a commutative op. Fix such occurrences. */
1001 if (swapped && is_gimple_assign (stmt))
1003 enum tree_code code = gimple_assign_rhs_code (stmt);
1005 if ((code == PLUS_EXPR
1006 || code == POINTER_PLUS_EXPR
1007 || code == MULT_EXPR)
1008 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1009 swap_ssa_operands (stmt,
1010 gimple_assign_rhs1_ptr (stmt),
1011 gimple_assign_rhs2_ptr (stmt));
1014 /* Free stmt_vec_info. */
1015 free_stmt_vec_info (stmt);
1016 gsi_next (&si);
1020 free (LOOP_VINFO_BBS (loop_vinfo));
1021 vect_destroy_datarefs (loop_vinfo, NULL);
1022 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1023 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1024 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1025 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1026 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1027 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1028 vect_free_slp_instance (instance);
1030 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1031 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1032 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1033 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1035 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1036 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1038 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1040 free (loop_vinfo);
1041 loop->aux = NULL;
1045 /* Function vect_analyze_loop_1.
1047 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1048 for it. The different analyses will record information in the
1049 loop_vec_info struct. This is a subset of the analyses applied in
1050 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1051 that is now considered for (outer-loop) vectorization. */
1053 static loop_vec_info
1054 vect_analyze_loop_1 (struct loop *loop)
1056 loop_vec_info loop_vinfo;
1058 if (dump_enabled_p ())
1059 dump_printf_loc (MSG_NOTE, vect_location,
1060 "===== analyze_loop_nest_1 =====\n");
1062 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1064 loop_vinfo = vect_analyze_loop_form (loop);
1065 if (!loop_vinfo)
1067 if (dump_enabled_p ())
1068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069 "bad inner-loop form.\n");
1070 return NULL;
1073 return loop_vinfo;
1077 /* Function vect_analyze_loop_form.
1079 Verify that certain CFG restrictions hold, including:
1080 - the loop has a pre-header
1081 - the loop has a single entry and exit
1082 - the loop exit condition is simple enough, and the number of iterations
1083 can be analyzed (a countable loop). */
1085 loop_vec_info
1086 vect_analyze_loop_form (struct loop *loop)
1088 loop_vec_info loop_vinfo;
1089 gimple_cond loop_cond;
1090 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1091 loop_vec_info inner_loop_vinfo = NULL;
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_NOTE, vect_location,
1095 "=== vect_analyze_loop_form ===\n");
1097 /* Different restrictions apply when we are considering an inner-most loop,
1098 vs. an outer (nested) loop.
1099 (FORNOW. May want to relax some of these restrictions in the future). */
1101 if (!loop->inner)
1103 /* Inner-most loop. We currently require that the number of BBs is
1104 exactly 2 (the header and latch). Vectorizable inner-most loops
1105 look like this:
1107 (pre-header)
1109 header <--------+
1110 | | |
1111 | +--> latch --+
1113 (exit-bb) */
1115 if (loop->num_nodes != 2)
1117 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119 "not vectorized: control flow in loop.\n");
1120 return NULL;
1123 if (empty_block_p (loop->header))
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 "not vectorized: empty loop.\n");
1128 return NULL;
1131 else
1133 struct loop *innerloop = loop->inner;
1134 edge entryedge;
1136 /* Nested loop. We currently require that the loop is doubly-nested,
1137 contains a single inner loop, and the number of BBs is exactly 5.
1138 Vectorizable outer-loops look like this:
1140 (pre-header)
1142 header <---+
1144 inner-loop |
1146 tail ------+
1148 (exit-bb)
1150 The inner-loop has the properties expected of inner-most loops
1151 as described above. */
1153 if ((loop->inner)->inner || (loop->inner)->next)
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "not vectorized: multiple nested loops.\n");
1158 return NULL;
1161 /* Analyze the inner-loop. */
1162 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1163 if (!inner_loop_vinfo)
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167 "not vectorized: Bad inner loop.\n");
1168 return NULL;
1171 if (!expr_invariant_in_loop_p (loop,
1172 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1174 if (dump_enabled_p ())
1175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1176 "not vectorized: inner-loop count not"
1177 " invariant.\n");
1178 destroy_loop_vec_info (inner_loop_vinfo, true);
1179 return NULL;
1182 if (loop->num_nodes != 5)
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "not vectorized: control flow in loop.\n");
1187 destroy_loop_vec_info (inner_loop_vinfo, true);
1188 return NULL;
1191 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1192 entryedge = EDGE_PRED (innerloop->header, 0);
1193 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1194 entryedge = EDGE_PRED (innerloop->header, 1);
1196 if (entryedge->src != loop->header
1197 || !single_exit (innerloop)
1198 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1200 if (dump_enabled_p ())
1201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202 "not vectorized: unsupported outerloop form.\n");
1203 destroy_loop_vec_info (inner_loop_vinfo, true);
1204 return NULL;
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_NOTE, vect_location,
1209 "Considering outer-loop vectorization.\n");
1212 if (!single_exit (loop)
1213 || EDGE_COUNT (loop->header->preds) != 2)
1215 if (dump_enabled_p ())
1217 if (!single_exit (loop))
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "not vectorized: multiple exits.\n");
1220 else if (EDGE_COUNT (loop->header->preds) != 2)
1221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222 "not vectorized: too many incoming edges.\n");
1224 if (inner_loop_vinfo)
1225 destroy_loop_vec_info (inner_loop_vinfo, true);
1226 return NULL;
1229 /* We assume that the loop exit condition is at the end of the loop. i.e,
1230 that the loop is represented as a do-while (with a proper if-guard
1231 before the loop if needed), where the loop header contains all the
1232 executable statements, and the latch is empty. */
1233 if (!empty_block_p (loop->latch)
1234 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1236 if (dump_enabled_p ())
1237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238 "not vectorized: latch block not empty.\n");
1239 if (inner_loop_vinfo)
1240 destroy_loop_vec_info (inner_loop_vinfo, true);
1241 return NULL;
1244 /* Make sure there exists a single-predecessor exit bb: */
1245 if (!single_pred_p (single_exit (loop)->dest))
1247 edge e = single_exit (loop);
1248 if (!(e->flags & EDGE_ABNORMAL))
1250 split_loop_exit_edge (e);
1251 if (dump_enabled_p ())
1252 dump_printf (MSG_NOTE, "split exit edge.\n");
1254 else
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "not vectorized: abnormal loop exit edge.\n");
1259 if (inner_loop_vinfo)
1260 destroy_loop_vec_info (inner_loop_vinfo, true);
1261 return NULL;
1265 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1266 &number_of_iterationsm1);
1267 if (!loop_cond)
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271 "not vectorized: complicated exit condition.\n");
1272 if (inner_loop_vinfo)
1273 destroy_loop_vec_info (inner_loop_vinfo, true);
1274 return NULL;
1277 if (!number_of_iterations
1278 || chrec_contains_undetermined (number_of_iterations))
1280 if (dump_enabled_p ())
1281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1282 "not vectorized: number of iterations cannot be "
1283 "computed.\n");
1284 if (inner_loop_vinfo)
1285 destroy_loop_vec_info (inner_loop_vinfo, true);
1286 return NULL;
1289 if (integer_zerop (number_of_iterations))
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: number of iterations = 0.\n");
1294 if (inner_loop_vinfo)
1295 destroy_loop_vec_info (inner_loop_vinfo, true);
1296 return NULL;
1299 loop_vinfo = new_loop_vec_info (loop);
1300 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1301 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1302 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1304 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1306 if (dump_enabled_p ())
1308 dump_printf_loc (MSG_NOTE, vect_location,
1309 "Symbolic number of iterations is ");
1310 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1311 dump_printf (MSG_NOTE, "\n");
1315 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1317 /* CHECKME: May want to keep it around it in the future. */
1318 if (inner_loop_vinfo)
1319 destroy_loop_vec_info (inner_loop_vinfo, false);
1321 gcc_assert (!loop->aux);
1322 loop->aux = loop_vinfo;
1323 return loop_vinfo;
1327 /* Function vect_analyze_loop_operations.
1329 Scan the loop stmts and make sure they are all vectorizable. */
1331 static bool
1332 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1334 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1335 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1336 int nbbs = loop->num_nodes;
1337 gimple_stmt_iterator si;
1338 unsigned int vectorization_factor = 0;
1339 int i;
1340 gimple phi;
1341 stmt_vec_info stmt_info;
1342 bool need_to_vectorize = false;
1343 int min_profitable_iters;
1344 int min_scalar_loop_bound;
1345 unsigned int th;
1346 bool only_slp_in_loop = true, ok;
1347 HOST_WIDE_INT max_niter;
1348 HOST_WIDE_INT estimated_niter;
1349 int min_profitable_estimate;
1351 if (dump_enabled_p ())
1352 dump_printf_loc (MSG_NOTE, vect_location,
1353 "=== vect_analyze_loop_operations ===\n");
1355 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1356 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1357 if (slp)
1359 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1360 vectorization factor of the loop is the unrolling factor required by
1361 the SLP instances. If that unrolling factor is 1, we say, that we
1362 perform pure SLP on loop - cross iteration parallelism is not
1363 exploited. */
1364 for (i = 0; i < nbbs; i++)
1366 basic_block bb = bbs[i];
1367 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1369 gimple stmt = gsi_stmt (si);
1370 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1371 gcc_assert (stmt_info);
1372 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1373 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1374 && !PURE_SLP_STMT (stmt_info))
1375 /* STMT needs both SLP and loop-based vectorization. */
1376 only_slp_in_loop = false;
1380 if (only_slp_in_loop)
1381 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1382 else
1383 vectorization_factor = least_common_multiple (vectorization_factor,
1384 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1386 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1387 if (dump_enabled_p ())
1388 dump_printf_loc (MSG_NOTE, vect_location,
1389 "Updating vectorization factor to %d\n",
1390 vectorization_factor);
1393 for (i = 0; i < nbbs; i++)
1395 basic_block bb = bbs[i];
1397 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1399 phi = gsi_stmt (si);
1400 ok = true;
1402 stmt_info = vinfo_for_stmt (phi);
1403 if (dump_enabled_p ())
1405 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1406 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1407 dump_printf (MSG_NOTE, "\n");
1410 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1411 (i.e., a phi in the tail of the outer-loop). */
1412 if (! is_loop_header_bb_p (bb))
1414 /* FORNOW: we currently don't support the case that these phis
1415 are not used in the outerloop (unless it is double reduction,
1416 i.e., this phi is vect_reduction_def), cause this case
1417 requires to actually do something here. */
1418 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1419 || STMT_VINFO_LIVE_P (stmt_info))
1420 && STMT_VINFO_DEF_TYPE (stmt_info)
1421 != vect_double_reduction_def)
1423 if (dump_enabled_p ())
1424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1425 "Unsupported loop-closed phi in "
1426 "outer-loop.\n");
1427 return false;
1430 /* If PHI is used in the outer loop, we check that its operand
1431 is defined in the inner loop. */
1432 if (STMT_VINFO_RELEVANT_P (stmt_info))
1434 tree phi_op;
1435 gimple op_def_stmt;
1437 if (gimple_phi_num_args (phi) != 1)
1438 return false;
1440 phi_op = PHI_ARG_DEF (phi, 0);
1441 if (TREE_CODE (phi_op) != SSA_NAME)
1442 return false;
1444 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1445 if (gimple_nop_p (op_def_stmt)
1446 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1447 || !vinfo_for_stmt (op_def_stmt))
1448 return false;
1450 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1451 != vect_used_in_outer
1452 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1453 != vect_used_in_outer_by_reduction)
1454 return false;
1457 continue;
1460 gcc_assert (stmt_info);
1462 if (STMT_VINFO_LIVE_P (stmt_info))
1464 /* FORNOW: not yet supported. */
1465 if (dump_enabled_p ())
1466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467 "not vectorized: value used after loop.\n");
1468 return false;
1471 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1472 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1474 /* A scalar-dependence cycle that we don't support. */
1475 if (dump_enabled_p ())
1476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1477 "not vectorized: scalar dependence cycle.\n");
1478 return false;
1481 if (STMT_VINFO_RELEVANT_P (stmt_info))
1483 need_to_vectorize = true;
1484 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1485 ok = vectorizable_induction (phi, NULL, NULL);
1488 if (!ok)
1490 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493 "not vectorized: relevant phi not "
1494 "supported: ");
1495 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1496 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1498 return false;
1502 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1504 gimple stmt = gsi_stmt (si);
1505 if (!gimple_clobber_p (stmt)
1506 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1507 return false;
1509 } /* bbs */
1511 /* All operations in the loop are either irrelevant (deal with loop
1512 control, or dead), or only used outside the loop and can be moved
1513 out of the loop (e.g. invariants, inductions). The loop can be
1514 optimized away by scalar optimizations. We're better off not
1515 touching this loop. */
1516 if (!need_to_vectorize)
1518 if (dump_enabled_p ())
1519 dump_printf_loc (MSG_NOTE, vect_location,
1520 "All the computation can be taken out of the loop.\n");
1521 if (dump_enabled_p ())
1522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1523 "not vectorized: redundant loop. no profit to "
1524 "vectorize.\n");
1525 return false;
1528 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1529 dump_printf_loc (MSG_NOTE, vect_location,
1530 "vectorization_factor = %d, niters = "
1531 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1532 LOOP_VINFO_INT_NITERS (loop_vinfo));
1534 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1535 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1536 || ((max_niter = max_stmt_executions_int (loop)) != -1
1537 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1539 if (dump_enabled_p ())
1540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1541 "not vectorized: iteration count too small.\n");
1542 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1544 "not vectorized: iteration count smaller than "
1545 "vectorization factor.\n");
1546 return false;
1549 /* Analyze cost. Decide if worth while to vectorize. */
1551 /* Once VF is set, SLP costs should be updated since the number of created
1552 vector stmts depends on VF. */
1553 vect_update_slp_costs_according_to_vf (loop_vinfo);
1555 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1556 &min_profitable_estimate);
1557 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1559 if (min_profitable_iters < 0)
1561 if (dump_enabled_p ())
1562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1563 "not vectorized: vectorization not profitable.\n");
1564 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566 "not vectorized: vector version will never be "
1567 "profitable.\n");
1568 return false;
1571 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1572 * vectorization_factor) - 1);
1575 /* Use the cost model only if it is more conservative than user specified
1576 threshold. */
1578 th = (unsigned) min_scalar_loop_bound;
1579 if (min_profitable_iters
1580 && (!min_scalar_loop_bound
1581 || min_profitable_iters > min_scalar_loop_bound))
1582 th = (unsigned) min_profitable_iters;
1584 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1586 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1587 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1589 if (dump_enabled_p ())
1590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1591 "not vectorized: vectorization not profitable.\n");
1592 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "not vectorized: iteration count smaller than user "
1595 "specified loop bound parameter or minimum profitable "
1596 "iterations (whichever is more conservative).\n");
1597 return false;
1600 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1601 && ((unsigned HOST_WIDE_INT) estimated_niter
1602 <= MAX (th, (unsigned)min_profitable_estimate)))
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "not vectorized: estimated iteration count too "
1607 "small.\n");
1608 if (dump_enabled_p ())
1609 dump_printf_loc (MSG_NOTE, vect_location,
1610 "not vectorized: estimated iteration count smaller "
1611 "than specified loop bound parameter or minimum "
1612 "profitable iterations (whichever is more "
1613 "conservative).\n");
1614 return false;
1617 return true;
1621 /* Function vect_analyze_loop_2.
1623 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1624 for it. The different analyses will record information in the
1625 loop_vec_info struct. */
1626 static bool
1627 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1629 bool ok, slp = false;
1630 int max_vf = MAX_VECTORIZATION_FACTOR;
1631 int min_vf = 2;
1632 unsigned int th;
1633 unsigned int n_stmts = 0;
1635 /* Find all data references in the loop (which correspond to vdefs/vuses)
1636 and analyze their evolution in the loop. Also adjust the minimal
1637 vectorization factor according to the loads and stores.
1639 FORNOW: Handle only simple, array references, which
1640 alignment can be forced, and aligned pointer-references. */
1642 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1643 if (!ok)
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "bad data references.\n");
1648 return false;
1651 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1652 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1654 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1655 if (!ok)
1657 if (dump_enabled_p ())
1658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1659 "bad data access.\n");
1660 return false;
1663 /* Classify all cross-iteration scalar data-flow cycles.
1664 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1666 vect_analyze_scalar_cycles (loop_vinfo);
1668 vect_pattern_recog (loop_vinfo, NULL);
1670 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1672 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1673 if (!ok)
1675 if (dump_enabled_p ())
1676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677 "unexpected pattern.\n");
1678 return false;
1681 /* Analyze data dependences between the data-refs in the loop
1682 and adjust the maximum vectorization factor according to
1683 the dependences.
1684 FORNOW: fail at the first data dependence that we encounter. */
1686 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1687 if (!ok
1688 || max_vf < min_vf)
1690 if (dump_enabled_p ())
1691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1692 "bad data dependence.\n");
1693 return false;
1696 ok = vect_determine_vectorization_factor (loop_vinfo);
1697 if (!ok)
1699 if (dump_enabled_p ())
1700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1701 "can't determine vectorization factor.\n");
1702 return false;
1704 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1708 "bad data dependence.\n");
1709 return false;
1712 /* Analyze the alignment of the data-refs in the loop.
1713 Fail if a data reference is found that cannot be vectorized. */
1715 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1716 if (!ok)
1718 if (dump_enabled_p ())
1719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1720 "bad data alignment.\n");
1721 return false;
1724 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1725 It is important to call pruning after vect_analyze_data_ref_accesses,
1726 since we use grouping information gathered by interleaving analysis. */
1727 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1728 if (!ok)
1730 if (dump_enabled_p ())
1731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1732 "number of versioning for alias "
1733 "run-time tests exceeds %d "
1734 "(--param vect-max-version-for-alias-checks)\n",
1735 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1736 return false;
1739 /* This pass will decide on using loop versioning and/or loop peeling in
1740 order to enhance the alignment of data references in the loop. */
1742 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1743 if (!ok)
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1747 "bad data alignment.\n");
1748 return false;
1751 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1752 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1753 if (ok)
1755 /* Decide which possible SLP instances to SLP. */
1756 slp = vect_make_slp_decision (loop_vinfo);
1758 /* Find stmts that need to be both vectorized and SLPed. */
1759 vect_detect_hybrid_slp (loop_vinfo);
1761 else
1762 return false;
1764 /* Scan all the operations in the loop and make sure they are
1765 vectorizable. */
1767 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1768 if (!ok)
1770 if (dump_enabled_p ())
1771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1772 "bad operation or unsupported loop bound.\n");
1773 return false;
1776 /* Decide whether we need to create an epilogue loop to handle
1777 remaining scalar iterations. */
1778 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1779 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1780 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1782 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1783 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1785 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1786 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1787 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1788 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1790 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1791 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1792 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1793 /* In case of versioning, check if the maximum number of
1794 iterations is greater than th. If they are identical,
1795 the epilogue is unnecessary. */
1796 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1797 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1798 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1799 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1800 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1802 /* If an epilogue loop is required make sure we can create one. */
1803 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1804 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1808 if (!vect_can_advance_ivs_p (loop_vinfo)
1809 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1810 single_exit (LOOP_VINFO_LOOP
1811 (loop_vinfo))))
1813 if (dump_enabled_p ())
1814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815 "not vectorized: can't create required "
1816 "epilog loop\n");
1817 return false;
1821 return true;
1824 /* Function vect_analyze_loop.
1826 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1827 for it. The different analyses will record information in the
1828 loop_vec_info struct. */
1829 loop_vec_info
1830 vect_analyze_loop (struct loop *loop)
1832 loop_vec_info loop_vinfo;
1833 unsigned int vector_sizes;
1835 /* Autodetect first vector size we try. */
1836 current_vector_size = 0;
1837 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "===== analyze_loop_nest =====\n");
1843 if (loop_outer (loop)
1844 && loop_vec_info_for_loop (loop_outer (loop))
1845 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1847 if (dump_enabled_p ())
1848 dump_printf_loc (MSG_NOTE, vect_location,
1849 "outer-loop already vectorized.\n");
1850 return NULL;
1853 while (1)
1855 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1856 loop_vinfo = vect_analyze_loop_form (loop);
1857 if (!loop_vinfo)
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "bad loop form.\n");
1862 return NULL;
1865 if (vect_analyze_loop_2 (loop_vinfo))
1867 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1869 return loop_vinfo;
1872 destroy_loop_vec_info (loop_vinfo, true);
1874 vector_sizes &= ~current_vector_size;
1875 if (vector_sizes == 0
1876 || current_vector_size == 0)
1877 return NULL;
1879 /* Try the next biggest vector size. */
1880 current_vector_size = 1 << floor_log2 (vector_sizes);
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_NOTE, vect_location,
1883 "***** Re-trying analysis with "
1884 "vector size %d\n", current_vector_size);
1889 /* Function reduction_code_for_scalar_code
1891 Input:
1892 CODE - tree_code of a reduction operations.
1894 Output:
1895 REDUC_CODE - the corresponding tree-code to be used to reduce the
1896 vector of partial results into a single scalar result (which
1897 will also reside in a vector) or ERROR_MARK if the operation is
1898 a supported reduction operation, but does not have such tree-code.
1900 Return FALSE if CODE currently cannot be vectorized as reduction. */
1902 static bool
1903 reduction_code_for_scalar_code (enum tree_code code,
1904 enum tree_code *reduc_code)
1906 switch (code)
1908 case MAX_EXPR:
1909 *reduc_code = REDUC_MAX_EXPR;
1910 return true;
1912 case MIN_EXPR:
1913 *reduc_code = REDUC_MIN_EXPR;
1914 return true;
1916 case PLUS_EXPR:
1917 *reduc_code = REDUC_PLUS_EXPR;
1918 return true;
1920 case MULT_EXPR:
1921 case MINUS_EXPR:
1922 case BIT_IOR_EXPR:
1923 case BIT_XOR_EXPR:
1924 case BIT_AND_EXPR:
1925 *reduc_code = ERROR_MARK;
1926 return true;
1928 default:
1929 return false;
1934 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1935 STMT is printed with a message MSG. */
1937 static void
1938 report_vect_op (int msg_type, gimple stmt, const char *msg)
1940 dump_printf_loc (msg_type, vect_location, "%s", msg);
1941 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1942 dump_printf (msg_type, "\n");
1946 /* Detect SLP reduction of the form:
1948 #a1 = phi <a5, a0>
1949 a2 = operation (a1)
1950 a3 = operation (a2)
1951 a4 = operation (a3)
1952 a5 = operation (a4)
1954 #a = phi <a5>
1956 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1957 FIRST_STMT is the first reduction stmt in the chain
1958 (a2 = operation (a1)).
1960 Return TRUE if a reduction chain was detected. */
1962 static bool
1963 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1965 struct loop *loop = (gimple_bb (phi))->loop_father;
1966 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1967 enum tree_code code;
1968 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1969 stmt_vec_info use_stmt_info, current_stmt_info;
1970 tree lhs;
1971 imm_use_iterator imm_iter;
1972 use_operand_p use_p;
1973 int nloop_uses, size = 0, n_out_of_loop_uses;
1974 bool found = false;
1976 if (loop != vect_loop)
1977 return false;
1979 lhs = PHI_RESULT (phi);
1980 code = gimple_assign_rhs_code (first_stmt);
1981 while (1)
1983 nloop_uses = 0;
1984 n_out_of_loop_uses = 0;
1985 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1987 gimple use_stmt = USE_STMT (use_p);
1988 if (is_gimple_debug (use_stmt))
1989 continue;
1991 /* Check if we got back to the reduction phi. */
1992 if (use_stmt == phi)
1994 loop_use_stmt = use_stmt;
1995 found = true;
1996 break;
1999 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2001 if (vinfo_for_stmt (use_stmt)
2002 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2004 loop_use_stmt = use_stmt;
2005 nloop_uses++;
2008 else
2009 n_out_of_loop_uses++;
2011 /* There are can be either a single use in the loop or two uses in
2012 phi nodes. */
2013 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2014 return false;
2017 if (found)
2018 break;
2020 /* We reached a statement with no loop uses. */
2021 if (nloop_uses == 0)
2022 return false;
2024 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2025 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2026 return false;
2028 if (!is_gimple_assign (loop_use_stmt)
2029 || code != gimple_assign_rhs_code (loop_use_stmt)
2030 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2031 return false;
2033 /* Insert USE_STMT into reduction chain. */
2034 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2035 if (current_stmt)
2037 current_stmt_info = vinfo_for_stmt (current_stmt);
2038 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2039 GROUP_FIRST_ELEMENT (use_stmt_info)
2040 = GROUP_FIRST_ELEMENT (current_stmt_info);
2042 else
2043 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2045 lhs = gimple_assign_lhs (loop_use_stmt);
2046 current_stmt = loop_use_stmt;
2047 size++;
2050 if (!found || loop_use_stmt != phi || size < 2)
2051 return false;
2053 /* Swap the operands, if needed, to make the reduction operand be the second
2054 operand. */
2055 lhs = PHI_RESULT (phi);
2056 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2057 while (next_stmt)
2059 if (gimple_assign_rhs2 (next_stmt) == lhs)
2061 tree op = gimple_assign_rhs1 (next_stmt);
2062 gimple def_stmt = NULL;
2064 if (TREE_CODE (op) == SSA_NAME)
2065 def_stmt = SSA_NAME_DEF_STMT (op);
2067 /* Check that the other def is either defined in the loop
2068 ("vect_internal_def"), or it's an induction (defined by a
2069 loop-header phi-node). */
2070 if (def_stmt
2071 && gimple_bb (def_stmt)
2072 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2073 && (is_gimple_assign (def_stmt)
2074 || is_gimple_call (def_stmt)
2075 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2076 == vect_induction_def
2077 || (gimple_code (def_stmt) == GIMPLE_PHI
2078 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2079 == vect_internal_def
2080 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2082 lhs = gimple_assign_lhs (next_stmt);
2083 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2084 continue;
2087 return false;
2089 else
2091 tree op = gimple_assign_rhs2 (next_stmt);
2092 gimple def_stmt = NULL;
2094 if (TREE_CODE (op) == SSA_NAME)
2095 def_stmt = SSA_NAME_DEF_STMT (op);
2097 /* Check that the other def is either defined in the loop
2098 ("vect_internal_def"), or it's an induction (defined by a
2099 loop-header phi-node). */
2100 if (def_stmt
2101 && gimple_bb (def_stmt)
2102 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2103 && (is_gimple_assign (def_stmt)
2104 || is_gimple_call (def_stmt)
2105 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2106 == vect_induction_def
2107 || (gimple_code (def_stmt) == GIMPLE_PHI
2108 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2109 == vect_internal_def
2110 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2112 if (dump_enabled_p ())
2114 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2115 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2116 dump_printf (MSG_NOTE, "\n");
2119 swap_ssa_operands (next_stmt,
2120 gimple_assign_rhs1_ptr (next_stmt),
2121 gimple_assign_rhs2_ptr (next_stmt));
2122 update_stmt (next_stmt);
2124 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2125 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2127 else
2128 return false;
2131 lhs = gimple_assign_lhs (next_stmt);
2132 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2135 /* Save the chain for further analysis in SLP detection. */
2136 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2137 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2138 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2140 return true;
2144 /* Function vect_is_simple_reduction_1
2146 (1) Detect a cross-iteration def-use cycle that represents a simple
2147 reduction computation. We look for the following pattern:
2149 loop_header:
2150 a1 = phi < a0, a2 >
2151 a3 = ...
2152 a2 = operation (a3, a1)
2156 a3 = ...
2157 loop_header:
2158 a1 = phi < a0, a2 >
2159 a2 = operation (a3, a1)
2161 such that:
2162 1. operation is commutative and associative and it is safe to
2163 change the order of the computation (if CHECK_REDUCTION is true)
2164 2. no uses for a2 in the loop (a2 is used out of the loop)
2165 3. no uses of a1 in the loop besides the reduction operation
2166 4. no uses of a1 outside the loop.
2168 Conditions 1,4 are tested here.
2169 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2171 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2172 nested cycles, if CHECK_REDUCTION is false.
2174 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2175 reductions:
2177 a1 = phi < a0, a2 >
2178 inner loop (def of a3)
2179 a2 = phi < a3 >
2181 If MODIFY is true it tries also to rework the code in-place to enable
2182 detection of more reduction patterns. For the time being we rewrite
2183 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2186 static gimple
2187 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2188 bool check_reduction, bool *double_reduc,
2189 bool modify)
2191 struct loop *loop = (gimple_bb (phi))->loop_father;
2192 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2193 edge latch_e = loop_latch_edge (loop);
2194 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2195 gimple def_stmt, def1 = NULL, def2 = NULL;
2196 enum tree_code orig_code, code;
2197 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2198 tree type;
2199 int nloop_uses;
2200 tree name;
2201 imm_use_iterator imm_iter;
2202 use_operand_p use_p;
2203 bool phi_def;
2205 *double_reduc = false;
2207 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2208 otherwise, we assume outer loop vectorization. */
2209 gcc_assert ((check_reduction && loop == vect_loop)
2210 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2212 name = PHI_RESULT (phi);
2213 /* ??? If there are no uses of the PHI result the inner loop reduction
2214 won't be detected as possibly double-reduction by vectorizable_reduction
2215 because that tries to walk the PHI arg from the preheader edge which
2216 can be constant. See PR60382. */
2217 if (has_zero_uses (name))
2218 return NULL;
2219 nloop_uses = 0;
2220 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2222 gimple use_stmt = USE_STMT (use_p);
2223 if (is_gimple_debug (use_stmt))
2224 continue;
2226 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 "intermediate value used outside loop.\n");
2232 return NULL;
2235 if (vinfo_for_stmt (use_stmt)
2236 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2237 nloop_uses++;
2238 if (nloop_uses > 1)
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242 "reduction used in loop.\n");
2243 return NULL;
2247 if (TREE_CODE (loop_arg) != SSA_NAME)
2249 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 "reduction: not ssa_name: ");
2253 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2254 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2256 return NULL;
2259 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2260 if (!def_stmt)
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264 "reduction: no def_stmt.\n");
2265 return NULL;
2268 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2270 if (dump_enabled_p ())
2272 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2273 dump_printf (MSG_NOTE, "\n");
2275 return NULL;
2278 if (is_gimple_assign (def_stmt))
2280 name = gimple_assign_lhs (def_stmt);
2281 phi_def = false;
2283 else
2285 name = PHI_RESULT (def_stmt);
2286 phi_def = true;
2289 nloop_uses = 0;
2290 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2292 gimple use_stmt = USE_STMT (use_p);
2293 if (is_gimple_debug (use_stmt))
2294 continue;
2295 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2296 && vinfo_for_stmt (use_stmt)
2297 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2298 nloop_uses++;
2299 if (nloop_uses > 1)
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "reduction used in loop.\n");
2304 return NULL;
2308 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2309 defined in the inner loop. */
2310 if (phi_def)
2312 op1 = PHI_ARG_DEF (def_stmt, 0);
2314 if (gimple_phi_num_args (def_stmt) != 1
2315 || TREE_CODE (op1) != SSA_NAME)
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319 "unsupported phi node definition.\n");
2321 return NULL;
2324 def1 = SSA_NAME_DEF_STMT (op1);
2325 if (gimple_bb (def1)
2326 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2327 && loop->inner
2328 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2329 && is_gimple_assign (def1))
2331 if (dump_enabled_p ())
2332 report_vect_op (MSG_NOTE, def_stmt,
2333 "detected double reduction: ");
2335 *double_reduc = true;
2336 return def_stmt;
2339 return NULL;
2342 code = orig_code = gimple_assign_rhs_code (def_stmt);
2344 /* We can handle "res -= x[i]", which is non-associative by
2345 simply rewriting this into "res += -x[i]". Avoid changing
2346 gimple instruction for the first simple tests and only do this
2347 if we're allowed to change code at all. */
2348 if (code == MINUS_EXPR
2349 && modify
2350 && (op1 = gimple_assign_rhs1 (def_stmt))
2351 && TREE_CODE (op1) == SSA_NAME
2352 && SSA_NAME_DEF_STMT (op1) == phi)
2353 code = PLUS_EXPR;
2355 if (check_reduction
2356 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2358 if (dump_enabled_p ())
2359 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2360 "reduction: not commutative/associative: ");
2361 return NULL;
2364 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2366 if (code != COND_EXPR)
2368 if (dump_enabled_p ())
2369 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370 "reduction: not binary operation: ");
2372 return NULL;
2375 op3 = gimple_assign_rhs1 (def_stmt);
2376 if (COMPARISON_CLASS_P (op3))
2378 op4 = TREE_OPERAND (op3, 1);
2379 op3 = TREE_OPERAND (op3, 0);
2382 op1 = gimple_assign_rhs2 (def_stmt);
2383 op2 = gimple_assign_rhs3 (def_stmt);
2385 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2387 if (dump_enabled_p ())
2388 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2389 "reduction: uses not ssa_names: ");
2391 return NULL;
2394 else
2396 op1 = gimple_assign_rhs1 (def_stmt);
2397 op2 = gimple_assign_rhs2 (def_stmt);
2399 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2401 if (dump_enabled_p ())
2402 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403 "reduction: uses not ssa_names: ");
2405 return NULL;
2409 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2410 if ((TREE_CODE (op1) == SSA_NAME
2411 && !types_compatible_p (type,TREE_TYPE (op1)))
2412 || (TREE_CODE (op2) == SSA_NAME
2413 && !types_compatible_p (type, TREE_TYPE (op2)))
2414 || (op3 && TREE_CODE (op3) == SSA_NAME
2415 && !types_compatible_p (type, TREE_TYPE (op3)))
2416 || (op4 && TREE_CODE (op4) == SSA_NAME
2417 && !types_compatible_p (type, TREE_TYPE (op4))))
2419 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_NOTE, vect_location,
2422 "reduction: multiple types: operation type: ");
2423 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2424 dump_printf (MSG_NOTE, ", operands types: ");
2425 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2426 TREE_TYPE (op1));
2427 dump_printf (MSG_NOTE, ",");
2428 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2429 TREE_TYPE (op2));
2430 if (op3)
2432 dump_printf (MSG_NOTE, ",");
2433 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2434 TREE_TYPE (op3));
2437 if (op4)
2439 dump_printf (MSG_NOTE, ",");
2440 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2441 TREE_TYPE (op4));
2443 dump_printf (MSG_NOTE, "\n");
2446 return NULL;
2449 /* Check that it's ok to change the order of the computation.
2450 Generally, when vectorizing a reduction we change the order of the
2451 computation. This may change the behavior of the program in some
2452 cases, so we need to check that this is ok. One exception is when
2453 vectorizing an outer-loop: the inner-loop is executed sequentially,
2454 and therefore vectorizing reductions in the inner-loop during
2455 outer-loop vectorization is safe. */
2457 /* CHECKME: check for !flag_finite_math_only too? */
2458 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2459 && check_reduction)
2461 /* Changing the order of operations changes the semantics. */
2462 if (dump_enabled_p ())
2463 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2464 "reduction: unsafe fp math optimization: ");
2465 return NULL;
2467 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2468 && check_reduction)
2470 /* Changing the order of operations changes the semantics. */
2471 if (dump_enabled_p ())
2472 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2473 "reduction: unsafe int math optimization: ");
2474 return NULL;
2476 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2478 /* Changing the order of operations changes the semantics. */
2479 if (dump_enabled_p ())
2480 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2481 "reduction: unsafe fixed-point math optimization: ");
2482 return NULL;
2485 /* If we detected "res -= x[i]" earlier, rewrite it into
2486 "res += -x[i]" now. If this turns out to be useless reassoc
2487 will clean it up again. */
2488 if (orig_code == MINUS_EXPR)
2490 tree rhs = gimple_assign_rhs2 (def_stmt);
2491 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2492 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2493 rhs, NULL);
2494 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2495 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2496 loop_info, NULL));
2497 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2498 gimple_assign_set_rhs2 (def_stmt, negrhs);
2499 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2500 update_stmt (def_stmt);
2503 /* Reduction is safe. We're dealing with one of the following:
2504 1) integer arithmetic and no trapv
2505 2) floating point arithmetic, and special flags permit this optimization
2506 3) nested cycle (i.e., outer loop vectorization). */
2507 if (TREE_CODE (op1) == SSA_NAME)
2508 def1 = SSA_NAME_DEF_STMT (op1);
2510 if (TREE_CODE (op2) == SSA_NAME)
2511 def2 = SSA_NAME_DEF_STMT (op2);
2513 if (code != COND_EXPR
2514 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2516 if (dump_enabled_p ())
2517 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2518 return NULL;
2521 /* Check that one def is the reduction def, defined by PHI,
2522 the other def is either defined in the loop ("vect_internal_def"),
2523 or it's an induction (defined by a loop-header phi-node). */
2525 if (def2 && def2 == phi
2526 && (code == COND_EXPR
2527 || !def1 || gimple_nop_p (def1)
2528 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2529 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2530 && (is_gimple_assign (def1)
2531 || is_gimple_call (def1)
2532 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2533 == vect_induction_def
2534 || (gimple_code (def1) == GIMPLE_PHI
2535 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2536 == vect_internal_def
2537 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2539 if (dump_enabled_p ())
2540 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2541 return def_stmt;
2544 if (def1 && def1 == phi
2545 && (code == COND_EXPR
2546 || !def2 || gimple_nop_p (def2)
2547 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2548 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2549 && (is_gimple_assign (def2)
2550 || is_gimple_call (def2)
2551 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2552 == vect_induction_def
2553 || (gimple_code (def2) == GIMPLE_PHI
2554 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2555 == vect_internal_def
2556 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2558 if (check_reduction)
2560 /* Swap operands (just for simplicity - so that the rest of the code
2561 can assume that the reduction variable is always the last (second)
2562 argument). */
2563 if (dump_enabled_p ())
2564 report_vect_op (MSG_NOTE, def_stmt,
2565 "detected reduction: need to swap operands: ");
2567 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2568 gimple_assign_rhs2_ptr (def_stmt));
2570 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2571 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2573 else
2575 if (dump_enabled_p ())
2576 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2579 return def_stmt;
2582 /* Try to find SLP reduction chain. */
2583 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2585 if (dump_enabled_p ())
2586 report_vect_op (MSG_NOTE, def_stmt,
2587 "reduction: detected reduction chain: ");
2589 return def_stmt;
2592 if (dump_enabled_p ())
2593 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2594 "reduction: unknown pattern: ");
2596 return NULL;
2599 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2600 in-place. Arguments as there. */
2602 static gimple
2603 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2604 bool check_reduction, bool *double_reduc)
2606 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2607 double_reduc, false);
2610 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2611 in-place if it enables detection of more reductions. Arguments
2612 as there. */
2614 gimple
2615 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2616 bool check_reduction, bool *double_reduc)
2618 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2619 double_reduc, true);
2622 /* Calculate the cost of one scalar iteration of the loop. */
2624 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2626 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2627 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2628 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2629 int innerloop_iters, i, stmt_cost;
2631 /* Count statements in scalar loop. Using this as scalar cost for a single
2632 iteration for now.
2634 TODO: Add outer loop support.
2636 TODO: Consider assigning different costs to different scalar
2637 statements. */
2639 /* FORNOW. */
2640 innerloop_iters = 1;
2641 if (loop->inner)
2642 innerloop_iters = 50; /* FIXME */
2644 for (i = 0; i < nbbs; i++)
2646 gimple_stmt_iterator si;
2647 basic_block bb = bbs[i];
2649 if (bb->loop_father == loop->inner)
2650 factor = innerloop_iters;
2651 else
2652 factor = 1;
2654 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2656 gimple stmt = gsi_stmt (si);
2657 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2659 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2660 continue;
2662 /* Skip stmts that are not vectorized inside the loop. */
2663 if (stmt_info
2664 && !STMT_VINFO_RELEVANT_P (stmt_info)
2665 && (!STMT_VINFO_LIVE_P (stmt_info)
2666 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2667 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2668 continue;
2670 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2672 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2673 stmt_cost = vect_get_stmt_cost (scalar_load);
2674 else
2675 stmt_cost = vect_get_stmt_cost (scalar_store);
2677 else
2678 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2680 scalar_single_iter_cost += stmt_cost * factor;
2683 return scalar_single_iter_cost;
2686 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2688 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2689 int *peel_iters_epilogue,
2690 int scalar_single_iter_cost,
2691 stmt_vector_for_cost *prologue_cost_vec,
2692 stmt_vector_for_cost *epilogue_cost_vec)
2694 int retval = 0;
2695 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2697 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2699 *peel_iters_epilogue = vf/2;
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE, vect_location,
2702 "cost model: epilogue peel iters set to vf/2 "
2703 "because loop iterations are unknown .\n");
2705 /* If peeled iterations are known but number of scalar loop
2706 iterations are unknown, count a taken branch per peeled loop. */
2707 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2708 NULL, 0, vect_prologue);
2710 else
2712 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2713 peel_iters_prologue = niters < peel_iters_prologue ?
2714 niters : peel_iters_prologue;
2715 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2716 /* If we need to peel for gaps, but no peeling is required, we have to
2717 peel VF iterations. */
2718 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2719 *peel_iters_epilogue = vf;
2722 if (peel_iters_prologue)
2723 retval += record_stmt_cost (prologue_cost_vec,
2724 peel_iters_prologue * scalar_single_iter_cost,
2725 scalar_stmt, NULL, 0, vect_prologue);
2726 if (*peel_iters_epilogue)
2727 retval += record_stmt_cost (epilogue_cost_vec,
2728 *peel_iters_epilogue * scalar_single_iter_cost,
2729 scalar_stmt, NULL, 0, vect_epilogue);
2730 return retval;
2733 /* Function vect_estimate_min_profitable_iters
2735 Return the number of iterations required for the vector version of the
2736 loop to be profitable relative to the cost of the scalar version of the
2737 loop. */
2739 static void
2740 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2741 int *ret_min_profitable_niters,
2742 int *ret_min_profitable_estimate)
2744 int min_profitable_iters;
2745 int min_profitable_estimate;
2746 int peel_iters_prologue;
2747 int peel_iters_epilogue;
2748 unsigned vec_inside_cost = 0;
2749 int vec_outside_cost = 0;
2750 unsigned vec_prologue_cost = 0;
2751 unsigned vec_epilogue_cost = 0;
2752 int scalar_single_iter_cost = 0;
2753 int scalar_outside_cost = 0;
2754 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2755 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2756 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2758 /* Cost model disabled. */
2759 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2761 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2762 *ret_min_profitable_niters = 0;
2763 *ret_min_profitable_estimate = 0;
2764 return;
2767 /* Requires loop versioning tests to handle misalignment. */
2768 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2770 /* FIXME: Make cost depend on complexity of individual check. */
2771 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2772 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2773 vect_prologue);
2774 dump_printf (MSG_NOTE,
2775 "cost model: Adding cost of checks for loop "
2776 "versioning to treat misalignment.\n");
2779 /* Requires loop versioning with alias checks. */
2780 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2782 /* FIXME: Make cost depend on complexity of individual check. */
2783 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2784 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2785 vect_prologue);
2786 dump_printf (MSG_NOTE,
2787 "cost model: Adding cost of checks for loop "
2788 "versioning aliasing.\n");
2791 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2792 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2793 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2794 vect_prologue);
2796 /* Count statements in scalar loop. Using this as scalar cost for a single
2797 iteration for now.
2799 TODO: Add outer loop support.
2801 TODO: Consider assigning different costs to different scalar
2802 statements. */
2804 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2806 /* Add additional cost for the peeled instructions in prologue and epilogue
2807 loop.
2809 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2810 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2812 TODO: Build an expression that represents peel_iters for prologue and
2813 epilogue to be used in a run-time test. */
2815 if (npeel < 0)
2817 peel_iters_prologue = vf/2;
2818 dump_printf (MSG_NOTE, "cost model: "
2819 "prologue peel iters set to vf/2.\n");
2821 /* If peeling for alignment is unknown, loop bound of main loop becomes
2822 unknown. */
2823 peel_iters_epilogue = vf/2;
2824 dump_printf (MSG_NOTE, "cost model: "
2825 "epilogue peel iters set to vf/2 because "
2826 "peeling for alignment is unknown.\n");
2828 /* If peeled iterations are unknown, count a taken branch and a not taken
2829 branch per peeled loop. Even if scalar loop iterations are known,
2830 vector iterations are not known since peeled prologue iterations are
2831 not known. Hence guards remain the same. */
2832 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2833 NULL, 0, vect_prologue);
2834 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2835 NULL, 0, vect_prologue);
2836 /* FORNOW: Don't attempt to pass individual scalar instructions to
2837 the model; just assume linear cost for scalar iterations. */
2838 (void) add_stmt_cost (target_cost_data,
2839 peel_iters_prologue * scalar_single_iter_cost,
2840 scalar_stmt, NULL, 0, vect_prologue);
2841 (void) add_stmt_cost (target_cost_data,
2842 peel_iters_epilogue * scalar_single_iter_cost,
2843 scalar_stmt, NULL, 0, vect_epilogue);
2845 else
2847 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2848 stmt_info_for_cost *si;
2849 int j;
2850 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2852 prologue_cost_vec.create (2);
2853 epilogue_cost_vec.create (2);
2854 peel_iters_prologue = npeel;
2856 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2857 &peel_iters_epilogue,
2858 scalar_single_iter_cost,
2859 &prologue_cost_vec,
2860 &epilogue_cost_vec);
2862 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2864 struct _stmt_vec_info *stmt_info
2865 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2866 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2867 si->misalign, vect_prologue);
2870 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2872 struct _stmt_vec_info *stmt_info
2873 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2874 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2875 si->misalign, vect_epilogue);
2878 prologue_cost_vec.release ();
2879 epilogue_cost_vec.release ();
2882 /* FORNOW: The scalar outside cost is incremented in one of the
2883 following ways:
2885 1. The vectorizer checks for alignment and aliasing and generates
2886 a condition that allows dynamic vectorization. A cost model
2887 check is ANDED with the versioning condition. Hence scalar code
2888 path now has the added cost of the versioning check.
2890 if (cost > th & versioning_check)
2891 jmp to vector code
2893 Hence run-time scalar is incremented by not-taken branch cost.
2895 2. The vectorizer then checks if a prologue is required. If the
2896 cost model check was not done before during versioning, it has to
2897 be done before the prologue check.
2899 if (cost <= th)
2900 prologue = scalar_iters
2901 if (prologue == 0)
2902 jmp to vector code
2903 else
2904 execute prologue
2905 if (prologue == num_iters)
2906 go to exit
2908 Hence the run-time scalar cost is incremented by a taken branch,
2909 plus a not-taken branch, plus a taken branch cost.
2911 3. The vectorizer then checks if an epilogue is required. If the
2912 cost model check was not done before during prologue check, it
2913 has to be done with the epilogue check.
2915 if (prologue == 0)
2916 jmp to vector code
2917 else
2918 execute prologue
2919 if (prologue == num_iters)
2920 go to exit
2921 vector code:
2922 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2923 jmp to epilogue
2925 Hence the run-time scalar cost should be incremented by 2 taken
2926 branches.
2928 TODO: The back end may reorder the BBS's differently and reverse
2929 conditions/branch directions. Change the estimates below to
2930 something more reasonable. */
2932 /* If the number of iterations is known and we do not do versioning, we can
2933 decide whether to vectorize at compile time. Hence the scalar version
2934 do not carry cost model guard costs. */
2935 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2936 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2937 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2939 /* Cost model check occurs at versioning. */
2940 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2941 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2942 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2943 else
2945 /* Cost model check occurs at prologue generation. */
2946 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2947 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2948 + vect_get_stmt_cost (cond_branch_not_taken);
2949 /* Cost model check occurs at epilogue generation. */
2950 else
2951 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2955 /* Complete the target-specific cost calculations. */
2956 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2957 &vec_inside_cost, &vec_epilogue_cost);
2959 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2961 /* Calculate number of iterations required to make the vector version
2962 profitable, relative to the loop bodies only. The following condition
2963 must hold true:
2964 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2965 where
2966 SIC = scalar iteration cost, VIC = vector iteration cost,
2967 VOC = vector outside cost, VF = vectorization factor,
2968 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2969 SOC = scalar outside cost for run time cost model check. */
2971 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2973 if (vec_outside_cost <= 0)
2974 min_profitable_iters = 1;
2975 else
2977 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2978 - vec_inside_cost * peel_iters_prologue
2979 - vec_inside_cost * peel_iters_epilogue)
2980 / ((scalar_single_iter_cost * vf)
2981 - vec_inside_cost);
2983 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2984 <= (((int) vec_inside_cost * min_profitable_iters)
2985 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2986 min_profitable_iters++;
2989 /* vector version will never be profitable. */
2990 else
2992 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
2993 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
2994 "did not happen for a simd loop");
2996 if (dump_enabled_p ())
2997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998 "cost model: the vector iteration cost = %d "
2999 "divided by the scalar iteration cost = %d "
3000 "is greater or equal to the vectorization factor = %d"
3001 ".\n",
3002 vec_inside_cost, scalar_single_iter_cost, vf);
3003 *ret_min_profitable_niters = -1;
3004 *ret_min_profitable_estimate = -1;
3005 return;
3008 if (dump_enabled_p ())
3010 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3011 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3012 vec_inside_cost);
3013 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3014 vec_prologue_cost);
3015 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3016 vec_epilogue_cost);
3017 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3018 scalar_single_iter_cost);
3019 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3020 scalar_outside_cost);
3021 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3022 vec_outside_cost);
3023 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3024 peel_iters_prologue);
3025 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3026 peel_iters_epilogue);
3027 dump_printf (MSG_NOTE,
3028 " Calculated minimum iters for profitability: %d\n",
3029 min_profitable_iters);
3030 dump_printf (MSG_NOTE, "\n");
3033 min_profitable_iters =
3034 min_profitable_iters < vf ? vf : min_profitable_iters;
3036 /* Because the condition we create is:
3037 if (niters <= min_profitable_iters)
3038 then skip the vectorized loop. */
3039 min_profitable_iters--;
3041 if (dump_enabled_p ())
3042 dump_printf_loc (MSG_NOTE, vect_location,
3043 " Runtime profitability threshold = %d\n",
3044 min_profitable_iters);
3046 *ret_min_profitable_niters = min_profitable_iters;
3048 /* Calculate number of iterations required to make the vector version
3049 profitable, relative to the loop bodies only.
3051 Non-vectorized variant is SIC * niters and it must win over vector
3052 variant on the expected loop trip count. The following condition must hold true:
3053 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3055 if (vec_outside_cost <= 0)
3056 min_profitable_estimate = 1;
3057 else
3059 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3060 - vec_inside_cost * peel_iters_prologue
3061 - vec_inside_cost * peel_iters_epilogue)
3062 / ((scalar_single_iter_cost * vf)
3063 - vec_inside_cost);
3065 min_profitable_estimate --;
3066 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3067 if (dump_enabled_p ())
3068 dump_printf_loc (MSG_NOTE, vect_location,
3069 " Static estimate profitability threshold = %d\n",
3070 min_profitable_iters);
3072 *ret_min_profitable_estimate = min_profitable_estimate;
3076 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3077 functions. Design better to avoid maintenance issues. */
3079 /* Function vect_model_reduction_cost.
3081 Models cost for a reduction operation, including the vector ops
3082 generated within the strip-mine loop, the initial definition before
3083 the loop, and the epilogue code that must be generated. */
3085 static bool
3086 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3087 int ncopies)
3089 int prologue_cost = 0, epilogue_cost = 0;
3090 enum tree_code code;
3091 optab optab;
3092 tree vectype;
3093 gimple stmt, orig_stmt;
3094 tree reduction_op;
3095 enum machine_mode mode;
3096 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3097 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3098 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3100 /* Cost of reduction op inside loop. */
3101 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3102 stmt_info, 0, vect_body);
3103 stmt = STMT_VINFO_STMT (stmt_info);
3105 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3107 case GIMPLE_SINGLE_RHS:
3108 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3109 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3110 break;
3111 case GIMPLE_UNARY_RHS:
3112 reduction_op = gimple_assign_rhs1 (stmt);
3113 break;
3114 case GIMPLE_BINARY_RHS:
3115 reduction_op = gimple_assign_rhs2 (stmt);
3116 break;
3117 case GIMPLE_TERNARY_RHS:
3118 reduction_op = gimple_assign_rhs3 (stmt);
3119 break;
3120 default:
3121 gcc_unreachable ();
3124 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3125 if (!vectype)
3127 if (dump_enabled_p ())
3129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130 "unsupported data-type ");
3131 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3132 TREE_TYPE (reduction_op));
3133 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3135 return false;
3138 mode = TYPE_MODE (vectype);
3139 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3141 if (!orig_stmt)
3142 orig_stmt = STMT_VINFO_STMT (stmt_info);
3144 code = gimple_assign_rhs_code (orig_stmt);
3146 /* Add in cost for initial definition. */
3147 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3148 stmt_info, 0, vect_prologue);
3150 /* Determine cost of epilogue code.
3152 We have a reduction operator that will reduce the vector in one statement.
3153 Also requires scalar extract. */
3155 if (!nested_in_vect_loop_p (loop, orig_stmt))
3157 if (reduc_code != ERROR_MARK)
3159 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3160 stmt_info, 0, vect_epilogue);
3161 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3162 stmt_info, 0, vect_epilogue);
3164 else
3166 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3167 tree bitsize =
3168 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3169 int element_bitsize = tree_to_uhwi (bitsize);
3170 int nelements = vec_size_in_bits / element_bitsize;
3172 optab = optab_for_tree_code (code, vectype, optab_default);
3174 /* We have a whole vector shift available. */
3175 if (VECTOR_MODE_P (mode)
3176 && optab_handler (optab, mode) != CODE_FOR_nothing
3177 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3179 /* Final reduction via vector shifts and the reduction operator.
3180 Also requires scalar extract. */
3181 epilogue_cost += add_stmt_cost (target_cost_data,
3182 exact_log2 (nelements) * 2,
3183 vector_stmt, stmt_info, 0,
3184 vect_epilogue);
3185 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3186 vec_to_scalar, stmt_info, 0,
3187 vect_epilogue);
3189 else
3190 /* Use extracts and reduction op for final reduction. For N
3191 elements, we have N extracts and N-1 reduction ops. */
3192 epilogue_cost += add_stmt_cost (target_cost_data,
3193 nelements + nelements - 1,
3194 vector_stmt, stmt_info, 0,
3195 vect_epilogue);
3199 if (dump_enabled_p ())
3200 dump_printf (MSG_NOTE,
3201 "vect_model_reduction_cost: inside_cost = %d, "
3202 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3203 prologue_cost, epilogue_cost);
3205 return true;
3209 /* Function vect_model_induction_cost.
3211 Models cost for induction operations. */
3213 static void
3214 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3216 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3217 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3218 unsigned inside_cost, prologue_cost;
3220 /* loop cost for vec_loop. */
3221 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3222 stmt_info, 0, vect_body);
3224 /* prologue cost for vec_init and vec_step. */
3225 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3226 stmt_info, 0, vect_prologue);
3228 if (dump_enabled_p ())
3229 dump_printf_loc (MSG_NOTE, vect_location,
3230 "vect_model_induction_cost: inside_cost = %d, "
3231 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3235 /* Function get_initial_def_for_induction
3237 Input:
3238 STMT - a stmt that performs an induction operation in the loop.
3239 IV_PHI - the initial value of the induction variable
3241 Output:
3242 Return a vector variable, initialized with the first VF values of
3243 the induction variable. E.g., for an iv with IV_PHI='X' and
3244 evolution S, for a vector of 4 units, we want to return:
3245 [X, X + S, X + 2*S, X + 3*S]. */
3247 static tree
3248 get_initial_def_for_induction (gimple iv_phi)
3250 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3251 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3252 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3253 tree vectype;
3254 int nunits;
3255 edge pe = loop_preheader_edge (loop);
3256 struct loop *iv_loop;
3257 basic_block new_bb;
3258 tree new_vec, vec_init, vec_step, t;
3259 tree new_var;
3260 tree new_name;
3261 gimple init_stmt, new_stmt;
3262 gimple_phi induction_phi;
3263 tree induc_def, vec_def, vec_dest;
3264 tree init_expr, step_expr;
3265 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3266 int i;
3267 int ncopies;
3268 tree expr;
3269 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3270 bool nested_in_vect_loop = false;
3271 gimple_seq stmts = NULL;
3272 imm_use_iterator imm_iter;
3273 use_operand_p use_p;
3274 gimple exit_phi;
3275 edge latch_e;
3276 tree loop_arg;
3277 gimple_stmt_iterator si;
3278 basic_block bb = gimple_bb (iv_phi);
3279 tree stepvectype;
3280 tree resvectype;
3282 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3283 if (nested_in_vect_loop_p (loop, iv_phi))
3285 nested_in_vect_loop = true;
3286 iv_loop = loop->inner;
3288 else
3289 iv_loop = loop;
3290 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3292 latch_e = loop_latch_edge (iv_loop);
3293 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3295 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3296 gcc_assert (step_expr != NULL_TREE);
3298 pe = loop_preheader_edge (iv_loop);
3299 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3300 loop_preheader_edge (iv_loop));
3302 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3303 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3304 gcc_assert (vectype);
3305 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3306 ncopies = vf / nunits;
3308 gcc_assert (phi_info);
3309 gcc_assert (ncopies >= 1);
3311 /* Convert the step to the desired type. */
3312 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3313 step_expr),
3314 &stmts, true, NULL_TREE);
3315 if (stmts)
3317 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3318 gcc_assert (!new_bb);
3321 /* Find the first insertion point in the BB. */
3322 si = gsi_after_labels (bb);
3324 /* Create the vector that holds the initial_value of the induction. */
3325 if (nested_in_vect_loop)
3327 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3328 been created during vectorization of previous stmts. We obtain it
3329 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3330 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3331 /* If the initial value is not of proper type, convert it. */
3332 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3334 new_stmt = gimple_build_assign_with_ops
3335 (VIEW_CONVERT_EXPR,
3336 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3337 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3338 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3339 gimple_assign_set_lhs (new_stmt, vec_init);
3340 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3341 new_stmt);
3342 gcc_assert (!new_bb);
3343 set_vinfo_for_stmt (new_stmt,
3344 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3347 else
3349 vec<constructor_elt, va_gc> *v;
3351 /* iv_loop is the loop to be vectorized. Create:
3352 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3353 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3354 vect_scalar_var, "var_");
3355 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3356 init_expr),
3357 &stmts, false, new_var);
3358 if (stmts)
3360 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3361 gcc_assert (!new_bb);
3364 vec_alloc (v, nunits);
3365 bool constant_p = is_gimple_min_invariant (new_name);
3366 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3367 for (i = 1; i < nunits; i++)
3369 /* Create: new_name_i = new_name + step_expr */
3370 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3371 new_name, step_expr);
3372 if (!is_gimple_min_invariant (new_name))
3374 init_stmt = gimple_build_assign (new_var, new_name);
3375 new_name = make_ssa_name (new_var, init_stmt);
3376 gimple_assign_set_lhs (init_stmt, new_name);
3377 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3378 gcc_assert (!new_bb);
3379 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_NOTE, vect_location,
3382 "created new init_stmt: ");
3383 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3384 dump_printf (MSG_NOTE, "\n");
3386 constant_p = false;
3388 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3390 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3391 if (constant_p)
3392 new_vec = build_vector_from_ctor (vectype, v);
3393 else
3394 new_vec = build_constructor (vectype, v);
3395 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3399 /* Create the vector that holds the step of the induction. */
3400 if (nested_in_vect_loop)
3401 /* iv_loop is nested in the loop to be vectorized. Generate:
3402 vec_step = [S, S, S, S] */
3403 new_name = step_expr;
3404 else
3406 /* iv_loop is the loop to be vectorized. Generate:
3407 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3408 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3410 expr = build_int_cst (integer_type_node, vf);
3411 expr = fold_convert (TREE_TYPE (step_expr), expr);
3413 else
3414 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3415 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3416 expr, step_expr);
3417 if (TREE_CODE (step_expr) == SSA_NAME)
3418 new_name = vect_init_vector (iv_phi, new_name,
3419 TREE_TYPE (step_expr), NULL);
3422 t = unshare_expr (new_name);
3423 gcc_assert (CONSTANT_CLASS_P (new_name)
3424 || TREE_CODE (new_name) == SSA_NAME);
3425 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3426 gcc_assert (stepvectype);
3427 new_vec = build_vector_from_val (stepvectype, t);
3428 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3431 /* Create the following def-use cycle:
3432 loop prolog:
3433 vec_init = ...
3434 vec_step = ...
3435 loop:
3436 vec_iv = PHI <vec_init, vec_loop>
3438 STMT
3440 vec_loop = vec_iv + vec_step; */
3442 /* Create the induction-phi that defines the induction-operand. */
3443 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3444 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3445 set_vinfo_for_stmt (induction_phi,
3446 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3447 induc_def = PHI_RESULT (induction_phi);
3449 /* Create the iv update inside the loop */
3450 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3451 induc_def, vec_step);
3452 vec_def = make_ssa_name (vec_dest, new_stmt);
3453 gimple_assign_set_lhs (new_stmt, vec_def);
3454 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3455 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3456 NULL));
3458 /* Set the arguments of the phi node: */
3459 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3460 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3461 UNKNOWN_LOCATION);
3464 /* In case that vectorization factor (VF) is bigger than the number
3465 of elements that we can fit in a vectype (nunits), we have to generate
3466 more than one vector stmt - i.e - we need to "unroll" the
3467 vector stmt by a factor VF/nunits. For more details see documentation
3468 in vectorizable_operation. */
3470 if (ncopies > 1)
3472 stmt_vec_info prev_stmt_vinfo;
3473 /* FORNOW. This restriction should be relaxed. */
3474 gcc_assert (!nested_in_vect_loop);
3476 /* Create the vector that holds the step of the induction. */
3477 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3479 expr = build_int_cst (integer_type_node, nunits);
3480 expr = fold_convert (TREE_TYPE (step_expr), expr);
3482 else
3483 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3484 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3485 expr, step_expr);
3486 if (TREE_CODE (step_expr) == SSA_NAME)
3487 new_name = vect_init_vector (iv_phi, new_name,
3488 TREE_TYPE (step_expr), NULL);
3489 t = unshare_expr (new_name);
3490 gcc_assert (CONSTANT_CLASS_P (new_name)
3491 || TREE_CODE (new_name) == SSA_NAME);
3492 new_vec = build_vector_from_val (stepvectype, t);
3493 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3495 vec_def = induc_def;
3496 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3497 for (i = 1; i < ncopies; i++)
3499 /* vec_i = vec_prev + vec_step */
3500 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3501 vec_def, vec_step);
3502 vec_def = make_ssa_name (vec_dest, new_stmt);
3503 gimple_assign_set_lhs (new_stmt, vec_def);
3505 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3506 if (!useless_type_conversion_p (resvectype, vectype))
3508 new_stmt = gimple_build_assign_with_ops
3509 (VIEW_CONVERT_EXPR,
3510 vect_get_new_vect_var (resvectype, vect_simple_var,
3511 "vec_iv_"),
3512 build1 (VIEW_CONVERT_EXPR, resvectype,
3513 gimple_assign_lhs (new_stmt)), NULL_TREE);
3514 gimple_assign_set_lhs (new_stmt,
3515 make_ssa_name
3516 (gimple_assign_lhs (new_stmt), new_stmt));
3517 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3519 set_vinfo_for_stmt (new_stmt,
3520 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3521 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3522 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3526 if (nested_in_vect_loop)
3528 /* Find the loop-closed exit-phi of the induction, and record
3529 the final vector of induction results: */
3530 exit_phi = NULL;
3531 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3533 gimple use_stmt = USE_STMT (use_p);
3534 if (is_gimple_debug (use_stmt))
3535 continue;
3537 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3539 exit_phi = use_stmt;
3540 break;
3543 if (exit_phi)
3545 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3546 /* FORNOW. Currently not supporting the case that an inner-loop induction
3547 is not used in the outer-loop (i.e. only outside the outer-loop). */
3548 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3549 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3551 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3552 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE, vect_location,
3555 "vector of inductions after inner-loop:");
3556 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3557 dump_printf (MSG_NOTE, "\n");
3563 if (dump_enabled_p ())
3565 dump_printf_loc (MSG_NOTE, vect_location,
3566 "transform induction: created def-use cycle: ");
3567 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3568 dump_printf (MSG_NOTE, "\n");
3569 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3570 SSA_NAME_DEF_STMT (vec_def), 0);
3571 dump_printf (MSG_NOTE, "\n");
3574 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3575 if (!useless_type_conversion_p (resvectype, vectype))
3577 new_stmt = gimple_build_assign_with_ops
3578 (VIEW_CONVERT_EXPR,
3579 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3580 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3581 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3582 gimple_assign_set_lhs (new_stmt, induc_def);
3583 si = gsi_after_labels (bb);
3584 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3585 set_vinfo_for_stmt (new_stmt,
3586 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3587 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3588 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3591 return induc_def;
3595 /* Function get_initial_def_for_reduction
3597 Input:
3598 STMT - a stmt that performs a reduction operation in the loop.
3599 INIT_VAL - the initial value of the reduction variable
3601 Output:
3602 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3603 of the reduction (used for adjusting the epilog - see below).
3604 Return a vector variable, initialized according to the operation that STMT
3605 performs. This vector will be used as the initial value of the
3606 vector of partial results.
3608 Option1 (adjust in epilog): Initialize the vector as follows:
3609 add/bit or/xor: [0,0,...,0,0]
3610 mult/bit and: [1,1,...,1,1]
3611 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3612 and when necessary (e.g. add/mult case) let the caller know
3613 that it needs to adjust the result by init_val.
3615 Option2: Initialize the vector as follows:
3616 add/bit or/xor: [init_val,0,0,...,0]
3617 mult/bit and: [init_val,1,1,...,1]
3618 min/max/cond_expr: [init_val,init_val,...,init_val]
3619 and no adjustments are needed.
3621 For example, for the following code:
3623 s = init_val;
3624 for (i=0;i<n;i++)
3625 s = s + a[i];
3627 STMT is 's = s + a[i]', and the reduction variable is 's'.
3628 For a vector of 4 units, we want to return either [0,0,0,init_val],
3629 or [0,0,0,0] and let the caller know that it needs to adjust
3630 the result at the end by 'init_val'.
3632 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3633 initialization vector is simpler (same element in all entries), if
3634 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3636 A cost model should help decide between these two schemes. */
3638 tree
3639 get_initial_def_for_reduction (gimple stmt, tree init_val,
3640 tree *adjustment_def)
3642 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3643 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3644 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3645 tree scalar_type = TREE_TYPE (init_val);
3646 tree vectype = get_vectype_for_scalar_type (scalar_type);
3647 int nunits;
3648 enum tree_code code = gimple_assign_rhs_code (stmt);
3649 tree def_for_init;
3650 tree init_def;
3651 tree *elts;
3652 int i;
3653 bool nested_in_vect_loop = false;
3654 tree init_value;
3655 REAL_VALUE_TYPE real_init_val = dconst0;
3656 int int_init_val = 0;
3657 gimple def_stmt = NULL;
3659 gcc_assert (vectype);
3660 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3662 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3663 || SCALAR_FLOAT_TYPE_P (scalar_type));
3665 if (nested_in_vect_loop_p (loop, stmt))
3666 nested_in_vect_loop = true;
3667 else
3668 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3670 /* In case of double reduction we only create a vector variable to be put
3671 in the reduction phi node. The actual statement creation is done in
3672 vect_create_epilog_for_reduction. */
3673 if (adjustment_def && nested_in_vect_loop
3674 && TREE_CODE (init_val) == SSA_NAME
3675 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3676 && gimple_code (def_stmt) == GIMPLE_PHI
3677 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3678 && vinfo_for_stmt (def_stmt)
3679 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3680 == vect_double_reduction_def)
3682 *adjustment_def = NULL;
3683 return vect_create_destination_var (init_val, vectype);
3686 if (TREE_CONSTANT (init_val))
3688 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3689 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3690 else
3691 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3693 else
3694 init_value = init_val;
3696 switch (code)
3698 case WIDEN_SUM_EXPR:
3699 case DOT_PROD_EXPR:
3700 case SAD_EXPR:
3701 case PLUS_EXPR:
3702 case MINUS_EXPR:
3703 case BIT_IOR_EXPR:
3704 case BIT_XOR_EXPR:
3705 case MULT_EXPR:
3706 case BIT_AND_EXPR:
3707 /* ADJUSMENT_DEF is NULL when called from
3708 vect_create_epilog_for_reduction to vectorize double reduction. */
3709 if (adjustment_def)
3711 if (nested_in_vect_loop)
3712 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3713 NULL);
3714 else
3715 *adjustment_def = init_val;
3718 if (code == MULT_EXPR)
3720 real_init_val = dconst1;
3721 int_init_val = 1;
3724 if (code == BIT_AND_EXPR)
3725 int_init_val = -1;
3727 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3728 def_for_init = build_real (scalar_type, real_init_val);
3729 else
3730 def_for_init = build_int_cst (scalar_type, int_init_val);
3732 /* Create a vector of '0' or '1' except the first element. */
3733 elts = XALLOCAVEC (tree, nunits);
3734 for (i = nunits - 2; i >= 0; --i)
3735 elts[i + 1] = def_for_init;
3737 /* Option1: the first element is '0' or '1' as well. */
3738 if (adjustment_def)
3740 elts[0] = def_for_init;
3741 init_def = build_vector (vectype, elts);
3742 break;
3745 /* Option2: the first element is INIT_VAL. */
3746 elts[0] = init_val;
3747 if (TREE_CONSTANT (init_val))
3748 init_def = build_vector (vectype, elts);
3749 else
3751 vec<constructor_elt, va_gc> *v;
3752 vec_alloc (v, nunits);
3753 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3754 for (i = 1; i < nunits; ++i)
3755 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3756 init_def = build_constructor (vectype, v);
3759 break;
3761 case MIN_EXPR:
3762 case MAX_EXPR:
3763 case COND_EXPR:
3764 if (adjustment_def)
3766 *adjustment_def = NULL_TREE;
3767 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3768 break;
3771 init_def = build_vector_from_val (vectype, init_value);
3772 break;
3774 default:
3775 gcc_unreachable ();
3778 return init_def;
3782 /* Function vect_create_epilog_for_reduction
3784 Create code at the loop-epilog to finalize the result of a reduction
3785 computation.
3787 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3788 reduction statements.
3789 STMT is the scalar reduction stmt that is being vectorized.
3790 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3791 number of elements that we can fit in a vectype (nunits). In this case
3792 we have to generate more than one vector stmt - i.e - we need to "unroll"
3793 the vector stmt by a factor VF/nunits. For more details see documentation
3794 in vectorizable_operation.
3795 REDUC_CODE is the tree-code for the epilog reduction.
3796 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3797 computation.
3798 REDUC_INDEX is the index of the operand in the right hand side of the
3799 statement that is defined by REDUCTION_PHI.
3800 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3801 SLP_NODE is an SLP node containing a group of reduction statements. The
3802 first one in this group is STMT.
3804 This function:
3805 1. Creates the reduction def-use cycles: sets the arguments for
3806 REDUCTION_PHIS:
3807 The loop-entry argument is the vectorized initial-value of the reduction.
3808 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3809 sums.
3810 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3811 by applying the operation specified by REDUC_CODE if available, or by
3812 other means (whole-vector shifts or a scalar loop).
3813 The function also creates a new phi node at the loop exit to preserve
3814 loop-closed form, as illustrated below.
3816 The flow at the entry to this function:
3818 loop:
3819 vec_def = phi <null, null> # REDUCTION_PHI
3820 VECT_DEF = vector_stmt # vectorized form of STMT
3821 s_loop = scalar_stmt # (scalar) STMT
3822 loop_exit:
3823 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3824 use <s_out0>
3825 use <s_out0>
3827 The above is transformed by this function into:
3829 loop:
3830 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3831 VECT_DEF = vector_stmt # vectorized form of STMT
3832 s_loop = scalar_stmt # (scalar) STMT
3833 loop_exit:
3834 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3835 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3836 v_out2 = reduce <v_out1>
3837 s_out3 = extract_field <v_out2, 0>
3838 s_out4 = adjust_result <s_out3>
3839 use <s_out4>
3840 use <s_out4>
3843 static void
3844 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3845 int ncopies, enum tree_code reduc_code,
3846 vec<gimple> reduction_phis,
3847 int reduc_index, bool double_reduc,
3848 slp_tree slp_node)
3850 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3851 stmt_vec_info prev_phi_info;
3852 tree vectype;
3853 enum machine_mode mode;
3854 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3855 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3856 basic_block exit_bb;
3857 tree scalar_dest;
3858 tree scalar_type;
3859 gimple new_phi = NULL, phi;
3860 gimple_stmt_iterator exit_gsi;
3861 tree vec_dest;
3862 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3863 gimple epilog_stmt = NULL;
3864 enum tree_code code = gimple_assign_rhs_code (stmt);
3865 gimple exit_phi;
3866 tree bitsize, bitpos;
3867 tree adjustment_def = NULL;
3868 tree vec_initial_def = NULL;
3869 tree reduction_op, expr, def;
3870 tree orig_name, scalar_result;
3871 imm_use_iterator imm_iter, phi_imm_iter;
3872 use_operand_p use_p, phi_use_p;
3873 bool extract_scalar_result = false;
3874 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3875 bool nested_in_vect_loop = false;
3876 auto_vec<gimple> new_phis;
3877 auto_vec<gimple> inner_phis;
3878 enum vect_def_type dt = vect_unknown_def_type;
3879 int j, i;
3880 auto_vec<tree> scalar_results;
3881 unsigned int group_size = 1, k, ratio;
3882 auto_vec<tree> vec_initial_defs;
3883 auto_vec<gimple> phis;
3884 bool slp_reduc = false;
3885 tree new_phi_result;
3886 gimple inner_phi = NULL;
3888 if (slp_node)
3889 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3891 if (nested_in_vect_loop_p (loop, stmt))
3893 outer_loop = loop;
3894 loop = loop->inner;
3895 nested_in_vect_loop = true;
3896 gcc_assert (!slp_node);
3899 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3901 case GIMPLE_SINGLE_RHS:
3902 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3903 == ternary_op);
3904 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3905 break;
3906 case GIMPLE_UNARY_RHS:
3907 reduction_op = gimple_assign_rhs1 (stmt);
3908 break;
3909 case GIMPLE_BINARY_RHS:
3910 reduction_op = reduc_index ?
3911 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3912 break;
3913 case GIMPLE_TERNARY_RHS:
3914 reduction_op = gimple_op (stmt, reduc_index + 1);
3915 break;
3916 default:
3917 gcc_unreachable ();
3920 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3921 gcc_assert (vectype);
3922 mode = TYPE_MODE (vectype);
3924 /* 1. Create the reduction def-use cycle:
3925 Set the arguments of REDUCTION_PHIS, i.e., transform
3927 loop:
3928 vec_def = phi <null, null> # REDUCTION_PHI
3929 VECT_DEF = vector_stmt # vectorized form of STMT
3932 into:
3934 loop:
3935 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3936 VECT_DEF = vector_stmt # vectorized form of STMT
3939 (in case of SLP, do it for all the phis). */
3941 /* Get the loop-entry arguments. */
3942 if (slp_node)
3943 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3944 NULL, slp_node, reduc_index);
3945 else
3947 vec_initial_defs.create (1);
3948 /* For the case of reduction, vect_get_vec_def_for_operand returns
3949 the scalar def before the loop, that defines the initial value
3950 of the reduction variable. */
3951 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3952 &adjustment_def);
3953 vec_initial_defs.quick_push (vec_initial_def);
3956 /* Set phi nodes arguments. */
3957 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3959 tree vec_init_def, def;
3960 gimple_seq stmts;
3961 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
3962 true, NULL_TREE);
3963 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3964 def = vect_defs[i];
3965 for (j = 0; j < ncopies; j++)
3967 /* Set the loop-entry arg of the reduction-phi. */
3968 add_phi_arg (as_a <gimple_phi> (phi), vec_init_def,
3969 loop_preheader_edge (loop), UNKNOWN_LOCATION);
3971 /* Set the loop-latch arg for the reduction-phi. */
3972 if (j > 0)
3973 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3975 add_phi_arg (as_a <gimple_phi> (phi), def, loop_latch_edge (loop),
3976 UNKNOWN_LOCATION);
3978 if (dump_enabled_p ())
3980 dump_printf_loc (MSG_NOTE, vect_location,
3981 "transform reduction: created def-use cycle: ");
3982 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3983 dump_printf (MSG_NOTE, "\n");
3984 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3985 dump_printf (MSG_NOTE, "\n");
3988 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3992 /* 2. Create epilog code.
3993 The reduction epilog code operates across the elements of the vector
3994 of partial results computed by the vectorized loop.
3995 The reduction epilog code consists of:
3997 step 1: compute the scalar result in a vector (v_out2)
3998 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3999 step 3: adjust the scalar result (s_out3) if needed.
4001 Step 1 can be accomplished using one the following three schemes:
4002 (scheme 1) using reduc_code, if available.
4003 (scheme 2) using whole-vector shifts, if available.
4004 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4005 combined.
4007 The overall epilog code looks like this:
4009 s_out0 = phi <s_loop> # original EXIT_PHI
4010 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4011 v_out2 = reduce <v_out1> # step 1
4012 s_out3 = extract_field <v_out2, 0> # step 2
4013 s_out4 = adjust_result <s_out3> # step 3
4015 (step 3 is optional, and steps 1 and 2 may be combined).
4016 Lastly, the uses of s_out0 are replaced by s_out4. */
4019 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4020 v_out1 = phi <VECT_DEF>
4021 Store them in NEW_PHIS. */
4023 exit_bb = single_exit (loop)->dest;
4024 prev_phi_info = NULL;
4025 new_phis.create (vect_defs.length ());
4026 FOR_EACH_VEC_ELT (vect_defs, i, def)
4028 for (j = 0; j < ncopies; j++)
4030 tree new_def = copy_ssa_name (def, NULL);
4031 phi = create_phi_node (new_def, exit_bb);
4032 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4033 if (j == 0)
4034 new_phis.quick_push (phi);
4035 else
4037 def = vect_get_vec_def_for_stmt_copy (dt, def);
4038 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4041 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4042 prev_phi_info = vinfo_for_stmt (phi);
4046 /* The epilogue is created for the outer-loop, i.e., for the loop being
4047 vectorized. Create exit phis for the outer loop. */
4048 if (double_reduc)
4050 loop = outer_loop;
4051 exit_bb = single_exit (loop)->dest;
4052 inner_phis.create (vect_defs.length ());
4053 FOR_EACH_VEC_ELT (new_phis, i, phi)
4055 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4056 gimple_phi outer_phi = create_phi_node (new_result, exit_bb);
4057 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4058 PHI_RESULT (phi));
4059 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4060 loop_vinfo, NULL));
4061 inner_phis.quick_push (phi);
4062 new_phis[i] = outer_phi;
4063 prev_phi_info = vinfo_for_stmt (outer_phi);
4064 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4066 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4067 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4068 outer_phi = create_phi_node (new_result, exit_bb);
4069 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4070 PHI_RESULT (phi));
4071 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4072 loop_vinfo, NULL));
4073 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4074 prev_phi_info = vinfo_for_stmt (outer_phi);
4079 exit_gsi = gsi_after_labels (exit_bb);
4081 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4082 (i.e. when reduc_code is not available) and in the final adjustment
4083 code (if needed). Also get the original scalar reduction variable as
4084 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4085 represents a reduction pattern), the tree-code and scalar-def are
4086 taken from the original stmt that the pattern-stmt (STMT) replaces.
4087 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4088 are taken from STMT. */
4090 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4091 if (!orig_stmt)
4093 /* Regular reduction */
4094 orig_stmt = stmt;
4096 else
4098 /* Reduction pattern */
4099 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4100 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4101 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4104 code = gimple_assign_rhs_code (orig_stmt);
4105 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4106 partial results are added and not subtracted. */
4107 if (code == MINUS_EXPR)
4108 code = PLUS_EXPR;
4110 scalar_dest = gimple_assign_lhs (orig_stmt);
4111 scalar_type = TREE_TYPE (scalar_dest);
4112 scalar_results.create (group_size);
4113 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4114 bitsize = TYPE_SIZE (scalar_type);
4116 /* In case this is a reduction in an inner-loop while vectorizing an outer
4117 loop - we don't need to extract a single scalar result at the end of the
4118 inner-loop (unless it is double reduction, i.e., the use of reduction is
4119 outside the outer-loop). The final vector of partial results will be used
4120 in the vectorized outer-loop, or reduced to a scalar result at the end of
4121 the outer-loop. */
4122 if (nested_in_vect_loop && !double_reduc)
4123 goto vect_finalize_reduction;
4125 /* SLP reduction without reduction chain, e.g.,
4126 # a1 = phi <a2, a0>
4127 # b1 = phi <b2, b0>
4128 a2 = operation (a1)
4129 b2 = operation (b1) */
4130 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4132 /* In case of reduction chain, e.g.,
4133 # a1 = phi <a3, a0>
4134 a2 = operation (a1)
4135 a3 = operation (a2),
4137 we may end up with more than one vector result. Here we reduce them to
4138 one vector. */
4139 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4141 tree first_vect = PHI_RESULT (new_phis[0]);
4142 tree tmp;
4143 gimple_assign new_vec_stmt = NULL;
4145 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4146 for (k = 1; k < new_phis.length (); k++)
4148 gimple next_phi = new_phis[k];
4149 tree second_vect = PHI_RESULT (next_phi);
4151 tmp = build2 (code, vectype, first_vect, second_vect);
4152 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4153 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4154 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4155 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4158 new_phi_result = first_vect;
4159 if (new_vec_stmt)
4161 new_phis.truncate (0);
4162 new_phis.safe_push (new_vec_stmt);
4165 else
4166 new_phi_result = PHI_RESULT (new_phis[0]);
4168 /* 2.3 Create the reduction code, using one of the three schemes described
4169 above. In SLP we simply need to extract all the elements from the
4170 vector (without reducing them), so we use scalar shifts. */
4171 if (reduc_code != ERROR_MARK && !slp_reduc)
4173 tree tmp;
4175 /*** Case 1: Create:
4176 v_out2 = reduc_expr <v_out1> */
4178 if (dump_enabled_p ())
4179 dump_printf_loc (MSG_NOTE, vect_location,
4180 "Reduce using direct vector reduction.\n");
4182 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4183 tmp = build1 (reduc_code, vectype, new_phi_result);
4184 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4185 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4186 gimple_assign_set_lhs (epilog_stmt, new_temp);
4187 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4189 extract_scalar_result = true;
4191 else
4193 enum tree_code shift_code = ERROR_MARK;
4194 bool have_whole_vector_shift = true;
4195 int bit_offset;
4196 int element_bitsize = tree_to_uhwi (bitsize);
4197 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4198 tree vec_temp;
4200 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4201 shift_code = VEC_RSHIFT_EXPR;
4202 else
4203 have_whole_vector_shift = false;
4205 /* Regardless of whether we have a whole vector shift, if we're
4206 emulating the operation via tree-vect-generic, we don't want
4207 to use it. Only the first round of the reduction is likely
4208 to still be profitable via emulation. */
4209 /* ??? It might be better to emit a reduction tree code here, so that
4210 tree-vect-generic can expand the first round via bit tricks. */
4211 if (!VECTOR_MODE_P (mode))
4212 have_whole_vector_shift = false;
4213 else
4215 optab optab = optab_for_tree_code (code, vectype, optab_default);
4216 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4217 have_whole_vector_shift = false;
4220 if (have_whole_vector_shift && !slp_reduc)
4222 /*** Case 2: Create:
4223 for (offset = VS/2; offset >= element_size; offset/=2)
4225 Create: va' = vec_shift <va, offset>
4226 Create: va = vop <va, va'>
4227 } */
4229 if (dump_enabled_p ())
4230 dump_printf_loc (MSG_NOTE, vect_location,
4231 "Reduce using vector shifts\n");
4233 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4234 new_temp = new_phi_result;
4235 for (bit_offset = vec_size_in_bits/2;
4236 bit_offset >= element_bitsize;
4237 bit_offset /= 2)
4239 tree bitpos = size_int (bit_offset);
4241 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4242 vec_dest, new_temp, bitpos);
4243 new_name = make_ssa_name (vec_dest, epilog_stmt);
4244 gimple_assign_set_lhs (epilog_stmt, new_name);
4245 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4247 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4248 new_name, new_temp);
4249 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4250 gimple_assign_set_lhs (epilog_stmt, new_temp);
4251 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4254 extract_scalar_result = true;
4256 else
4258 tree rhs;
4260 /*** Case 3: Create:
4261 s = extract_field <v_out2, 0>
4262 for (offset = element_size;
4263 offset < vector_size;
4264 offset += element_size;)
4266 Create: s' = extract_field <v_out2, offset>
4267 Create: s = op <s, s'> // For non SLP cases
4268 } */
4270 if (dump_enabled_p ())
4271 dump_printf_loc (MSG_NOTE, vect_location,
4272 "Reduce using scalar code.\n");
4274 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4275 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4277 if (gimple_code (new_phi) == GIMPLE_PHI)
4278 vec_temp = PHI_RESULT (new_phi);
4279 else
4280 vec_temp = gimple_assign_lhs (new_phi);
4281 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4282 bitsize_zero_node);
4283 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4284 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4285 gimple_assign_set_lhs (epilog_stmt, new_temp);
4286 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4288 /* In SLP we don't need to apply reduction operation, so we just
4289 collect s' values in SCALAR_RESULTS. */
4290 if (slp_reduc)
4291 scalar_results.safe_push (new_temp);
4293 for (bit_offset = element_bitsize;
4294 bit_offset < vec_size_in_bits;
4295 bit_offset += element_bitsize)
4297 tree bitpos = bitsize_int (bit_offset);
4298 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4299 bitsize, bitpos);
4301 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4302 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4303 gimple_assign_set_lhs (epilog_stmt, new_name);
4304 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4306 if (slp_reduc)
4308 /* In SLP we don't need to apply reduction operation, so
4309 we just collect s' values in SCALAR_RESULTS. */
4310 new_temp = new_name;
4311 scalar_results.safe_push (new_name);
4313 else
4315 epilog_stmt = gimple_build_assign_with_ops (code,
4316 new_scalar_dest, new_name, new_temp);
4317 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4318 gimple_assign_set_lhs (epilog_stmt, new_temp);
4319 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4324 /* The only case where we need to reduce scalar results in SLP, is
4325 unrolling. If the size of SCALAR_RESULTS is greater than
4326 GROUP_SIZE, we reduce them combining elements modulo
4327 GROUP_SIZE. */
4328 if (slp_reduc)
4330 tree res, first_res, new_res;
4331 gimple new_stmt;
4333 /* Reduce multiple scalar results in case of SLP unrolling. */
4334 for (j = group_size; scalar_results.iterate (j, &res);
4335 j++)
4337 first_res = scalar_results[j % group_size];
4338 new_stmt = gimple_build_assign_with_ops (code,
4339 new_scalar_dest, first_res, res);
4340 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4341 gimple_assign_set_lhs (new_stmt, new_res);
4342 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4343 scalar_results[j % group_size] = new_res;
4346 else
4347 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4348 scalar_results.safe_push (new_temp);
4350 extract_scalar_result = false;
4354 /* 2.4 Extract the final scalar result. Create:
4355 s_out3 = extract_field <v_out2, bitpos> */
4357 if (extract_scalar_result)
4359 tree rhs;
4361 if (dump_enabled_p ())
4362 dump_printf_loc (MSG_NOTE, vect_location,
4363 "extract scalar result\n");
4365 if (BYTES_BIG_ENDIAN)
4366 bitpos = size_binop (MULT_EXPR,
4367 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4368 TYPE_SIZE (scalar_type));
4369 else
4370 bitpos = bitsize_zero_node;
4372 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4373 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4374 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4375 gimple_assign_set_lhs (epilog_stmt, new_temp);
4376 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4377 scalar_results.safe_push (new_temp);
4380 vect_finalize_reduction:
4382 if (double_reduc)
4383 loop = loop->inner;
4385 /* 2.5 Adjust the final result by the initial value of the reduction
4386 variable. (When such adjustment is not needed, then
4387 'adjustment_def' is zero). For example, if code is PLUS we create:
4388 new_temp = loop_exit_def + adjustment_def */
4390 if (adjustment_def)
4392 gcc_assert (!slp_reduc);
4393 if (nested_in_vect_loop)
4395 new_phi = new_phis[0];
4396 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4397 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4398 new_dest = vect_create_destination_var (scalar_dest, vectype);
4400 else
4402 new_temp = scalar_results[0];
4403 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4404 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4405 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4408 epilog_stmt = gimple_build_assign (new_dest, expr);
4409 new_temp = make_ssa_name (new_dest, epilog_stmt);
4410 gimple_assign_set_lhs (epilog_stmt, new_temp);
4411 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4412 if (nested_in_vect_loop)
4414 set_vinfo_for_stmt (epilog_stmt,
4415 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4416 NULL));
4417 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4418 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4420 if (!double_reduc)
4421 scalar_results.quick_push (new_temp);
4422 else
4423 scalar_results[0] = new_temp;
4425 else
4426 scalar_results[0] = new_temp;
4428 new_phis[0] = epilog_stmt;
4431 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4432 phis with new adjusted scalar results, i.e., replace use <s_out0>
4433 with use <s_out4>.
4435 Transform:
4436 loop_exit:
4437 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4438 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4439 v_out2 = reduce <v_out1>
4440 s_out3 = extract_field <v_out2, 0>
4441 s_out4 = adjust_result <s_out3>
4442 use <s_out0>
4443 use <s_out0>
4445 into:
4447 loop_exit:
4448 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4449 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4450 v_out2 = reduce <v_out1>
4451 s_out3 = extract_field <v_out2, 0>
4452 s_out4 = adjust_result <s_out3>
4453 use <s_out4>
4454 use <s_out4> */
4457 /* In SLP reduction chain we reduce vector results into one vector if
4458 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4459 the last stmt in the reduction chain, since we are looking for the loop
4460 exit phi node. */
4461 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4463 scalar_dest = gimple_assign_lhs (
4464 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4465 group_size = 1;
4468 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4469 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4470 need to match SCALAR_RESULTS with corresponding statements. The first
4471 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4472 the first vector stmt, etc.
4473 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4474 if (group_size > new_phis.length ())
4476 ratio = group_size / new_phis.length ();
4477 gcc_assert (!(group_size % new_phis.length ()));
4479 else
4480 ratio = 1;
4482 for (k = 0; k < group_size; k++)
4484 if (k % ratio == 0)
4486 epilog_stmt = new_phis[k / ratio];
4487 reduction_phi = reduction_phis[k / ratio];
4488 if (double_reduc)
4489 inner_phi = inner_phis[k / ratio];
4492 if (slp_reduc)
4494 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4496 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4497 /* SLP statements can't participate in patterns. */
4498 gcc_assert (!orig_stmt);
4499 scalar_dest = gimple_assign_lhs (current_stmt);
4502 phis.create (3);
4503 /* Find the loop-closed-use at the loop exit of the original scalar
4504 result. (The reduction result is expected to have two immediate uses -
4505 one at the latch block, and one at the loop exit). */
4506 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4507 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4508 && !is_gimple_debug (USE_STMT (use_p)))
4509 phis.safe_push (USE_STMT (use_p));
4511 /* While we expect to have found an exit_phi because of loop-closed-ssa
4512 form we can end up without one if the scalar cycle is dead. */
4514 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4516 if (outer_loop)
4518 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4519 gimple_phi vect_phi;
4521 /* FORNOW. Currently not supporting the case that an inner-loop
4522 reduction is not used in the outer-loop (but only outside the
4523 outer-loop), unless it is double reduction. */
4524 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4525 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4526 || double_reduc);
4528 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4529 if (!double_reduc
4530 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4531 != vect_double_reduction_def)
4532 continue;
4534 /* Handle double reduction:
4536 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4537 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4538 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4539 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4541 At that point the regular reduction (stmt2 and stmt3) is
4542 already vectorized, as well as the exit phi node, stmt4.
4543 Here we vectorize the phi node of double reduction, stmt1, and
4544 update all relevant statements. */
4546 /* Go through all the uses of s2 to find double reduction phi
4547 node, i.e., stmt1 above. */
4548 orig_name = PHI_RESULT (exit_phi);
4549 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4551 stmt_vec_info use_stmt_vinfo;
4552 stmt_vec_info new_phi_vinfo;
4553 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4554 basic_block bb = gimple_bb (use_stmt);
4555 gimple use;
4557 /* Check that USE_STMT is really double reduction phi
4558 node. */
4559 if (gimple_code (use_stmt) != GIMPLE_PHI
4560 || gimple_phi_num_args (use_stmt) != 2
4561 || bb->loop_father != outer_loop)
4562 continue;
4563 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4564 if (!use_stmt_vinfo
4565 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4566 != vect_double_reduction_def)
4567 continue;
4569 /* Create vector phi node for double reduction:
4570 vs1 = phi <vs0, vs2>
4571 vs1 was created previously in this function by a call to
4572 vect_get_vec_def_for_operand and is stored in
4573 vec_initial_def;
4574 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4575 vs0 is created here. */
4577 /* Create vector phi node. */
4578 vect_phi = create_phi_node (vec_initial_def, bb);
4579 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4580 loop_vec_info_for_loop (outer_loop), NULL);
4581 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4583 /* Create vs0 - initial def of the double reduction phi. */
4584 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4585 loop_preheader_edge (outer_loop));
4586 init_def = get_initial_def_for_reduction (stmt,
4587 preheader_arg, NULL);
4588 vect_phi_init = vect_init_vector (use_stmt, init_def,
4589 vectype, NULL);
4591 /* Update phi node arguments with vs0 and vs2. */
4592 add_phi_arg (vect_phi, vect_phi_init,
4593 loop_preheader_edge (outer_loop),
4594 UNKNOWN_LOCATION);
4595 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4596 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4597 if (dump_enabled_p ())
4599 dump_printf_loc (MSG_NOTE, vect_location,
4600 "created double reduction phi node: ");
4601 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4602 dump_printf (MSG_NOTE, "\n");
4605 vect_phi_res = PHI_RESULT (vect_phi);
4607 /* Replace the use, i.e., set the correct vs1 in the regular
4608 reduction phi node. FORNOW, NCOPIES is always 1, so the
4609 loop is redundant. */
4610 use = reduction_phi;
4611 for (j = 0; j < ncopies; j++)
4613 edge pr_edge = loop_preheader_edge (loop);
4614 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4615 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4621 phis.release ();
4622 if (nested_in_vect_loop)
4624 if (double_reduc)
4625 loop = outer_loop;
4626 else
4627 continue;
4630 phis.create (3);
4631 /* Find the loop-closed-use at the loop exit of the original scalar
4632 result. (The reduction result is expected to have two immediate uses,
4633 one at the latch block, and one at the loop exit). For double
4634 reductions we are looking for exit phis of the outer loop. */
4635 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4637 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4639 if (!is_gimple_debug (USE_STMT (use_p)))
4640 phis.safe_push (USE_STMT (use_p));
4642 else
4644 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4646 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4648 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4650 if (!flow_bb_inside_loop_p (loop,
4651 gimple_bb (USE_STMT (phi_use_p)))
4652 && !is_gimple_debug (USE_STMT (phi_use_p)))
4653 phis.safe_push (USE_STMT (phi_use_p));
4659 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4661 /* Replace the uses: */
4662 orig_name = PHI_RESULT (exit_phi);
4663 scalar_result = scalar_results[k];
4664 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4665 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4666 SET_USE (use_p, scalar_result);
4669 phis.release ();
4674 /* Function vectorizable_reduction.
4676 Check if STMT performs a reduction operation that can be vectorized.
4677 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4678 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4679 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4681 This function also handles reduction idioms (patterns) that have been
4682 recognized in advance during vect_pattern_recog. In this case, STMT may be
4683 of this form:
4684 X = pattern_expr (arg0, arg1, ..., X)
4685 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4686 sequence that had been detected and replaced by the pattern-stmt (STMT).
4688 In some cases of reduction patterns, the type of the reduction variable X is
4689 different than the type of the other arguments of STMT.
4690 In such cases, the vectype that is used when transforming STMT into a vector
4691 stmt is different than the vectype that is used to determine the
4692 vectorization factor, because it consists of a different number of elements
4693 than the actual number of elements that are being operated upon in parallel.
4695 For example, consider an accumulation of shorts into an int accumulator.
4696 On some targets it's possible to vectorize this pattern operating on 8
4697 shorts at a time (hence, the vectype for purposes of determining the
4698 vectorization factor should be V8HI); on the other hand, the vectype that
4699 is used to create the vector form is actually V4SI (the type of the result).
4701 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4702 indicates what is the actual level of parallelism (V8HI in the example), so
4703 that the right vectorization factor would be derived. This vectype
4704 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4705 be used to create the vectorized stmt. The right vectype for the vectorized
4706 stmt is obtained from the type of the result X:
4707 get_vectype_for_scalar_type (TREE_TYPE (X))
4709 This means that, contrary to "regular" reductions (or "regular" stmts in
4710 general), the following equation:
4711 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4712 does *NOT* necessarily hold for reduction patterns. */
4714 bool
4715 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4716 gimple *vec_stmt, slp_tree slp_node)
4718 tree vec_dest;
4719 tree scalar_dest;
4720 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4721 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4722 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4723 tree vectype_in = NULL_TREE;
4724 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4725 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4726 enum tree_code code, orig_code, epilog_reduc_code;
4727 enum machine_mode vec_mode;
4728 int op_type;
4729 optab optab, reduc_optab;
4730 tree new_temp = NULL_TREE;
4731 tree def;
4732 gimple def_stmt;
4733 enum vect_def_type dt;
4734 gimple_phi new_phi = NULL;
4735 tree scalar_type;
4736 bool is_simple_use;
4737 gimple orig_stmt;
4738 stmt_vec_info orig_stmt_info;
4739 tree expr = NULL_TREE;
4740 int i;
4741 int ncopies;
4742 int epilog_copies;
4743 stmt_vec_info prev_stmt_info, prev_phi_info;
4744 bool single_defuse_cycle = false;
4745 tree reduc_def = NULL_TREE;
4746 gimple new_stmt = NULL;
4747 int j;
4748 tree ops[3];
4749 bool nested_cycle = false, found_nested_cycle_def = false;
4750 gimple reduc_def_stmt = NULL;
4751 /* The default is that the reduction variable is the last in statement. */
4752 int reduc_index = 2;
4753 bool double_reduc = false, dummy;
4754 basic_block def_bb;
4755 struct loop * def_stmt_loop, *outer_loop = NULL;
4756 tree def_arg;
4757 gimple def_arg_stmt;
4758 auto_vec<tree> vec_oprnds0;
4759 auto_vec<tree> vec_oprnds1;
4760 auto_vec<tree> vect_defs;
4761 auto_vec<gimple> phis;
4762 int vec_num;
4763 tree def0, def1, tem, op0, op1 = NULL_TREE;
4765 /* In case of reduction chain we switch to the first stmt in the chain, but
4766 we don't update STMT_INFO, since only the last stmt is marked as reduction
4767 and has reduction properties. */
4768 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4769 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4771 if (nested_in_vect_loop_p (loop, stmt))
4773 outer_loop = loop;
4774 loop = loop->inner;
4775 nested_cycle = true;
4778 /* 1. Is vectorizable reduction? */
4779 /* Not supportable if the reduction variable is used in the loop, unless
4780 it's a reduction chain. */
4781 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4782 && !GROUP_FIRST_ELEMENT (stmt_info))
4783 return false;
4785 /* Reductions that are not used even in an enclosing outer-loop,
4786 are expected to be "live" (used out of the loop). */
4787 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4788 && !STMT_VINFO_LIVE_P (stmt_info))
4789 return false;
4791 /* Make sure it was already recognized as a reduction computation. */
4792 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4793 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4794 return false;
4796 /* 2. Has this been recognized as a reduction pattern?
4798 Check if STMT represents a pattern that has been recognized
4799 in earlier analysis stages. For stmts that represent a pattern,
4800 the STMT_VINFO_RELATED_STMT field records the last stmt in
4801 the original sequence that constitutes the pattern. */
4803 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4804 if (orig_stmt)
4806 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4807 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4808 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4811 /* 3. Check the operands of the operation. The first operands are defined
4812 inside the loop body. The last operand is the reduction variable,
4813 which is defined by the loop-header-phi. */
4815 gcc_assert (is_gimple_assign (stmt));
4817 /* Flatten RHS. */
4818 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4820 case GIMPLE_SINGLE_RHS:
4821 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4822 if (op_type == ternary_op)
4824 tree rhs = gimple_assign_rhs1 (stmt);
4825 ops[0] = TREE_OPERAND (rhs, 0);
4826 ops[1] = TREE_OPERAND (rhs, 1);
4827 ops[2] = TREE_OPERAND (rhs, 2);
4828 code = TREE_CODE (rhs);
4830 else
4831 return false;
4832 break;
4834 case GIMPLE_BINARY_RHS:
4835 code = gimple_assign_rhs_code (stmt);
4836 op_type = TREE_CODE_LENGTH (code);
4837 gcc_assert (op_type == binary_op);
4838 ops[0] = gimple_assign_rhs1 (stmt);
4839 ops[1] = gimple_assign_rhs2 (stmt);
4840 break;
4842 case GIMPLE_TERNARY_RHS:
4843 code = gimple_assign_rhs_code (stmt);
4844 op_type = TREE_CODE_LENGTH (code);
4845 gcc_assert (op_type == ternary_op);
4846 ops[0] = gimple_assign_rhs1 (stmt);
4847 ops[1] = gimple_assign_rhs2 (stmt);
4848 ops[2] = gimple_assign_rhs3 (stmt);
4849 break;
4851 case GIMPLE_UNARY_RHS:
4852 return false;
4854 default:
4855 gcc_unreachable ();
4858 if (code == COND_EXPR && slp_node)
4859 return false;
4861 scalar_dest = gimple_assign_lhs (stmt);
4862 scalar_type = TREE_TYPE (scalar_dest);
4863 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4864 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4865 return false;
4867 /* Do not try to vectorize bit-precision reductions. */
4868 if ((TYPE_PRECISION (scalar_type)
4869 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4870 return false;
4872 /* All uses but the last are expected to be defined in the loop.
4873 The last use is the reduction variable. In case of nested cycle this
4874 assumption is not true: we use reduc_index to record the index of the
4875 reduction variable. */
4876 for (i = 0; i < op_type - 1; i++)
4878 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4879 if (i == 0 && code == COND_EXPR)
4880 continue;
4882 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4883 &def_stmt, &def, &dt, &tem);
4884 if (!vectype_in)
4885 vectype_in = tem;
4886 gcc_assert (is_simple_use);
4888 if (dt != vect_internal_def
4889 && dt != vect_external_def
4890 && dt != vect_constant_def
4891 && dt != vect_induction_def
4892 && !(dt == vect_nested_cycle && nested_cycle))
4893 return false;
4895 if (dt == vect_nested_cycle)
4897 found_nested_cycle_def = true;
4898 reduc_def_stmt = def_stmt;
4899 reduc_index = i;
4903 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4904 &def_stmt, &def, &dt, &tem);
4905 if (!vectype_in)
4906 vectype_in = tem;
4907 gcc_assert (is_simple_use);
4908 if (!(dt == vect_reduction_def
4909 || dt == vect_nested_cycle
4910 || ((dt == vect_internal_def || dt == vect_external_def
4911 || dt == vect_constant_def || dt == vect_induction_def)
4912 && nested_cycle && found_nested_cycle_def)))
4914 /* For pattern recognized stmts, orig_stmt might be a reduction,
4915 but some helper statements for the pattern might not, or
4916 might be COND_EXPRs with reduction uses in the condition. */
4917 gcc_assert (orig_stmt);
4918 return false;
4920 if (!found_nested_cycle_def)
4921 reduc_def_stmt = def_stmt;
4923 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4924 if (orig_stmt)
4925 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4926 reduc_def_stmt,
4927 !nested_cycle,
4928 &dummy));
4929 else
4931 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4932 !nested_cycle, &dummy);
4933 /* We changed STMT to be the first stmt in reduction chain, hence we
4934 check that in this case the first element in the chain is STMT. */
4935 gcc_assert (stmt == tmp
4936 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4939 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4940 return false;
4942 if (slp_node || PURE_SLP_STMT (stmt_info))
4943 ncopies = 1;
4944 else
4945 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4946 / TYPE_VECTOR_SUBPARTS (vectype_in));
4948 gcc_assert (ncopies >= 1);
4950 vec_mode = TYPE_MODE (vectype_in);
4952 if (code == COND_EXPR)
4954 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4956 if (dump_enabled_p ())
4957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4958 "unsupported condition in reduction\n");
4960 return false;
4963 else
4965 /* 4. Supportable by target? */
4967 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4968 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4970 /* Shifts and rotates are only supported by vectorizable_shifts,
4971 not vectorizable_reduction. */
4972 if (dump_enabled_p ())
4973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4974 "unsupported shift or rotation.\n");
4975 return false;
4978 /* 4.1. check support for the operation in the loop */
4979 optab = optab_for_tree_code (code, vectype_in, optab_default);
4980 if (!optab)
4982 if (dump_enabled_p ())
4983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4984 "no optab.\n");
4986 return false;
4989 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4991 if (dump_enabled_p ())
4992 dump_printf (MSG_NOTE, "op not supported by target.\n");
4994 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4995 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4996 < vect_min_worthwhile_factor (code))
4997 return false;
4999 if (dump_enabled_p ())
5000 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5003 /* Worthwhile without SIMD support? */
5004 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5005 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5006 < vect_min_worthwhile_factor (code))
5008 if (dump_enabled_p ())
5009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5010 "not worthwhile without SIMD support.\n");
5012 return false;
5016 /* 4.2. Check support for the epilog operation.
5018 If STMT represents a reduction pattern, then the type of the
5019 reduction variable may be different than the type of the rest
5020 of the arguments. For example, consider the case of accumulation
5021 of shorts into an int accumulator; The original code:
5022 S1: int_a = (int) short_a;
5023 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5025 was replaced with:
5026 STMT: int_acc = widen_sum <short_a, int_acc>
5028 This means that:
5029 1. The tree-code that is used to create the vector operation in the
5030 epilog code (that reduces the partial results) is not the
5031 tree-code of STMT, but is rather the tree-code of the original
5032 stmt from the pattern that STMT is replacing. I.e, in the example
5033 above we want to use 'widen_sum' in the loop, but 'plus' in the
5034 epilog.
5035 2. The type (mode) we use to check available target support
5036 for the vector operation to be created in the *epilog*, is
5037 determined by the type of the reduction variable (in the example
5038 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5039 However the type (mode) we use to check available target support
5040 for the vector operation to be created *inside the loop*, is
5041 determined by the type of the other arguments to STMT (in the
5042 example we'd check this: optab_handler (widen_sum_optab,
5043 vect_short_mode)).
5045 This is contrary to "regular" reductions, in which the types of all
5046 the arguments are the same as the type of the reduction variable.
5047 For "regular" reductions we can therefore use the same vector type
5048 (and also the same tree-code) when generating the epilog code and
5049 when generating the code inside the loop. */
5051 if (orig_stmt)
5053 /* This is a reduction pattern: get the vectype from the type of the
5054 reduction variable, and get the tree-code from orig_stmt. */
5055 orig_code = gimple_assign_rhs_code (orig_stmt);
5056 gcc_assert (vectype_out);
5057 vec_mode = TYPE_MODE (vectype_out);
5059 else
5061 /* Regular reduction: use the same vectype and tree-code as used for
5062 the vector code inside the loop can be used for the epilog code. */
5063 orig_code = code;
5066 if (nested_cycle)
5068 def_bb = gimple_bb (reduc_def_stmt);
5069 def_stmt_loop = def_bb->loop_father;
5070 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5071 loop_preheader_edge (def_stmt_loop));
5072 if (TREE_CODE (def_arg) == SSA_NAME
5073 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5074 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5075 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5076 && vinfo_for_stmt (def_arg_stmt)
5077 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5078 == vect_double_reduction_def)
5079 double_reduc = true;
5082 epilog_reduc_code = ERROR_MARK;
5083 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5085 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5086 optab_default);
5087 if (!reduc_optab)
5089 if (dump_enabled_p ())
5090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5091 "no optab for reduction.\n");
5093 epilog_reduc_code = ERROR_MARK;
5096 if (reduc_optab
5097 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5099 if (dump_enabled_p ())
5100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5101 "reduc op not supported by target.\n");
5103 epilog_reduc_code = ERROR_MARK;
5106 else
5108 if (!nested_cycle || double_reduc)
5110 if (dump_enabled_p ())
5111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5112 "no reduc code for scalar code.\n");
5114 return false;
5118 if (double_reduc && ncopies > 1)
5120 if (dump_enabled_p ())
5121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5122 "multiple types in double reduction\n");
5124 return false;
5127 /* In case of widenning multiplication by a constant, we update the type
5128 of the constant to be the type of the other operand. We check that the
5129 constant fits the type in the pattern recognition pass. */
5130 if (code == DOT_PROD_EXPR
5131 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5133 if (TREE_CODE (ops[0]) == INTEGER_CST)
5134 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5135 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5136 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5137 else
5139 if (dump_enabled_p ())
5140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5141 "invalid types in dot-prod\n");
5143 return false;
5147 if (!vec_stmt) /* transformation not required. */
5149 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5150 return false;
5151 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5152 return true;
5155 /** Transform. **/
5157 if (dump_enabled_p ())
5158 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5160 /* FORNOW: Multiple types are not supported for condition. */
5161 if (code == COND_EXPR)
5162 gcc_assert (ncopies == 1);
5164 /* Create the destination vector */
5165 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5167 /* In case the vectorization factor (VF) is bigger than the number
5168 of elements that we can fit in a vectype (nunits), we have to generate
5169 more than one vector stmt - i.e - we need to "unroll" the
5170 vector stmt by a factor VF/nunits. For more details see documentation
5171 in vectorizable_operation. */
5173 /* If the reduction is used in an outer loop we need to generate
5174 VF intermediate results, like so (e.g. for ncopies=2):
5175 r0 = phi (init, r0)
5176 r1 = phi (init, r1)
5177 r0 = x0 + r0;
5178 r1 = x1 + r1;
5179 (i.e. we generate VF results in 2 registers).
5180 In this case we have a separate def-use cycle for each copy, and therefore
5181 for each copy we get the vector def for the reduction variable from the
5182 respective phi node created for this copy.
5184 Otherwise (the reduction is unused in the loop nest), we can combine
5185 together intermediate results, like so (e.g. for ncopies=2):
5186 r = phi (init, r)
5187 r = x0 + r;
5188 r = x1 + r;
5189 (i.e. we generate VF/2 results in a single register).
5190 In this case for each copy we get the vector def for the reduction variable
5191 from the vectorized reduction operation generated in the previous iteration.
5194 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5196 single_defuse_cycle = true;
5197 epilog_copies = 1;
5199 else
5200 epilog_copies = ncopies;
5202 prev_stmt_info = NULL;
5203 prev_phi_info = NULL;
5204 if (slp_node)
5206 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5207 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5208 == TYPE_VECTOR_SUBPARTS (vectype_in));
5210 else
5212 vec_num = 1;
5213 vec_oprnds0.create (1);
5214 if (op_type == ternary_op)
5215 vec_oprnds1.create (1);
5218 phis.create (vec_num);
5219 vect_defs.create (vec_num);
5220 if (!slp_node)
5221 vect_defs.quick_push (NULL_TREE);
5223 for (j = 0; j < ncopies; j++)
5225 if (j == 0 || !single_defuse_cycle)
5227 for (i = 0; i < vec_num; i++)
5229 /* Create the reduction-phi that defines the reduction
5230 operand. */
5231 new_phi = create_phi_node (vec_dest, loop->header);
5232 set_vinfo_for_stmt (new_phi,
5233 new_stmt_vec_info (new_phi, loop_vinfo,
5234 NULL));
5235 if (j == 0 || slp_node)
5236 phis.quick_push (new_phi);
5240 if (code == COND_EXPR)
5242 gcc_assert (!slp_node);
5243 vectorizable_condition (stmt, gsi, vec_stmt,
5244 PHI_RESULT (phis[0]),
5245 reduc_index, NULL);
5246 /* Multiple types are not supported for condition. */
5247 break;
5250 /* Handle uses. */
5251 if (j == 0)
5253 op0 = ops[!reduc_index];
5254 if (op_type == ternary_op)
5256 if (reduc_index == 0)
5257 op1 = ops[2];
5258 else
5259 op1 = ops[1];
5262 if (slp_node)
5263 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5264 slp_node, -1);
5265 else
5267 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5268 stmt, NULL);
5269 vec_oprnds0.quick_push (loop_vec_def0);
5270 if (op_type == ternary_op)
5272 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5273 NULL);
5274 vec_oprnds1.quick_push (loop_vec_def1);
5278 else
5280 if (!slp_node)
5282 enum vect_def_type dt;
5283 gimple dummy_stmt;
5284 tree dummy;
5286 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5287 &dummy_stmt, &dummy, &dt);
5288 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5289 loop_vec_def0);
5290 vec_oprnds0[0] = loop_vec_def0;
5291 if (op_type == ternary_op)
5293 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5294 &dummy, &dt);
5295 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5296 loop_vec_def1);
5297 vec_oprnds1[0] = loop_vec_def1;
5301 if (single_defuse_cycle)
5302 reduc_def = gimple_assign_lhs (new_stmt);
5304 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5307 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5309 if (slp_node)
5310 reduc_def = PHI_RESULT (phis[i]);
5311 else
5313 if (!single_defuse_cycle || j == 0)
5314 reduc_def = PHI_RESULT (new_phi);
5317 def1 = ((op_type == ternary_op)
5318 ? vec_oprnds1[i] : NULL);
5319 if (op_type == binary_op)
5321 if (reduc_index == 0)
5322 expr = build2 (code, vectype_out, reduc_def, def0);
5323 else
5324 expr = build2 (code, vectype_out, def0, reduc_def);
5326 else
5328 if (reduc_index == 0)
5329 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5330 else
5332 if (reduc_index == 1)
5333 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5334 else
5335 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5339 new_stmt = gimple_build_assign (vec_dest, expr);
5340 new_temp = make_ssa_name (vec_dest, new_stmt);
5341 gimple_assign_set_lhs (new_stmt, new_temp);
5342 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5344 if (slp_node)
5346 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5347 vect_defs.quick_push (new_temp);
5349 else
5350 vect_defs[0] = new_temp;
5353 if (slp_node)
5354 continue;
5356 if (j == 0)
5357 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5358 else
5359 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5361 prev_stmt_info = vinfo_for_stmt (new_stmt);
5362 prev_phi_info = vinfo_for_stmt (new_phi);
5365 /* Finalize the reduction-phi (set its arguments) and create the
5366 epilog reduction code. */
5367 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5369 new_temp = gimple_assign_lhs (*vec_stmt);
5370 vect_defs[0] = new_temp;
5373 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5374 epilog_reduc_code, phis, reduc_index,
5375 double_reduc, slp_node);
5377 return true;
5380 /* Function vect_min_worthwhile_factor.
5382 For a loop where we could vectorize the operation indicated by CODE,
5383 return the minimum vectorization factor that makes it worthwhile
5384 to use generic vectors. */
5386 vect_min_worthwhile_factor (enum tree_code code)
5388 switch (code)
5390 case PLUS_EXPR:
5391 case MINUS_EXPR:
5392 case NEGATE_EXPR:
5393 return 4;
5395 case BIT_AND_EXPR:
5396 case BIT_IOR_EXPR:
5397 case BIT_XOR_EXPR:
5398 case BIT_NOT_EXPR:
5399 return 2;
5401 default:
5402 return INT_MAX;
5407 /* Function vectorizable_induction
5409 Check if PHI performs an induction computation that can be vectorized.
5410 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5411 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5412 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5414 bool
5415 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5416 gimple *vec_stmt)
5418 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5419 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5420 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5421 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5422 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5423 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5424 tree vec_def;
5426 gcc_assert (ncopies >= 1);
5427 /* FORNOW. These restrictions should be relaxed. */
5428 if (nested_in_vect_loop_p (loop, phi))
5430 imm_use_iterator imm_iter;
5431 use_operand_p use_p;
5432 gimple exit_phi;
5433 edge latch_e;
5434 tree loop_arg;
5436 if (ncopies > 1)
5438 if (dump_enabled_p ())
5439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5440 "multiple types in nested loop.\n");
5441 return false;
5444 exit_phi = NULL;
5445 latch_e = loop_latch_edge (loop->inner);
5446 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5447 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5449 gimple use_stmt = USE_STMT (use_p);
5450 if (is_gimple_debug (use_stmt))
5451 continue;
5453 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5455 exit_phi = use_stmt;
5456 break;
5459 if (exit_phi)
5461 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5462 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5463 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5465 if (dump_enabled_p ())
5466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5467 "inner-loop induction only used outside "
5468 "of the outer vectorized loop.\n");
5469 return false;
5474 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5475 return false;
5477 /* FORNOW: SLP not supported. */
5478 if (STMT_SLP_TYPE (stmt_info))
5479 return false;
5481 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5483 if (gimple_code (phi) != GIMPLE_PHI)
5484 return false;
5486 if (!vec_stmt) /* transformation not required. */
5488 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5489 if (dump_enabled_p ())
5490 dump_printf_loc (MSG_NOTE, vect_location,
5491 "=== vectorizable_induction ===\n");
5492 vect_model_induction_cost (stmt_info, ncopies);
5493 return true;
5496 /** Transform. **/
5498 if (dump_enabled_p ())
5499 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5501 vec_def = get_initial_def_for_induction (phi);
5502 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5503 return true;
5506 /* Function vectorizable_live_operation.
5508 STMT computes a value that is used outside the loop. Check if
5509 it can be supported. */
5511 bool
5512 vectorizable_live_operation (gimple stmt,
5513 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5514 gimple *vec_stmt)
5516 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5517 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5518 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5519 int i;
5520 int op_type;
5521 tree op;
5522 tree def;
5523 gimple def_stmt;
5524 enum vect_def_type dt;
5525 enum tree_code code;
5526 enum gimple_rhs_class rhs_class;
5528 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5530 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5531 return false;
5533 if (!is_gimple_assign (stmt))
5535 if (gimple_call_internal_p (stmt)
5536 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5537 && gimple_call_lhs (stmt)
5538 && loop->simduid
5539 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5540 && loop->simduid
5541 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5543 edge e = single_exit (loop);
5544 basic_block merge_bb = e->dest;
5545 imm_use_iterator imm_iter;
5546 use_operand_p use_p;
5547 tree lhs = gimple_call_lhs (stmt);
5549 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5551 gimple use_stmt = USE_STMT (use_p);
5552 if (gimple_code (use_stmt) == GIMPLE_PHI
5553 && gimple_bb (use_stmt) == merge_bb)
5555 if (vec_stmt)
5557 tree vfm1
5558 = build_int_cst (unsigned_type_node,
5559 loop_vinfo->vectorization_factor - 1);
5560 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5562 return true;
5567 return false;
5570 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5571 return false;
5573 /* FORNOW. CHECKME. */
5574 if (nested_in_vect_loop_p (loop, stmt))
5575 return false;
5577 code = gimple_assign_rhs_code (stmt);
5578 op_type = TREE_CODE_LENGTH (code);
5579 rhs_class = get_gimple_rhs_class (code);
5580 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5581 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5583 /* FORNOW: support only if all uses are invariant. This means
5584 that the scalar operations can remain in place, unvectorized.
5585 The original last scalar value that they compute will be used. */
5587 for (i = 0; i < op_type; i++)
5589 if (rhs_class == GIMPLE_SINGLE_RHS)
5590 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5591 else
5592 op = gimple_op (stmt, i + 1);
5593 if (op
5594 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5595 &dt))
5597 if (dump_enabled_p ())
5598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5599 "use not simple.\n");
5600 return false;
5603 if (dt != vect_external_def && dt != vect_constant_def)
5604 return false;
5607 /* No transformation is required for the cases we currently support. */
5608 return true;
5611 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5613 static void
5614 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5616 ssa_op_iter op_iter;
5617 imm_use_iterator imm_iter;
5618 def_operand_p def_p;
5619 gimple ustmt;
5621 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5623 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5625 basic_block bb;
5627 if (!is_gimple_debug (ustmt))
5628 continue;
5630 bb = gimple_bb (ustmt);
5632 if (!flow_bb_inside_loop_p (loop, bb))
5634 if (gimple_debug_bind_p (ustmt))
5636 if (dump_enabled_p ())
5637 dump_printf_loc (MSG_NOTE, vect_location,
5638 "killing debug use\n");
5640 gimple_debug_bind_reset_value (ustmt);
5641 update_stmt (ustmt);
5643 else
5644 gcc_unreachable ();
5651 /* This function builds ni_name = number of iterations. Statements
5652 are emitted on the loop preheader edge. */
5654 static tree
5655 vect_build_loop_niters (loop_vec_info loop_vinfo)
5657 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5658 if (TREE_CODE (ni) == INTEGER_CST)
5659 return ni;
5660 else
5662 tree ni_name, var;
5663 gimple_seq stmts = NULL;
5664 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5666 var = create_tmp_var (TREE_TYPE (ni), "niters");
5667 ni_name = force_gimple_operand (ni, &stmts, false, var);
5668 if (stmts)
5669 gsi_insert_seq_on_edge_immediate (pe, stmts);
5671 return ni_name;
5676 /* This function generates the following statements:
5678 ni_name = number of iterations loop executes
5679 ratio = ni_name / vf
5680 ratio_mult_vf_name = ratio * vf
5682 and places them on the loop preheader edge. */
5684 static void
5685 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5686 tree ni_name,
5687 tree *ratio_mult_vf_name_ptr,
5688 tree *ratio_name_ptr)
5690 tree ni_minus_gap_name;
5691 tree var;
5692 tree ratio_name;
5693 tree ratio_mult_vf_name;
5694 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5695 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5696 tree log_vf;
5698 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5700 /* If epilogue loop is required because of data accesses with gaps, we
5701 subtract one iteration from the total number of iterations here for
5702 correct calculation of RATIO. */
5703 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5705 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5706 ni_name,
5707 build_one_cst (TREE_TYPE (ni_name)));
5708 if (!is_gimple_val (ni_minus_gap_name))
5710 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5711 gimple stmts = NULL;
5712 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5713 true, var);
5714 gsi_insert_seq_on_edge_immediate (pe, stmts);
5717 else
5718 ni_minus_gap_name = ni_name;
5720 /* Create: ratio = ni >> log2(vf) */
5721 /* ??? As we have ni == number of latch executions + 1, ni could
5722 have overflown to zero. So avoid computing ratio based on ni
5723 but compute it using the fact that we know ratio will be at least
5724 one, thus via (ni - vf) >> log2(vf) + 1. */
5725 ratio_name
5726 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5727 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5728 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5729 ni_minus_gap_name,
5730 build_int_cst
5731 (TREE_TYPE (ni_name), vf)),
5732 log_vf),
5733 build_int_cst (TREE_TYPE (ni_name), 1));
5734 if (!is_gimple_val (ratio_name))
5736 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5737 gimple stmts = NULL;
5738 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5739 gsi_insert_seq_on_edge_immediate (pe, stmts);
5741 *ratio_name_ptr = ratio_name;
5743 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5745 if (ratio_mult_vf_name_ptr)
5747 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5748 ratio_name, log_vf);
5749 if (!is_gimple_val (ratio_mult_vf_name))
5751 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5752 gimple stmts = NULL;
5753 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5754 true, var);
5755 gsi_insert_seq_on_edge_immediate (pe, stmts);
5757 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5760 return;
5764 /* Function vect_transform_loop.
5766 The analysis phase has determined that the loop is vectorizable.
5767 Vectorize the loop - created vectorized stmts to replace the scalar
5768 stmts in the loop, and update the loop exit condition. */
5770 void
5771 vect_transform_loop (loop_vec_info loop_vinfo)
5773 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5774 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5775 int nbbs = loop->num_nodes;
5776 gimple_stmt_iterator si;
5777 int i;
5778 tree ratio = NULL;
5779 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5780 bool grouped_store;
5781 bool slp_scheduled = false;
5782 gimple stmt, pattern_stmt;
5783 gimple_seq pattern_def_seq = NULL;
5784 gimple_stmt_iterator pattern_def_si = gsi_none ();
5785 bool transform_pattern_stmt = false;
5786 bool check_profitability = false;
5787 int th;
5788 /* Record number of iterations before we started tampering with the profile. */
5789 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5791 if (dump_enabled_p ())
5792 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5794 /* If profile is inprecise, we have chance to fix it up. */
5795 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5796 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5798 /* Use the more conservative vectorization threshold. If the number
5799 of iterations is constant assume the cost check has been performed
5800 by our caller. If the threshold makes all loops profitable that
5801 run at least the vectorization factor number of times checking
5802 is pointless, too. */
5803 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5804 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5805 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5807 if (dump_enabled_p ())
5808 dump_printf_loc (MSG_NOTE, vect_location,
5809 "Profitability threshold is %d loop iterations.\n",
5810 th);
5811 check_profitability = true;
5814 /* Version the loop first, if required, so the profitability check
5815 comes first. */
5817 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5818 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5820 vect_loop_versioning (loop_vinfo, th, check_profitability);
5821 check_profitability = false;
5824 tree ni_name = vect_build_loop_niters (loop_vinfo);
5825 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5827 /* Peel the loop if there are data refs with unknown alignment.
5828 Only one data ref with unknown store is allowed. */
5830 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5832 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5833 th, check_profitability);
5834 check_profitability = false;
5835 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5836 be re-computed. */
5837 ni_name = NULL_TREE;
5840 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5841 compile time constant), or it is a constant that doesn't divide by the
5842 vectorization factor, then an epilog loop needs to be created.
5843 We therefore duplicate the loop: the original loop will be vectorized,
5844 and will compute the first (n/VF) iterations. The second copy of the loop
5845 will remain scalar and will compute the remaining (n%VF) iterations.
5846 (VF is the vectorization factor). */
5848 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5849 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5851 tree ratio_mult_vf;
5852 if (!ni_name)
5853 ni_name = vect_build_loop_niters (loop_vinfo);
5854 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5855 &ratio);
5856 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5857 th, check_profitability);
5859 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5860 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5861 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5862 else
5864 if (!ni_name)
5865 ni_name = vect_build_loop_niters (loop_vinfo);
5866 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5869 /* 1) Make sure the loop header has exactly two entries
5870 2) Make sure we have a preheader basic block. */
5872 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5874 split_edge (loop_preheader_edge (loop));
5876 /* FORNOW: the vectorizer supports only loops which body consist
5877 of one basic block (header + empty latch). When the vectorizer will
5878 support more involved loop forms, the order by which the BBs are
5879 traversed need to be reconsidered. */
5881 for (i = 0; i < nbbs; i++)
5883 basic_block bb = bbs[i];
5884 stmt_vec_info stmt_info;
5885 gimple phi;
5887 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5889 phi = gsi_stmt (si);
5890 if (dump_enabled_p ())
5892 dump_printf_loc (MSG_NOTE, vect_location,
5893 "------>vectorizing phi: ");
5894 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5895 dump_printf (MSG_NOTE, "\n");
5897 stmt_info = vinfo_for_stmt (phi);
5898 if (!stmt_info)
5899 continue;
5901 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5902 vect_loop_kill_debug_uses (loop, phi);
5904 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5905 && !STMT_VINFO_LIVE_P (stmt_info))
5906 continue;
5908 if (STMT_VINFO_VECTYPE (stmt_info)
5909 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5910 != (unsigned HOST_WIDE_INT) vectorization_factor)
5911 && dump_enabled_p ())
5912 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5914 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5916 if (dump_enabled_p ())
5917 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5918 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5922 pattern_stmt = NULL;
5923 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5925 bool is_store;
5927 if (transform_pattern_stmt)
5928 stmt = pattern_stmt;
5929 else
5931 stmt = gsi_stmt (si);
5932 /* During vectorization remove existing clobber stmts. */
5933 if (gimple_clobber_p (stmt))
5935 unlink_stmt_vdef (stmt);
5936 gsi_remove (&si, true);
5937 release_defs (stmt);
5938 continue;
5942 if (dump_enabled_p ())
5944 dump_printf_loc (MSG_NOTE, vect_location,
5945 "------>vectorizing statement: ");
5946 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5947 dump_printf (MSG_NOTE, "\n");
5950 stmt_info = vinfo_for_stmt (stmt);
5952 /* vector stmts created in the outer-loop during vectorization of
5953 stmts in an inner-loop may not have a stmt_info, and do not
5954 need to be vectorized. */
5955 if (!stmt_info)
5957 gsi_next (&si);
5958 continue;
5961 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5962 vect_loop_kill_debug_uses (loop, stmt);
5964 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5965 && !STMT_VINFO_LIVE_P (stmt_info))
5967 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5968 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5969 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5970 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5972 stmt = pattern_stmt;
5973 stmt_info = vinfo_for_stmt (stmt);
5975 else
5977 gsi_next (&si);
5978 continue;
5981 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5982 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5983 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5984 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5985 transform_pattern_stmt = true;
5987 /* If pattern statement has def stmts, vectorize them too. */
5988 if (is_pattern_stmt_p (stmt_info))
5990 if (pattern_def_seq == NULL)
5992 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5993 pattern_def_si = gsi_start (pattern_def_seq);
5995 else if (!gsi_end_p (pattern_def_si))
5996 gsi_next (&pattern_def_si);
5997 if (pattern_def_seq != NULL)
5999 gimple pattern_def_stmt = NULL;
6000 stmt_vec_info pattern_def_stmt_info = NULL;
6002 while (!gsi_end_p (pattern_def_si))
6004 pattern_def_stmt = gsi_stmt (pattern_def_si);
6005 pattern_def_stmt_info
6006 = vinfo_for_stmt (pattern_def_stmt);
6007 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6008 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6009 break;
6010 gsi_next (&pattern_def_si);
6013 if (!gsi_end_p (pattern_def_si))
6015 if (dump_enabled_p ())
6017 dump_printf_loc (MSG_NOTE, vect_location,
6018 "==> vectorizing pattern def "
6019 "stmt: ");
6020 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6021 pattern_def_stmt, 0);
6022 dump_printf (MSG_NOTE, "\n");
6025 stmt = pattern_def_stmt;
6026 stmt_info = pattern_def_stmt_info;
6028 else
6030 pattern_def_si = gsi_none ();
6031 transform_pattern_stmt = false;
6034 else
6035 transform_pattern_stmt = false;
6038 if (STMT_VINFO_VECTYPE (stmt_info))
6040 unsigned int nunits
6041 = (unsigned int)
6042 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6043 if (!STMT_SLP_TYPE (stmt_info)
6044 && nunits != (unsigned int) vectorization_factor
6045 && dump_enabled_p ())
6046 /* For SLP VF is set according to unrolling factor, and not
6047 to vector size, hence for SLP this print is not valid. */
6048 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6051 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6052 reached. */
6053 if (STMT_SLP_TYPE (stmt_info))
6055 if (!slp_scheduled)
6057 slp_scheduled = true;
6059 if (dump_enabled_p ())
6060 dump_printf_loc (MSG_NOTE, vect_location,
6061 "=== scheduling SLP instances ===\n");
6063 vect_schedule_slp (loop_vinfo, NULL);
6066 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6067 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6069 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6071 pattern_def_seq = NULL;
6072 gsi_next (&si);
6074 continue;
6078 /* -------- vectorize statement ------------ */
6079 if (dump_enabled_p ())
6080 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6082 grouped_store = false;
6083 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6084 if (is_store)
6086 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6088 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6089 interleaving chain was completed - free all the stores in
6090 the chain. */
6091 gsi_next (&si);
6092 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6094 else
6096 /* Free the attached stmt_vec_info and remove the stmt. */
6097 gimple store = gsi_stmt (si);
6098 free_stmt_vec_info (store);
6099 unlink_stmt_vdef (store);
6100 gsi_remove (&si, true);
6101 release_defs (store);
6104 /* Stores can only appear at the end of pattern statements. */
6105 gcc_assert (!transform_pattern_stmt);
6106 pattern_def_seq = NULL;
6108 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6110 pattern_def_seq = NULL;
6111 gsi_next (&si);
6113 } /* stmts in BB */
6114 } /* BBs in loop */
6116 slpeel_make_loop_iterate_ntimes (loop, ratio);
6118 /* Reduce loop iterations by the vectorization factor. */
6119 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6120 expected_iterations / vectorization_factor);
6121 loop->nb_iterations_upper_bound
6122 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6123 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6124 && loop->nb_iterations_upper_bound != 0)
6125 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6126 if (loop->any_estimate)
6128 loop->nb_iterations_estimate
6129 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6130 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6131 && loop->nb_iterations_estimate != 0)
6132 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6135 if (dump_enabled_p ())
6137 dump_printf_loc (MSG_NOTE, vect_location,
6138 "LOOP VECTORIZED\n");
6139 if (loop->inner)
6140 dump_printf_loc (MSG_NOTE, vect_location,
6141 "OUTER LOOP VECTORIZED\n");
6142 dump_printf (MSG_NOTE, "\n");