Fixup ChangeLog entry
[official-gcc.git] / gcc / tree-vect-loop.c
blob32c54a78d88e88a04985725e9114b38e5b04902b
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "cfghooks.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "cfganal.h"
36 #include "gimple-pretty-print.h"
37 #include "internal-fn.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-pass.h"
45 #include "cfgloop.h"
46 #include "flags.h"
47 #include "insn-codes.h"
48 #include "optabs-tree.h"
49 #include "params.h"
50 #include "diagnostic-core.h"
51 #include "tree-chrec.h"
52 #include "tree-scalar-evolution.h"
53 #include "tree-vectorizer.h"
54 #include "target.h"
55 #include "gimple-fold.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 /* Function vect_determine_vectorization_factor
159 Determine the vectorization factor (VF). VF is the number of data elements
160 that are operated upon in parallel in a single iteration of the vectorized
161 loop. For example, when vectorizing a loop that operates on 4byte elements,
162 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
163 elements can fit in a single vector register.
165 We currently support vectorization of loops in which all types operated upon
166 are of the same size. Therefore this function currently sets VF according to
167 the size of the types operated upon, and fails if there are multiple sizes
168 in the loop.
170 VF is also the factor by which the loop iterations are strip-mined, e.g.:
171 original loop:
172 for (i=0; i<N; i++){
173 a[i] = b[i] + c[i];
176 vectorized loop:
177 for (i=0; i<N; i+=VF){
178 a[i:VF] = b[i:VF] + c[i:VF];
182 static bool
183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
187 int nbbs = loop->num_nodes;
188 unsigned int vectorization_factor = 0;
189 tree scalar_type;
190 gphi *phi;
191 tree vectype;
192 unsigned int nunits;
193 stmt_vec_info stmt_info;
194 int i;
195 HOST_WIDE_INT dummy;
196 gimple *stmt, *pattern_stmt = NULL;
197 gimple_seq pattern_def_seq = NULL;
198 gimple_stmt_iterator pattern_def_si = gsi_none ();
199 bool analyze_pattern_stmt = false;
201 if (dump_enabled_p ())
202 dump_printf_loc (MSG_NOTE, vect_location,
203 "=== vect_determine_vectorization_factor ===\n");
205 for (i = 0; i < nbbs; i++)
207 basic_block bb = bbs[i];
209 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
210 gsi_next (&si))
212 phi = si.phi ();
213 stmt_info = vinfo_for_stmt (phi);
214 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
217 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
218 dump_printf (MSG_NOTE, "\n");
221 gcc_assert (stmt_info);
223 if (STMT_VINFO_RELEVANT_P (stmt_info))
225 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
226 scalar_type = TREE_TYPE (PHI_RESULT (phi));
228 if (dump_enabled_p ())
230 dump_printf_loc (MSG_NOTE, vect_location,
231 "get vectype for scalar type: ");
232 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
233 dump_printf (MSG_NOTE, "\n");
236 vectype = get_vectype_for_scalar_type (scalar_type);
237 if (!vectype)
239 if (dump_enabled_p ())
241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
242 "not vectorized: unsupported "
243 "data-type ");
244 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
245 scalar_type);
246 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
248 return false;
250 STMT_VINFO_VECTYPE (stmt_info) = vectype;
252 if (dump_enabled_p ())
254 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
255 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
256 dump_printf (MSG_NOTE, "\n");
259 nunits = TYPE_VECTOR_SUBPARTS (vectype);
260 if (dump_enabled_p ())
261 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
262 nunits);
264 if (!vectorization_factor
265 || (nunits > vectorization_factor))
266 vectorization_factor = nunits;
270 for (gimple_stmt_iterator si = gsi_start_bb (bb);
271 !gsi_end_p (si) || analyze_pattern_stmt;)
273 tree vf_vectype;
275 if (analyze_pattern_stmt)
276 stmt = pattern_stmt;
277 else
278 stmt = gsi_stmt (si);
280 stmt_info = vinfo_for_stmt (stmt);
282 if (dump_enabled_p ())
284 dump_printf_loc (MSG_NOTE, vect_location,
285 "==> examining statement: ");
286 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
287 dump_printf (MSG_NOTE, "\n");
290 gcc_assert (stmt_info);
292 /* Skip stmts which do not need to be vectorized. */
293 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
294 && !STMT_VINFO_LIVE_P (stmt_info))
295 || gimple_clobber_p (stmt))
297 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
298 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
299 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
300 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302 stmt = pattern_stmt;
303 stmt_info = vinfo_for_stmt (pattern_stmt);
304 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location,
307 "==> examining pattern statement: ");
308 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
309 dump_printf (MSG_NOTE, "\n");
312 else
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316 gsi_next (&si);
317 continue;
320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324 analyze_pattern_stmt = true;
326 /* If a pattern statement has def stmts, analyze them too. */
327 if (is_pattern_stmt_p (stmt_info))
329 if (pattern_def_seq == NULL)
331 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332 pattern_def_si = gsi_start (pattern_def_seq);
334 else if (!gsi_end_p (pattern_def_si))
335 gsi_next (&pattern_def_si);
336 if (pattern_def_seq != NULL)
338 gimple *pattern_def_stmt = NULL;
339 stmt_vec_info pattern_def_stmt_info = NULL;
341 while (!gsi_end_p (pattern_def_si))
343 pattern_def_stmt = gsi_stmt (pattern_def_si);
344 pattern_def_stmt_info
345 = vinfo_for_stmt (pattern_def_stmt);
346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348 break;
349 gsi_next (&pattern_def_si);
352 if (!gsi_end_p (pattern_def_si))
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "==> examining pattern def stmt: ");
358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359 pattern_def_stmt, 0);
360 dump_printf (MSG_NOTE, "\n");
363 stmt = pattern_def_stmt;
364 stmt_info = pattern_def_stmt_info;
366 else
368 pattern_def_si = gsi_none ();
369 analyze_pattern_stmt = false;
372 else
373 analyze_pattern_stmt = false;
376 if (gimple_get_lhs (stmt) == NULL_TREE
377 /* MASK_STORE has no lhs, but is ok. */
378 && (!is_gimple_call (stmt)
379 || !gimple_call_internal_p (stmt)
380 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
382 if (is_gimple_call (stmt))
384 /* Ignore calls with no lhs. These must be calls to
385 #pragma omp simd functions, and what vectorization factor
386 it really needs can't be determined until
387 vectorizable_simd_clone_call. */
388 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
390 pattern_def_seq = NULL;
391 gsi_next (&si);
393 continue;
395 if (dump_enabled_p ())
397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
398 "not vectorized: irregular stmt.");
399 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
401 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
403 return false;
406 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
408 if (dump_enabled_p ())
410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
411 "not vectorized: vector stmt in loop:");
412 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
413 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
415 return false;
418 if (STMT_VINFO_VECTYPE (stmt_info))
420 /* The only case when a vectype had been already set is for stmts
421 that contain a dataref, or for "pattern-stmts" (stmts
422 generated by the vectorizer to represent/replace a certain
423 idiom). */
424 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
425 || is_pattern_stmt_p (stmt_info)
426 || !gsi_end_p (pattern_def_si));
427 vectype = STMT_VINFO_VECTYPE (stmt_info);
429 else
431 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
432 if (is_gimple_call (stmt)
433 && gimple_call_internal_p (stmt)
434 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
435 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
436 else
437 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
438 if (dump_enabled_p ())
440 dump_printf_loc (MSG_NOTE, vect_location,
441 "get vectype for scalar type: ");
442 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
443 dump_printf (MSG_NOTE, "\n");
445 vectype = get_vectype_for_scalar_type (scalar_type);
446 if (!vectype)
448 if (dump_enabled_p ())
450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
451 "not vectorized: unsupported "
452 "data-type ");
453 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
454 scalar_type);
455 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
457 return false;
460 STMT_VINFO_VECTYPE (stmt_info) = vectype;
462 if (dump_enabled_p ())
464 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
465 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
466 dump_printf (MSG_NOTE, "\n");
470 /* The vectorization factor is according to the smallest
471 scalar type (or the largest vector size, but we only
472 support one vector size per loop). */
473 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
474 &dummy);
475 if (dump_enabled_p ())
477 dump_printf_loc (MSG_NOTE, vect_location,
478 "get vectype for scalar type: ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
480 dump_printf (MSG_NOTE, "\n");
482 vf_vectype = get_vectype_for_scalar_type (scalar_type);
483 if (!vf_vectype)
485 if (dump_enabled_p ())
487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
488 "not vectorized: unsupported data-type ");
489 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
490 scalar_type);
491 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
493 return false;
496 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
497 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
499 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "not vectorized: different sized vector "
503 "types in statement, ");
504 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
505 vectype);
506 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
507 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
508 vf_vectype);
509 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
511 return false;
514 if (dump_enabled_p ())
516 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
517 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
518 dump_printf (MSG_NOTE, "\n");
521 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
522 if (dump_enabled_p ())
523 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
524 if (!vectorization_factor
525 || (nunits > vectorization_factor))
526 vectorization_factor = nunits;
528 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
530 pattern_def_seq = NULL;
531 gsi_next (&si);
536 /* TODO: Analyze cost. Decide if worth while to vectorize. */
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
539 vectorization_factor);
540 if (vectorization_factor <= 1)
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
544 "not vectorized: unsupported data-type\n");
545 return false;
547 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
549 return true;
553 /* Function vect_is_simple_iv_evolution.
555 FORNOW: A simple evolution of an induction variables in the loop is
556 considered a polynomial evolution. */
558 static bool
559 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
560 tree * step)
562 tree init_expr;
563 tree step_expr;
564 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
565 basic_block bb;
567 /* When there is no evolution in this loop, the evolution function
568 is not "simple". */
569 if (evolution_part == NULL_TREE)
570 return false;
572 /* When the evolution is a polynomial of degree >= 2
573 the evolution function is not "simple". */
574 if (tree_is_chrec (evolution_part))
575 return false;
577 step_expr = evolution_part;
578 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
580 if (dump_enabled_p ())
582 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
583 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
584 dump_printf (MSG_NOTE, ", init: ");
585 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
586 dump_printf (MSG_NOTE, "\n");
589 *init = init_expr;
590 *step = step_expr;
592 if (TREE_CODE (step_expr) != INTEGER_CST
593 && (TREE_CODE (step_expr) != SSA_NAME
594 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
595 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
596 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
597 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
598 || !flag_associative_math)))
599 && (TREE_CODE (step_expr) != REAL_CST
600 || !flag_associative_math))
602 if (dump_enabled_p ())
603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
604 "step unknown.\n");
605 return false;
608 return true;
611 /* Function vect_analyze_scalar_cycles_1.
613 Examine the cross iteration def-use cycles of scalar variables
614 in LOOP. LOOP_VINFO represents the loop that is now being
615 considered for vectorization (can be LOOP, or an outer-loop
616 enclosing LOOP). */
618 static void
619 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
621 basic_block bb = loop->header;
622 tree init, step;
623 auto_vec<gimple *, 64> worklist;
624 gphi_iterator gsi;
625 bool double_reduc;
627 if (dump_enabled_p ())
628 dump_printf_loc (MSG_NOTE, vect_location,
629 "=== vect_analyze_scalar_cycles ===\n");
631 /* First - identify all inductions. Reduction detection assumes that all the
632 inductions have been identified, therefore, this order must not be
633 changed. */
634 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
636 gphi *phi = gsi.phi ();
637 tree access_fn = NULL;
638 tree def = PHI_RESULT (phi);
639 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
641 if (dump_enabled_p ())
643 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
644 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
645 dump_printf (MSG_NOTE, "\n");
648 /* Skip virtual phi's. The data dependences that are associated with
649 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
650 if (virtual_operand_p (def))
651 continue;
653 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
655 /* Analyze the evolution function. */
656 access_fn = analyze_scalar_evolution (loop, def);
657 if (access_fn)
659 STRIP_NOPS (access_fn);
660 if (dump_enabled_p ())
662 dump_printf_loc (MSG_NOTE, vect_location,
663 "Access function of PHI: ");
664 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
665 dump_printf (MSG_NOTE, "\n");
667 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
668 = evolution_part_in_loop_num (access_fn, loop->num);
671 if (!access_fn
672 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
673 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
674 && TREE_CODE (step) != INTEGER_CST))
676 worklist.safe_push (phi);
677 continue;
680 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
684 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
688 /* Second - identify all reductions and nested cycles. */
689 while (worklist.length () > 0)
691 gimple *phi = worklist.pop ();
692 tree def = PHI_RESULT (phi);
693 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
694 gimple *reduc_stmt;
695 bool nested_cycle;
697 if (dump_enabled_p ())
699 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
700 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
701 dump_printf (MSG_NOTE, "\n");
704 gcc_assert (!virtual_operand_p (def)
705 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
707 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
708 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
709 &double_reduc, false);
710 if (reduc_stmt)
712 if (double_reduc)
714 if (dump_enabled_p ())
715 dump_printf_loc (MSG_NOTE, vect_location,
716 "Detected double reduction.\n");
718 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
719 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
720 vect_double_reduction_def;
722 else
724 if (nested_cycle)
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_NOTE, vect_location,
728 "Detected vectorizable nested cycle.\n");
730 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
731 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
732 vect_nested_cycle;
734 else
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_NOTE, vect_location,
738 "Detected reduction.\n");
740 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
741 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
742 vect_reduction_def;
743 /* Store the reduction cycles for possible vectorization in
744 loop-aware SLP. */
745 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
749 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "Unknown def-use cycle pattern.\n");
757 /* Function vect_analyze_scalar_cycles.
759 Examine the cross iteration def-use cycles of scalar variables, by
760 analyzing the loop-header PHIs of scalar variables. Classify each
761 cycle as one of the following: invariant, induction, reduction, unknown.
762 We do that for the loop represented by LOOP_VINFO, and also to its
763 inner-loop, if exists.
764 Examples for scalar cycles:
766 Example1: reduction:
768 loop1:
769 for (i=0; i<N; i++)
770 sum += a[i];
772 Example2: induction:
774 loop2:
775 for (i=0; i<N; i++)
776 a[i] = i; */
778 static void
779 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
781 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
783 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
785 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
786 Reductions in such inner-loop therefore have different properties than
787 the reductions in the nest that gets vectorized:
788 1. When vectorized, they are executed in the same order as in the original
789 scalar loop, so we can't change the order of computation when
790 vectorizing them.
791 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
792 current checks are too strict. */
794 if (loop->inner)
795 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
798 /* Transfer group and reduction information from STMT to its pattern stmt. */
800 static void
801 vect_fixup_reduc_chain (gimple *stmt)
803 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
804 gimple *stmtp;
805 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
806 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
807 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
810 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
811 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
812 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
813 if (stmt)
814 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
815 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
817 while (stmt);
818 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
821 /* Fixup scalar cycles that now have their stmts detected as patterns. */
823 static void
824 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
826 gimple *first;
827 unsigned i;
829 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
830 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
832 vect_fixup_reduc_chain (first);
833 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
834 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
838 /* Function vect_get_loop_niters.
840 Determine how many iterations the loop is executed and place it
841 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
842 in NUMBER_OF_ITERATIONSM1.
844 Return the loop exit condition. */
847 static gcond *
848 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
849 tree *number_of_iterationsm1)
851 tree niters;
853 if (dump_enabled_p ())
854 dump_printf_loc (MSG_NOTE, vect_location,
855 "=== get_loop_niters ===\n");
857 niters = number_of_latch_executions (loop);
858 *number_of_iterationsm1 = niters;
860 /* We want the number of loop header executions which is the number
861 of latch executions plus one.
862 ??? For UINT_MAX latch executions this number overflows to zero
863 for loops like do { n++; } while (n != 0); */
864 if (niters && !chrec_contains_undetermined (niters))
865 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
866 build_int_cst (TREE_TYPE (niters), 1));
867 *number_of_iterations = niters;
869 return get_loop_exit_condition (loop);
873 /* Function bb_in_loop_p
875 Used as predicate for dfs order traversal of the loop bbs. */
877 static bool
878 bb_in_loop_p (const_basic_block bb, const void *data)
880 const struct loop *const loop = (const struct loop *)data;
881 if (flow_bb_inside_loop_p (loop, bb))
882 return true;
883 return false;
887 /* Function new_loop_vec_info.
889 Create and initialize a new loop_vec_info struct for LOOP, as well as
890 stmt_vec_info structs for all the stmts in LOOP. */
892 static loop_vec_info
893 new_loop_vec_info (struct loop *loop)
895 loop_vec_info res;
896 basic_block *bbs;
897 gimple_stmt_iterator si;
898 unsigned int i, nbbs;
900 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
901 res->kind = vec_info::loop;
902 LOOP_VINFO_LOOP (res) = loop;
904 bbs = get_loop_body (loop);
906 /* Create/Update stmt_info for all stmts in the loop. */
907 for (i = 0; i < loop->num_nodes; i++)
909 basic_block bb = bbs[i];
911 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
913 gimple *phi = gsi_stmt (si);
914 gimple_set_uid (phi, 0);
915 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
918 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
920 gimple *stmt = gsi_stmt (si);
921 gimple_set_uid (stmt, 0);
922 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
926 /* CHECKME: We want to visit all BBs before their successors (except for
927 latch blocks, for which this assertion wouldn't hold). In the simple
928 case of the loop forms we allow, a dfs order of the BBs would the same
929 as reversed postorder traversal, so we are safe. */
931 free (bbs);
932 bbs = XCNEWVEC (basic_block, loop->num_nodes);
933 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
934 bbs, loop->num_nodes, loop);
935 gcc_assert (nbbs == loop->num_nodes);
937 LOOP_VINFO_BBS (res) = bbs;
938 LOOP_VINFO_NITERSM1 (res) = NULL;
939 LOOP_VINFO_NITERS (res) = NULL;
940 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
941 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
942 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
943 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
944 LOOP_VINFO_VECT_FACTOR (res) = 0;
945 LOOP_VINFO_LOOP_NEST (res) = vNULL;
946 LOOP_VINFO_DATAREFS (res) = vNULL;
947 LOOP_VINFO_DDRS (res) = vNULL;
948 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
949 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
950 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
951 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
952 LOOP_VINFO_REDUCTIONS (res) = vNULL;
953 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
954 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
955 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
956 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
957 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
958 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
959 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
961 return res;
965 /* Function destroy_loop_vec_info.
967 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
968 stmts in the loop. */
970 void
971 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
973 struct loop *loop;
974 basic_block *bbs;
975 int nbbs;
976 gimple_stmt_iterator si;
977 int j;
978 vec<slp_instance> slp_instances;
979 slp_instance instance;
980 bool swapped;
982 if (!loop_vinfo)
983 return;
985 loop = LOOP_VINFO_LOOP (loop_vinfo);
987 bbs = LOOP_VINFO_BBS (loop_vinfo);
988 nbbs = clean_stmts ? loop->num_nodes : 0;
989 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
991 for (j = 0; j < nbbs; j++)
993 basic_block bb = bbs[j];
994 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
995 free_stmt_vec_info (gsi_stmt (si));
997 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
999 gimple *stmt = gsi_stmt (si);
1001 /* We may have broken canonical form by moving a constant
1002 into RHS1 of a commutative op. Fix such occurrences. */
1003 if (swapped && is_gimple_assign (stmt))
1005 enum tree_code code = gimple_assign_rhs_code (stmt);
1007 if ((code == PLUS_EXPR
1008 || code == POINTER_PLUS_EXPR
1009 || code == MULT_EXPR)
1010 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1011 swap_ssa_operands (stmt,
1012 gimple_assign_rhs1_ptr (stmt),
1013 gimple_assign_rhs2_ptr (stmt));
1016 /* Free stmt_vec_info. */
1017 free_stmt_vec_info (stmt);
1018 gsi_next (&si);
1022 free (LOOP_VINFO_BBS (loop_vinfo));
1023 vect_destroy_datarefs (loop_vinfo);
1024 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1025 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1026 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1027 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1028 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1029 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1030 vect_free_slp_instance (instance);
1032 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1033 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1034 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1035 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1037 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1038 loop_vinfo->scalar_cost_vec.release ();
1040 free (loop_vinfo);
1041 loop->aux = NULL;
1045 /* Calculate the cost of one scalar iteration of the loop. */
1046 static void
1047 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1049 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1050 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1051 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1052 int innerloop_iters, i;
1054 /* Count statements in scalar loop. Using this as scalar cost for a single
1055 iteration for now.
1057 TODO: Add outer loop support.
1059 TODO: Consider assigning different costs to different scalar
1060 statements. */
1062 /* FORNOW. */
1063 innerloop_iters = 1;
1064 if (loop->inner)
1065 innerloop_iters = 50; /* FIXME */
1067 for (i = 0; i < nbbs; i++)
1069 gimple_stmt_iterator si;
1070 basic_block bb = bbs[i];
1072 if (bb->loop_father == loop->inner)
1073 factor = innerloop_iters;
1074 else
1075 factor = 1;
1077 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1079 gimple *stmt = gsi_stmt (si);
1080 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1082 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1083 continue;
1085 /* Skip stmts that are not vectorized inside the loop. */
1086 if (stmt_info
1087 && !STMT_VINFO_RELEVANT_P (stmt_info)
1088 && (!STMT_VINFO_LIVE_P (stmt_info)
1089 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1090 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1091 continue;
1093 vect_cost_for_stmt kind;
1094 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1096 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1097 kind = scalar_load;
1098 else
1099 kind = scalar_store;
1101 else
1102 kind = scalar_stmt;
1104 scalar_single_iter_cost
1105 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1106 factor, kind, NULL, 0, vect_prologue);
1109 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1110 = scalar_single_iter_cost;
1114 /* Function vect_analyze_loop_form_1.
1116 Verify that certain CFG restrictions hold, including:
1117 - the loop has a pre-header
1118 - the loop has a single entry and exit
1119 - the loop exit condition is simple enough, and the number of iterations
1120 can be analyzed (a countable loop). */
1122 bool
1123 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1124 tree *number_of_iterationsm1,
1125 tree *number_of_iterations, gcond **inner_loop_cond)
1127 if (dump_enabled_p ())
1128 dump_printf_loc (MSG_NOTE, vect_location,
1129 "=== vect_analyze_loop_form ===\n");
1131 /* Different restrictions apply when we are considering an inner-most loop,
1132 vs. an outer (nested) loop.
1133 (FORNOW. May want to relax some of these restrictions in the future). */
1135 if (!loop->inner)
1137 /* Inner-most loop. We currently require that the number of BBs is
1138 exactly 2 (the header and latch). Vectorizable inner-most loops
1139 look like this:
1141 (pre-header)
1143 header <--------+
1144 | | |
1145 | +--> latch --+
1147 (exit-bb) */
1149 if (loop->num_nodes != 2)
1151 if (dump_enabled_p ())
1152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1153 "not vectorized: control flow in loop.\n");
1154 return false;
1157 if (empty_block_p (loop->header))
1159 if (dump_enabled_p ())
1160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1161 "not vectorized: empty loop.\n");
1162 return false;
1165 else
1167 struct loop *innerloop = loop->inner;
1168 edge entryedge;
1170 /* Nested loop. We currently require that the loop is doubly-nested,
1171 contains a single inner loop, and the number of BBs is exactly 5.
1172 Vectorizable outer-loops look like this:
1174 (pre-header)
1176 header <---+
1178 inner-loop |
1180 tail ------+
1182 (exit-bb)
1184 The inner-loop has the properties expected of inner-most loops
1185 as described above. */
1187 if ((loop->inner)->inner || (loop->inner)->next)
1189 if (dump_enabled_p ())
1190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 "not vectorized: multiple nested loops.\n");
1192 return false;
1195 if (loop->num_nodes != 5)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "not vectorized: control flow in loop.\n");
1200 return false;
1203 entryedge = loop_preheader_edge (innerloop);
1204 if (entryedge->src != loop->header
1205 || !single_exit (innerloop)
1206 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1208 if (dump_enabled_p ())
1209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1210 "not vectorized: unsupported outerloop form.\n");
1211 return false;
1214 /* Analyze the inner-loop. */
1215 tree inner_niterm1, inner_niter;
1216 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1217 &inner_niterm1, &inner_niter, NULL))
1219 if (dump_enabled_p ())
1220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1221 "not vectorized: Bad inner loop.\n");
1222 return false;
1225 if (!expr_invariant_in_loop_p (loop, inner_niter))
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229 "not vectorized: inner-loop count not"
1230 " invariant.\n");
1231 return false;
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_NOTE, vect_location,
1236 "Considering outer-loop vectorization.\n");
1239 if (!single_exit (loop)
1240 || EDGE_COUNT (loop->header->preds) != 2)
1242 if (dump_enabled_p ())
1244 if (!single_exit (loop))
1245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246 "not vectorized: multiple exits.\n");
1247 else if (EDGE_COUNT (loop->header->preds) != 2)
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "not vectorized: too many incoming edges.\n");
1251 return false;
1254 /* We assume that the loop exit condition is at the end of the loop. i.e,
1255 that the loop is represented as a do-while (with a proper if-guard
1256 before the loop if needed), where the loop header contains all the
1257 executable statements, and the latch is empty. */
1258 if (!empty_block_p (loop->latch)
1259 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1261 if (dump_enabled_p ())
1262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263 "not vectorized: latch block not empty.\n");
1264 return false;
1267 /* Make sure there exists a single-predecessor exit bb: */
1268 if (!single_pred_p (single_exit (loop)->dest))
1270 edge e = single_exit (loop);
1271 if (!(e->flags & EDGE_ABNORMAL))
1273 split_loop_exit_edge (e);
1274 if (dump_enabled_p ())
1275 dump_printf (MSG_NOTE, "split exit edge.\n");
1277 else
1279 if (dump_enabled_p ())
1280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1281 "not vectorized: abnormal loop exit edge.\n");
1282 return false;
1286 *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1287 number_of_iterationsm1);
1288 if (!*loop_cond)
1290 if (dump_enabled_p ())
1291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1292 "not vectorized: complicated exit condition.\n");
1293 return false;
1296 if (!*number_of_iterations
1297 || chrec_contains_undetermined (*number_of_iterations))
1299 if (dump_enabled_p ())
1300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301 "not vectorized: number of iterations cannot be "
1302 "computed.\n");
1303 return false;
1306 if (integer_zerop (*number_of_iterations))
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 "not vectorized: number of iterations = 0.\n");
1311 return false;
1314 return true;
1317 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1319 loop_vec_info
1320 vect_analyze_loop_form (struct loop *loop)
1322 tree number_of_iterations, number_of_iterationsm1;
1323 gcond *loop_cond, *inner_loop_cond = NULL;
1325 if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1326 &number_of_iterations, &inner_loop_cond))
1327 return NULL;
1329 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1330 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1331 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1332 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1336 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Symbolic number of iterations is ");
1340 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341 dump_printf (MSG_NOTE, "\n");
1345 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1346 if (inner_loop_cond)
1347 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1348 = loop_exit_ctrl_vec_info_type;
1350 gcc_assert (!loop->aux);
1351 loop->aux = loop_vinfo;
1352 return loop_vinfo;
1357 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1358 statements update the vectorization factor. */
1360 static void
1361 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1363 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1364 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1365 int nbbs = loop->num_nodes;
1366 unsigned int vectorization_factor;
1367 int i;
1369 if (dump_enabled_p ())
1370 dump_printf_loc (MSG_NOTE, vect_location,
1371 "=== vect_update_vf_for_slp ===\n");
1373 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1374 gcc_assert (vectorization_factor != 0);
1376 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1377 vectorization factor of the loop is the unrolling factor required by
1378 the SLP instances. If that unrolling factor is 1, we say, that we
1379 perform pure SLP on loop - cross iteration parallelism is not
1380 exploited. */
1381 bool only_slp_in_loop = true;
1382 for (i = 0; i < nbbs; i++)
1384 basic_block bb = bbs[i];
1385 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1386 gsi_next (&si))
1388 gimple *stmt = gsi_stmt (si);
1389 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1390 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1391 && STMT_VINFO_RELATED_STMT (stmt_info))
1393 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1394 stmt_info = vinfo_for_stmt (stmt);
1396 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1397 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1398 && !PURE_SLP_STMT (stmt_info))
1399 /* STMT needs both SLP and loop-based vectorization. */
1400 only_slp_in_loop = false;
1404 if (only_slp_in_loop)
1405 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406 else
1407 vectorization_factor
1408 = least_common_multiple (vectorization_factor,
1409 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1411 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1412 if (dump_enabled_p ())
1413 dump_printf_loc (MSG_NOTE, vect_location,
1414 "Updating vectorization factor to %d\n",
1415 vectorization_factor);
1418 /* Function vect_analyze_loop_operations.
1420 Scan the loop stmts and make sure they are all vectorizable. */
1422 static bool
1423 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1425 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1426 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1427 int nbbs = loop->num_nodes;
1428 int i;
1429 stmt_vec_info stmt_info;
1430 bool need_to_vectorize = false;
1431 bool ok;
1433 if (dump_enabled_p ())
1434 dump_printf_loc (MSG_NOTE, vect_location,
1435 "=== vect_analyze_loop_operations ===\n");
1437 for (i = 0; i < nbbs; i++)
1439 basic_block bb = bbs[i];
1441 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1442 gsi_next (&si))
1444 gphi *phi = si.phi ();
1445 ok = true;
1447 stmt_info = vinfo_for_stmt (phi);
1448 if (dump_enabled_p ())
1450 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1451 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1452 dump_printf (MSG_NOTE, "\n");
1454 if (virtual_operand_p (gimple_phi_result (phi)))
1455 continue;
1457 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1458 (i.e., a phi in the tail of the outer-loop). */
1459 if (! is_loop_header_bb_p (bb))
1461 /* FORNOW: we currently don't support the case that these phis
1462 are not used in the outerloop (unless it is double reduction,
1463 i.e., this phi is vect_reduction_def), cause this case
1464 requires to actually do something here. */
1465 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1466 || STMT_VINFO_LIVE_P (stmt_info))
1467 && STMT_VINFO_DEF_TYPE (stmt_info)
1468 != vect_double_reduction_def)
1470 if (dump_enabled_p ())
1471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472 "Unsupported loop-closed phi in "
1473 "outer-loop.\n");
1474 return false;
1477 /* If PHI is used in the outer loop, we check that its operand
1478 is defined in the inner loop. */
1479 if (STMT_VINFO_RELEVANT_P (stmt_info))
1481 tree phi_op;
1482 gimple *op_def_stmt;
1484 if (gimple_phi_num_args (phi) != 1)
1485 return false;
1487 phi_op = PHI_ARG_DEF (phi, 0);
1488 if (TREE_CODE (phi_op) != SSA_NAME)
1489 return false;
1491 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1492 if (gimple_nop_p (op_def_stmt)
1493 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1494 || !vinfo_for_stmt (op_def_stmt))
1495 return false;
1497 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1498 != vect_used_in_outer
1499 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1500 != vect_used_in_outer_by_reduction)
1501 return false;
1504 continue;
1507 gcc_assert (stmt_info);
1509 if (STMT_VINFO_LIVE_P (stmt_info))
1511 /* FORNOW: not yet supported. */
1512 if (dump_enabled_p ())
1513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1514 "not vectorized: value used after loop.\n");
1515 return false;
1518 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1519 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1521 /* A scalar-dependence cycle that we don't support. */
1522 if (dump_enabled_p ())
1523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1524 "not vectorized: scalar dependence cycle.\n");
1525 return false;
1528 if (STMT_VINFO_RELEVANT_P (stmt_info))
1530 need_to_vectorize = true;
1531 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1532 ok = vectorizable_induction (phi, NULL, NULL);
1535 if (!ok)
1537 if (dump_enabled_p ())
1539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1540 "not vectorized: relevant phi not "
1541 "supported: ");
1542 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1543 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1545 return false;
1549 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1550 gsi_next (&si))
1552 gimple *stmt = gsi_stmt (si);
1553 if (!gimple_clobber_p (stmt)
1554 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1555 return false;
1557 } /* bbs */
1559 /* All operations in the loop are either irrelevant (deal with loop
1560 control, or dead), or only used outside the loop and can be moved
1561 out of the loop (e.g. invariants, inductions). The loop can be
1562 optimized away by scalar optimizations. We're better off not
1563 touching this loop. */
1564 if (!need_to_vectorize)
1566 if (dump_enabled_p ())
1567 dump_printf_loc (MSG_NOTE, vect_location,
1568 "All the computation can be taken out of the loop.\n");
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571 "not vectorized: redundant loop. no profit to "
1572 "vectorize.\n");
1573 return false;
1576 return true;
1580 /* Function vect_analyze_loop_2.
1582 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1583 for it. The different analyses will record information in the
1584 loop_vec_info struct. */
1585 static bool
1586 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1588 bool ok;
1589 int max_vf = MAX_VECTORIZATION_FACTOR;
1590 int min_vf = 2;
1591 unsigned int n_stmts = 0;
1593 /* Find all data references in the loop (which correspond to vdefs/vuses)
1594 and analyze their evolution in the loop. Also adjust the minimal
1595 vectorization factor according to the loads and stores.
1597 FORNOW: Handle only simple, array references, which
1598 alignment can be forced, and aligned pointer-references. */
1600 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &n_stmts);
1601 if (!ok)
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 "bad data references.\n");
1606 return false;
1609 /* Classify all cross-iteration scalar data-flow cycles.
1610 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1612 vect_analyze_scalar_cycles (loop_vinfo);
1614 vect_pattern_recog (loop_vinfo);
1616 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1618 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1619 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1621 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1622 if (!ok)
1624 if (dump_enabled_p ())
1625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1626 "bad data access.\n");
1627 return false;
1630 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1632 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1633 if (!ok)
1635 if (dump_enabled_p ())
1636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637 "unexpected pattern.\n");
1638 return false;
1641 /* Analyze data dependences between the data-refs in the loop
1642 and adjust the maximum vectorization factor according to
1643 the dependences.
1644 FORNOW: fail at the first data dependence that we encounter. */
1646 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1647 if (!ok
1648 || max_vf < min_vf)
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652 "bad data dependence.\n");
1653 return false;
1656 ok = vect_determine_vectorization_factor (loop_vinfo);
1657 if (!ok)
1659 if (dump_enabled_p ())
1660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1661 "can't determine vectorization factor.\n");
1662 return false;
1664 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1666 if (dump_enabled_p ())
1667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1668 "bad data dependence.\n");
1669 return false;
1672 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1673 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1674 if (!ok)
1675 return false;
1677 /* If there are any SLP instances mark them as pure_slp. */
1678 bool slp = vect_make_slp_decision (loop_vinfo);
1679 if (slp)
1681 /* Find stmts that need to be both vectorized and SLPed. */
1682 vect_detect_hybrid_slp (loop_vinfo);
1684 /* Update the vectorization factor based on the SLP decision. */
1685 vect_update_vf_for_slp (loop_vinfo);
1688 /* Now the vectorization factor is final. */
1689 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1690 gcc_assert (vectorization_factor != 0);
1692 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1693 dump_printf_loc (MSG_NOTE, vect_location,
1694 "vectorization_factor = %d, niters = "
1695 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1696 LOOP_VINFO_INT_NITERS (loop_vinfo));
1698 HOST_WIDE_INT max_niter
1699 = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1700 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1701 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1702 || (max_niter != -1
1703 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1705 if (dump_enabled_p ())
1706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707 "not vectorized: iteration count too small.\n");
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710 "not vectorized: iteration count smaller than "
1711 "vectorization factor.\n");
1712 return false;
1715 /* Analyze the alignment of the data-refs in the loop.
1716 Fail if a data reference is found that cannot be vectorized. */
1718 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1719 if (!ok)
1721 if (dump_enabled_p ())
1722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1723 "bad data alignment.\n");
1724 return false;
1727 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1728 It is important to call pruning after vect_analyze_data_ref_accesses,
1729 since we use grouping information gathered by interleaving analysis. */
1730 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1731 if (!ok)
1733 if (dump_enabled_p ())
1734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735 "number of versioning for alias "
1736 "run-time tests exceeds %d "
1737 "(--param vect-max-version-for-alias-checks)\n",
1738 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1739 return false;
1742 /* Compute the scalar iteration cost. */
1743 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1745 /* This pass will decide on using loop versioning and/or loop peeling in
1746 order to enhance the alignment of data references in the loop. */
1748 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1749 if (!ok)
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753 "bad data alignment.\n");
1754 return false;
1757 if (slp)
1759 /* Analyze operations in the SLP instances. Note this may
1760 remove unsupported SLP instances which makes the above
1761 SLP kind detection invalid. */
1762 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1763 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1764 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1765 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1766 return false;
1769 /* Scan all the remaining operations in the loop that are not subject
1770 to SLP and make sure they are vectorizable. */
1771 ok = vect_analyze_loop_operations (loop_vinfo);
1772 if (!ok)
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776 "bad operation or unsupported loop bound.\n");
1777 return false;
1780 /* Analyze cost. Decide if worth while to vectorize. */
1781 int min_profitable_estimate, min_profitable_iters;
1782 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1783 &min_profitable_estimate);
1785 if (min_profitable_iters < 0)
1787 if (dump_enabled_p ())
1788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1789 "not vectorized: vectorization not profitable.\n");
1790 if (dump_enabled_p ())
1791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1792 "not vectorized: vector version will never be "
1793 "profitable.\n");
1794 return false;
1797 int min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1798 * vectorization_factor) - 1);
1800 /* Use the cost model only if it is more conservative than user specified
1801 threshold. */
1802 unsigned th = (unsigned) min_scalar_loop_bound;
1803 if (min_profitable_iters
1804 && (!min_scalar_loop_bound
1805 || min_profitable_iters > min_scalar_loop_bound))
1806 th = (unsigned) min_profitable_iters;
1808 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1810 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1811 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1813 if (dump_enabled_p ())
1814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815 "not vectorized: vectorization not profitable.\n");
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_NOTE, vect_location,
1818 "not vectorized: iteration count smaller than user "
1819 "specified loop bound parameter or minimum profitable "
1820 "iterations (whichever is more conservative).\n");
1821 return false;
1824 HOST_WIDE_INT estimated_niter
1825 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1826 if (estimated_niter != -1
1827 && ((unsigned HOST_WIDE_INT) estimated_niter
1828 <= MAX (th, (unsigned)min_profitable_estimate)))
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1832 "not vectorized: estimated iteration count too "
1833 "small.\n");
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_NOTE, vect_location,
1836 "not vectorized: estimated iteration count smaller "
1837 "than specified loop bound parameter or minimum "
1838 "profitable iterations (whichever is more "
1839 "conservative).\n");
1840 return false;
1843 /* Decide whether we need to create an epilogue loop to handle
1844 remaining scalar iterations. */
1845 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1846 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1847 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1849 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1850 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1852 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1853 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1854 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1855 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1857 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1859 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1860 /* In case of versioning, check if the maximum number of
1861 iterations is greater than th. If they are identical,
1862 the epilogue is unnecessary. */
1863 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1864 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1865 || (unsigned HOST_WIDE_INT) max_niter > th)))
1866 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1868 /* If an epilogue loop is required make sure we can create one. */
1869 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1870 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1874 if (!vect_can_advance_ivs_p (loop_vinfo)
1875 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1876 single_exit (LOOP_VINFO_LOOP
1877 (loop_vinfo))))
1879 if (dump_enabled_p ())
1880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1881 "not vectorized: can't create required "
1882 "epilog loop\n");
1883 return false;
1887 gcc_assert (vectorization_factor
1888 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1890 return true;
1893 /* Function vect_analyze_loop.
1895 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1896 for it. The different analyses will record information in the
1897 loop_vec_info struct. */
1898 loop_vec_info
1899 vect_analyze_loop (struct loop *loop)
1901 loop_vec_info loop_vinfo;
1902 unsigned int vector_sizes;
1904 /* Autodetect first vector size we try. */
1905 current_vector_size = 0;
1906 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_NOTE, vect_location,
1910 "===== analyze_loop_nest =====\n");
1912 if (loop_outer (loop)
1913 && loop_vec_info_for_loop (loop_outer (loop))
1914 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1916 if (dump_enabled_p ())
1917 dump_printf_loc (MSG_NOTE, vect_location,
1918 "outer-loop already vectorized.\n");
1919 return NULL;
1922 while (1)
1924 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1925 loop_vinfo = vect_analyze_loop_form (loop);
1926 if (!loop_vinfo)
1928 if (dump_enabled_p ())
1929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930 "bad loop form.\n");
1931 return NULL;
1934 if (vect_analyze_loop_2 (loop_vinfo))
1936 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1938 return loop_vinfo;
1941 destroy_loop_vec_info (loop_vinfo, true);
1943 vector_sizes &= ~current_vector_size;
1944 if (vector_sizes == 0
1945 || current_vector_size == 0)
1946 return NULL;
1948 /* Try the next biggest vector size. */
1949 current_vector_size = 1 << floor_log2 (vector_sizes);
1950 if (dump_enabled_p ())
1951 dump_printf_loc (MSG_NOTE, vect_location,
1952 "***** Re-trying analysis with "
1953 "vector size %d\n", current_vector_size);
1958 /* Function reduction_code_for_scalar_code
1960 Input:
1961 CODE - tree_code of a reduction operations.
1963 Output:
1964 REDUC_CODE - the corresponding tree-code to be used to reduce the
1965 vector of partial results into a single scalar result, or ERROR_MARK
1966 if the operation is a supported reduction operation, but does not have
1967 such a tree-code.
1969 Return FALSE if CODE currently cannot be vectorized as reduction. */
1971 static bool
1972 reduction_code_for_scalar_code (enum tree_code code,
1973 enum tree_code *reduc_code)
1975 switch (code)
1977 case MAX_EXPR:
1978 *reduc_code = REDUC_MAX_EXPR;
1979 return true;
1981 case MIN_EXPR:
1982 *reduc_code = REDUC_MIN_EXPR;
1983 return true;
1985 case PLUS_EXPR:
1986 *reduc_code = REDUC_PLUS_EXPR;
1987 return true;
1989 case MULT_EXPR:
1990 case MINUS_EXPR:
1991 case BIT_IOR_EXPR:
1992 case BIT_XOR_EXPR:
1993 case BIT_AND_EXPR:
1994 *reduc_code = ERROR_MARK;
1995 return true;
1997 default:
1998 return false;
2003 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2004 STMT is printed with a message MSG. */
2006 static void
2007 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2009 dump_printf_loc (msg_type, vect_location, "%s", msg);
2010 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2011 dump_printf (msg_type, "\n");
2015 /* Detect SLP reduction of the form:
2017 #a1 = phi <a5, a0>
2018 a2 = operation (a1)
2019 a3 = operation (a2)
2020 a4 = operation (a3)
2021 a5 = operation (a4)
2023 #a = phi <a5>
2025 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2026 FIRST_STMT is the first reduction stmt in the chain
2027 (a2 = operation (a1)).
2029 Return TRUE if a reduction chain was detected. */
2031 static bool
2032 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2033 gimple *first_stmt)
2035 struct loop *loop = (gimple_bb (phi))->loop_father;
2036 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2037 enum tree_code code;
2038 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2039 stmt_vec_info use_stmt_info, current_stmt_info;
2040 tree lhs;
2041 imm_use_iterator imm_iter;
2042 use_operand_p use_p;
2043 int nloop_uses, size = 0, n_out_of_loop_uses;
2044 bool found = false;
2046 if (loop != vect_loop)
2047 return false;
2049 lhs = PHI_RESULT (phi);
2050 code = gimple_assign_rhs_code (first_stmt);
2051 while (1)
2053 nloop_uses = 0;
2054 n_out_of_loop_uses = 0;
2055 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2057 gimple *use_stmt = USE_STMT (use_p);
2058 if (is_gimple_debug (use_stmt))
2059 continue;
2061 /* Check if we got back to the reduction phi. */
2062 if (use_stmt == phi)
2064 loop_use_stmt = use_stmt;
2065 found = true;
2066 break;
2069 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2071 loop_use_stmt = use_stmt;
2072 nloop_uses++;
2074 else
2075 n_out_of_loop_uses++;
2077 /* There are can be either a single use in the loop or two uses in
2078 phi nodes. */
2079 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2080 return false;
2083 if (found)
2084 break;
2086 /* We reached a statement with no loop uses. */
2087 if (nloop_uses == 0)
2088 return false;
2090 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2091 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2092 return false;
2094 if (!is_gimple_assign (loop_use_stmt)
2095 || code != gimple_assign_rhs_code (loop_use_stmt)
2096 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2097 return false;
2099 /* Insert USE_STMT into reduction chain. */
2100 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2101 if (current_stmt)
2103 current_stmt_info = vinfo_for_stmt (current_stmt);
2104 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2105 GROUP_FIRST_ELEMENT (use_stmt_info)
2106 = GROUP_FIRST_ELEMENT (current_stmt_info);
2108 else
2109 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2111 lhs = gimple_assign_lhs (loop_use_stmt);
2112 current_stmt = loop_use_stmt;
2113 size++;
2116 if (!found || loop_use_stmt != phi || size < 2)
2117 return false;
2119 /* Swap the operands, if needed, to make the reduction operand be the second
2120 operand. */
2121 lhs = PHI_RESULT (phi);
2122 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2123 while (next_stmt)
2125 if (gimple_assign_rhs2 (next_stmt) == lhs)
2127 tree op = gimple_assign_rhs1 (next_stmt);
2128 gimple *def_stmt = NULL;
2130 if (TREE_CODE (op) == SSA_NAME)
2131 def_stmt = SSA_NAME_DEF_STMT (op);
2133 /* Check that the other def is either defined in the loop
2134 ("vect_internal_def"), or it's an induction (defined by a
2135 loop-header phi-node). */
2136 if (def_stmt
2137 && gimple_bb (def_stmt)
2138 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2139 && (is_gimple_assign (def_stmt)
2140 || is_gimple_call (def_stmt)
2141 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2142 == vect_induction_def
2143 || (gimple_code (def_stmt) == GIMPLE_PHI
2144 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2145 == vect_internal_def
2146 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2148 lhs = gimple_assign_lhs (next_stmt);
2149 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2150 continue;
2153 return false;
2155 else
2157 tree op = gimple_assign_rhs2 (next_stmt);
2158 gimple *def_stmt = NULL;
2160 if (TREE_CODE (op) == SSA_NAME)
2161 def_stmt = SSA_NAME_DEF_STMT (op);
2163 /* Check that the other def is either defined in the loop
2164 ("vect_internal_def"), or it's an induction (defined by a
2165 loop-header phi-node). */
2166 if (def_stmt
2167 && gimple_bb (def_stmt)
2168 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2169 && (is_gimple_assign (def_stmt)
2170 || is_gimple_call (def_stmt)
2171 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2172 == vect_induction_def
2173 || (gimple_code (def_stmt) == GIMPLE_PHI
2174 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2175 == vect_internal_def
2176 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2178 if (dump_enabled_p ())
2180 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2181 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2182 dump_printf (MSG_NOTE, "\n");
2185 swap_ssa_operands (next_stmt,
2186 gimple_assign_rhs1_ptr (next_stmt),
2187 gimple_assign_rhs2_ptr (next_stmt));
2188 update_stmt (next_stmt);
2190 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2191 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2193 else
2194 return false;
2197 lhs = gimple_assign_lhs (next_stmt);
2198 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2201 /* Save the chain for further analysis in SLP detection. */
2202 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2203 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2204 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2206 return true;
2210 /* Function vect_is_simple_reduction_1
2212 (1) Detect a cross-iteration def-use cycle that represents a simple
2213 reduction computation. We look for the following pattern:
2215 loop_header:
2216 a1 = phi < a0, a2 >
2217 a3 = ...
2218 a2 = operation (a3, a1)
2222 a3 = ...
2223 loop_header:
2224 a1 = phi < a0, a2 >
2225 a2 = operation (a3, a1)
2227 such that:
2228 1. operation is commutative and associative and it is safe to
2229 change the order of the computation (if CHECK_REDUCTION is true)
2230 2. no uses for a2 in the loop (a2 is used out of the loop)
2231 3. no uses of a1 in the loop besides the reduction operation
2232 4. no uses of a1 outside the loop.
2234 Conditions 1,4 are tested here.
2235 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2237 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2238 nested cycles, if CHECK_REDUCTION is false.
2240 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2241 reductions:
2243 a1 = phi < a0, a2 >
2244 inner loop (def of a3)
2245 a2 = phi < a3 >
2247 If MODIFY is true it tries also to rework the code in-place to enable
2248 detection of more reduction patterns. For the time being we rewrite
2249 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2252 static gimple *
2253 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple *phi,
2254 bool check_reduction, bool *double_reduc,
2255 bool modify, bool need_wrapping_integral_overflow)
2257 struct loop *loop = (gimple_bb (phi))->loop_father;
2258 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2259 edge latch_e = loop_latch_edge (loop);
2260 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2261 gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2262 enum tree_code orig_code, code;
2263 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2264 tree type;
2265 int nloop_uses;
2266 tree name;
2267 imm_use_iterator imm_iter;
2268 use_operand_p use_p;
2269 bool phi_def;
2271 *double_reduc = false;
2273 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2274 otherwise, we assume outer loop vectorization. */
2275 gcc_assert ((check_reduction && loop == vect_loop)
2276 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2278 name = PHI_RESULT (phi);
2279 /* ??? If there are no uses of the PHI result the inner loop reduction
2280 won't be detected as possibly double-reduction by vectorizable_reduction
2281 because that tries to walk the PHI arg from the preheader edge which
2282 can be constant. See PR60382. */
2283 if (has_zero_uses (name))
2284 return NULL;
2285 nloop_uses = 0;
2286 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2288 gimple *use_stmt = USE_STMT (use_p);
2289 if (is_gimple_debug (use_stmt))
2290 continue;
2292 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296 "intermediate value used outside loop.\n");
2298 return NULL;
2301 nloop_uses++;
2302 if (nloop_uses > 1)
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306 "reduction used in loop.\n");
2307 return NULL;
2311 if (TREE_CODE (loop_arg) != SSA_NAME)
2313 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "reduction: not ssa_name: ");
2317 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2318 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2320 return NULL;
2323 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2324 if (!def_stmt)
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328 "reduction: no def_stmt.\n");
2329 return NULL;
2332 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2334 if (dump_enabled_p ())
2336 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2337 dump_printf (MSG_NOTE, "\n");
2339 return NULL;
2342 if (is_gimple_assign (def_stmt))
2344 name = gimple_assign_lhs (def_stmt);
2345 phi_def = false;
2347 else
2349 name = PHI_RESULT (def_stmt);
2350 phi_def = true;
2353 nloop_uses = 0;
2354 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2356 gimple *use_stmt = USE_STMT (use_p);
2357 if (is_gimple_debug (use_stmt))
2358 continue;
2359 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2360 nloop_uses++;
2361 if (nloop_uses > 1)
2363 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2365 "reduction used in loop.\n");
2366 return NULL;
2370 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2371 defined in the inner loop. */
2372 if (phi_def)
2374 op1 = PHI_ARG_DEF (def_stmt, 0);
2376 if (gimple_phi_num_args (def_stmt) != 1
2377 || TREE_CODE (op1) != SSA_NAME)
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 "unsupported phi node definition.\n");
2383 return NULL;
2386 def1 = SSA_NAME_DEF_STMT (op1);
2387 if (gimple_bb (def1)
2388 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2389 && loop->inner
2390 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2391 && is_gimple_assign (def1))
2393 if (dump_enabled_p ())
2394 report_vect_op (MSG_NOTE, def_stmt,
2395 "detected double reduction: ");
2397 *double_reduc = true;
2398 return def_stmt;
2401 return NULL;
2404 code = orig_code = gimple_assign_rhs_code (def_stmt);
2406 /* We can handle "res -= x[i]", which is non-associative by
2407 simply rewriting this into "res += -x[i]". Avoid changing
2408 gimple instruction for the first simple tests and only do this
2409 if we're allowed to change code at all. */
2410 if (code == MINUS_EXPR
2411 && modify
2412 && (op1 = gimple_assign_rhs1 (def_stmt))
2413 && TREE_CODE (op1) == SSA_NAME
2414 && SSA_NAME_DEF_STMT (op1) == phi)
2415 code = PLUS_EXPR;
2417 if (check_reduction
2418 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2420 if (dump_enabled_p ())
2421 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2422 "reduction: not commutative/associative: ");
2423 return NULL;
2426 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2428 if (code != COND_EXPR)
2430 if (dump_enabled_p ())
2431 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2432 "reduction: not binary operation: ");
2434 return NULL;
2437 op3 = gimple_assign_rhs1 (def_stmt);
2438 if (COMPARISON_CLASS_P (op3))
2440 op4 = TREE_OPERAND (op3, 1);
2441 op3 = TREE_OPERAND (op3, 0);
2444 op1 = gimple_assign_rhs2 (def_stmt);
2445 op2 = gimple_assign_rhs3 (def_stmt);
2447 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2449 if (dump_enabled_p ())
2450 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2451 "reduction: uses not ssa_names: ");
2453 return NULL;
2456 else
2458 op1 = gimple_assign_rhs1 (def_stmt);
2459 op2 = gimple_assign_rhs2 (def_stmt);
2461 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2463 if (dump_enabled_p ())
2464 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2465 "reduction: uses not ssa_names: ");
2467 return NULL;
2471 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2472 if ((TREE_CODE (op1) == SSA_NAME
2473 && !types_compatible_p (type,TREE_TYPE (op1)))
2474 || (TREE_CODE (op2) == SSA_NAME
2475 && !types_compatible_p (type, TREE_TYPE (op2)))
2476 || (op3 && TREE_CODE (op3) == SSA_NAME
2477 && !types_compatible_p (type, TREE_TYPE (op3)))
2478 || (op4 && TREE_CODE (op4) == SSA_NAME
2479 && !types_compatible_p (type, TREE_TYPE (op4))))
2481 if (dump_enabled_p ())
2483 dump_printf_loc (MSG_NOTE, vect_location,
2484 "reduction: multiple types: operation type: ");
2485 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2486 dump_printf (MSG_NOTE, ", operands types: ");
2487 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2488 TREE_TYPE (op1));
2489 dump_printf (MSG_NOTE, ",");
2490 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2491 TREE_TYPE (op2));
2492 if (op3)
2494 dump_printf (MSG_NOTE, ",");
2495 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2496 TREE_TYPE (op3));
2499 if (op4)
2501 dump_printf (MSG_NOTE, ",");
2502 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2503 TREE_TYPE (op4));
2505 dump_printf (MSG_NOTE, "\n");
2508 return NULL;
2511 /* Check that it's ok to change the order of the computation.
2512 Generally, when vectorizing a reduction we change the order of the
2513 computation. This may change the behavior of the program in some
2514 cases, so we need to check that this is ok. One exception is when
2515 vectorizing an outer-loop: the inner-loop is executed sequentially,
2516 and therefore vectorizing reductions in the inner-loop during
2517 outer-loop vectorization is safe. */
2519 /* CHECKME: check for !flag_finite_math_only too? */
2520 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2521 && check_reduction)
2523 /* Changing the order of operations changes the semantics. */
2524 if (dump_enabled_p ())
2525 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2526 "reduction: unsafe fp math optimization: ");
2527 return NULL;
2529 else if (INTEGRAL_TYPE_P (type) && check_reduction)
2531 if (!operation_no_trapping_overflow (type, code))
2533 /* Changing the order of operations changes the semantics. */
2534 if (dump_enabled_p ())
2535 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2536 "reduction: unsafe int math optimization"
2537 " (overflow traps): ");
2538 return NULL;
2540 if (need_wrapping_integral_overflow
2541 && !TYPE_OVERFLOW_WRAPS (type)
2542 && operation_can_overflow (code))
2544 /* Changing the order of operations changes the semantics. */
2545 if (dump_enabled_p ())
2546 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2547 "reduction: unsafe int math optimization"
2548 " (overflow doesn't wrap): ");
2549 return NULL;
2552 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2554 /* Changing the order of operations changes the semantics. */
2555 if (dump_enabled_p ())
2556 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2557 "reduction: unsafe fixed-point math optimization: ");
2558 return NULL;
2561 /* If we detected "res -= x[i]" earlier, rewrite it into
2562 "res += -x[i]" now. If this turns out to be useless reassoc
2563 will clean it up again. */
2564 if (orig_code == MINUS_EXPR)
2566 tree rhs = gimple_assign_rhs2 (def_stmt);
2567 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2568 gimple *negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2569 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2570 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2571 loop_info));
2572 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2573 gimple_assign_set_rhs2 (def_stmt, negrhs);
2574 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2575 update_stmt (def_stmt);
2578 /* Reduction is safe. We're dealing with one of the following:
2579 1) integer arithmetic and no trapv
2580 2) floating point arithmetic, and special flags permit this optimization
2581 3) nested cycle (i.e., outer loop vectorization). */
2582 if (TREE_CODE (op1) == SSA_NAME)
2583 def1 = SSA_NAME_DEF_STMT (op1);
2585 if (TREE_CODE (op2) == SSA_NAME)
2586 def2 = SSA_NAME_DEF_STMT (op2);
2588 if (code != COND_EXPR
2589 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2591 if (dump_enabled_p ())
2592 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2593 return NULL;
2596 /* Check that one def is the reduction def, defined by PHI,
2597 the other def is either defined in the loop ("vect_internal_def"),
2598 or it's an induction (defined by a loop-header phi-node). */
2600 if (def2 && def2 == phi
2601 && (code == COND_EXPR
2602 || !def1 || gimple_nop_p (def1)
2603 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2604 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2605 && (is_gimple_assign (def1)
2606 || is_gimple_call (def1)
2607 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2608 == vect_induction_def
2609 || (gimple_code (def1) == GIMPLE_PHI
2610 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2611 == vect_internal_def
2612 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2614 if (dump_enabled_p ())
2615 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2616 return def_stmt;
2619 if (def1 && def1 == phi
2620 && (code == COND_EXPR
2621 || !def2 || gimple_nop_p (def2)
2622 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2623 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2624 && (is_gimple_assign (def2)
2625 || is_gimple_call (def2)
2626 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2627 == vect_induction_def
2628 || (gimple_code (def2) == GIMPLE_PHI
2629 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2630 == vect_internal_def
2631 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2633 if (check_reduction)
2635 /* Swap operands (just for simplicity - so that the rest of the code
2636 can assume that the reduction variable is always the last (second)
2637 argument). */
2638 if (dump_enabled_p ())
2639 report_vect_op (MSG_NOTE, def_stmt,
2640 "detected reduction: need to swap operands: ");
2642 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2643 gimple_assign_rhs2_ptr (def_stmt));
2645 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2646 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2648 else
2650 if (dump_enabled_p ())
2651 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2654 return def_stmt;
2657 /* Try to find SLP reduction chain. */
2658 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2660 if (dump_enabled_p ())
2661 report_vect_op (MSG_NOTE, def_stmt,
2662 "reduction: detected reduction chain: ");
2664 return def_stmt;
2667 if (dump_enabled_p ())
2668 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2669 "reduction: unknown pattern: ");
2671 return NULL;
2674 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2675 in-place. Arguments as there. */
2677 static gimple *
2678 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2679 bool check_reduction, bool *double_reduc,
2680 bool need_wrapping_integral_overflow)
2682 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2683 double_reduc, false,
2684 need_wrapping_integral_overflow);
2687 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2688 in-place if it enables detection of more reductions. Arguments
2689 as there. */
2691 gimple *
2692 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2693 bool check_reduction, bool *double_reduc,
2694 bool need_wrapping_integral_overflow)
2696 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2697 double_reduc, true,
2698 need_wrapping_integral_overflow);
2701 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2703 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2704 int *peel_iters_epilogue,
2705 stmt_vector_for_cost *scalar_cost_vec,
2706 stmt_vector_for_cost *prologue_cost_vec,
2707 stmt_vector_for_cost *epilogue_cost_vec)
2709 int retval = 0;
2710 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2712 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2714 *peel_iters_epilogue = vf/2;
2715 if (dump_enabled_p ())
2716 dump_printf_loc (MSG_NOTE, vect_location,
2717 "cost model: epilogue peel iters set to vf/2 "
2718 "because loop iterations are unknown .\n");
2720 /* If peeled iterations are known but number of scalar loop
2721 iterations are unknown, count a taken branch per peeled loop. */
2722 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2723 NULL, 0, vect_prologue);
2724 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2725 NULL, 0, vect_epilogue);
2727 else
2729 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2730 peel_iters_prologue = niters < peel_iters_prologue ?
2731 niters : peel_iters_prologue;
2732 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2733 /* If we need to peel for gaps, but no peeling is required, we have to
2734 peel VF iterations. */
2735 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2736 *peel_iters_epilogue = vf;
2739 stmt_info_for_cost *si;
2740 int j;
2741 if (peel_iters_prologue)
2742 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2743 retval += record_stmt_cost (prologue_cost_vec,
2744 si->count * peel_iters_prologue,
2745 si->kind, NULL, si->misalign,
2746 vect_prologue);
2747 if (*peel_iters_epilogue)
2748 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2749 retval += record_stmt_cost (epilogue_cost_vec,
2750 si->count * *peel_iters_epilogue,
2751 si->kind, NULL, si->misalign,
2752 vect_epilogue);
2754 return retval;
2757 /* Function vect_estimate_min_profitable_iters
2759 Return the number of iterations required for the vector version of the
2760 loop to be profitable relative to the cost of the scalar version of the
2761 loop. */
2763 static void
2764 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2765 int *ret_min_profitable_niters,
2766 int *ret_min_profitable_estimate)
2768 int min_profitable_iters;
2769 int min_profitable_estimate;
2770 int peel_iters_prologue;
2771 int peel_iters_epilogue;
2772 unsigned vec_inside_cost = 0;
2773 int vec_outside_cost = 0;
2774 unsigned vec_prologue_cost = 0;
2775 unsigned vec_epilogue_cost = 0;
2776 int scalar_single_iter_cost = 0;
2777 int scalar_outside_cost = 0;
2778 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2779 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2780 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2782 /* Cost model disabled. */
2783 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2785 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2786 *ret_min_profitable_niters = 0;
2787 *ret_min_profitable_estimate = 0;
2788 return;
2791 /* Requires loop versioning tests to handle misalignment. */
2792 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2794 /* FIXME: Make cost depend on complexity of individual check. */
2795 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2796 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2797 vect_prologue);
2798 dump_printf (MSG_NOTE,
2799 "cost model: Adding cost of checks for loop "
2800 "versioning to treat misalignment.\n");
2803 /* Requires loop versioning with alias checks. */
2804 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2806 /* FIXME: Make cost depend on complexity of individual check. */
2807 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
2808 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2809 vect_prologue);
2810 dump_printf (MSG_NOTE,
2811 "cost model: Adding cost of checks for loop "
2812 "versioning aliasing.\n");
2815 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2816 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2817 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2818 vect_prologue);
2820 /* Count statements in scalar loop. Using this as scalar cost for a single
2821 iteration for now.
2823 TODO: Add outer loop support.
2825 TODO: Consider assigning different costs to different scalar
2826 statements. */
2828 scalar_single_iter_cost
2829 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2831 /* Add additional cost for the peeled instructions in prologue and epilogue
2832 loop.
2834 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2835 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2837 TODO: Build an expression that represents peel_iters for prologue and
2838 epilogue to be used in a run-time test. */
2840 if (npeel < 0)
2842 peel_iters_prologue = vf/2;
2843 dump_printf (MSG_NOTE, "cost model: "
2844 "prologue peel iters set to vf/2.\n");
2846 /* If peeling for alignment is unknown, loop bound of main loop becomes
2847 unknown. */
2848 peel_iters_epilogue = vf/2;
2849 dump_printf (MSG_NOTE, "cost model: "
2850 "epilogue peel iters set to vf/2 because "
2851 "peeling for alignment is unknown.\n");
2853 /* If peeled iterations are unknown, count a taken branch and a not taken
2854 branch per peeled loop. Even if scalar loop iterations are known,
2855 vector iterations are not known since peeled prologue iterations are
2856 not known. Hence guards remain the same. */
2857 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2858 NULL, 0, vect_prologue);
2859 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2860 NULL, 0, vect_prologue);
2861 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2862 NULL, 0, vect_epilogue);
2863 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2864 NULL, 0, vect_epilogue);
2865 stmt_info_for_cost *si;
2866 int j;
2867 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2869 struct _stmt_vec_info *stmt_info
2870 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2871 (void) add_stmt_cost (target_cost_data,
2872 si->count * peel_iters_prologue,
2873 si->kind, stmt_info, si->misalign,
2874 vect_prologue);
2875 (void) add_stmt_cost (target_cost_data,
2876 si->count * peel_iters_epilogue,
2877 si->kind, stmt_info, si->misalign,
2878 vect_epilogue);
2881 else
2883 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2884 stmt_info_for_cost *si;
2885 int j;
2886 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2888 prologue_cost_vec.create (2);
2889 epilogue_cost_vec.create (2);
2890 peel_iters_prologue = npeel;
2892 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2893 &peel_iters_epilogue,
2894 &LOOP_VINFO_SCALAR_ITERATION_COST
2895 (loop_vinfo),
2896 &prologue_cost_vec,
2897 &epilogue_cost_vec);
2899 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2901 struct _stmt_vec_info *stmt_info
2902 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2903 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2904 si->misalign, vect_prologue);
2907 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2909 struct _stmt_vec_info *stmt_info
2910 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2911 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2912 si->misalign, vect_epilogue);
2915 prologue_cost_vec.release ();
2916 epilogue_cost_vec.release ();
2919 /* FORNOW: The scalar outside cost is incremented in one of the
2920 following ways:
2922 1. The vectorizer checks for alignment and aliasing and generates
2923 a condition that allows dynamic vectorization. A cost model
2924 check is ANDED with the versioning condition. Hence scalar code
2925 path now has the added cost of the versioning check.
2927 if (cost > th & versioning_check)
2928 jmp to vector code
2930 Hence run-time scalar is incremented by not-taken branch cost.
2932 2. The vectorizer then checks if a prologue is required. If the
2933 cost model check was not done before during versioning, it has to
2934 be done before the prologue check.
2936 if (cost <= th)
2937 prologue = scalar_iters
2938 if (prologue == 0)
2939 jmp to vector code
2940 else
2941 execute prologue
2942 if (prologue == num_iters)
2943 go to exit
2945 Hence the run-time scalar cost is incremented by a taken branch,
2946 plus a not-taken branch, plus a taken branch cost.
2948 3. The vectorizer then checks if an epilogue is required. If the
2949 cost model check was not done before during prologue check, it
2950 has to be done with the epilogue check.
2952 if (prologue == 0)
2953 jmp to vector code
2954 else
2955 execute prologue
2956 if (prologue == num_iters)
2957 go to exit
2958 vector code:
2959 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2960 jmp to epilogue
2962 Hence the run-time scalar cost should be incremented by 2 taken
2963 branches.
2965 TODO: The back end may reorder the BBS's differently and reverse
2966 conditions/branch directions. Change the estimates below to
2967 something more reasonable. */
2969 /* If the number of iterations is known and we do not do versioning, we can
2970 decide whether to vectorize at compile time. Hence the scalar version
2971 do not carry cost model guard costs. */
2972 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2973 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2974 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2976 /* Cost model check occurs at versioning. */
2977 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2978 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2979 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2980 else
2982 /* Cost model check occurs at prologue generation. */
2983 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2984 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2985 + vect_get_stmt_cost (cond_branch_not_taken);
2986 /* Cost model check occurs at epilogue generation. */
2987 else
2988 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2992 /* Complete the target-specific cost calculations. */
2993 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2994 &vec_inside_cost, &vec_epilogue_cost);
2996 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2998 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3001 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3002 vec_inside_cost);
3003 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3004 vec_prologue_cost);
3005 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3006 vec_epilogue_cost);
3007 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3008 scalar_single_iter_cost);
3009 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3010 scalar_outside_cost);
3011 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3012 vec_outside_cost);
3013 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3014 peel_iters_prologue);
3015 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3016 peel_iters_epilogue);
3019 /* Calculate number of iterations required to make the vector version
3020 profitable, relative to the loop bodies only. The following condition
3021 must hold true:
3022 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3023 where
3024 SIC = scalar iteration cost, VIC = vector iteration cost,
3025 VOC = vector outside cost, VF = vectorization factor,
3026 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3027 SOC = scalar outside cost for run time cost model check. */
3029 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3031 if (vec_outside_cost <= 0)
3032 min_profitable_iters = 1;
3033 else
3035 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3036 - vec_inside_cost * peel_iters_prologue
3037 - vec_inside_cost * peel_iters_epilogue)
3038 / ((scalar_single_iter_cost * vf)
3039 - vec_inside_cost);
3041 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3042 <= (((int) vec_inside_cost * min_profitable_iters)
3043 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3044 min_profitable_iters++;
3047 /* vector version will never be profitable. */
3048 else
3050 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3051 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3052 "did not happen for a simd loop");
3054 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3056 "cost model: the vector iteration cost = %d "
3057 "divided by the scalar iteration cost = %d "
3058 "is greater or equal to the vectorization factor = %d"
3059 ".\n",
3060 vec_inside_cost, scalar_single_iter_cost, vf);
3061 *ret_min_profitable_niters = -1;
3062 *ret_min_profitable_estimate = -1;
3063 return;
3066 dump_printf (MSG_NOTE,
3067 " Calculated minimum iters for profitability: %d\n",
3068 min_profitable_iters);
3070 min_profitable_iters =
3071 min_profitable_iters < vf ? vf : min_profitable_iters;
3073 /* Because the condition we create is:
3074 if (niters <= min_profitable_iters)
3075 then skip the vectorized loop. */
3076 min_profitable_iters--;
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE, vect_location,
3080 " Runtime profitability threshold = %d\n",
3081 min_profitable_iters);
3083 *ret_min_profitable_niters = min_profitable_iters;
3085 /* Calculate number of iterations required to make the vector version
3086 profitable, relative to the loop bodies only.
3088 Non-vectorized variant is SIC * niters and it must win over vector
3089 variant on the expected loop trip count. The following condition must hold true:
3090 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3092 if (vec_outside_cost <= 0)
3093 min_profitable_estimate = 1;
3094 else
3096 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3097 - vec_inside_cost * peel_iters_prologue
3098 - vec_inside_cost * peel_iters_epilogue)
3099 / ((scalar_single_iter_cost * vf)
3100 - vec_inside_cost);
3102 min_profitable_estimate --;
3103 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3104 if (dump_enabled_p ())
3105 dump_printf_loc (MSG_NOTE, vect_location,
3106 " Static estimate profitability threshold = %d\n",
3107 min_profitable_iters);
3109 *ret_min_profitable_estimate = min_profitable_estimate;
3112 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3113 vector elements (not bits) for a vector of mode MODE. */
3114 static void
3115 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3116 unsigned char *sel)
3118 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3120 for (i = 0; i < nelt; i++)
3121 sel[i] = (i + offset) & (2*nelt - 1);
3124 /* Checks whether the target supports whole-vector shifts for vectors of mode
3125 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3126 it supports vec_perm_const with masks for all necessary shift amounts. */
3127 static bool
3128 have_whole_vector_shift (enum machine_mode mode)
3130 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3131 return true;
3133 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3134 return false;
3136 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3137 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3139 for (i = nelt/2; i >= 1; i/=2)
3141 calc_vec_perm_mask_for_shift (mode, i, sel);
3142 if (!can_vec_perm_p (mode, false, sel))
3143 return false;
3145 return true;
3148 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3150 static tree
3151 get_reduction_op (gimple *stmt, int reduc_index)
3153 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3155 case GIMPLE_SINGLE_RHS:
3156 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3157 == ternary_op);
3158 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3159 case GIMPLE_UNARY_RHS:
3160 return gimple_assign_rhs1 (stmt);
3161 case GIMPLE_BINARY_RHS:
3162 return (reduc_index
3163 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3164 case GIMPLE_TERNARY_RHS:
3165 return gimple_op (stmt, reduc_index + 1);
3166 default:
3167 gcc_unreachable ();
3171 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3172 functions. Design better to avoid maintenance issues. */
3174 /* Function vect_model_reduction_cost.
3176 Models cost for a reduction operation, including the vector ops
3177 generated within the strip-mine loop, the initial definition before
3178 the loop, and the epilogue code that must be generated. */
3180 static bool
3181 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3182 int ncopies, int reduc_index)
3184 int prologue_cost = 0, epilogue_cost = 0;
3185 enum tree_code code;
3186 optab optab;
3187 tree vectype;
3188 gimple *stmt, *orig_stmt;
3189 tree reduction_op;
3190 machine_mode mode;
3191 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3192 struct loop *loop = NULL;
3193 void *target_cost_data;
3195 if (loop_vinfo)
3197 loop = LOOP_VINFO_LOOP (loop_vinfo);
3198 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3200 else
3201 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3203 /* Cost of reduction op inside loop. */
3204 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3205 stmt_info, 0, vect_body);
3206 stmt = STMT_VINFO_STMT (stmt_info);
3208 reduction_op = get_reduction_op (stmt, reduc_index);
3210 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3211 if (!vectype)
3213 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3216 "unsupported data-type ");
3217 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3218 TREE_TYPE (reduction_op));
3219 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3221 return false;
3224 mode = TYPE_MODE (vectype);
3225 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3227 if (!orig_stmt)
3228 orig_stmt = STMT_VINFO_STMT (stmt_info);
3230 code = gimple_assign_rhs_code (orig_stmt);
3232 /* Add in cost for initial definition. */
3233 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3234 stmt_info, 0, vect_prologue);
3236 /* Determine cost of epilogue code.
3238 We have a reduction operator that will reduce the vector in one statement.
3239 Also requires scalar extract. */
3241 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3243 if (reduc_code != ERROR_MARK)
3245 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3246 stmt_info, 0, vect_epilogue);
3247 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3248 stmt_info, 0, vect_epilogue);
3250 else
3252 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3253 tree bitsize =
3254 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3255 int element_bitsize = tree_to_uhwi (bitsize);
3256 int nelements = vec_size_in_bits / element_bitsize;
3258 optab = optab_for_tree_code (code, vectype, optab_default);
3260 /* We have a whole vector shift available. */
3261 if (VECTOR_MODE_P (mode)
3262 && optab_handler (optab, mode) != CODE_FOR_nothing
3263 && have_whole_vector_shift (mode))
3265 /* Final reduction via vector shifts and the reduction operator.
3266 Also requires scalar extract. */
3267 epilogue_cost += add_stmt_cost (target_cost_data,
3268 exact_log2 (nelements) * 2,
3269 vector_stmt, stmt_info, 0,
3270 vect_epilogue);
3271 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3272 vec_to_scalar, stmt_info, 0,
3273 vect_epilogue);
3275 else
3276 /* Use extracts and reduction op for final reduction. For N
3277 elements, we have N extracts and N-1 reduction ops. */
3278 epilogue_cost += add_stmt_cost (target_cost_data,
3279 nelements + nelements - 1,
3280 vector_stmt, stmt_info, 0,
3281 vect_epilogue);
3285 if (dump_enabled_p ())
3286 dump_printf (MSG_NOTE,
3287 "vect_model_reduction_cost: inside_cost = %d, "
3288 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3289 prologue_cost, epilogue_cost);
3291 return true;
3295 /* Function vect_model_induction_cost.
3297 Models cost for induction operations. */
3299 static void
3300 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3302 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3303 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3304 unsigned inside_cost, prologue_cost;
3306 /* loop cost for vec_loop. */
3307 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3308 stmt_info, 0, vect_body);
3310 /* prologue cost for vec_init and vec_step. */
3311 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3312 stmt_info, 0, vect_prologue);
3314 if (dump_enabled_p ())
3315 dump_printf_loc (MSG_NOTE, vect_location,
3316 "vect_model_induction_cost: inside_cost = %d, "
3317 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3321 /* Function get_initial_def_for_induction
3323 Input:
3324 STMT - a stmt that performs an induction operation in the loop.
3325 IV_PHI - the initial value of the induction variable
3327 Output:
3328 Return a vector variable, initialized with the first VF values of
3329 the induction variable. E.g., for an iv with IV_PHI='X' and
3330 evolution S, for a vector of 4 units, we want to return:
3331 [X, X + S, X + 2*S, X + 3*S]. */
3333 static tree
3334 get_initial_def_for_induction (gimple *iv_phi)
3336 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3337 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3338 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3339 tree vectype;
3340 int nunits;
3341 edge pe = loop_preheader_edge (loop);
3342 struct loop *iv_loop;
3343 basic_block new_bb;
3344 tree new_vec, vec_init, vec_step, t;
3345 tree new_name;
3346 gimple *new_stmt;
3347 gphi *induction_phi;
3348 tree induc_def, vec_def, vec_dest;
3349 tree init_expr, step_expr;
3350 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3351 int i;
3352 int ncopies;
3353 tree expr;
3354 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3355 bool nested_in_vect_loop = false;
3356 gimple_seq stmts;
3357 imm_use_iterator imm_iter;
3358 use_operand_p use_p;
3359 gimple *exit_phi;
3360 edge latch_e;
3361 tree loop_arg;
3362 gimple_stmt_iterator si;
3363 basic_block bb = gimple_bb (iv_phi);
3364 tree stepvectype;
3365 tree resvectype;
3367 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3368 if (nested_in_vect_loop_p (loop, iv_phi))
3370 nested_in_vect_loop = true;
3371 iv_loop = loop->inner;
3373 else
3374 iv_loop = loop;
3375 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3377 latch_e = loop_latch_edge (iv_loop);
3378 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3380 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3381 gcc_assert (step_expr != NULL_TREE);
3383 pe = loop_preheader_edge (iv_loop);
3384 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3385 loop_preheader_edge (iv_loop));
3387 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3388 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3389 gcc_assert (vectype);
3390 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3391 ncopies = vf / nunits;
3393 gcc_assert (phi_info);
3394 gcc_assert (ncopies >= 1);
3396 /* Convert the step to the desired type. */
3397 stmts = NULL;
3398 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3399 if (stmts)
3401 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3402 gcc_assert (!new_bb);
3405 /* Find the first insertion point in the BB. */
3406 si = gsi_after_labels (bb);
3408 /* Create the vector that holds the initial_value of the induction. */
3409 if (nested_in_vect_loop)
3411 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3412 been created during vectorization of previous stmts. We obtain it
3413 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3414 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3415 /* If the initial value is not of proper type, convert it. */
3416 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3418 new_stmt
3419 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3420 vect_simple_var,
3421 "vec_iv_"),
3422 VIEW_CONVERT_EXPR,
3423 build1 (VIEW_CONVERT_EXPR, vectype,
3424 vec_init));
3425 vec_init = gimple_assign_lhs (new_stmt);
3426 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3427 new_stmt);
3428 gcc_assert (!new_bb);
3429 set_vinfo_for_stmt (new_stmt,
3430 new_stmt_vec_info (new_stmt, loop_vinfo));
3433 else
3435 vec<constructor_elt, va_gc> *v;
3437 /* iv_loop is the loop to be vectorized. Create:
3438 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3439 stmts = NULL;
3440 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3442 vec_alloc (v, nunits);
3443 bool constant_p = is_gimple_min_invariant (new_name);
3444 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3445 for (i = 1; i < nunits; i++)
3447 /* Create: new_name_i = new_name + step_expr */
3448 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3449 new_name, step_expr);
3450 if (!is_gimple_min_invariant (new_name))
3451 constant_p = false;
3452 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3454 if (stmts)
3456 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3457 gcc_assert (!new_bb);
3460 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3461 if (constant_p)
3462 new_vec = build_vector_from_ctor (vectype, v);
3463 else
3464 new_vec = build_constructor (vectype, v);
3465 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3469 /* Create the vector that holds the step of the induction. */
3470 if (nested_in_vect_loop)
3471 /* iv_loop is nested in the loop to be vectorized. Generate:
3472 vec_step = [S, S, S, S] */
3473 new_name = step_expr;
3474 else
3476 /* iv_loop is the loop to be vectorized. Generate:
3477 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3478 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3480 expr = build_int_cst (integer_type_node, vf);
3481 expr = fold_convert (TREE_TYPE (step_expr), expr);
3483 else
3484 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3485 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3486 expr, step_expr);
3487 if (TREE_CODE (step_expr) == SSA_NAME)
3488 new_name = vect_init_vector (iv_phi, new_name,
3489 TREE_TYPE (step_expr), NULL);
3492 t = unshare_expr (new_name);
3493 gcc_assert (CONSTANT_CLASS_P (new_name)
3494 || TREE_CODE (new_name) == SSA_NAME);
3495 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3496 gcc_assert (stepvectype);
3497 new_vec = build_vector_from_val (stepvectype, t);
3498 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3501 /* Create the following def-use cycle:
3502 loop prolog:
3503 vec_init = ...
3504 vec_step = ...
3505 loop:
3506 vec_iv = PHI <vec_init, vec_loop>
3508 STMT
3510 vec_loop = vec_iv + vec_step; */
3512 /* Create the induction-phi that defines the induction-operand. */
3513 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3514 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3515 set_vinfo_for_stmt (induction_phi,
3516 new_stmt_vec_info (induction_phi, loop_vinfo));
3517 induc_def = PHI_RESULT (induction_phi);
3519 /* Create the iv update inside the loop */
3520 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3521 vec_def = make_ssa_name (vec_dest, new_stmt);
3522 gimple_assign_set_lhs (new_stmt, vec_def);
3523 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3524 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3526 /* Set the arguments of the phi node: */
3527 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3528 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3529 UNKNOWN_LOCATION);
3532 /* In case that vectorization factor (VF) is bigger than the number
3533 of elements that we can fit in a vectype (nunits), we have to generate
3534 more than one vector stmt - i.e - we need to "unroll" the
3535 vector stmt by a factor VF/nunits. For more details see documentation
3536 in vectorizable_operation. */
3538 if (ncopies > 1)
3540 stmt_vec_info prev_stmt_vinfo;
3541 /* FORNOW. This restriction should be relaxed. */
3542 gcc_assert (!nested_in_vect_loop);
3544 /* Create the vector that holds the step of the induction. */
3545 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3547 expr = build_int_cst (integer_type_node, nunits);
3548 expr = fold_convert (TREE_TYPE (step_expr), expr);
3550 else
3551 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3552 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3553 expr, step_expr);
3554 if (TREE_CODE (step_expr) == SSA_NAME)
3555 new_name = vect_init_vector (iv_phi, new_name,
3556 TREE_TYPE (step_expr), NULL);
3557 t = unshare_expr (new_name);
3558 gcc_assert (CONSTANT_CLASS_P (new_name)
3559 || TREE_CODE (new_name) == SSA_NAME);
3560 new_vec = build_vector_from_val (stepvectype, t);
3561 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3563 vec_def = induc_def;
3564 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3565 for (i = 1; i < ncopies; i++)
3567 /* vec_i = vec_prev + vec_step */
3568 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3569 vec_def, vec_step);
3570 vec_def = make_ssa_name (vec_dest, new_stmt);
3571 gimple_assign_set_lhs (new_stmt, vec_def);
3573 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3574 if (!useless_type_conversion_p (resvectype, vectype))
3576 new_stmt
3577 = gimple_build_assign
3578 (vect_get_new_vect_var (resvectype, vect_simple_var,
3579 "vec_iv_"),
3580 VIEW_CONVERT_EXPR,
3581 build1 (VIEW_CONVERT_EXPR, resvectype,
3582 gimple_assign_lhs (new_stmt)));
3583 gimple_assign_set_lhs (new_stmt,
3584 make_ssa_name
3585 (gimple_assign_lhs (new_stmt), new_stmt));
3586 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3588 set_vinfo_for_stmt (new_stmt,
3589 new_stmt_vec_info (new_stmt, loop_vinfo));
3590 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3591 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3595 if (nested_in_vect_loop)
3597 /* Find the loop-closed exit-phi of the induction, and record
3598 the final vector of induction results: */
3599 exit_phi = NULL;
3600 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3602 gimple *use_stmt = USE_STMT (use_p);
3603 if (is_gimple_debug (use_stmt))
3604 continue;
3606 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3608 exit_phi = use_stmt;
3609 break;
3612 if (exit_phi)
3614 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3615 /* FORNOW. Currently not supporting the case that an inner-loop induction
3616 is not used in the outer-loop (i.e. only outside the outer-loop). */
3617 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3618 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3620 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3621 if (dump_enabled_p ())
3623 dump_printf_loc (MSG_NOTE, vect_location,
3624 "vector of inductions after inner-loop:");
3625 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3626 dump_printf (MSG_NOTE, "\n");
3632 if (dump_enabled_p ())
3634 dump_printf_loc (MSG_NOTE, vect_location,
3635 "transform induction: created def-use cycle: ");
3636 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3637 dump_printf (MSG_NOTE, "\n");
3638 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3639 SSA_NAME_DEF_STMT (vec_def), 0);
3640 dump_printf (MSG_NOTE, "\n");
3643 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3644 if (!useless_type_conversion_p (resvectype, vectype))
3646 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3647 vect_simple_var,
3648 "vec_iv_"),
3649 VIEW_CONVERT_EXPR,
3650 build1 (VIEW_CONVERT_EXPR, resvectype,
3651 induc_def));
3652 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3653 gimple_assign_set_lhs (new_stmt, induc_def);
3654 si = gsi_after_labels (bb);
3655 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3656 set_vinfo_for_stmt (new_stmt,
3657 new_stmt_vec_info (new_stmt, loop_vinfo));
3658 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3659 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3662 return induc_def;
3666 /* Function get_initial_def_for_reduction
3668 Input:
3669 STMT - a stmt that performs a reduction operation in the loop.
3670 INIT_VAL - the initial value of the reduction variable
3672 Output:
3673 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3674 of the reduction (used for adjusting the epilog - see below).
3675 Return a vector variable, initialized according to the operation that STMT
3676 performs. This vector will be used as the initial value of the
3677 vector of partial results.
3679 Option1 (adjust in epilog): Initialize the vector as follows:
3680 add/bit or/xor: [0,0,...,0,0]
3681 mult/bit and: [1,1,...,1,1]
3682 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3683 and when necessary (e.g. add/mult case) let the caller know
3684 that it needs to adjust the result by init_val.
3686 Option2: Initialize the vector as follows:
3687 add/bit or/xor: [init_val,0,0,...,0]
3688 mult/bit and: [init_val,1,1,...,1]
3689 min/max/cond_expr: [init_val,init_val,...,init_val]
3690 and no adjustments are needed.
3692 For example, for the following code:
3694 s = init_val;
3695 for (i=0;i<n;i++)
3696 s = s + a[i];
3698 STMT is 's = s + a[i]', and the reduction variable is 's'.
3699 For a vector of 4 units, we want to return either [0,0,0,init_val],
3700 or [0,0,0,0] and let the caller know that it needs to adjust
3701 the result at the end by 'init_val'.
3703 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3704 initialization vector is simpler (same element in all entries), if
3705 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3707 A cost model should help decide between these two schemes. */
3709 tree
3710 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3711 tree *adjustment_def)
3713 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3714 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3715 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3716 tree scalar_type = TREE_TYPE (init_val);
3717 tree vectype = get_vectype_for_scalar_type (scalar_type);
3718 int nunits;
3719 enum tree_code code = gimple_assign_rhs_code (stmt);
3720 tree def_for_init;
3721 tree init_def;
3722 tree *elts;
3723 int i;
3724 bool nested_in_vect_loop = false;
3725 tree init_value;
3726 REAL_VALUE_TYPE real_init_val = dconst0;
3727 int int_init_val = 0;
3728 gimple *def_stmt = NULL;
3730 gcc_assert (vectype);
3731 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3733 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3734 || SCALAR_FLOAT_TYPE_P (scalar_type));
3736 if (nested_in_vect_loop_p (loop, stmt))
3737 nested_in_vect_loop = true;
3738 else
3739 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3741 /* In case of double reduction we only create a vector variable to be put
3742 in the reduction phi node. The actual statement creation is done in
3743 vect_create_epilog_for_reduction. */
3744 if (adjustment_def && nested_in_vect_loop
3745 && TREE_CODE (init_val) == SSA_NAME
3746 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3747 && gimple_code (def_stmt) == GIMPLE_PHI
3748 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3749 && vinfo_for_stmt (def_stmt)
3750 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3751 == vect_double_reduction_def)
3753 *adjustment_def = NULL;
3754 return vect_create_destination_var (init_val, vectype);
3757 if (TREE_CONSTANT (init_val))
3759 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3760 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3761 else
3762 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3764 else
3765 init_value = init_val;
3767 switch (code)
3769 case WIDEN_SUM_EXPR:
3770 case DOT_PROD_EXPR:
3771 case SAD_EXPR:
3772 case PLUS_EXPR:
3773 case MINUS_EXPR:
3774 case BIT_IOR_EXPR:
3775 case BIT_XOR_EXPR:
3776 case MULT_EXPR:
3777 case BIT_AND_EXPR:
3778 /* ADJUSMENT_DEF is NULL when called from
3779 vect_create_epilog_for_reduction to vectorize double reduction. */
3780 if (adjustment_def)
3782 if (nested_in_vect_loop)
3783 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
3784 else
3785 *adjustment_def = init_val;
3788 if (code == MULT_EXPR)
3790 real_init_val = dconst1;
3791 int_init_val = 1;
3794 if (code == BIT_AND_EXPR)
3795 int_init_val = -1;
3797 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3798 def_for_init = build_real (scalar_type, real_init_val);
3799 else
3800 def_for_init = build_int_cst (scalar_type, int_init_val);
3802 /* Create a vector of '0' or '1' except the first element. */
3803 elts = XALLOCAVEC (tree, nunits);
3804 for (i = nunits - 2; i >= 0; --i)
3805 elts[i + 1] = def_for_init;
3807 /* Option1: the first element is '0' or '1' as well. */
3808 if (adjustment_def)
3810 elts[0] = def_for_init;
3811 init_def = build_vector (vectype, elts);
3812 break;
3815 /* Option2: the first element is INIT_VAL. */
3816 elts[0] = init_val;
3817 if (TREE_CONSTANT (init_val))
3818 init_def = build_vector (vectype, elts);
3819 else
3821 vec<constructor_elt, va_gc> *v;
3822 vec_alloc (v, nunits);
3823 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3824 for (i = 1; i < nunits; ++i)
3825 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3826 init_def = build_constructor (vectype, v);
3829 break;
3831 case MIN_EXPR:
3832 case MAX_EXPR:
3833 case COND_EXPR:
3834 if (adjustment_def)
3836 *adjustment_def = NULL_TREE;
3837 init_def = vect_get_vec_def_for_operand (init_val, stmt);
3838 break;
3841 init_def = build_vector_from_val (vectype, init_value);
3842 break;
3844 default:
3845 gcc_unreachable ();
3848 return init_def;
3851 /* Function vect_create_epilog_for_reduction
3853 Create code at the loop-epilog to finalize the result of a reduction
3854 computation.
3856 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3857 reduction statements.
3858 STMT is the scalar reduction stmt that is being vectorized.
3859 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3860 number of elements that we can fit in a vectype (nunits). In this case
3861 we have to generate more than one vector stmt - i.e - we need to "unroll"
3862 the vector stmt by a factor VF/nunits. For more details see documentation
3863 in vectorizable_operation.
3864 REDUC_CODE is the tree-code for the epilog reduction.
3865 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3866 computation.
3867 REDUC_INDEX is the index of the operand in the right hand side of the
3868 statement that is defined by REDUCTION_PHI.
3869 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3870 SLP_NODE is an SLP node containing a group of reduction statements. The
3871 first one in this group is STMT.
3873 This function:
3874 1. Creates the reduction def-use cycles: sets the arguments for
3875 REDUCTION_PHIS:
3876 The loop-entry argument is the vectorized initial-value of the reduction.
3877 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3878 sums.
3879 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3880 by applying the operation specified by REDUC_CODE if available, or by
3881 other means (whole-vector shifts or a scalar loop).
3882 The function also creates a new phi node at the loop exit to preserve
3883 loop-closed form, as illustrated below.
3885 The flow at the entry to this function:
3887 loop:
3888 vec_def = phi <null, null> # REDUCTION_PHI
3889 VECT_DEF = vector_stmt # vectorized form of STMT
3890 s_loop = scalar_stmt # (scalar) STMT
3891 loop_exit:
3892 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3893 use <s_out0>
3894 use <s_out0>
3896 The above is transformed by this function into:
3898 loop:
3899 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3900 VECT_DEF = vector_stmt # vectorized form of STMT
3901 s_loop = scalar_stmt # (scalar) STMT
3902 loop_exit:
3903 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3904 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3905 v_out2 = reduce <v_out1>
3906 s_out3 = extract_field <v_out2, 0>
3907 s_out4 = adjust_result <s_out3>
3908 use <s_out4>
3909 use <s_out4>
3912 static void
3913 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
3914 int ncopies, enum tree_code reduc_code,
3915 vec<gimple *> reduction_phis,
3916 int reduc_index, bool double_reduc,
3917 slp_tree slp_node)
3919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3920 stmt_vec_info prev_phi_info;
3921 tree vectype;
3922 machine_mode mode;
3923 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3924 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3925 basic_block exit_bb;
3926 tree scalar_dest;
3927 tree scalar_type;
3928 gimple *new_phi = NULL, *phi;
3929 gimple_stmt_iterator exit_gsi;
3930 tree vec_dest;
3931 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3932 gimple *epilog_stmt = NULL;
3933 enum tree_code code = gimple_assign_rhs_code (stmt);
3934 gimple *exit_phi;
3935 tree bitsize;
3936 tree adjustment_def = NULL;
3937 tree vec_initial_def = NULL;
3938 tree reduction_op, expr, def;
3939 tree orig_name, scalar_result;
3940 imm_use_iterator imm_iter, phi_imm_iter;
3941 use_operand_p use_p, phi_use_p;
3942 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
3943 bool nested_in_vect_loop = false;
3944 auto_vec<gimple *> new_phis;
3945 auto_vec<gimple *> inner_phis;
3946 enum vect_def_type dt = vect_unknown_def_type;
3947 int j, i;
3948 auto_vec<tree> scalar_results;
3949 unsigned int group_size = 1, k, ratio;
3950 auto_vec<tree> vec_initial_defs;
3951 auto_vec<gimple *> phis;
3952 bool slp_reduc = false;
3953 tree new_phi_result;
3954 gimple *inner_phi = NULL;
3956 if (slp_node)
3957 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3959 if (nested_in_vect_loop_p (loop, stmt))
3961 outer_loop = loop;
3962 loop = loop->inner;
3963 nested_in_vect_loop = true;
3964 gcc_assert (!slp_node);
3967 reduction_op = get_reduction_op (stmt, reduc_index);
3969 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3970 gcc_assert (vectype);
3971 mode = TYPE_MODE (vectype);
3973 /* 1. Create the reduction def-use cycle:
3974 Set the arguments of REDUCTION_PHIS, i.e., transform
3976 loop:
3977 vec_def = phi <null, null> # REDUCTION_PHI
3978 VECT_DEF = vector_stmt # vectorized form of STMT
3981 into:
3983 loop:
3984 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3985 VECT_DEF = vector_stmt # vectorized form of STMT
3988 (in case of SLP, do it for all the phis). */
3990 /* Get the loop-entry arguments. */
3991 if (slp_node)
3992 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3993 NULL, slp_node, reduc_index);
3994 else
3996 /* Get at the scalar def before the loop, that defines the initial value
3997 of the reduction variable. */
3998 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
3999 tree op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
4000 vec_initial_defs.create (1);
4001 vec_initial_def = get_initial_def_for_reduction (stmt, op,
4002 &adjustment_def);
4003 vec_initial_defs.quick_push (vec_initial_def);
4006 /* Set phi nodes arguments. */
4007 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4009 tree vec_init_def, def;
4010 gimple_seq stmts;
4011 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4012 true, NULL_TREE);
4013 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4014 def = vect_defs[i];
4015 for (j = 0; j < ncopies; j++)
4017 /* Set the loop-entry arg of the reduction-phi. */
4018 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4019 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4021 /* Set the loop-latch arg for the reduction-phi. */
4022 if (j > 0)
4023 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4025 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4026 UNKNOWN_LOCATION);
4028 if (dump_enabled_p ())
4030 dump_printf_loc (MSG_NOTE, vect_location,
4031 "transform reduction: created def-use cycle: ");
4032 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4033 dump_printf (MSG_NOTE, "\n");
4034 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4035 dump_printf (MSG_NOTE, "\n");
4038 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4042 /* 2. Create epilog code.
4043 The reduction epilog code operates across the elements of the vector
4044 of partial results computed by the vectorized loop.
4045 The reduction epilog code consists of:
4047 step 1: compute the scalar result in a vector (v_out2)
4048 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4049 step 3: adjust the scalar result (s_out3) if needed.
4051 Step 1 can be accomplished using one the following three schemes:
4052 (scheme 1) using reduc_code, if available.
4053 (scheme 2) using whole-vector shifts, if available.
4054 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4055 combined.
4057 The overall epilog code looks like this:
4059 s_out0 = phi <s_loop> # original EXIT_PHI
4060 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4061 v_out2 = reduce <v_out1> # step 1
4062 s_out3 = extract_field <v_out2, 0> # step 2
4063 s_out4 = adjust_result <s_out3> # step 3
4065 (step 3 is optional, and steps 1 and 2 may be combined).
4066 Lastly, the uses of s_out0 are replaced by s_out4. */
4069 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4070 v_out1 = phi <VECT_DEF>
4071 Store them in NEW_PHIS. */
4073 exit_bb = single_exit (loop)->dest;
4074 prev_phi_info = NULL;
4075 new_phis.create (vect_defs.length ());
4076 FOR_EACH_VEC_ELT (vect_defs, i, def)
4078 for (j = 0; j < ncopies; j++)
4080 tree new_def = copy_ssa_name (def);
4081 phi = create_phi_node (new_def, exit_bb);
4082 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4083 if (j == 0)
4084 new_phis.quick_push (phi);
4085 else
4087 def = vect_get_vec_def_for_stmt_copy (dt, def);
4088 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4091 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4092 prev_phi_info = vinfo_for_stmt (phi);
4096 /* The epilogue is created for the outer-loop, i.e., for the loop being
4097 vectorized. Create exit phis for the outer loop. */
4098 if (double_reduc)
4100 loop = outer_loop;
4101 exit_bb = single_exit (loop)->dest;
4102 inner_phis.create (vect_defs.length ());
4103 FOR_EACH_VEC_ELT (new_phis, i, phi)
4105 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4106 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4107 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4108 PHI_RESULT (phi));
4109 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4110 loop_vinfo));
4111 inner_phis.quick_push (phi);
4112 new_phis[i] = outer_phi;
4113 prev_phi_info = vinfo_for_stmt (outer_phi);
4114 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4116 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4117 new_result = copy_ssa_name (PHI_RESULT (phi));
4118 outer_phi = create_phi_node (new_result, exit_bb);
4119 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4120 PHI_RESULT (phi));
4121 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4122 loop_vinfo));
4123 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4124 prev_phi_info = vinfo_for_stmt (outer_phi);
4129 exit_gsi = gsi_after_labels (exit_bb);
4131 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4132 (i.e. when reduc_code is not available) and in the final adjustment
4133 code (if needed). Also get the original scalar reduction variable as
4134 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4135 represents a reduction pattern), the tree-code and scalar-def are
4136 taken from the original stmt that the pattern-stmt (STMT) replaces.
4137 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4138 are taken from STMT. */
4140 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4141 if (!orig_stmt)
4143 /* Regular reduction */
4144 orig_stmt = stmt;
4146 else
4148 /* Reduction pattern */
4149 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4150 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4151 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4154 code = gimple_assign_rhs_code (orig_stmt);
4155 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4156 partial results are added and not subtracted. */
4157 if (code == MINUS_EXPR)
4158 code = PLUS_EXPR;
4160 scalar_dest = gimple_assign_lhs (orig_stmt);
4161 scalar_type = TREE_TYPE (scalar_dest);
4162 scalar_results.create (group_size);
4163 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4164 bitsize = TYPE_SIZE (scalar_type);
4166 /* In case this is a reduction in an inner-loop while vectorizing an outer
4167 loop - we don't need to extract a single scalar result at the end of the
4168 inner-loop (unless it is double reduction, i.e., the use of reduction is
4169 outside the outer-loop). The final vector of partial results will be used
4170 in the vectorized outer-loop, or reduced to a scalar result at the end of
4171 the outer-loop. */
4172 if (nested_in_vect_loop && !double_reduc)
4173 goto vect_finalize_reduction;
4175 /* SLP reduction without reduction chain, e.g.,
4176 # a1 = phi <a2, a0>
4177 # b1 = phi <b2, b0>
4178 a2 = operation (a1)
4179 b2 = operation (b1) */
4180 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4182 /* In case of reduction chain, e.g.,
4183 # a1 = phi <a3, a0>
4184 a2 = operation (a1)
4185 a3 = operation (a2),
4187 we may end up with more than one vector result. Here we reduce them to
4188 one vector. */
4189 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4191 tree first_vect = PHI_RESULT (new_phis[0]);
4192 tree tmp;
4193 gassign *new_vec_stmt = NULL;
4195 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4196 for (k = 1; k < new_phis.length (); k++)
4198 gimple *next_phi = new_phis[k];
4199 tree second_vect = PHI_RESULT (next_phi);
4201 tmp = build2 (code, vectype, first_vect, second_vect);
4202 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4203 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4204 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4205 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4208 new_phi_result = first_vect;
4209 if (new_vec_stmt)
4211 new_phis.truncate (0);
4212 new_phis.safe_push (new_vec_stmt);
4215 else
4216 new_phi_result = PHI_RESULT (new_phis[0]);
4218 /* 2.3 Create the reduction code, using one of the three schemes described
4219 above. In SLP we simply need to extract all the elements from the
4220 vector (without reducing them), so we use scalar shifts. */
4221 if (reduc_code != ERROR_MARK && !slp_reduc)
4223 tree tmp;
4224 tree vec_elem_type;
4226 /*** Case 1: Create:
4227 v_out2 = reduc_expr <v_out1> */
4229 if (dump_enabled_p ())
4230 dump_printf_loc (MSG_NOTE, vect_location,
4231 "Reduce using direct vector reduction.\n");
4233 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4234 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4236 tree tmp_dest =
4237 vect_create_destination_var (scalar_dest, vec_elem_type);
4238 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4239 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4240 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4241 gimple_assign_set_lhs (epilog_stmt, new_temp);
4242 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4244 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4246 else
4247 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4248 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4249 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4250 gimple_assign_set_lhs (epilog_stmt, new_temp);
4251 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4252 scalar_results.safe_push (new_temp);
4254 else
4256 bool reduce_with_shift = have_whole_vector_shift (mode);
4257 int element_bitsize = tree_to_uhwi (bitsize);
4258 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4259 tree vec_temp;
4261 /* Regardless of whether we have a whole vector shift, if we're
4262 emulating the operation via tree-vect-generic, we don't want
4263 to use it. Only the first round of the reduction is likely
4264 to still be profitable via emulation. */
4265 /* ??? It might be better to emit a reduction tree code here, so that
4266 tree-vect-generic can expand the first round via bit tricks. */
4267 if (!VECTOR_MODE_P (mode))
4268 reduce_with_shift = false;
4269 else
4271 optab optab = optab_for_tree_code (code, vectype, optab_default);
4272 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4273 reduce_with_shift = false;
4276 if (reduce_with_shift && !slp_reduc)
4278 int nelements = vec_size_in_bits / element_bitsize;
4279 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4281 int elt_offset;
4283 tree zero_vec = build_zero_cst (vectype);
4284 /*** Case 2: Create:
4285 for (offset = nelements/2; offset >= 1; offset/=2)
4287 Create: va' = vec_shift <va, offset>
4288 Create: va = vop <va, va'>
4289 } */
4291 tree rhs;
4293 if (dump_enabled_p ())
4294 dump_printf_loc (MSG_NOTE, vect_location,
4295 "Reduce using vector shifts\n");
4297 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4298 new_temp = new_phi_result;
4299 for (elt_offset = nelements / 2;
4300 elt_offset >= 1;
4301 elt_offset /= 2)
4303 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4304 tree mask = vect_gen_perm_mask_any (vectype, sel);
4305 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4306 new_temp, zero_vec, mask);
4307 new_name = make_ssa_name (vec_dest, epilog_stmt);
4308 gimple_assign_set_lhs (epilog_stmt, new_name);
4309 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4311 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4312 new_temp);
4313 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4314 gimple_assign_set_lhs (epilog_stmt, new_temp);
4315 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4318 /* 2.4 Extract the final scalar result. Create:
4319 s_out3 = extract_field <v_out2, bitpos> */
4321 if (dump_enabled_p ())
4322 dump_printf_loc (MSG_NOTE, vect_location,
4323 "extract scalar result\n");
4325 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4326 bitsize, bitsize_zero_node);
4327 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4328 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4329 gimple_assign_set_lhs (epilog_stmt, new_temp);
4330 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4331 scalar_results.safe_push (new_temp);
4333 else
4335 /*** Case 3: Create:
4336 s = extract_field <v_out2, 0>
4337 for (offset = element_size;
4338 offset < vector_size;
4339 offset += element_size;)
4341 Create: s' = extract_field <v_out2, offset>
4342 Create: s = op <s, s'> // For non SLP cases
4343 } */
4345 if (dump_enabled_p ())
4346 dump_printf_loc (MSG_NOTE, vect_location,
4347 "Reduce using scalar code.\n");
4349 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4350 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4352 int bit_offset;
4353 if (gimple_code (new_phi) == GIMPLE_PHI)
4354 vec_temp = PHI_RESULT (new_phi);
4355 else
4356 vec_temp = gimple_assign_lhs (new_phi);
4357 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4358 bitsize_zero_node);
4359 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4360 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4361 gimple_assign_set_lhs (epilog_stmt, new_temp);
4362 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4364 /* In SLP we don't need to apply reduction operation, so we just
4365 collect s' values in SCALAR_RESULTS. */
4366 if (slp_reduc)
4367 scalar_results.safe_push (new_temp);
4369 for (bit_offset = element_bitsize;
4370 bit_offset < vec_size_in_bits;
4371 bit_offset += element_bitsize)
4373 tree bitpos = bitsize_int (bit_offset);
4374 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4375 bitsize, bitpos);
4377 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4378 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4379 gimple_assign_set_lhs (epilog_stmt, new_name);
4380 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4382 if (slp_reduc)
4384 /* In SLP we don't need to apply reduction operation, so
4385 we just collect s' values in SCALAR_RESULTS. */
4386 new_temp = new_name;
4387 scalar_results.safe_push (new_name);
4389 else
4391 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4392 new_name, new_temp);
4393 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4394 gimple_assign_set_lhs (epilog_stmt, new_temp);
4395 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4400 /* The only case where we need to reduce scalar results in SLP, is
4401 unrolling. If the size of SCALAR_RESULTS is greater than
4402 GROUP_SIZE, we reduce them combining elements modulo
4403 GROUP_SIZE. */
4404 if (slp_reduc)
4406 tree res, first_res, new_res;
4407 gimple *new_stmt;
4409 /* Reduce multiple scalar results in case of SLP unrolling. */
4410 for (j = group_size; scalar_results.iterate (j, &res);
4411 j++)
4413 first_res = scalar_results[j % group_size];
4414 new_stmt = gimple_build_assign (new_scalar_dest, code,
4415 first_res, res);
4416 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4417 gimple_assign_set_lhs (new_stmt, new_res);
4418 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4419 scalar_results[j % group_size] = new_res;
4422 else
4423 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4424 scalar_results.safe_push (new_temp);
4428 vect_finalize_reduction:
4430 if (double_reduc)
4431 loop = loop->inner;
4433 /* 2.5 Adjust the final result by the initial value of the reduction
4434 variable. (When such adjustment is not needed, then
4435 'adjustment_def' is zero). For example, if code is PLUS we create:
4436 new_temp = loop_exit_def + adjustment_def */
4438 if (adjustment_def)
4440 gcc_assert (!slp_reduc);
4441 if (nested_in_vect_loop)
4443 new_phi = new_phis[0];
4444 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4445 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4446 new_dest = vect_create_destination_var (scalar_dest, vectype);
4448 else
4450 new_temp = scalar_results[0];
4451 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4452 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4453 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4456 epilog_stmt = gimple_build_assign (new_dest, expr);
4457 new_temp = make_ssa_name (new_dest, epilog_stmt);
4458 gimple_assign_set_lhs (epilog_stmt, new_temp);
4459 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4460 if (nested_in_vect_loop)
4462 set_vinfo_for_stmt (epilog_stmt,
4463 new_stmt_vec_info (epilog_stmt, loop_vinfo));
4464 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4465 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4467 if (!double_reduc)
4468 scalar_results.quick_push (new_temp);
4469 else
4470 scalar_results[0] = new_temp;
4472 else
4473 scalar_results[0] = new_temp;
4475 new_phis[0] = epilog_stmt;
4478 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4479 phis with new adjusted scalar results, i.e., replace use <s_out0>
4480 with use <s_out4>.
4482 Transform:
4483 loop_exit:
4484 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4485 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4486 v_out2 = reduce <v_out1>
4487 s_out3 = extract_field <v_out2, 0>
4488 s_out4 = adjust_result <s_out3>
4489 use <s_out0>
4490 use <s_out0>
4492 into:
4494 loop_exit:
4495 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4496 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4497 v_out2 = reduce <v_out1>
4498 s_out3 = extract_field <v_out2, 0>
4499 s_out4 = adjust_result <s_out3>
4500 use <s_out4>
4501 use <s_out4> */
4504 /* In SLP reduction chain we reduce vector results into one vector if
4505 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4506 the last stmt in the reduction chain, since we are looking for the loop
4507 exit phi node. */
4508 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4510 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4511 /* Handle reduction patterns. */
4512 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4513 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4515 scalar_dest = gimple_assign_lhs (dest_stmt);
4516 group_size = 1;
4519 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4520 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4521 need to match SCALAR_RESULTS with corresponding statements. The first
4522 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4523 the first vector stmt, etc.
4524 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4525 if (group_size > new_phis.length ())
4527 ratio = group_size / new_phis.length ();
4528 gcc_assert (!(group_size % new_phis.length ()));
4530 else
4531 ratio = 1;
4533 for (k = 0; k < group_size; k++)
4535 if (k % ratio == 0)
4537 epilog_stmt = new_phis[k / ratio];
4538 reduction_phi = reduction_phis[k / ratio];
4539 if (double_reduc)
4540 inner_phi = inner_phis[k / ratio];
4543 if (slp_reduc)
4545 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4547 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4548 /* SLP statements can't participate in patterns. */
4549 gcc_assert (!orig_stmt);
4550 scalar_dest = gimple_assign_lhs (current_stmt);
4553 phis.create (3);
4554 /* Find the loop-closed-use at the loop exit of the original scalar
4555 result. (The reduction result is expected to have two immediate uses -
4556 one at the latch block, and one at the loop exit). */
4557 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4558 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4559 && !is_gimple_debug (USE_STMT (use_p)))
4560 phis.safe_push (USE_STMT (use_p));
4562 /* While we expect to have found an exit_phi because of loop-closed-ssa
4563 form we can end up without one if the scalar cycle is dead. */
4565 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4567 if (outer_loop)
4569 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4570 gphi *vect_phi;
4572 /* FORNOW. Currently not supporting the case that an inner-loop
4573 reduction is not used in the outer-loop (but only outside the
4574 outer-loop), unless it is double reduction. */
4575 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4576 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4577 || double_reduc);
4579 if (double_reduc)
4580 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4581 else
4582 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4583 if (!double_reduc
4584 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4585 != vect_double_reduction_def)
4586 continue;
4588 /* Handle double reduction:
4590 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4591 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4592 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4593 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4595 At that point the regular reduction (stmt2 and stmt3) is
4596 already vectorized, as well as the exit phi node, stmt4.
4597 Here we vectorize the phi node of double reduction, stmt1, and
4598 update all relevant statements. */
4600 /* Go through all the uses of s2 to find double reduction phi
4601 node, i.e., stmt1 above. */
4602 orig_name = PHI_RESULT (exit_phi);
4603 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4605 stmt_vec_info use_stmt_vinfo;
4606 stmt_vec_info new_phi_vinfo;
4607 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4608 basic_block bb = gimple_bb (use_stmt);
4609 gimple *use;
4611 /* Check that USE_STMT is really double reduction phi
4612 node. */
4613 if (gimple_code (use_stmt) != GIMPLE_PHI
4614 || gimple_phi_num_args (use_stmt) != 2
4615 || bb->loop_father != outer_loop)
4616 continue;
4617 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4618 if (!use_stmt_vinfo
4619 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4620 != vect_double_reduction_def)
4621 continue;
4623 /* Create vector phi node for double reduction:
4624 vs1 = phi <vs0, vs2>
4625 vs1 was created previously in this function by a call to
4626 vect_get_vec_def_for_operand and is stored in
4627 vec_initial_def;
4628 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4629 vs0 is created here. */
4631 /* Create vector phi node. */
4632 vect_phi = create_phi_node (vec_initial_def, bb);
4633 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4634 loop_vec_info_for_loop (outer_loop));
4635 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4637 /* Create vs0 - initial def of the double reduction phi. */
4638 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4639 loop_preheader_edge (outer_loop));
4640 init_def = get_initial_def_for_reduction (stmt,
4641 preheader_arg, NULL);
4642 vect_phi_init = vect_init_vector (use_stmt, init_def,
4643 vectype, NULL);
4645 /* Update phi node arguments with vs0 and vs2. */
4646 add_phi_arg (vect_phi, vect_phi_init,
4647 loop_preheader_edge (outer_loop),
4648 UNKNOWN_LOCATION);
4649 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4650 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4651 if (dump_enabled_p ())
4653 dump_printf_loc (MSG_NOTE, vect_location,
4654 "created double reduction phi node: ");
4655 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4656 dump_printf (MSG_NOTE, "\n");
4659 vect_phi_res = PHI_RESULT (vect_phi);
4661 /* Replace the use, i.e., set the correct vs1 in the regular
4662 reduction phi node. FORNOW, NCOPIES is always 1, so the
4663 loop is redundant. */
4664 use = reduction_phi;
4665 for (j = 0; j < ncopies; j++)
4667 edge pr_edge = loop_preheader_edge (loop);
4668 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4669 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4675 phis.release ();
4676 if (nested_in_vect_loop)
4678 if (double_reduc)
4679 loop = outer_loop;
4680 else
4681 continue;
4684 phis.create (3);
4685 /* Find the loop-closed-use at the loop exit of the original scalar
4686 result. (The reduction result is expected to have two immediate uses,
4687 one at the latch block, and one at the loop exit). For double
4688 reductions we are looking for exit phis of the outer loop. */
4689 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4691 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4693 if (!is_gimple_debug (USE_STMT (use_p)))
4694 phis.safe_push (USE_STMT (use_p));
4696 else
4698 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4700 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4702 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4704 if (!flow_bb_inside_loop_p (loop,
4705 gimple_bb (USE_STMT (phi_use_p)))
4706 && !is_gimple_debug (USE_STMT (phi_use_p)))
4707 phis.safe_push (USE_STMT (phi_use_p));
4713 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4715 /* Replace the uses: */
4716 orig_name = PHI_RESULT (exit_phi);
4717 scalar_result = scalar_results[k];
4718 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4719 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4720 SET_USE (use_p, scalar_result);
4723 phis.release ();
4728 /* Function vectorizable_reduction.
4730 Check if STMT performs a reduction operation that can be vectorized.
4731 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4732 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4733 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4735 This function also handles reduction idioms (patterns) that have been
4736 recognized in advance during vect_pattern_recog. In this case, STMT may be
4737 of this form:
4738 X = pattern_expr (arg0, arg1, ..., X)
4739 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4740 sequence that had been detected and replaced by the pattern-stmt (STMT).
4742 In some cases of reduction patterns, the type of the reduction variable X is
4743 different than the type of the other arguments of STMT.
4744 In such cases, the vectype that is used when transforming STMT into a vector
4745 stmt is different than the vectype that is used to determine the
4746 vectorization factor, because it consists of a different number of elements
4747 than the actual number of elements that are being operated upon in parallel.
4749 For example, consider an accumulation of shorts into an int accumulator.
4750 On some targets it's possible to vectorize this pattern operating on 8
4751 shorts at a time (hence, the vectype for purposes of determining the
4752 vectorization factor should be V8HI); on the other hand, the vectype that
4753 is used to create the vector form is actually V4SI (the type of the result).
4755 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4756 indicates what is the actual level of parallelism (V8HI in the example), so
4757 that the right vectorization factor would be derived. This vectype
4758 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4759 be used to create the vectorized stmt. The right vectype for the vectorized
4760 stmt is obtained from the type of the result X:
4761 get_vectype_for_scalar_type (TREE_TYPE (X))
4763 This means that, contrary to "regular" reductions (or "regular" stmts in
4764 general), the following equation:
4765 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4766 does *NOT* necessarily hold for reduction patterns. */
4768 bool
4769 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
4770 gimple **vec_stmt, slp_tree slp_node)
4772 tree vec_dest;
4773 tree scalar_dest;
4774 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4775 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4776 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4777 tree vectype_in = NULL_TREE;
4778 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4779 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4780 enum tree_code code, orig_code, epilog_reduc_code;
4781 machine_mode vec_mode;
4782 int op_type;
4783 optab optab, reduc_optab;
4784 tree new_temp = NULL_TREE;
4785 gimple *def_stmt;
4786 enum vect_def_type dt;
4787 gphi *new_phi = NULL;
4788 tree scalar_type;
4789 bool is_simple_use;
4790 gimple *orig_stmt;
4791 stmt_vec_info orig_stmt_info;
4792 tree expr = NULL_TREE;
4793 int i;
4794 int ncopies;
4795 int epilog_copies;
4796 stmt_vec_info prev_stmt_info, prev_phi_info;
4797 bool single_defuse_cycle = false;
4798 tree reduc_def = NULL_TREE;
4799 gimple *new_stmt = NULL;
4800 int j;
4801 tree ops[3];
4802 bool nested_cycle = false, found_nested_cycle_def = false;
4803 gimple *reduc_def_stmt = NULL;
4804 bool double_reduc = false, dummy;
4805 basic_block def_bb;
4806 struct loop * def_stmt_loop, *outer_loop = NULL;
4807 tree def_arg;
4808 gimple *def_arg_stmt;
4809 auto_vec<tree> vec_oprnds0;
4810 auto_vec<tree> vec_oprnds1;
4811 auto_vec<tree> vect_defs;
4812 auto_vec<gimple *> phis;
4813 int vec_num;
4814 tree def0, def1, tem, op0, op1 = NULL_TREE;
4815 bool first_p = true;
4817 /* In case of reduction chain we switch to the first stmt in the chain, but
4818 we don't update STMT_INFO, since only the last stmt is marked as reduction
4819 and has reduction properties. */
4820 if (GROUP_FIRST_ELEMENT (stmt_info)
4821 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4823 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4824 first_p = false;
4827 if (nested_in_vect_loop_p (loop, stmt))
4829 outer_loop = loop;
4830 loop = loop->inner;
4831 nested_cycle = true;
4834 /* 1. Is vectorizable reduction? */
4835 /* Not supportable if the reduction variable is used in the loop, unless
4836 it's a reduction chain. */
4837 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4838 && !GROUP_FIRST_ELEMENT (stmt_info))
4839 return false;
4841 /* Reductions that are not used even in an enclosing outer-loop,
4842 are expected to be "live" (used out of the loop). */
4843 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4844 && !STMT_VINFO_LIVE_P (stmt_info))
4845 return false;
4847 /* Make sure it was already recognized as a reduction computation. */
4848 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4849 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4850 return false;
4852 /* 2. Has this been recognized as a reduction pattern?
4854 Check if STMT represents a pattern that has been recognized
4855 in earlier analysis stages. For stmts that represent a pattern,
4856 the STMT_VINFO_RELATED_STMT field records the last stmt in
4857 the original sequence that constitutes the pattern. */
4859 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4860 if (orig_stmt)
4862 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4863 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4864 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4867 /* 3. Check the operands of the operation. The first operands are defined
4868 inside the loop body. The last operand is the reduction variable,
4869 which is defined by the loop-header-phi. */
4871 gcc_assert (is_gimple_assign (stmt));
4873 /* Flatten RHS. */
4874 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4876 case GIMPLE_SINGLE_RHS:
4877 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4878 if (op_type == ternary_op)
4880 tree rhs = gimple_assign_rhs1 (stmt);
4881 ops[0] = TREE_OPERAND (rhs, 0);
4882 ops[1] = TREE_OPERAND (rhs, 1);
4883 ops[2] = TREE_OPERAND (rhs, 2);
4884 code = TREE_CODE (rhs);
4886 else
4887 return false;
4888 break;
4890 case GIMPLE_BINARY_RHS:
4891 code = gimple_assign_rhs_code (stmt);
4892 op_type = TREE_CODE_LENGTH (code);
4893 gcc_assert (op_type == binary_op);
4894 ops[0] = gimple_assign_rhs1 (stmt);
4895 ops[1] = gimple_assign_rhs2 (stmt);
4896 break;
4898 case GIMPLE_TERNARY_RHS:
4899 code = gimple_assign_rhs_code (stmt);
4900 op_type = TREE_CODE_LENGTH (code);
4901 gcc_assert (op_type == ternary_op);
4902 ops[0] = gimple_assign_rhs1 (stmt);
4903 ops[1] = gimple_assign_rhs2 (stmt);
4904 ops[2] = gimple_assign_rhs3 (stmt);
4905 break;
4907 case GIMPLE_UNARY_RHS:
4908 return false;
4910 default:
4911 gcc_unreachable ();
4913 /* The default is that the reduction variable is the last in statement. */
4914 int reduc_index = op_type - 1;
4916 if (code == COND_EXPR && slp_node)
4917 return false;
4919 scalar_dest = gimple_assign_lhs (stmt);
4920 scalar_type = TREE_TYPE (scalar_dest);
4921 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4922 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4923 return false;
4925 /* Do not try to vectorize bit-precision reductions. */
4926 if ((TYPE_PRECISION (scalar_type)
4927 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4928 return false;
4930 /* All uses but the last are expected to be defined in the loop.
4931 The last use is the reduction variable. In case of nested cycle this
4932 assumption is not true: we use reduc_index to record the index of the
4933 reduction variable. */
4934 for (i = 0; i < op_type - 1; i++)
4936 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4937 if (i == 0 && code == COND_EXPR)
4938 continue;
4940 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
4941 &def_stmt, &dt, &tem);
4942 if (!vectype_in)
4943 vectype_in = tem;
4944 gcc_assert (is_simple_use);
4946 if (dt != vect_internal_def
4947 && dt != vect_external_def
4948 && dt != vect_constant_def
4949 && dt != vect_induction_def
4950 && !(dt == vect_nested_cycle && nested_cycle))
4951 return false;
4953 if (dt == vect_nested_cycle)
4955 found_nested_cycle_def = true;
4956 reduc_def_stmt = def_stmt;
4957 reduc_index = i;
4961 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &dt, &tem);
4962 if (!vectype_in)
4963 vectype_in = tem;
4964 gcc_assert (is_simple_use);
4965 if (!found_nested_cycle_def)
4966 reduc_def_stmt = def_stmt;
4968 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
4969 return false;
4971 if (!(dt == vect_reduction_def
4972 || dt == vect_nested_cycle
4973 || ((dt == vect_internal_def || dt == vect_external_def
4974 || dt == vect_constant_def || dt == vect_induction_def)
4975 && nested_cycle && found_nested_cycle_def)))
4977 /* For pattern recognized stmts, orig_stmt might be a reduction,
4978 but some helper statements for the pattern might not, or
4979 might be COND_EXPRs with reduction uses in the condition. */
4980 gcc_assert (orig_stmt);
4981 return false;
4984 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4985 !nested_cycle, &dummy, false);
4986 if (orig_stmt)
4987 gcc_assert (tmp == orig_stmt
4988 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
4989 else
4990 /* We changed STMT to be the first stmt in reduction chain, hence we
4991 check that in this case the first element in the chain is STMT. */
4992 gcc_assert (stmt == tmp
4993 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4995 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4996 return false;
4998 if (slp_node || PURE_SLP_STMT (stmt_info))
4999 ncopies = 1;
5000 else
5001 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5002 / TYPE_VECTOR_SUBPARTS (vectype_in));
5004 gcc_assert (ncopies >= 1);
5006 vec_mode = TYPE_MODE (vectype_in);
5008 if (code == COND_EXPR)
5010 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5012 if (dump_enabled_p ())
5013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5014 "unsupported condition in reduction\n");
5016 return false;
5019 else
5021 /* 4. Supportable by target? */
5023 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5024 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5026 /* Shifts and rotates are only supported by vectorizable_shifts,
5027 not vectorizable_reduction. */
5028 if (dump_enabled_p ())
5029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5030 "unsupported shift or rotation.\n");
5031 return false;
5034 /* 4.1. check support for the operation in the loop */
5035 optab = optab_for_tree_code (code, vectype_in, optab_default);
5036 if (!optab)
5038 if (dump_enabled_p ())
5039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5040 "no optab.\n");
5042 return false;
5045 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5047 if (dump_enabled_p ())
5048 dump_printf (MSG_NOTE, "op not supported by target.\n");
5050 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5051 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5052 < vect_min_worthwhile_factor (code))
5053 return false;
5055 if (dump_enabled_p ())
5056 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5059 /* Worthwhile without SIMD support? */
5060 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5061 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5062 < vect_min_worthwhile_factor (code))
5064 if (dump_enabled_p ())
5065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5066 "not worthwhile without SIMD support.\n");
5068 return false;
5072 /* 4.2. Check support for the epilog operation.
5074 If STMT represents a reduction pattern, then the type of the
5075 reduction variable may be different than the type of the rest
5076 of the arguments. For example, consider the case of accumulation
5077 of shorts into an int accumulator; The original code:
5078 S1: int_a = (int) short_a;
5079 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5081 was replaced with:
5082 STMT: int_acc = widen_sum <short_a, int_acc>
5084 This means that:
5085 1. The tree-code that is used to create the vector operation in the
5086 epilog code (that reduces the partial results) is not the
5087 tree-code of STMT, but is rather the tree-code of the original
5088 stmt from the pattern that STMT is replacing. I.e, in the example
5089 above we want to use 'widen_sum' in the loop, but 'plus' in the
5090 epilog.
5091 2. The type (mode) we use to check available target support
5092 for the vector operation to be created in the *epilog*, is
5093 determined by the type of the reduction variable (in the example
5094 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5095 However the type (mode) we use to check available target support
5096 for the vector operation to be created *inside the loop*, is
5097 determined by the type of the other arguments to STMT (in the
5098 example we'd check this: optab_handler (widen_sum_optab,
5099 vect_short_mode)).
5101 This is contrary to "regular" reductions, in which the types of all
5102 the arguments are the same as the type of the reduction variable.
5103 For "regular" reductions we can therefore use the same vector type
5104 (and also the same tree-code) when generating the epilog code and
5105 when generating the code inside the loop. */
5107 if (orig_stmt)
5109 /* This is a reduction pattern: get the vectype from the type of the
5110 reduction variable, and get the tree-code from orig_stmt. */
5111 orig_code = gimple_assign_rhs_code (orig_stmt);
5112 gcc_assert (vectype_out);
5113 vec_mode = TYPE_MODE (vectype_out);
5115 else
5117 /* Regular reduction: use the same vectype and tree-code as used for
5118 the vector code inside the loop can be used for the epilog code. */
5119 orig_code = code;
5122 if (nested_cycle)
5124 def_bb = gimple_bb (reduc_def_stmt);
5125 def_stmt_loop = def_bb->loop_father;
5126 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5127 loop_preheader_edge (def_stmt_loop));
5128 if (TREE_CODE (def_arg) == SSA_NAME
5129 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5130 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5131 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5132 && vinfo_for_stmt (def_arg_stmt)
5133 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5134 == vect_double_reduction_def)
5135 double_reduc = true;
5138 epilog_reduc_code = ERROR_MARK;
5139 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5141 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5142 optab_default);
5143 if (!reduc_optab)
5145 if (dump_enabled_p ())
5146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5147 "no optab for reduction.\n");
5149 epilog_reduc_code = ERROR_MARK;
5151 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5153 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5154 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5156 if (dump_enabled_p ())
5157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5158 "reduc op not supported by target.\n");
5160 epilog_reduc_code = ERROR_MARK;
5164 else
5166 if (!nested_cycle || double_reduc)
5168 if (dump_enabled_p ())
5169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5170 "no reduc code for scalar code.\n");
5172 return false;
5176 if (double_reduc && ncopies > 1)
5178 if (dump_enabled_p ())
5179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5180 "multiple types in double reduction\n");
5182 return false;
5185 /* In case of widenning multiplication by a constant, we update the type
5186 of the constant to be the type of the other operand. We check that the
5187 constant fits the type in the pattern recognition pass. */
5188 if (code == DOT_PROD_EXPR
5189 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5191 if (TREE_CODE (ops[0]) == INTEGER_CST)
5192 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5193 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5194 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5195 else
5197 if (dump_enabled_p ())
5198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5199 "invalid types in dot-prod\n");
5201 return false;
5205 if (!vec_stmt) /* transformation not required. */
5207 if (first_p
5208 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5209 reduc_index))
5210 return false;
5211 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5212 return true;
5215 /** Transform. **/
5217 if (dump_enabled_p ())
5218 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5220 /* FORNOW: Multiple types are not supported for condition. */
5221 if (code == COND_EXPR)
5222 gcc_assert (ncopies == 1);
5224 /* Create the destination vector */
5225 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5227 /* In case the vectorization factor (VF) is bigger than the number
5228 of elements that we can fit in a vectype (nunits), we have to generate
5229 more than one vector stmt - i.e - we need to "unroll" the
5230 vector stmt by a factor VF/nunits. For more details see documentation
5231 in vectorizable_operation. */
5233 /* If the reduction is used in an outer loop we need to generate
5234 VF intermediate results, like so (e.g. for ncopies=2):
5235 r0 = phi (init, r0)
5236 r1 = phi (init, r1)
5237 r0 = x0 + r0;
5238 r1 = x1 + r1;
5239 (i.e. we generate VF results in 2 registers).
5240 In this case we have a separate def-use cycle for each copy, and therefore
5241 for each copy we get the vector def for the reduction variable from the
5242 respective phi node created for this copy.
5244 Otherwise (the reduction is unused in the loop nest), we can combine
5245 together intermediate results, like so (e.g. for ncopies=2):
5246 r = phi (init, r)
5247 r = x0 + r;
5248 r = x1 + r;
5249 (i.e. we generate VF/2 results in a single register).
5250 In this case for each copy we get the vector def for the reduction variable
5251 from the vectorized reduction operation generated in the previous iteration.
5254 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5256 single_defuse_cycle = true;
5257 epilog_copies = 1;
5259 else
5260 epilog_copies = ncopies;
5262 prev_stmt_info = NULL;
5263 prev_phi_info = NULL;
5264 if (slp_node)
5265 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5266 else
5268 vec_num = 1;
5269 vec_oprnds0.create (1);
5270 if (op_type == ternary_op)
5271 vec_oprnds1.create (1);
5274 phis.create (vec_num);
5275 vect_defs.create (vec_num);
5276 if (!slp_node)
5277 vect_defs.quick_push (NULL_TREE);
5279 for (j = 0; j < ncopies; j++)
5281 if (j == 0 || !single_defuse_cycle)
5283 for (i = 0; i < vec_num; i++)
5285 /* Create the reduction-phi that defines the reduction
5286 operand. */
5287 new_phi = create_phi_node (vec_dest, loop->header);
5288 set_vinfo_for_stmt (new_phi,
5289 new_stmt_vec_info (new_phi, loop_vinfo));
5290 if (j == 0 || slp_node)
5291 phis.quick_push (new_phi);
5295 if (code == COND_EXPR)
5297 gcc_assert (!slp_node);
5298 vectorizable_condition (stmt, gsi, vec_stmt,
5299 PHI_RESULT (phis[0]),
5300 reduc_index, NULL);
5301 /* Multiple types are not supported for condition. */
5302 break;
5305 /* Handle uses. */
5306 if (j == 0)
5308 op0 = ops[!reduc_index];
5309 if (op_type == ternary_op)
5311 if (reduc_index == 0)
5312 op1 = ops[2];
5313 else
5314 op1 = ops[1];
5317 if (slp_node)
5318 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5319 slp_node, -1);
5320 else
5322 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5323 stmt);
5324 vec_oprnds0.quick_push (loop_vec_def0);
5325 if (op_type == ternary_op)
5327 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5328 vec_oprnds1.quick_push (loop_vec_def1);
5332 else
5334 if (!slp_node)
5336 enum vect_def_type dt;
5337 gimple *dummy_stmt;
5339 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5340 &dummy_stmt, &dt);
5341 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5342 loop_vec_def0);
5343 vec_oprnds0[0] = loop_vec_def0;
5344 if (op_type == ternary_op)
5346 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
5347 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5348 loop_vec_def1);
5349 vec_oprnds1[0] = loop_vec_def1;
5353 if (single_defuse_cycle)
5354 reduc_def = gimple_assign_lhs (new_stmt);
5356 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5359 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5361 if (slp_node)
5362 reduc_def = PHI_RESULT (phis[i]);
5363 else
5365 if (!single_defuse_cycle || j == 0)
5366 reduc_def = PHI_RESULT (new_phi);
5369 def1 = ((op_type == ternary_op)
5370 ? vec_oprnds1[i] : NULL);
5371 if (op_type == binary_op)
5373 if (reduc_index == 0)
5374 expr = build2 (code, vectype_out, reduc_def, def0);
5375 else
5376 expr = build2 (code, vectype_out, def0, reduc_def);
5378 else
5380 if (reduc_index == 0)
5381 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5382 else
5384 if (reduc_index == 1)
5385 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5386 else
5387 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5391 new_stmt = gimple_build_assign (vec_dest, expr);
5392 new_temp = make_ssa_name (vec_dest, new_stmt);
5393 gimple_assign_set_lhs (new_stmt, new_temp);
5394 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5396 if (slp_node)
5398 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5399 vect_defs.quick_push (new_temp);
5401 else
5402 vect_defs[0] = new_temp;
5405 if (slp_node)
5406 continue;
5408 if (j == 0)
5409 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5410 else
5411 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5413 prev_stmt_info = vinfo_for_stmt (new_stmt);
5414 prev_phi_info = vinfo_for_stmt (new_phi);
5417 /* Finalize the reduction-phi (set its arguments) and create the
5418 epilog reduction code. */
5419 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5421 new_temp = gimple_assign_lhs (*vec_stmt);
5422 vect_defs[0] = new_temp;
5425 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5426 epilog_reduc_code, phis, reduc_index,
5427 double_reduc, slp_node);
5429 return true;
5432 /* Function vect_min_worthwhile_factor.
5434 For a loop where we could vectorize the operation indicated by CODE,
5435 return the minimum vectorization factor that makes it worthwhile
5436 to use generic vectors. */
5438 vect_min_worthwhile_factor (enum tree_code code)
5440 switch (code)
5442 case PLUS_EXPR:
5443 case MINUS_EXPR:
5444 case NEGATE_EXPR:
5445 return 4;
5447 case BIT_AND_EXPR:
5448 case BIT_IOR_EXPR:
5449 case BIT_XOR_EXPR:
5450 case BIT_NOT_EXPR:
5451 return 2;
5453 default:
5454 return INT_MAX;
5459 /* Function vectorizable_induction
5461 Check if PHI performs an induction computation that can be vectorized.
5462 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5463 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5464 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5466 bool
5467 vectorizable_induction (gimple *phi,
5468 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5469 gimple **vec_stmt)
5471 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5472 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5474 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5475 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5476 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5477 tree vec_def;
5479 gcc_assert (ncopies >= 1);
5480 /* FORNOW. These restrictions should be relaxed. */
5481 if (nested_in_vect_loop_p (loop, phi))
5483 imm_use_iterator imm_iter;
5484 use_operand_p use_p;
5485 gimple *exit_phi;
5486 edge latch_e;
5487 tree loop_arg;
5489 if (ncopies > 1)
5491 if (dump_enabled_p ())
5492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5493 "multiple types in nested loop.\n");
5494 return false;
5497 exit_phi = NULL;
5498 latch_e = loop_latch_edge (loop->inner);
5499 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5500 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5502 gimple *use_stmt = USE_STMT (use_p);
5503 if (is_gimple_debug (use_stmt))
5504 continue;
5506 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5508 exit_phi = use_stmt;
5509 break;
5512 if (exit_phi)
5514 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5515 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5516 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5518 if (dump_enabled_p ())
5519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5520 "inner-loop induction only used outside "
5521 "of the outer vectorized loop.\n");
5522 return false;
5527 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5528 return false;
5530 /* FORNOW: SLP not supported. */
5531 if (STMT_SLP_TYPE (stmt_info))
5532 return false;
5534 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5536 if (gimple_code (phi) != GIMPLE_PHI)
5537 return false;
5539 if (!vec_stmt) /* transformation not required. */
5541 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5542 if (dump_enabled_p ())
5543 dump_printf_loc (MSG_NOTE, vect_location,
5544 "=== vectorizable_induction ===\n");
5545 vect_model_induction_cost (stmt_info, ncopies);
5546 return true;
5549 /** Transform. **/
5551 if (dump_enabled_p ())
5552 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5554 vec_def = get_initial_def_for_induction (phi);
5555 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5556 return true;
5559 /* Function vectorizable_live_operation.
5561 STMT computes a value that is used outside the loop. Check if
5562 it can be supported. */
5564 bool
5565 vectorizable_live_operation (gimple *stmt,
5566 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5567 gimple **vec_stmt)
5569 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5570 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5571 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5572 int i;
5573 int op_type;
5574 tree op;
5575 gimple *def_stmt;
5576 enum vect_def_type dt;
5577 enum tree_code code;
5578 enum gimple_rhs_class rhs_class;
5580 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5582 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5583 return false;
5585 if (!is_gimple_assign (stmt))
5587 if (gimple_call_internal_p (stmt)
5588 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5589 && gimple_call_lhs (stmt)
5590 && loop->simduid
5591 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5592 && loop->simduid
5593 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5595 edge e = single_exit (loop);
5596 basic_block merge_bb = e->dest;
5597 imm_use_iterator imm_iter;
5598 use_operand_p use_p;
5599 tree lhs = gimple_call_lhs (stmt);
5601 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5603 gimple *use_stmt = USE_STMT (use_p);
5604 if (gimple_code (use_stmt) == GIMPLE_PHI
5605 && gimple_bb (use_stmt) == merge_bb)
5607 if (vec_stmt)
5609 tree vfm1
5610 = build_int_cst (unsigned_type_node,
5611 loop_vinfo->vectorization_factor - 1);
5612 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5614 return true;
5619 return false;
5622 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5623 return false;
5625 /* FORNOW. CHECKME. */
5626 if (nested_in_vect_loop_p (loop, stmt))
5627 return false;
5629 code = gimple_assign_rhs_code (stmt);
5630 op_type = TREE_CODE_LENGTH (code);
5631 rhs_class = get_gimple_rhs_class (code);
5632 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5633 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5635 /* FORNOW: support only if all uses are invariant. This means
5636 that the scalar operations can remain in place, unvectorized.
5637 The original last scalar value that they compute will be used. */
5639 for (i = 0; i < op_type; i++)
5641 if (rhs_class == GIMPLE_SINGLE_RHS)
5642 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5643 else
5644 op = gimple_op (stmt, i + 1);
5645 if (op
5646 && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
5648 if (dump_enabled_p ())
5649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5650 "use not simple.\n");
5651 return false;
5654 if (dt != vect_external_def && dt != vect_constant_def)
5655 return false;
5658 /* No transformation is required for the cases we currently support. */
5659 return true;
5662 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5664 static void
5665 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
5667 ssa_op_iter op_iter;
5668 imm_use_iterator imm_iter;
5669 def_operand_p def_p;
5670 gimple *ustmt;
5672 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5674 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5676 basic_block bb;
5678 if (!is_gimple_debug (ustmt))
5679 continue;
5681 bb = gimple_bb (ustmt);
5683 if (!flow_bb_inside_loop_p (loop, bb))
5685 if (gimple_debug_bind_p (ustmt))
5687 if (dump_enabled_p ())
5688 dump_printf_loc (MSG_NOTE, vect_location,
5689 "killing debug use\n");
5691 gimple_debug_bind_reset_value (ustmt);
5692 update_stmt (ustmt);
5694 else
5695 gcc_unreachable ();
5702 /* This function builds ni_name = number of iterations. Statements
5703 are emitted on the loop preheader edge. */
5705 static tree
5706 vect_build_loop_niters (loop_vec_info loop_vinfo)
5708 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5709 if (TREE_CODE (ni) == INTEGER_CST)
5710 return ni;
5711 else
5713 tree ni_name, var;
5714 gimple_seq stmts = NULL;
5715 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5717 var = create_tmp_var (TREE_TYPE (ni), "niters");
5718 ni_name = force_gimple_operand (ni, &stmts, false, var);
5719 if (stmts)
5720 gsi_insert_seq_on_edge_immediate (pe, stmts);
5722 return ni_name;
5727 /* This function generates the following statements:
5729 ni_name = number of iterations loop executes
5730 ratio = ni_name / vf
5731 ratio_mult_vf_name = ratio * vf
5733 and places them on the loop preheader edge. */
5735 static void
5736 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5737 tree ni_name,
5738 tree *ratio_mult_vf_name_ptr,
5739 tree *ratio_name_ptr)
5741 tree ni_minus_gap_name;
5742 tree var;
5743 tree ratio_name;
5744 tree ratio_mult_vf_name;
5745 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5746 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5747 tree log_vf;
5749 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5751 /* If epilogue loop is required because of data accesses with gaps, we
5752 subtract one iteration from the total number of iterations here for
5753 correct calculation of RATIO. */
5754 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5756 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5757 ni_name,
5758 build_one_cst (TREE_TYPE (ni_name)));
5759 if (!is_gimple_val (ni_minus_gap_name))
5761 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5762 gimple *stmts = NULL;
5763 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5764 true, var);
5765 gsi_insert_seq_on_edge_immediate (pe, stmts);
5768 else
5769 ni_minus_gap_name = ni_name;
5771 /* Create: ratio = ni >> log2(vf) */
5772 /* ??? As we have ni == number of latch executions + 1, ni could
5773 have overflown to zero. So avoid computing ratio based on ni
5774 but compute it using the fact that we know ratio will be at least
5775 one, thus via (ni - vf) >> log2(vf) + 1. */
5776 ratio_name
5777 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5778 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5779 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5780 ni_minus_gap_name,
5781 build_int_cst
5782 (TREE_TYPE (ni_name), vf)),
5783 log_vf),
5784 build_int_cst (TREE_TYPE (ni_name), 1));
5785 if (!is_gimple_val (ratio_name))
5787 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5788 gimple *stmts = NULL;
5789 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5790 gsi_insert_seq_on_edge_immediate (pe, stmts);
5792 *ratio_name_ptr = ratio_name;
5794 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5796 if (ratio_mult_vf_name_ptr)
5798 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5799 ratio_name, log_vf);
5800 if (!is_gimple_val (ratio_mult_vf_name))
5802 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5803 gimple *stmts = NULL;
5804 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5805 true, var);
5806 gsi_insert_seq_on_edge_immediate (pe, stmts);
5808 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5811 return;
5815 /* Function vect_transform_loop.
5817 The analysis phase has determined that the loop is vectorizable.
5818 Vectorize the loop - created vectorized stmts to replace the scalar
5819 stmts in the loop, and update the loop exit condition. */
5821 void
5822 vect_transform_loop (loop_vec_info loop_vinfo)
5824 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5825 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5826 int nbbs = loop->num_nodes;
5827 int i;
5828 tree ratio = NULL;
5829 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5830 bool grouped_store;
5831 bool slp_scheduled = false;
5832 gimple *stmt, *pattern_stmt;
5833 gimple_seq pattern_def_seq = NULL;
5834 gimple_stmt_iterator pattern_def_si = gsi_none ();
5835 bool transform_pattern_stmt = false;
5836 bool check_profitability = false;
5837 int th;
5838 /* Record number of iterations before we started tampering with the profile. */
5839 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5841 if (dump_enabled_p ())
5842 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5844 /* If profile is inprecise, we have chance to fix it up. */
5845 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5846 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5848 /* Use the more conservative vectorization threshold. If the number
5849 of iterations is constant assume the cost check has been performed
5850 by our caller. If the threshold makes all loops profitable that
5851 run at least the vectorization factor number of times checking
5852 is pointless, too. */
5853 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5854 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5855 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5857 if (dump_enabled_p ())
5858 dump_printf_loc (MSG_NOTE, vect_location,
5859 "Profitability threshold is %d loop iterations.\n",
5860 th);
5861 check_profitability = true;
5864 /* Version the loop first, if required, so the profitability check
5865 comes first. */
5867 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5868 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5870 vect_loop_versioning (loop_vinfo, th, check_profitability);
5871 check_profitability = false;
5874 tree ni_name = vect_build_loop_niters (loop_vinfo);
5875 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5877 /* Peel the loop if there are data refs with unknown alignment.
5878 Only one data ref with unknown store is allowed. */
5880 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5882 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5883 th, check_profitability);
5884 check_profitability = false;
5885 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5886 be re-computed. */
5887 ni_name = NULL_TREE;
5890 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5891 compile time constant), or it is a constant that doesn't divide by the
5892 vectorization factor, then an epilog loop needs to be created.
5893 We therefore duplicate the loop: the original loop will be vectorized,
5894 and will compute the first (n/VF) iterations. The second copy of the loop
5895 will remain scalar and will compute the remaining (n%VF) iterations.
5896 (VF is the vectorization factor). */
5898 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5899 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5901 tree ratio_mult_vf;
5902 if (!ni_name)
5903 ni_name = vect_build_loop_niters (loop_vinfo);
5904 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5905 &ratio);
5906 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5907 th, check_profitability);
5909 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5910 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5911 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5912 else
5914 if (!ni_name)
5915 ni_name = vect_build_loop_niters (loop_vinfo);
5916 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5919 /* 1) Make sure the loop header has exactly two entries
5920 2) Make sure we have a preheader basic block. */
5922 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5924 split_edge (loop_preheader_edge (loop));
5926 /* FORNOW: the vectorizer supports only loops which body consist
5927 of one basic block (header + empty latch). When the vectorizer will
5928 support more involved loop forms, the order by which the BBs are
5929 traversed need to be reconsidered. */
5931 for (i = 0; i < nbbs; i++)
5933 basic_block bb = bbs[i];
5934 stmt_vec_info stmt_info;
5936 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5937 gsi_next (&si))
5939 gphi *phi = si.phi ();
5940 if (dump_enabled_p ())
5942 dump_printf_loc (MSG_NOTE, vect_location,
5943 "------>vectorizing phi: ");
5944 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5945 dump_printf (MSG_NOTE, "\n");
5947 stmt_info = vinfo_for_stmt (phi);
5948 if (!stmt_info)
5949 continue;
5951 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5952 vect_loop_kill_debug_uses (loop, phi);
5954 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5955 && !STMT_VINFO_LIVE_P (stmt_info))
5956 continue;
5958 if (STMT_VINFO_VECTYPE (stmt_info)
5959 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5960 != (unsigned HOST_WIDE_INT) vectorization_factor)
5961 && dump_enabled_p ())
5962 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5964 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5966 if (dump_enabled_p ())
5967 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5968 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5972 pattern_stmt = NULL;
5973 for (gimple_stmt_iterator si = gsi_start_bb (bb);
5974 !gsi_end_p (si) || transform_pattern_stmt;)
5976 bool is_store;
5978 if (transform_pattern_stmt)
5979 stmt = pattern_stmt;
5980 else
5982 stmt = gsi_stmt (si);
5983 /* During vectorization remove existing clobber stmts. */
5984 if (gimple_clobber_p (stmt))
5986 unlink_stmt_vdef (stmt);
5987 gsi_remove (&si, true);
5988 release_defs (stmt);
5989 continue;
5993 if (dump_enabled_p ())
5995 dump_printf_loc (MSG_NOTE, vect_location,
5996 "------>vectorizing statement: ");
5997 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5998 dump_printf (MSG_NOTE, "\n");
6001 stmt_info = vinfo_for_stmt (stmt);
6003 /* vector stmts created in the outer-loop during vectorization of
6004 stmts in an inner-loop may not have a stmt_info, and do not
6005 need to be vectorized. */
6006 if (!stmt_info)
6008 gsi_next (&si);
6009 continue;
6012 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6013 vect_loop_kill_debug_uses (loop, stmt);
6015 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6016 && !STMT_VINFO_LIVE_P (stmt_info))
6018 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6019 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6020 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6021 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6023 stmt = pattern_stmt;
6024 stmt_info = vinfo_for_stmt (stmt);
6026 else
6028 gsi_next (&si);
6029 continue;
6032 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6033 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6034 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6035 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6036 transform_pattern_stmt = true;
6038 /* If pattern statement has def stmts, vectorize them too. */
6039 if (is_pattern_stmt_p (stmt_info))
6041 if (pattern_def_seq == NULL)
6043 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6044 pattern_def_si = gsi_start (pattern_def_seq);
6046 else if (!gsi_end_p (pattern_def_si))
6047 gsi_next (&pattern_def_si);
6048 if (pattern_def_seq != NULL)
6050 gimple *pattern_def_stmt = NULL;
6051 stmt_vec_info pattern_def_stmt_info = NULL;
6053 while (!gsi_end_p (pattern_def_si))
6055 pattern_def_stmt = gsi_stmt (pattern_def_si);
6056 pattern_def_stmt_info
6057 = vinfo_for_stmt (pattern_def_stmt);
6058 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6059 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6060 break;
6061 gsi_next (&pattern_def_si);
6064 if (!gsi_end_p (pattern_def_si))
6066 if (dump_enabled_p ())
6068 dump_printf_loc (MSG_NOTE, vect_location,
6069 "==> vectorizing pattern def "
6070 "stmt: ");
6071 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6072 pattern_def_stmt, 0);
6073 dump_printf (MSG_NOTE, "\n");
6076 stmt = pattern_def_stmt;
6077 stmt_info = pattern_def_stmt_info;
6079 else
6081 pattern_def_si = gsi_none ();
6082 transform_pattern_stmt = false;
6085 else
6086 transform_pattern_stmt = false;
6089 if (STMT_VINFO_VECTYPE (stmt_info))
6091 unsigned int nunits
6092 = (unsigned int)
6093 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6094 if (!STMT_SLP_TYPE (stmt_info)
6095 && nunits != (unsigned int) vectorization_factor
6096 && dump_enabled_p ())
6097 /* For SLP VF is set according to unrolling factor, and not
6098 to vector size, hence for SLP this print is not valid. */
6099 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6102 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6103 reached. */
6104 if (STMT_SLP_TYPE (stmt_info))
6106 if (!slp_scheduled)
6108 slp_scheduled = true;
6110 if (dump_enabled_p ())
6111 dump_printf_loc (MSG_NOTE, vect_location,
6112 "=== scheduling SLP instances ===\n");
6114 vect_schedule_slp (loop_vinfo);
6117 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6118 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6120 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6122 pattern_def_seq = NULL;
6123 gsi_next (&si);
6125 continue;
6129 /* -------- vectorize statement ------------ */
6130 if (dump_enabled_p ())
6131 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6133 grouped_store = false;
6134 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6135 if (is_store)
6137 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6139 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6140 interleaving chain was completed - free all the stores in
6141 the chain. */
6142 gsi_next (&si);
6143 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6145 else
6147 /* Free the attached stmt_vec_info and remove the stmt. */
6148 gimple *store = gsi_stmt (si);
6149 free_stmt_vec_info (store);
6150 unlink_stmt_vdef (store);
6151 gsi_remove (&si, true);
6152 release_defs (store);
6155 /* Stores can only appear at the end of pattern statements. */
6156 gcc_assert (!transform_pattern_stmt);
6157 pattern_def_seq = NULL;
6159 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6161 pattern_def_seq = NULL;
6162 gsi_next (&si);
6164 } /* stmts in BB */
6165 } /* BBs in loop */
6167 slpeel_make_loop_iterate_ntimes (loop, ratio);
6169 /* Reduce loop iterations by the vectorization factor. */
6170 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6171 expected_iterations / vectorization_factor);
6172 loop->nb_iterations_upper_bound
6173 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6174 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6175 && loop->nb_iterations_upper_bound != 0)
6176 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6177 if (loop->any_estimate)
6179 loop->nb_iterations_estimate
6180 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6181 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6182 && loop->nb_iterations_estimate != 0)
6183 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6186 if (dump_enabled_p ())
6188 dump_printf_loc (MSG_NOTE, vect_location,
6189 "LOOP VECTORIZED\n");
6190 if (loop->inner)
6191 dump_printf_loc (MSG_NOTE, vect_location,
6192 "OUTER LOOP VECTORIZED\n");
6193 dump_printf (MSG_NOTE, "\n");