* gcc.dg/store-motion-fgcse-sm.c (dg-final): Cleanup
[official-gcc.git] / gcc / tree-vect-loop.c
bloba4f9501b29e079124d6ada5cd7ba8084f89422d8
1 /* Loop Vectorization
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "predict.h"
30 #include "vec.h"
31 #include "hashtab.h"
32 #include "hash-set.h"
33 #include "machmode.h"
34 #include "hard-reg-set.h"
35 #include "input.h"
36 #include "function.h"
37 #include "dominance.h"
38 #include "cfg.h"
39 #include "cfganal.h"
40 #include "basic-block.h"
41 #include "gimple-pretty-print.h"
42 #include "tree-ssa-alias.h"
43 #include "internal-fn.h"
44 #include "gimple-expr.h"
45 #include "is-a.h"
46 #include "gimple.h"
47 #include "gimplify.h"
48 #include "gimple-iterator.h"
49 #include "gimplify-me.h"
50 #include "gimple-ssa.h"
51 #include "tree-phinodes.h"
52 #include "ssa-iterators.h"
53 #include "stringpool.h"
54 #include "tree-ssanames.h"
55 #include "tree-ssa-loop-ivopts.h"
56 #include "tree-ssa-loop-manip.h"
57 #include "tree-ssa-loop-niter.h"
58 #include "tree-pass.h"
59 #include "cfgloop.h"
60 #include "expr.h"
61 #include "recog.h"
62 #include "insn-codes.h"
63 #include "optabs.h"
64 #include "params.h"
65 #include "diagnostic-core.h"
66 #include "tree-chrec.h"
67 #include "tree-scalar-evolution.h"
68 #include "tree-vectorizer.h"
69 #include "target.h"
71 /* Loop Vectorization Pass.
73 This pass tries to vectorize loops.
75 For example, the vectorizer transforms the following simple loop:
77 short a[N]; short b[N]; short c[N]; int i;
79 for (i=0; i<N; i++){
80 a[i] = b[i] + c[i];
83 as if it was manually vectorized by rewriting the source code into:
85 typedef int __attribute__((mode(V8HI))) v8hi;
86 short a[N]; short b[N]; short c[N]; int i;
87 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
88 v8hi va, vb, vc;
90 for (i=0; i<N/8; i++){
91 vb = pb[i];
92 vc = pc[i];
93 va = vb + vc;
94 pa[i] = va;
97 The main entry to this pass is vectorize_loops(), in which
98 the vectorizer applies a set of analyses on a given set of loops,
99 followed by the actual vectorization transformation for the loops that
100 had successfully passed the analysis phase.
101 Throughout this pass we make a distinction between two types of
102 data: scalars (which are represented by SSA_NAMES), and memory references
103 ("data-refs"). These two types of data require different handling both
104 during analysis and transformation. The types of data-refs that the
105 vectorizer currently supports are ARRAY_REFS which base is an array DECL
106 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
107 accesses are required to have a simple (consecutive) access pattern.
109 Analysis phase:
110 ===============
111 The driver for the analysis phase is vect_analyze_loop().
112 It applies a set of analyses, some of which rely on the scalar evolution
113 analyzer (scev) developed by Sebastian Pop.
115 During the analysis phase the vectorizer records some information
116 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
117 loop, as well as general information about the loop as a whole, which is
118 recorded in a "loop_vec_info" struct attached to each loop.
120 Transformation phase:
121 =====================
122 The loop transformation phase scans all the stmts in the loop, and
123 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
124 the loop that needs to be vectorized. It inserts the vector code sequence
125 just before the scalar stmt S, and records a pointer to the vector code
126 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
127 attached to S). This pointer will be used for the vectorization of following
128 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
129 otherwise, we rely on dead code elimination for removing it.
131 For example, say stmt S1 was vectorized into stmt VS1:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 S2: a = b;
137 To vectorize stmt S2, the vectorizer first finds the stmt that defines
138 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
139 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
140 resulting sequence would be:
142 VS1: vb = px[i];
143 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
144 VS2: va = vb;
145 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
147 Operands that are not SSA_NAMEs, are data-refs that appear in
148 load/store operations (like 'x[i]' in S1), and are handled differently.
150 Target modeling:
151 =================
152 Currently the only target specific information that is used is the
153 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
154 Targets that can support different sizes of vectors, for now will need
155 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
156 flexibility will be added in the future.
158 Since we only vectorize operations which vector form can be
159 expressed using existing tree codes, to verify that an operation is
160 supported, the vectorizer checks the relevant optab at the relevant
161 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
162 the value found is CODE_FOR_nothing, then there's no target support, and
163 we can't vectorize the stmt.
165 For additional information on this project see:
166 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
169 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
171 /* Function vect_determine_vectorization_factor
173 Determine the vectorization factor (VF). VF is the number of data elements
174 that are operated upon in parallel in a single iteration of the vectorized
175 loop. For example, when vectorizing a loop that operates on 4byte elements,
176 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
177 elements can fit in a single vector register.
179 We currently support vectorization of loops in which all types operated upon
180 are of the same size. Therefore this function currently sets VF according to
181 the size of the types operated upon, and fails if there are multiple sizes
182 in the loop.
184 VF is also the factor by which the loop iterations are strip-mined, e.g.:
185 original loop:
186 for (i=0; i<N; i++){
187 a[i] = b[i] + c[i];
190 vectorized loop:
191 for (i=0; i<N; i+=VF){
192 a[i:VF] = b[i:VF] + c[i:VF];
196 static bool
197 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
199 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
200 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
201 int nbbs = loop->num_nodes;
202 unsigned int vectorization_factor = 0;
203 tree scalar_type;
204 gphi *phi;
205 tree vectype;
206 unsigned int nunits;
207 stmt_vec_info stmt_info;
208 int i;
209 HOST_WIDE_INT dummy;
210 gimple stmt, pattern_stmt = NULL;
211 gimple_seq pattern_def_seq = NULL;
212 gimple_stmt_iterator pattern_def_si = gsi_none ();
213 bool analyze_pattern_stmt = false;
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location,
217 "=== vect_determine_vectorization_factor ===\n");
219 for (i = 0; i < nbbs; i++)
221 basic_block bb = bbs[i];
223 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
224 gsi_next (&si))
226 phi = si.phi ();
227 stmt_info = vinfo_for_stmt (phi);
228 if (dump_enabled_p ())
230 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
231 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
232 dump_printf (MSG_NOTE, "\n");
235 gcc_assert (stmt_info);
237 if (STMT_VINFO_RELEVANT_P (stmt_info))
239 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
240 scalar_type = TREE_TYPE (PHI_RESULT (phi));
242 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location,
245 "get vectype for scalar type: ");
246 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
247 dump_printf (MSG_NOTE, "\n");
250 vectype = get_vectype_for_scalar_type (scalar_type);
251 if (!vectype)
253 if (dump_enabled_p ())
255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
256 "not vectorized: unsupported "
257 "data-type ");
258 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
259 scalar_type);
260 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
262 return false;
264 STMT_VINFO_VECTYPE (stmt_info) = vectype;
266 if (dump_enabled_p ())
268 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
269 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
270 dump_printf (MSG_NOTE, "\n");
273 nunits = TYPE_VECTOR_SUBPARTS (vectype);
274 if (dump_enabled_p ())
275 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
276 nunits);
278 if (!vectorization_factor
279 || (nunits > vectorization_factor))
280 vectorization_factor = nunits;
284 for (gimple_stmt_iterator si = gsi_start_bb (bb);
285 !gsi_end_p (si) || analyze_pattern_stmt;)
287 tree vf_vectype;
289 if (analyze_pattern_stmt)
290 stmt = pattern_stmt;
291 else
292 stmt = gsi_stmt (si);
294 stmt_info = vinfo_for_stmt (stmt);
296 if (dump_enabled_p ())
298 dump_printf_loc (MSG_NOTE, vect_location,
299 "==> examining statement: ");
300 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
301 dump_printf (MSG_NOTE, "\n");
304 gcc_assert (stmt_info);
306 /* Skip stmts which do not need to be vectorized. */
307 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
308 && !STMT_VINFO_LIVE_P (stmt_info))
309 || gimple_clobber_p (stmt))
311 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
312 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
313 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
314 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
316 stmt = pattern_stmt;
317 stmt_info = vinfo_for_stmt (pattern_stmt);
318 if (dump_enabled_p ())
320 dump_printf_loc (MSG_NOTE, vect_location,
321 "==> examining pattern statement: ");
322 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
323 dump_printf (MSG_NOTE, "\n");
326 else
328 if (dump_enabled_p ())
329 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
330 gsi_next (&si);
331 continue;
334 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
335 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
336 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
337 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
338 analyze_pattern_stmt = true;
340 /* If a pattern statement has def stmts, analyze them too. */
341 if (is_pattern_stmt_p (stmt_info))
343 if (pattern_def_seq == NULL)
345 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
346 pattern_def_si = gsi_start (pattern_def_seq);
348 else if (!gsi_end_p (pattern_def_si))
349 gsi_next (&pattern_def_si);
350 if (pattern_def_seq != NULL)
352 gimple pattern_def_stmt = NULL;
353 stmt_vec_info pattern_def_stmt_info = NULL;
355 while (!gsi_end_p (pattern_def_si))
357 pattern_def_stmt = gsi_stmt (pattern_def_si);
358 pattern_def_stmt_info
359 = vinfo_for_stmt (pattern_def_stmt);
360 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
361 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
362 break;
363 gsi_next (&pattern_def_si);
366 if (!gsi_end_p (pattern_def_si))
368 if (dump_enabled_p ())
370 dump_printf_loc (MSG_NOTE, vect_location,
371 "==> examining pattern def stmt: ");
372 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
373 pattern_def_stmt, 0);
374 dump_printf (MSG_NOTE, "\n");
377 stmt = pattern_def_stmt;
378 stmt_info = pattern_def_stmt_info;
380 else
382 pattern_def_si = gsi_none ();
383 analyze_pattern_stmt = false;
386 else
387 analyze_pattern_stmt = false;
390 if (gimple_get_lhs (stmt) == NULL_TREE
391 /* MASK_STORE has no lhs, but is ok. */
392 && (!is_gimple_call (stmt)
393 || !gimple_call_internal_p (stmt)
394 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
396 if (is_gimple_call (stmt))
398 /* Ignore calls with no lhs. These must be calls to
399 #pragma omp simd functions, and what vectorization factor
400 it really needs can't be determined until
401 vectorizable_simd_clone_call. */
402 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
404 pattern_def_seq = NULL;
405 gsi_next (&si);
407 continue;
409 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 "not vectorized: irregular stmt.");
413 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
415 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
417 return false;
420 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
422 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425 "not vectorized: vector stmt in loop:");
426 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
427 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
429 return false;
432 if (STMT_VINFO_VECTYPE (stmt_info))
434 /* The only case when a vectype had been already set is for stmts
435 that contain a dataref, or for "pattern-stmts" (stmts
436 generated by the vectorizer to represent/replace a certain
437 idiom). */
438 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
439 || is_pattern_stmt_p (stmt_info)
440 || !gsi_end_p (pattern_def_si));
441 vectype = STMT_VINFO_VECTYPE (stmt_info);
443 else
445 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
446 if (is_gimple_call (stmt)
447 && gimple_call_internal_p (stmt)
448 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
449 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
450 else
451 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
452 if (dump_enabled_p ())
454 dump_printf_loc (MSG_NOTE, vect_location,
455 "get vectype for scalar type: ");
456 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
457 dump_printf (MSG_NOTE, "\n");
459 vectype = get_vectype_for_scalar_type (scalar_type);
460 if (!vectype)
462 if (dump_enabled_p ())
464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
465 "not vectorized: unsupported "
466 "data-type ");
467 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
468 scalar_type);
469 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
471 return false;
474 STMT_VINFO_VECTYPE (stmt_info) = vectype;
476 if (dump_enabled_p ())
478 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
480 dump_printf (MSG_NOTE, "\n");
484 /* The vectorization factor is according to the smallest
485 scalar type (or the largest vector size, but we only
486 support one vector size per loop). */
487 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
488 &dummy);
489 if (dump_enabled_p ())
491 dump_printf_loc (MSG_NOTE, vect_location,
492 "get vectype for scalar type: ");
493 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
494 dump_printf (MSG_NOTE, "\n");
496 vf_vectype = get_vectype_for_scalar_type (scalar_type);
497 if (!vf_vectype)
499 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "not vectorized: unsupported data-type ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
504 scalar_type);
505 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
507 return false;
510 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
511 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
513 if (dump_enabled_p ())
515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
516 "not vectorized: different sized vector "
517 "types in statement, ");
518 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
519 vectype);
520 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
521 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
522 vf_vectype);
523 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 return false;
528 if (dump_enabled_p ())
530 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
531 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
532 dump_printf (MSG_NOTE, "\n");
535 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
538 if (!vectorization_factor
539 || (nunits > vectorization_factor))
540 vectorization_factor = nunits;
542 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
544 pattern_def_seq = NULL;
545 gsi_next (&si);
550 /* TODO: Analyze cost. Decide if worth while to vectorize. */
551 if (dump_enabled_p ())
552 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
553 vectorization_factor);
554 if (vectorization_factor <= 1)
556 if (dump_enabled_p ())
557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
558 "not vectorized: unsupported data-type\n");
559 return false;
561 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
563 return true;
567 /* Function vect_is_simple_iv_evolution.
569 FORNOW: A simple evolution of an induction variables in the loop is
570 considered a polynomial evolution. */
572 static bool
573 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
574 tree * step)
576 tree init_expr;
577 tree step_expr;
578 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
579 basic_block bb;
581 /* When there is no evolution in this loop, the evolution function
582 is not "simple". */
583 if (evolution_part == NULL_TREE)
584 return false;
586 /* When the evolution is a polynomial of degree >= 2
587 the evolution function is not "simple". */
588 if (tree_is_chrec (evolution_part))
589 return false;
591 step_expr = evolution_part;
592 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
594 if (dump_enabled_p ())
596 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
597 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
598 dump_printf (MSG_NOTE, ", init: ");
599 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
600 dump_printf (MSG_NOTE, "\n");
603 *init = init_expr;
604 *step = step_expr;
606 if (TREE_CODE (step_expr) != INTEGER_CST
607 && (TREE_CODE (step_expr) != SSA_NAME
608 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
609 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
610 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
611 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
612 || !flag_associative_math)))
613 && (TREE_CODE (step_expr) != REAL_CST
614 || !flag_associative_math))
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "step unknown.\n");
619 return false;
622 return true;
625 /* Function vect_analyze_scalar_cycles_1.
627 Examine the cross iteration def-use cycles of scalar variables
628 in LOOP. LOOP_VINFO represents the loop that is now being
629 considered for vectorization (can be LOOP, or an outer-loop
630 enclosing LOOP). */
632 static void
633 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
635 basic_block bb = loop->header;
636 tree init, step;
637 auto_vec<gimple, 64> worklist;
638 gphi_iterator gsi;
639 bool double_reduc;
641 if (dump_enabled_p ())
642 dump_printf_loc (MSG_NOTE, vect_location,
643 "=== vect_analyze_scalar_cycles ===\n");
645 /* First - identify all inductions. Reduction detection assumes that all the
646 inductions have been identified, therefore, this order must not be
647 changed. */
648 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
650 gphi *phi = gsi.phi ();
651 tree access_fn = NULL;
652 tree def = PHI_RESULT (phi);
653 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
655 if (dump_enabled_p ())
657 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
658 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
659 dump_printf (MSG_NOTE, "\n");
662 /* Skip virtual phi's. The data dependences that are associated with
663 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
664 if (virtual_operand_p (def))
665 continue;
667 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
669 /* Analyze the evolution function. */
670 access_fn = analyze_scalar_evolution (loop, def);
671 if (access_fn)
673 STRIP_NOPS (access_fn);
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE, vect_location,
677 "Access function of PHI: ");
678 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
679 dump_printf (MSG_NOTE, "\n");
681 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
682 = evolution_part_in_loop_num (access_fn, loop->num);
685 if (!access_fn
686 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
687 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
688 && TREE_CODE (step) != INTEGER_CST))
690 worklist.safe_push (phi);
691 continue;
694 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
698 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
702 /* Second - identify all reductions and nested cycles. */
703 while (worklist.length () > 0)
705 gimple phi = worklist.pop ();
706 tree def = PHI_RESULT (phi);
707 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
708 gimple reduc_stmt;
709 bool nested_cycle;
711 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
714 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
715 dump_printf (MSG_NOTE, "\n");
718 gcc_assert (!virtual_operand_p (def)
719 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
721 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
722 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
723 &double_reduc);
724 if (reduc_stmt)
726 if (double_reduc)
728 if (dump_enabled_p ())
729 dump_printf_loc (MSG_NOTE, vect_location,
730 "Detected double reduction.\n");
732 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
733 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
734 vect_double_reduction_def;
736 else
738 if (nested_cycle)
740 if (dump_enabled_p ())
741 dump_printf_loc (MSG_NOTE, vect_location,
742 "Detected vectorizable nested cycle.\n");
744 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
745 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
746 vect_nested_cycle;
748 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_NOTE, vect_location,
752 "Detected reduction.\n");
754 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
755 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
756 vect_reduction_def;
757 /* Store the reduction cycles for possible vectorization in
758 loop-aware SLP. */
759 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
763 else
764 if (dump_enabled_p ())
765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
766 "Unknown def-use cycle pattern.\n");
771 /* Function vect_analyze_scalar_cycles.
773 Examine the cross iteration def-use cycles of scalar variables, by
774 analyzing the loop-header PHIs of scalar variables. Classify each
775 cycle as one of the following: invariant, induction, reduction, unknown.
776 We do that for the loop represented by LOOP_VINFO, and also to its
777 inner-loop, if exists.
778 Examples for scalar cycles:
780 Example1: reduction:
782 loop1:
783 for (i=0; i<N; i++)
784 sum += a[i];
786 Example2: induction:
788 loop2:
789 for (i=0; i<N; i++)
790 a[i] = i; */
792 static void
793 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
795 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
797 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
799 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
800 Reductions in such inner-loop therefore have different properties than
801 the reductions in the nest that gets vectorized:
802 1. When vectorized, they are executed in the same order as in the original
803 scalar loop, so we can't change the order of computation when
804 vectorizing them.
805 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
806 current checks are too strict. */
808 if (loop->inner)
809 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
813 /* Function vect_get_loop_niters.
815 Determine how many iterations the loop is executed and place it
816 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
817 in NUMBER_OF_ITERATIONSM1.
819 Return the loop exit condition. */
822 static gcond *
823 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
824 tree *number_of_iterationsm1)
826 tree niters;
828 if (dump_enabled_p ())
829 dump_printf_loc (MSG_NOTE, vect_location,
830 "=== get_loop_niters ===\n");
832 niters = number_of_latch_executions (loop);
833 *number_of_iterationsm1 = niters;
835 /* We want the number of loop header executions which is the number
836 of latch executions plus one.
837 ??? For UINT_MAX latch executions this number overflows to zero
838 for loops like do { n++; } while (n != 0); */
839 if (niters && !chrec_contains_undetermined (niters))
840 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
841 build_int_cst (TREE_TYPE (niters), 1));
842 *number_of_iterations = niters;
844 return get_loop_exit_condition (loop);
848 /* Function bb_in_loop_p
850 Used as predicate for dfs order traversal of the loop bbs. */
852 static bool
853 bb_in_loop_p (const_basic_block bb, const void *data)
855 const struct loop *const loop = (const struct loop *)data;
856 if (flow_bb_inside_loop_p (loop, bb))
857 return true;
858 return false;
862 /* Function new_loop_vec_info.
864 Create and initialize a new loop_vec_info struct for LOOP, as well as
865 stmt_vec_info structs for all the stmts in LOOP. */
867 static loop_vec_info
868 new_loop_vec_info (struct loop *loop)
870 loop_vec_info res;
871 basic_block *bbs;
872 gimple_stmt_iterator si;
873 unsigned int i, nbbs;
875 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
876 LOOP_VINFO_LOOP (res) = loop;
878 bbs = get_loop_body (loop);
880 /* Create/Update stmt_info for all stmts in the loop. */
881 for (i = 0; i < loop->num_nodes; i++)
883 basic_block bb = bbs[i];
885 /* BBs in a nested inner-loop will have been already processed (because
886 we will have called vect_analyze_loop_form for any nested inner-loop).
887 Therefore, for stmts in an inner-loop we just want to update the
888 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
889 loop_info of the outer-loop we are currently considering to vectorize
890 (instead of the loop_info of the inner-loop).
891 For stmts in other BBs we need to create a stmt_info from scratch. */
892 if (bb->loop_father != loop)
894 /* Inner-loop bb. */
895 gcc_assert (loop->inner && bb->loop_father == loop->inner);
896 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
898 gimple phi = gsi_stmt (si);
899 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
900 loop_vec_info inner_loop_vinfo =
901 STMT_VINFO_LOOP_VINFO (stmt_info);
902 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
903 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
905 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
907 gimple stmt = gsi_stmt (si);
908 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
909 loop_vec_info inner_loop_vinfo =
910 STMT_VINFO_LOOP_VINFO (stmt_info);
911 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
912 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
915 else
917 /* bb in current nest. */
918 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
920 gimple phi = gsi_stmt (si);
921 gimple_set_uid (phi, 0);
922 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
925 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
927 gimple stmt = gsi_stmt (si);
928 gimple_set_uid (stmt, 0);
929 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
934 /* CHECKME: We want to visit all BBs before their successors (except for
935 latch blocks, for which this assertion wouldn't hold). In the simple
936 case of the loop forms we allow, a dfs order of the BBs would the same
937 as reversed postorder traversal, so we are safe. */
939 free (bbs);
940 bbs = XCNEWVEC (basic_block, loop->num_nodes);
941 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
942 bbs, loop->num_nodes, loop);
943 gcc_assert (nbbs == loop->num_nodes);
945 LOOP_VINFO_BBS (res) = bbs;
946 LOOP_VINFO_NITERSM1 (res) = NULL;
947 LOOP_VINFO_NITERS (res) = NULL;
948 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
949 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
950 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
951 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
952 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
953 LOOP_VINFO_VECT_FACTOR (res) = 0;
954 LOOP_VINFO_LOOP_NEST (res).create (3);
955 LOOP_VINFO_DATAREFS (res).create (10);
956 LOOP_VINFO_DDRS (res).create (10 * 10);
957 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
958 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
959 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
960 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
961 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
962 LOOP_VINFO_GROUPED_STORES (res).create (10);
963 LOOP_VINFO_REDUCTIONS (res).create (10);
964 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
965 LOOP_VINFO_SLP_INSTANCES (res).create (10);
966 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
967 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
968 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
969 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
970 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
972 return res;
976 /* Function destroy_loop_vec_info.
978 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
979 stmts in the loop. */
981 void
982 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
984 struct loop *loop;
985 basic_block *bbs;
986 int nbbs;
987 gimple_stmt_iterator si;
988 int j;
989 vec<slp_instance> slp_instances;
990 slp_instance instance;
991 bool swapped;
993 if (!loop_vinfo)
994 return;
996 loop = LOOP_VINFO_LOOP (loop_vinfo);
998 bbs = LOOP_VINFO_BBS (loop_vinfo);
999 nbbs = clean_stmts ? loop->num_nodes : 0;
1000 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1002 for (j = 0; j < nbbs; j++)
1004 basic_block bb = bbs[j];
1005 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1006 free_stmt_vec_info (gsi_stmt (si));
1008 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1010 gimple stmt = gsi_stmt (si);
1012 /* We may have broken canonical form by moving a constant
1013 into RHS1 of a commutative op. Fix such occurrences. */
1014 if (swapped && is_gimple_assign (stmt))
1016 enum tree_code code = gimple_assign_rhs_code (stmt);
1018 if ((code == PLUS_EXPR
1019 || code == POINTER_PLUS_EXPR
1020 || code == MULT_EXPR)
1021 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1022 swap_ssa_operands (stmt,
1023 gimple_assign_rhs1_ptr (stmt),
1024 gimple_assign_rhs2_ptr (stmt));
1027 /* Free stmt_vec_info. */
1028 free_stmt_vec_info (stmt);
1029 gsi_next (&si);
1033 free (LOOP_VINFO_BBS (loop_vinfo));
1034 vect_destroy_datarefs (loop_vinfo, NULL);
1035 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1036 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1037 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1038 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1039 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1040 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1041 vect_free_slp_instance (instance);
1043 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1044 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1045 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1046 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1048 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1049 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1051 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1053 free (loop_vinfo);
1054 loop->aux = NULL;
1058 /* Function vect_analyze_loop_1.
1060 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1061 for it. The different analyses will record information in the
1062 loop_vec_info struct. This is a subset of the analyses applied in
1063 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1064 that is now considered for (outer-loop) vectorization. */
1066 static loop_vec_info
1067 vect_analyze_loop_1 (struct loop *loop)
1069 loop_vec_info loop_vinfo;
1071 if (dump_enabled_p ())
1072 dump_printf_loc (MSG_NOTE, vect_location,
1073 "===== analyze_loop_nest_1 =====\n");
1075 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1077 loop_vinfo = vect_analyze_loop_form (loop);
1078 if (!loop_vinfo)
1080 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1082 "bad inner-loop form.\n");
1083 return NULL;
1086 return loop_vinfo;
1090 /* Function vect_analyze_loop_form.
1092 Verify that certain CFG restrictions hold, including:
1093 - the loop has a pre-header
1094 - the loop has a single entry and exit
1095 - the loop exit condition is simple enough, and the number of iterations
1096 can be analyzed (a countable loop). */
1098 loop_vec_info
1099 vect_analyze_loop_form (struct loop *loop)
1101 loop_vec_info loop_vinfo;
1102 gcond *loop_cond;
1103 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1104 loop_vec_info inner_loop_vinfo = NULL;
1106 if (dump_enabled_p ())
1107 dump_printf_loc (MSG_NOTE, vect_location,
1108 "=== vect_analyze_loop_form ===\n");
1110 /* Different restrictions apply when we are considering an inner-most loop,
1111 vs. an outer (nested) loop.
1112 (FORNOW. May want to relax some of these restrictions in the future). */
1114 if (!loop->inner)
1116 /* Inner-most loop. We currently require that the number of BBs is
1117 exactly 2 (the header and latch). Vectorizable inner-most loops
1118 look like this:
1120 (pre-header)
1122 header <--------+
1123 | | |
1124 | +--> latch --+
1126 (exit-bb) */
1128 if (loop->num_nodes != 2)
1130 if (dump_enabled_p ())
1131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1132 "not vectorized: control flow in loop.\n");
1133 return NULL;
1136 if (empty_block_p (loop->header))
1138 if (dump_enabled_p ())
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 "not vectorized: empty loop.\n");
1141 return NULL;
1144 else
1146 struct loop *innerloop = loop->inner;
1147 edge entryedge;
1149 /* Nested loop. We currently require that the loop is doubly-nested,
1150 contains a single inner loop, and the number of BBs is exactly 5.
1151 Vectorizable outer-loops look like this:
1153 (pre-header)
1155 header <---+
1157 inner-loop |
1159 tail ------+
1161 (exit-bb)
1163 The inner-loop has the properties expected of inner-most loops
1164 as described above. */
1166 if ((loop->inner)->inner || (loop->inner)->next)
1168 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170 "not vectorized: multiple nested loops.\n");
1171 return NULL;
1174 /* Analyze the inner-loop. */
1175 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1176 if (!inner_loop_vinfo)
1178 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180 "not vectorized: Bad inner loop.\n");
1181 return NULL;
1184 if (!expr_invariant_in_loop_p (loop,
1185 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "not vectorized: inner-loop count not"
1190 " invariant.\n");
1191 destroy_loop_vec_info (inner_loop_vinfo, true);
1192 return NULL;
1195 if (loop->num_nodes != 5)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "not vectorized: control flow in loop.\n");
1200 destroy_loop_vec_info (inner_loop_vinfo, true);
1201 return NULL;
1204 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1205 entryedge = EDGE_PRED (innerloop->header, 0);
1206 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1207 entryedge = EDGE_PRED (innerloop->header, 1);
1209 if (entryedge->src != loop->header
1210 || !single_exit (innerloop)
1211 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1213 if (dump_enabled_p ())
1214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215 "not vectorized: unsupported outerloop form.\n");
1216 destroy_loop_vec_info (inner_loop_vinfo, true);
1217 return NULL;
1220 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_NOTE, vect_location,
1222 "Considering outer-loop vectorization.\n");
1225 if (!single_exit (loop)
1226 || EDGE_COUNT (loop->header->preds) != 2)
1228 if (dump_enabled_p ())
1230 if (!single_exit (loop))
1231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232 "not vectorized: multiple exits.\n");
1233 else if (EDGE_COUNT (loop->header->preds) != 2)
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "not vectorized: too many incoming edges.\n");
1237 if (inner_loop_vinfo)
1238 destroy_loop_vec_info (inner_loop_vinfo, true);
1239 return NULL;
1242 /* We assume that the loop exit condition is at the end of the loop. i.e,
1243 that the loop is represented as a do-while (with a proper if-guard
1244 before the loop if needed), where the loop header contains all the
1245 executable statements, and the latch is empty. */
1246 if (!empty_block_p (loop->latch)
1247 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1249 if (dump_enabled_p ())
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 "not vectorized: latch block not empty.\n");
1252 if (inner_loop_vinfo)
1253 destroy_loop_vec_info (inner_loop_vinfo, true);
1254 return NULL;
1257 /* Make sure there exists a single-predecessor exit bb: */
1258 if (!single_pred_p (single_exit (loop)->dest))
1260 edge e = single_exit (loop);
1261 if (!(e->flags & EDGE_ABNORMAL))
1263 split_loop_exit_edge (e);
1264 if (dump_enabled_p ())
1265 dump_printf (MSG_NOTE, "split exit edge.\n");
1267 else
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271 "not vectorized: abnormal loop exit edge.\n");
1272 if (inner_loop_vinfo)
1273 destroy_loop_vec_info (inner_loop_vinfo, true);
1274 return NULL;
1278 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1279 &number_of_iterationsm1);
1280 if (!loop_cond)
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284 "not vectorized: complicated exit condition.\n");
1285 if (inner_loop_vinfo)
1286 destroy_loop_vec_info (inner_loop_vinfo, true);
1287 return NULL;
1290 if (!number_of_iterations
1291 || chrec_contains_undetermined (number_of_iterations))
1293 if (dump_enabled_p ())
1294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1295 "not vectorized: number of iterations cannot be "
1296 "computed.\n");
1297 if (inner_loop_vinfo)
1298 destroy_loop_vec_info (inner_loop_vinfo, true);
1299 return NULL;
1302 if (integer_zerop (number_of_iterations))
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306 "not vectorized: number of iterations = 0.\n");
1307 if (inner_loop_vinfo)
1308 destroy_loop_vec_info (inner_loop_vinfo, true);
1309 return NULL;
1312 loop_vinfo = new_loop_vec_info (loop);
1313 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1314 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1315 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1317 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1319 if (dump_enabled_p ())
1321 dump_printf_loc (MSG_NOTE, vect_location,
1322 "Symbolic number of iterations is ");
1323 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1324 dump_printf (MSG_NOTE, "\n");
1328 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1330 /* CHECKME: May want to keep it around it in the future. */
1331 if (inner_loop_vinfo)
1332 destroy_loop_vec_info (inner_loop_vinfo, false);
1334 gcc_assert (!loop->aux);
1335 loop->aux = loop_vinfo;
1336 return loop_vinfo;
1340 /* Function vect_analyze_loop_operations.
1342 Scan the loop stmts and make sure they are all vectorizable. */
1344 static bool
1345 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1347 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1348 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1349 int nbbs = loop->num_nodes;
1350 unsigned int vectorization_factor = 0;
1351 int i;
1352 stmt_vec_info stmt_info;
1353 bool need_to_vectorize = false;
1354 int min_profitable_iters;
1355 int min_scalar_loop_bound;
1356 unsigned int th;
1357 bool only_slp_in_loop = true, ok;
1358 HOST_WIDE_INT max_niter;
1359 HOST_WIDE_INT estimated_niter;
1360 int min_profitable_estimate;
1362 if (dump_enabled_p ())
1363 dump_printf_loc (MSG_NOTE, vect_location,
1364 "=== vect_analyze_loop_operations ===\n");
1366 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1367 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1368 if (slp)
1370 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1371 vectorization factor of the loop is the unrolling factor required by
1372 the SLP instances. If that unrolling factor is 1, we say, that we
1373 perform pure SLP on loop - cross iteration parallelism is not
1374 exploited. */
1375 for (i = 0; i < nbbs; i++)
1377 basic_block bb = bbs[i];
1378 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1379 gsi_next (&si))
1381 gimple stmt = gsi_stmt (si);
1382 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1383 gcc_assert (stmt_info);
1384 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1385 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1386 && !PURE_SLP_STMT (stmt_info))
1387 /* STMT needs both SLP and loop-based vectorization. */
1388 only_slp_in_loop = false;
1392 if (only_slp_in_loop)
1393 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1394 else
1395 vectorization_factor = least_common_multiple (vectorization_factor,
1396 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1398 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_NOTE, vect_location,
1401 "Updating vectorization factor to %d\n",
1402 vectorization_factor);
1405 for (i = 0; i < nbbs; i++)
1407 basic_block bb = bbs[i];
1409 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1410 gsi_next (&si))
1412 gphi *phi = si.phi ();
1413 ok = true;
1415 stmt_info = vinfo_for_stmt (phi);
1416 if (dump_enabled_p ())
1418 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1419 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1420 dump_printf (MSG_NOTE, "\n");
1423 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1424 (i.e., a phi in the tail of the outer-loop). */
1425 if (! is_loop_header_bb_p (bb))
1427 /* FORNOW: we currently don't support the case that these phis
1428 are not used in the outerloop (unless it is double reduction,
1429 i.e., this phi is vect_reduction_def), cause this case
1430 requires to actually do something here. */
1431 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1432 || STMT_VINFO_LIVE_P (stmt_info))
1433 && STMT_VINFO_DEF_TYPE (stmt_info)
1434 != vect_double_reduction_def)
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438 "Unsupported loop-closed phi in "
1439 "outer-loop.\n");
1440 return false;
1443 /* If PHI is used in the outer loop, we check that its operand
1444 is defined in the inner loop. */
1445 if (STMT_VINFO_RELEVANT_P (stmt_info))
1447 tree phi_op;
1448 gimple op_def_stmt;
1450 if (gimple_phi_num_args (phi) != 1)
1451 return false;
1453 phi_op = PHI_ARG_DEF (phi, 0);
1454 if (TREE_CODE (phi_op) != SSA_NAME)
1455 return false;
1457 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1458 if (gimple_nop_p (op_def_stmt)
1459 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1460 || !vinfo_for_stmt (op_def_stmt))
1461 return false;
1463 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1464 != vect_used_in_outer
1465 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1466 != vect_used_in_outer_by_reduction)
1467 return false;
1470 continue;
1473 gcc_assert (stmt_info);
1475 if (STMT_VINFO_LIVE_P (stmt_info))
1477 /* FORNOW: not yet supported. */
1478 if (dump_enabled_p ())
1479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1480 "not vectorized: value used after loop.\n");
1481 return false;
1484 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1485 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1487 /* A scalar-dependence cycle that we don't support. */
1488 if (dump_enabled_p ())
1489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1490 "not vectorized: scalar dependence cycle.\n");
1491 return false;
1494 if (STMT_VINFO_RELEVANT_P (stmt_info))
1496 need_to_vectorize = true;
1497 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1498 ok = vectorizable_induction (phi, NULL, NULL);
1501 if (!ok)
1503 if (dump_enabled_p ())
1505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1506 "not vectorized: relevant phi not "
1507 "supported: ");
1508 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1509 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1511 return false;
1515 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1516 gsi_next (&si))
1518 gimple stmt = gsi_stmt (si);
1519 if (!gimple_clobber_p (stmt)
1520 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1521 return false;
1523 } /* bbs */
1525 /* All operations in the loop are either irrelevant (deal with loop
1526 control, or dead), or only used outside the loop and can be moved
1527 out of the loop (e.g. invariants, inductions). The loop can be
1528 optimized away by scalar optimizations. We're better off not
1529 touching this loop. */
1530 if (!need_to_vectorize)
1532 if (dump_enabled_p ())
1533 dump_printf_loc (MSG_NOTE, vect_location,
1534 "All the computation can be taken out of the loop.\n");
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537 "not vectorized: redundant loop. no profit to "
1538 "vectorize.\n");
1539 return false;
1542 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE, vect_location,
1544 "vectorization_factor = %d, niters = "
1545 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1546 LOOP_VINFO_INT_NITERS (loop_vinfo));
1548 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1549 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1550 || ((max_niter = max_stmt_executions_int (loop)) != -1
1551 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1553 if (dump_enabled_p ())
1554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555 "not vectorized: iteration count too small.\n");
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558 "not vectorized: iteration count smaller than "
1559 "vectorization factor.\n");
1560 return false;
1563 /* Analyze cost. Decide if worth while to vectorize. */
1565 /* Once VF is set, SLP costs should be updated since the number of created
1566 vector stmts depends on VF. */
1567 vect_update_slp_costs_according_to_vf (loop_vinfo);
1569 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1570 &min_profitable_estimate);
1571 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1573 if (min_profitable_iters < 0)
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577 "not vectorized: vectorization not profitable.\n");
1578 if (dump_enabled_p ())
1579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1580 "not vectorized: vector version will never be "
1581 "profitable.\n");
1582 return false;
1585 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1586 * vectorization_factor) - 1);
1589 /* Use the cost model only if it is more conservative than user specified
1590 threshold. */
1592 th = (unsigned) min_scalar_loop_bound;
1593 if (min_profitable_iters
1594 && (!min_scalar_loop_bound
1595 || min_profitable_iters > min_scalar_loop_bound))
1596 th = (unsigned) min_profitable_iters;
1598 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1600 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1601 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 "not vectorized: vectorization not profitable.\n");
1606 if (dump_enabled_p ())
1607 dump_printf_loc (MSG_NOTE, vect_location,
1608 "not vectorized: iteration count smaller than user "
1609 "specified loop bound parameter or minimum profitable "
1610 "iterations (whichever is more conservative).\n");
1611 return false;
1614 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1615 && ((unsigned HOST_WIDE_INT) estimated_niter
1616 <= MAX (th, (unsigned)min_profitable_estimate)))
1618 if (dump_enabled_p ())
1619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1620 "not vectorized: estimated iteration count too "
1621 "small.\n");
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_NOTE, vect_location,
1624 "not vectorized: estimated iteration count smaller "
1625 "than specified loop bound parameter or minimum "
1626 "profitable iterations (whichever is more "
1627 "conservative).\n");
1628 return false;
1631 return true;
1635 /* Function vect_analyze_loop_2.
1637 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1638 for it. The different analyses will record information in the
1639 loop_vec_info struct. */
1640 static bool
1641 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1643 bool ok, slp = false;
1644 int max_vf = MAX_VECTORIZATION_FACTOR;
1645 int min_vf = 2;
1646 unsigned int th;
1647 unsigned int n_stmts = 0;
1649 /* Find all data references in the loop (which correspond to vdefs/vuses)
1650 and analyze their evolution in the loop. Also adjust the minimal
1651 vectorization factor according to the loads and stores.
1653 FORNOW: Handle only simple, array references, which
1654 alignment can be forced, and aligned pointer-references. */
1656 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1657 if (!ok)
1659 if (dump_enabled_p ())
1660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1661 "bad data references.\n");
1662 return false;
1665 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1666 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1668 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1669 if (!ok)
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673 "bad data access.\n");
1674 return false;
1677 /* Classify all cross-iteration scalar data-flow cycles.
1678 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1680 vect_analyze_scalar_cycles (loop_vinfo);
1682 vect_pattern_recog (loop_vinfo, NULL);
1684 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1686 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1687 if (!ok)
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691 "unexpected pattern.\n");
1692 return false;
1695 /* Analyze data dependences between the data-refs in the loop
1696 and adjust the maximum vectorization factor according to
1697 the dependences.
1698 FORNOW: fail at the first data dependence that we encounter. */
1700 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1701 if (!ok
1702 || max_vf < min_vf)
1704 if (dump_enabled_p ())
1705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1706 "bad data dependence.\n");
1707 return false;
1710 ok = vect_determine_vectorization_factor (loop_vinfo);
1711 if (!ok)
1713 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715 "can't determine vectorization factor.\n");
1716 return false;
1718 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722 "bad data dependence.\n");
1723 return false;
1726 /* Analyze the alignment of the data-refs in the loop.
1727 Fail if a data reference is found that cannot be vectorized. */
1729 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1730 if (!ok)
1732 if (dump_enabled_p ())
1733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734 "bad data alignment.\n");
1735 return false;
1738 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1739 It is important to call pruning after vect_analyze_data_ref_accesses,
1740 since we use grouping information gathered by interleaving analysis. */
1741 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1742 if (!ok)
1744 if (dump_enabled_p ())
1745 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1746 "number of versioning for alias "
1747 "run-time tests exceeds %d "
1748 "(--param vect-max-version-for-alias-checks)\n",
1749 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1750 return false;
1753 /* This pass will decide on using loop versioning and/or loop peeling in
1754 order to enhance the alignment of data references in the loop. */
1756 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1757 if (!ok)
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761 "bad data alignment.\n");
1762 return false;
1765 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1766 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1767 if (ok)
1769 /* Decide which possible SLP instances to SLP. */
1770 slp = vect_make_slp_decision (loop_vinfo);
1772 /* Find stmts that need to be both vectorized and SLPed. */
1773 vect_detect_hybrid_slp (loop_vinfo);
1775 else
1776 return false;
1778 /* Scan all the operations in the loop and make sure they are
1779 vectorizable. */
1781 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1782 if (!ok)
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1786 "bad operation or unsupported loop bound.\n");
1787 return false;
1790 /* Decide whether we need to create an epilogue loop to handle
1791 remaining scalar iterations. */
1792 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1793 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1794 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1796 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1797 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1799 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1800 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1801 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1802 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1804 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1805 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1806 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1807 /* In case of versioning, check if the maximum number of
1808 iterations is greater than th. If they are identical,
1809 the epilogue is unnecessary. */
1810 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1811 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1812 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1813 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1814 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1816 /* If an epilogue loop is required make sure we can create one. */
1817 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1818 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1820 if (dump_enabled_p ())
1821 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1822 if (!vect_can_advance_ivs_p (loop_vinfo)
1823 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1824 single_exit (LOOP_VINFO_LOOP
1825 (loop_vinfo))))
1827 if (dump_enabled_p ())
1828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1829 "not vectorized: can't create required "
1830 "epilog loop\n");
1831 return false;
1835 return true;
1838 /* Function vect_analyze_loop.
1840 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1841 for it. The different analyses will record information in the
1842 loop_vec_info struct. */
1843 loop_vec_info
1844 vect_analyze_loop (struct loop *loop)
1846 loop_vec_info loop_vinfo;
1847 unsigned int vector_sizes;
1849 /* Autodetect first vector size we try. */
1850 current_vector_size = 0;
1851 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_NOTE, vect_location,
1855 "===== analyze_loop_nest =====\n");
1857 if (loop_outer (loop)
1858 && loop_vec_info_for_loop (loop_outer (loop))
1859 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_NOTE, vect_location,
1863 "outer-loop already vectorized.\n");
1864 return NULL;
1867 while (1)
1869 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1870 loop_vinfo = vect_analyze_loop_form (loop);
1871 if (!loop_vinfo)
1873 if (dump_enabled_p ())
1874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875 "bad loop form.\n");
1876 return NULL;
1879 if (vect_analyze_loop_2 (loop_vinfo))
1881 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1883 return loop_vinfo;
1886 destroy_loop_vec_info (loop_vinfo, true);
1888 vector_sizes &= ~current_vector_size;
1889 if (vector_sizes == 0
1890 || current_vector_size == 0)
1891 return NULL;
1893 /* Try the next biggest vector size. */
1894 current_vector_size = 1 << floor_log2 (vector_sizes);
1895 if (dump_enabled_p ())
1896 dump_printf_loc (MSG_NOTE, vect_location,
1897 "***** Re-trying analysis with "
1898 "vector size %d\n", current_vector_size);
1903 /* Function reduction_code_for_scalar_code
1905 Input:
1906 CODE - tree_code of a reduction operations.
1908 Output:
1909 REDUC_CODE - the corresponding tree-code to be used to reduce the
1910 vector of partial results into a single scalar result, or ERROR_MARK
1911 if the operation is a supported reduction operation, but does not have
1912 such a tree-code.
1914 Return FALSE if CODE currently cannot be vectorized as reduction. */
1916 static bool
1917 reduction_code_for_scalar_code (enum tree_code code,
1918 enum tree_code *reduc_code)
1920 switch (code)
1922 case MAX_EXPR:
1923 *reduc_code = REDUC_MAX_EXPR;
1924 return true;
1926 case MIN_EXPR:
1927 *reduc_code = REDUC_MIN_EXPR;
1928 return true;
1930 case PLUS_EXPR:
1931 *reduc_code = REDUC_PLUS_EXPR;
1932 return true;
1934 case MULT_EXPR:
1935 case MINUS_EXPR:
1936 case BIT_IOR_EXPR:
1937 case BIT_XOR_EXPR:
1938 case BIT_AND_EXPR:
1939 *reduc_code = ERROR_MARK;
1940 return true;
1942 default:
1943 return false;
1948 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1949 STMT is printed with a message MSG. */
1951 static void
1952 report_vect_op (int msg_type, gimple stmt, const char *msg)
1954 dump_printf_loc (msg_type, vect_location, "%s", msg);
1955 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1956 dump_printf (msg_type, "\n");
1960 /* Detect SLP reduction of the form:
1962 #a1 = phi <a5, a0>
1963 a2 = operation (a1)
1964 a3 = operation (a2)
1965 a4 = operation (a3)
1966 a5 = operation (a4)
1968 #a = phi <a5>
1970 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1971 FIRST_STMT is the first reduction stmt in the chain
1972 (a2 = operation (a1)).
1974 Return TRUE if a reduction chain was detected. */
1976 static bool
1977 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1979 struct loop *loop = (gimple_bb (phi))->loop_father;
1980 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1981 enum tree_code code;
1982 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1983 stmt_vec_info use_stmt_info, current_stmt_info;
1984 tree lhs;
1985 imm_use_iterator imm_iter;
1986 use_operand_p use_p;
1987 int nloop_uses, size = 0, n_out_of_loop_uses;
1988 bool found = false;
1990 if (loop != vect_loop)
1991 return false;
1993 lhs = PHI_RESULT (phi);
1994 code = gimple_assign_rhs_code (first_stmt);
1995 while (1)
1997 nloop_uses = 0;
1998 n_out_of_loop_uses = 0;
1999 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2001 gimple use_stmt = USE_STMT (use_p);
2002 if (is_gimple_debug (use_stmt))
2003 continue;
2005 /* Check if we got back to the reduction phi. */
2006 if (use_stmt == phi)
2008 loop_use_stmt = use_stmt;
2009 found = true;
2010 break;
2013 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2015 if (vinfo_for_stmt (use_stmt)
2016 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2018 loop_use_stmt = use_stmt;
2019 nloop_uses++;
2022 else
2023 n_out_of_loop_uses++;
2025 /* There are can be either a single use in the loop or two uses in
2026 phi nodes. */
2027 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2028 return false;
2031 if (found)
2032 break;
2034 /* We reached a statement with no loop uses. */
2035 if (nloop_uses == 0)
2036 return false;
2038 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2039 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2040 return false;
2042 if (!is_gimple_assign (loop_use_stmt)
2043 || code != gimple_assign_rhs_code (loop_use_stmt)
2044 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2045 return false;
2047 /* Insert USE_STMT into reduction chain. */
2048 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2049 if (current_stmt)
2051 current_stmt_info = vinfo_for_stmt (current_stmt);
2052 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2053 GROUP_FIRST_ELEMENT (use_stmt_info)
2054 = GROUP_FIRST_ELEMENT (current_stmt_info);
2056 else
2057 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2059 lhs = gimple_assign_lhs (loop_use_stmt);
2060 current_stmt = loop_use_stmt;
2061 size++;
2064 if (!found || loop_use_stmt != phi || size < 2)
2065 return false;
2067 /* Swap the operands, if needed, to make the reduction operand be the second
2068 operand. */
2069 lhs = PHI_RESULT (phi);
2070 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2071 while (next_stmt)
2073 if (gimple_assign_rhs2 (next_stmt) == lhs)
2075 tree op = gimple_assign_rhs1 (next_stmt);
2076 gimple def_stmt = NULL;
2078 if (TREE_CODE (op) == SSA_NAME)
2079 def_stmt = SSA_NAME_DEF_STMT (op);
2081 /* Check that the other def is either defined in the loop
2082 ("vect_internal_def"), or it's an induction (defined by a
2083 loop-header phi-node). */
2084 if (def_stmt
2085 && gimple_bb (def_stmt)
2086 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2087 && (is_gimple_assign (def_stmt)
2088 || is_gimple_call (def_stmt)
2089 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2090 == vect_induction_def
2091 || (gimple_code (def_stmt) == GIMPLE_PHI
2092 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2093 == vect_internal_def
2094 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2096 lhs = gimple_assign_lhs (next_stmt);
2097 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2098 continue;
2101 return false;
2103 else
2105 tree op = gimple_assign_rhs2 (next_stmt);
2106 gimple def_stmt = NULL;
2108 if (TREE_CODE (op) == SSA_NAME)
2109 def_stmt = SSA_NAME_DEF_STMT (op);
2111 /* Check that the other def is either defined in the loop
2112 ("vect_internal_def"), or it's an induction (defined by a
2113 loop-header phi-node). */
2114 if (def_stmt
2115 && gimple_bb (def_stmt)
2116 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2117 && (is_gimple_assign (def_stmt)
2118 || is_gimple_call (def_stmt)
2119 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2120 == vect_induction_def
2121 || (gimple_code (def_stmt) == GIMPLE_PHI
2122 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2123 == vect_internal_def
2124 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2126 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2129 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2130 dump_printf (MSG_NOTE, "\n");
2133 swap_ssa_operands (next_stmt,
2134 gimple_assign_rhs1_ptr (next_stmt),
2135 gimple_assign_rhs2_ptr (next_stmt));
2136 update_stmt (next_stmt);
2138 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2139 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2141 else
2142 return false;
2145 lhs = gimple_assign_lhs (next_stmt);
2146 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2149 /* Save the chain for further analysis in SLP detection. */
2150 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2151 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2152 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2154 return true;
2158 /* Function vect_is_simple_reduction_1
2160 (1) Detect a cross-iteration def-use cycle that represents a simple
2161 reduction computation. We look for the following pattern:
2163 loop_header:
2164 a1 = phi < a0, a2 >
2165 a3 = ...
2166 a2 = operation (a3, a1)
2170 a3 = ...
2171 loop_header:
2172 a1 = phi < a0, a2 >
2173 a2 = operation (a3, a1)
2175 such that:
2176 1. operation is commutative and associative and it is safe to
2177 change the order of the computation (if CHECK_REDUCTION is true)
2178 2. no uses for a2 in the loop (a2 is used out of the loop)
2179 3. no uses of a1 in the loop besides the reduction operation
2180 4. no uses of a1 outside the loop.
2182 Conditions 1,4 are tested here.
2183 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2185 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2186 nested cycles, if CHECK_REDUCTION is false.
2188 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2189 reductions:
2191 a1 = phi < a0, a2 >
2192 inner loop (def of a3)
2193 a2 = phi < a3 >
2195 If MODIFY is true it tries also to rework the code in-place to enable
2196 detection of more reduction patterns. For the time being we rewrite
2197 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2200 static gimple
2201 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2202 bool check_reduction, bool *double_reduc,
2203 bool modify)
2205 struct loop *loop = (gimple_bb (phi))->loop_father;
2206 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2207 edge latch_e = loop_latch_edge (loop);
2208 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2209 gimple def_stmt, def1 = NULL, def2 = NULL;
2210 enum tree_code orig_code, code;
2211 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2212 tree type;
2213 int nloop_uses;
2214 tree name;
2215 imm_use_iterator imm_iter;
2216 use_operand_p use_p;
2217 bool phi_def;
2219 *double_reduc = false;
2221 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2222 otherwise, we assume outer loop vectorization. */
2223 gcc_assert ((check_reduction && loop == vect_loop)
2224 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2226 name = PHI_RESULT (phi);
2227 /* ??? If there are no uses of the PHI result the inner loop reduction
2228 won't be detected as possibly double-reduction by vectorizable_reduction
2229 because that tries to walk the PHI arg from the preheader edge which
2230 can be constant. See PR60382. */
2231 if (has_zero_uses (name))
2232 return NULL;
2233 nloop_uses = 0;
2234 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2236 gimple use_stmt = USE_STMT (use_p);
2237 if (is_gimple_debug (use_stmt))
2238 continue;
2240 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244 "intermediate value used outside loop.\n");
2246 return NULL;
2249 if (vinfo_for_stmt (use_stmt)
2250 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2251 nloop_uses++;
2252 if (nloop_uses > 1)
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256 "reduction used in loop.\n");
2257 return NULL;
2261 if (TREE_CODE (loop_arg) != SSA_NAME)
2263 if (dump_enabled_p ())
2265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2266 "reduction: not ssa_name: ");
2267 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2270 return NULL;
2273 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2274 if (!def_stmt)
2276 if (dump_enabled_p ())
2277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278 "reduction: no def_stmt.\n");
2279 return NULL;
2282 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2284 if (dump_enabled_p ())
2286 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2287 dump_printf (MSG_NOTE, "\n");
2289 return NULL;
2292 if (is_gimple_assign (def_stmt))
2294 name = gimple_assign_lhs (def_stmt);
2295 phi_def = false;
2297 else
2299 name = PHI_RESULT (def_stmt);
2300 phi_def = true;
2303 nloop_uses = 0;
2304 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2306 gimple use_stmt = USE_STMT (use_p);
2307 if (is_gimple_debug (use_stmt))
2308 continue;
2309 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2310 && vinfo_for_stmt (use_stmt)
2311 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2312 nloop_uses++;
2313 if (nloop_uses > 1)
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317 "reduction used in loop.\n");
2318 return NULL;
2322 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2323 defined in the inner loop. */
2324 if (phi_def)
2326 op1 = PHI_ARG_DEF (def_stmt, 0);
2328 if (gimple_phi_num_args (def_stmt) != 1
2329 || TREE_CODE (op1) != SSA_NAME)
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "unsupported phi node definition.\n");
2335 return NULL;
2338 def1 = SSA_NAME_DEF_STMT (op1);
2339 if (gimple_bb (def1)
2340 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2341 && loop->inner
2342 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2343 && is_gimple_assign (def1))
2345 if (dump_enabled_p ())
2346 report_vect_op (MSG_NOTE, def_stmt,
2347 "detected double reduction: ");
2349 *double_reduc = true;
2350 return def_stmt;
2353 return NULL;
2356 code = orig_code = gimple_assign_rhs_code (def_stmt);
2358 /* We can handle "res -= x[i]", which is non-associative by
2359 simply rewriting this into "res += -x[i]". Avoid changing
2360 gimple instruction for the first simple tests and only do this
2361 if we're allowed to change code at all. */
2362 if (code == MINUS_EXPR
2363 && modify
2364 && (op1 = gimple_assign_rhs1 (def_stmt))
2365 && TREE_CODE (op1) == SSA_NAME
2366 && SSA_NAME_DEF_STMT (op1) == phi)
2367 code = PLUS_EXPR;
2369 if (check_reduction
2370 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2372 if (dump_enabled_p ())
2373 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2374 "reduction: not commutative/associative: ");
2375 return NULL;
2378 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2380 if (code != COND_EXPR)
2382 if (dump_enabled_p ())
2383 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2384 "reduction: not binary operation: ");
2386 return NULL;
2389 op3 = gimple_assign_rhs1 (def_stmt);
2390 if (COMPARISON_CLASS_P (op3))
2392 op4 = TREE_OPERAND (op3, 1);
2393 op3 = TREE_OPERAND (op3, 0);
2396 op1 = gimple_assign_rhs2 (def_stmt);
2397 op2 = gimple_assign_rhs3 (def_stmt);
2399 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2401 if (dump_enabled_p ())
2402 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403 "reduction: uses not ssa_names: ");
2405 return NULL;
2408 else
2410 op1 = gimple_assign_rhs1 (def_stmt);
2411 op2 = gimple_assign_rhs2 (def_stmt);
2413 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2415 if (dump_enabled_p ())
2416 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2417 "reduction: uses not ssa_names: ");
2419 return NULL;
2423 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2424 if ((TREE_CODE (op1) == SSA_NAME
2425 && !types_compatible_p (type,TREE_TYPE (op1)))
2426 || (TREE_CODE (op2) == SSA_NAME
2427 && !types_compatible_p (type, TREE_TYPE (op2)))
2428 || (op3 && TREE_CODE (op3) == SSA_NAME
2429 && !types_compatible_p (type, TREE_TYPE (op3)))
2430 || (op4 && TREE_CODE (op4) == SSA_NAME
2431 && !types_compatible_p (type, TREE_TYPE (op4))))
2433 if (dump_enabled_p ())
2435 dump_printf_loc (MSG_NOTE, vect_location,
2436 "reduction: multiple types: operation type: ");
2437 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2438 dump_printf (MSG_NOTE, ", operands types: ");
2439 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2440 TREE_TYPE (op1));
2441 dump_printf (MSG_NOTE, ",");
2442 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2443 TREE_TYPE (op2));
2444 if (op3)
2446 dump_printf (MSG_NOTE, ",");
2447 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2448 TREE_TYPE (op3));
2451 if (op4)
2453 dump_printf (MSG_NOTE, ",");
2454 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2455 TREE_TYPE (op4));
2457 dump_printf (MSG_NOTE, "\n");
2460 return NULL;
2463 /* Check that it's ok to change the order of the computation.
2464 Generally, when vectorizing a reduction we change the order of the
2465 computation. This may change the behavior of the program in some
2466 cases, so we need to check that this is ok. One exception is when
2467 vectorizing an outer-loop: the inner-loop is executed sequentially,
2468 and therefore vectorizing reductions in the inner-loop during
2469 outer-loop vectorization is safe. */
2471 /* CHECKME: check for !flag_finite_math_only too? */
2472 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2473 && check_reduction)
2475 /* Changing the order of operations changes the semantics. */
2476 if (dump_enabled_p ())
2477 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2478 "reduction: unsafe fp math optimization: ");
2479 return NULL;
2481 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2482 && check_reduction)
2484 /* Changing the order of operations changes the semantics. */
2485 if (dump_enabled_p ())
2486 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2487 "reduction: unsafe int math optimization: ");
2488 return NULL;
2490 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2492 /* Changing the order of operations changes the semantics. */
2493 if (dump_enabled_p ())
2494 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2495 "reduction: unsafe fixed-point math optimization: ");
2496 return NULL;
2499 /* If we detected "res -= x[i]" earlier, rewrite it into
2500 "res += -x[i]" now. If this turns out to be useless reassoc
2501 will clean it up again. */
2502 if (orig_code == MINUS_EXPR)
2504 tree rhs = gimple_assign_rhs2 (def_stmt);
2505 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2506 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2507 rhs);
2508 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2509 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2510 loop_info, NULL));
2511 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2512 gimple_assign_set_rhs2 (def_stmt, negrhs);
2513 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2514 update_stmt (def_stmt);
2517 /* Reduction is safe. We're dealing with one of the following:
2518 1) integer arithmetic and no trapv
2519 2) floating point arithmetic, and special flags permit this optimization
2520 3) nested cycle (i.e., outer loop vectorization). */
2521 if (TREE_CODE (op1) == SSA_NAME)
2522 def1 = SSA_NAME_DEF_STMT (op1);
2524 if (TREE_CODE (op2) == SSA_NAME)
2525 def2 = SSA_NAME_DEF_STMT (op2);
2527 if (code != COND_EXPR
2528 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2530 if (dump_enabled_p ())
2531 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2532 return NULL;
2535 /* Check that one def is the reduction def, defined by PHI,
2536 the other def is either defined in the loop ("vect_internal_def"),
2537 or it's an induction (defined by a loop-header phi-node). */
2539 if (def2 && def2 == phi
2540 && (code == COND_EXPR
2541 || !def1 || gimple_nop_p (def1)
2542 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2543 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2544 && (is_gimple_assign (def1)
2545 || is_gimple_call (def1)
2546 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2547 == vect_induction_def
2548 || (gimple_code (def1) == GIMPLE_PHI
2549 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2550 == vect_internal_def
2551 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2553 if (dump_enabled_p ())
2554 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2555 return def_stmt;
2558 if (def1 && def1 == phi
2559 && (code == COND_EXPR
2560 || !def2 || gimple_nop_p (def2)
2561 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2562 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2563 && (is_gimple_assign (def2)
2564 || is_gimple_call (def2)
2565 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2566 == vect_induction_def
2567 || (gimple_code (def2) == GIMPLE_PHI
2568 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2569 == vect_internal_def
2570 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2572 if (check_reduction)
2574 /* Swap operands (just for simplicity - so that the rest of the code
2575 can assume that the reduction variable is always the last (second)
2576 argument). */
2577 if (dump_enabled_p ())
2578 report_vect_op (MSG_NOTE, def_stmt,
2579 "detected reduction: need to swap operands: ");
2581 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2582 gimple_assign_rhs2_ptr (def_stmt));
2584 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2585 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2587 else
2589 if (dump_enabled_p ())
2590 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2593 return def_stmt;
2596 /* Try to find SLP reduction chain. */
2597 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2599 if (dump_enabled_p ())
2600 report_vect_op (MSG_NOTE, def_stmt,
2601 "reduction: detected reduction chain: ");
2603 return def_stmt;
2606 if (dump_enabled_p ())
2607 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2608 "reduction: unknown pattern: ");
2610 return NULL;
2613 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2614 in-place. Arguments as there. */
2616 static gimple
2617 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2618 bool check_reduction, bool *double_reduc)
2620 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2621 double_reduc, false);
2624 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2625 in-place if it enables detection of more reductions. Arguments
2626 as there. */
2628 gimple
2629 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2630 bool check_reduction, bool *double_reduc)
2632 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2633 double_reduc, true);
2636 /* Calculate the cost of one scalar iteration of the loop. */
2638 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2640 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2641 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2642 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2643 int innerloop_iters, i, stmt_cost;
2645 /* Count statements in scalar loop. Using this as scalar cost for a single
2646 iteration for now.
2648 TODO: Add outer loop support.
2650 TODO: Consider assigning different costs to different scalar
2651 statements. */
2653 /* FORNOW. */
2654 innerloop_iters = 1;
2655 if (loop->inner)
2656 innerloop_iters = 50; /* FIXME */
2658 for (i = 0; i < nbbs; i++)
2660 gimple_stmt_iterator si;
2661 basic_block bb = bbs[i];
2663 if (bb->loop_father == loop->inner)
2664 factor = innerloop_iters;
2665 else
2666 factor = 1;
2668 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2670 gimple stmt = gsi_stmt (si);
2671 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2673 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2674 continue;
2676 /* Skip stmts that are not vectorized inside the loop. */
2677 if (stmt_info
2678 && !STMT_VINFO_RELEVANT_P (stmt_info)
2679 && (!STMT_VINFO_LIVE_P (stmt_info)
2680 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2681 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2682 continue;
2684 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2686 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2687 stmt_cost = vect_get_stmt_cost (scalar_load);
2688 else
2689 stmt_cost = vect_get_stmt_cost (scalar_store);
2691 else
2692 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2694 scalar_single_iter_cost += stmt_cost * factor;
2697 return scalar_single_iter_cost;
2700 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2702 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2703 int *peel_iters_epilogue,
2704 int scalar_single_iter_cost,
2705 stmt_vector_for_cost *prologue_cost_vec,
2706 stmt_vector_for_cost *epilogue_cost_vec)
2708 int retval = 0;
2709 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2711 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2713 *peel_iters_epilogue = vf/2;
2714 if (dump_enabled_p ())
2715 dump_printf_loc (MSG_NOTE, vect_location,
2716 "cost model: epilogue peel iters set to vf/2 "
2717 "because loop iterations are unknown .\n");
2719 /* If peeled iterations are known but number of scalar loop
2720 iterations are unknown, count a taken branch per peeled loop. */
2721 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2722 NULL, 0, vect_prologue);
2724 else
2726 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2727 peel_iters_prologue = niters < peel_iters_prologue ?
2728 niters : peel_iters_prologue;
2729 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2730 /* If we need to peel for gaps, but no peeling is required, we have to
2731 peel VF iterations. */
2732 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2733 *peel_iters_epilogue = vf;
2736 if (peel_iters_prologue)
2737 retval += record_stmt_cost (prologue_cost_vec,
2738 peel_iters_prologue * scalar_single_iter_cost,
2739 scalar_stmt, NULL, 0, vect_prologue);
2740 if (*peel_iters_epilogue)
2741 retval += record_stmt_cost (epilogue_cost_vec,
2742 *peel_iters_epilogue * scalar_single_iter_cost,
2743 scalar_stmt, NULL, 0, vect_epilogue);
2744 return retval;
2747 /* Function vect_estimate_min_profitable_iters
2749 Return the number of iterations required for the vector version of the
2750 loop to be profitable relative to the cost of the scalar version of the
2751 loop. */
2753 static void
2754 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2755 int *ret_min_profitable_niters,
2756 int *ret_min_profitable_estimate)
2758 int min_profitable_iters;
2759 int min_profitable_estimate;
2760 int peel_iters_prologue;
2761 int peel_iters_epilogue;
2762 unsigned vec_inside_cost = 0;
2763 int vec_outside_cost = 0;
2764 unsigned vec_prologue_cost = 0;
2765 unsigned vec_epilogue_cost = 0;
2766 int scalar_single_iter_cost = 0;
2767 int scalar_outside_cost = 0;
2768 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2769 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2770 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2772 /* Cost model disabled. */
2773 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2775 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2776 *ret_min_profitable_niters = 0;
2777 *ret_min_profitable_estimate = 0;
2778 return;
2781 /* Requires loop versioning tests to handle misalignment. */
2782 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2784 /* FIXME: Make cost depend on complexity of individual check. */
2785 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2786 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2787 vect_prologue);
2788 dump_printf (MSG_NOTE,
2789 "cost model: Adding cost of checks for loop "
2790 "versioning to treat misalignment.\n");
2793 /* Requires loop versioning with alias checks. */
2794 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2796 /* FIXME: Make cost depend on complexity of individual check. */
2797 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2798 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2799 vect_prologue);
2800 dump_printf (MSG_NOTE,
2801 "cost model: Adding cost of checks for loop "
2802 "versioning aliasing.\n");
2805 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2806 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2807 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2808 vect_prologue);
2810 /* Count statements in scalar loop. Using this as scalar cost for a single
2811 iteration for now.
2813 TODO: Add outer loop support.
2815 TODO: Consider assigning different costs to different scalar
2816 statements. */
2818 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2820 /* Add additional cost for the peeled instructions in prologue and epilogue
2821 loop.
2823 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2824 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2826 TODO: Build an expression that represents peel_iters for prologue and
2827 epilogue to be used in a run-time test. */
2829 if (npeel < 0)
2831 peel_iters_prologue = vf/2;
2832 dump_printf (MSG_NOTE, "cost model: "
2833 "prologue peel iters set to vf/2.\n");
2835 /* If peeling for alignment is unknown, loop bound of main loop becomes
2836 unknown. */
2837 peel_iters_epilogue = vf/2;
2838 dump_printf (MSG_NOTE, "cost model: "
2839 "epilogue peel iters set to vf/2 because "
2840 "peeling for alignment is unknown.\n");
2842 /* If peeled iterations are unknown, count a taken branch and a not taken
2843 branch per peeled loop. Even if scalar loop iterations are known,
2844 vector iterations are not known since peeled prologue iterations are
2845 not known. Hence guards remain the same. */
2846 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2847 NULL, 0, vect_prologue);
2848 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2849 NULL, 0, vect_prologue);
2850 /* FORNOW: Don't attempt to pass individual scalar instructions to
2851 the model; just assume linear cost for scalar iterations. */
2852 (void) add_stmt_cost (target_cost_data,
2853 peel_iters_prologue * scalar_single_iter_cost,
2854 scalar_stmt, NULL, 0, vect_prologue);
2855 (void) add_stmt_cost (target_cost_data,
2856 peel_iters_epilogue * scalar_single_iter_cost,
2857 scalar_stmt, NULL, 0, vect_epilogue);
2859 else
2861 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2862 stmt_info_for_cost *si;
2863 int j;
2864 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2866 prologue_cost_vec.create (2);
2867 epilogue_cost_vec.create (2);
2868 peel_iters_prologue = npeel;
2870 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2871 &peel_iters_epilogue,
2872 scalar_single_iter_cost,
2873 &prologue_cost_vec,
2874 &epilogue_cost_vec);
2876 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2878 struct _stmt_vec_info *stmt_info
2879 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2880 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2881 si->misalign, vect_prologue);
2884 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2886 struct _stmt_vec_info *stmt_info
2887 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2888 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2889 si->misalign, vect_epilogue);
2892 prologue_cost_vec.release ();
2893 epilogue_cost_vec.release ();
2896 /* FORNOW: The scalar outside cost is incremented in one of the
2897 following ways:
2899 1. The vectorizer checks for alignment and aliasing and generates
2900 a condition that allows dynamic vectorization. A cost model
2901 check is ANDED with the versioning condition. Hence scalar code
2902 path now has the added cost of the versioning check.
2904 if (cost > th & versioning_check)
2905 jmp to vector code
2907 Hence run-time scalar is incremented by not-taken branch cost.
2909 2. The vectorizer then checks if a prologue is required. If the
2910 cost model check was not done before during versioning, it has to
2911 be done before the prologue check.
2913 if (cost <= th)
2914 prologue = scalar_iters
2915 if (prologue == 0)
2916 jmp to vector code
2917 else
2918 execute prologue
2919 if (prologue == num_iters)
2920 go to exit
2922 Hence the run-time scalar cost is incremented by a taken branch,
2923 plus a not-taken branch, plus a taken branch cost.
2925 3. The vectorizer then checks if an epilogue is required. If the
2926 cost model check was not done before during prologue check, it
2927 has to be done with the epilogue check.
2929 if (prologue == 0)
2930 jmp to vector code
2931 else
2932 execute prologue
2933 if (prologue == num_iters)
2934 go to exit
2935 vector code:
2936 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2937 jmp to epilogue
2939 Hence the run-time scalar cost should be incremented by 2 taken
2940 branches.
2942 TODO: The back end may reorder the BBS's differently and reverse
2943 conditions/branch directions. Change the estimates below to
2944 something more reasonable. */
2946 /* If the number of iterations is known and we do not do versioning, we can
2947 decide whether to vectorize at compile time. Hence the scalar version
2948 do not carry cost model guard costs. */
2949 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2950 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2951 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2953 /* Cost model check occurs at versioning. */
2954 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2955 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2956 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2957 else
2959 /* Cost model check occurs at prologue generation. */
2960 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2961 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2962 + vect_get_stmt_cost (cond_branch_not_taken);
2963 /* Cost model check occurs at epilogue generation. */
2964 else
2965 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2969 /* Complete the target-specific cost calculations. */
2970 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2971 &vec_inside_cost, &vec_epilogue_cost);
2973 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2975 /* Calculate number of iterations required to make the vector version
2976 profitable, relative to the loop bodies only. The following condition
2977 must hold true:
2978 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2979 where
2980 SIC = scalar iteration cost, VIC = vector iteration cost,
2981 VOC = vector outside cost, VF = vectorization factor,
2982 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2983 SOC = scalar outside cost for run time cost model check. */
2985 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2987 if (vec_outside_cost <= 0)
2988 min_profitable_iters = 1;
2989 else
2991 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2992 - vec_inside_cost * peel_iters_prologue
2993 - vec_inside_cost * peel_iters_epilogue)
2994 / ((scalar_single_iter_cost * vf)
2995 - vec_inside_cost);
2997 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2998 <= (((int) vec_inside_cost * min_profitable_iters)
2999 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3000 min_profitable_iters++;
3003 /* vector version will never be profitable. */
3004 else
3006 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3007 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3008 "did not happen for a simd loop");
3010 if (dump_enabled_p ())
3011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012 "cost model: the vector iteration cost = %d "
3013 "divided by the scalar iteration cost = %d "
3014 "is greater or equal to the vectorization factor = %d"
3015 ".\n",
3016 vec_inside_cost, scalar_single_iter_cost, vf);
3017 *ret_min_profitable_niters = -1;
3018 *ret_min_profitable_estimate = -1;
3019 return;
3022 if (dump_enabled_p ())
3024 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3025 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3026 vec_inside_cost);
3027 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3028 vec_prologue_cost);
3029 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3030 vec_epilogue_cost);
3031 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3032 scalar_single_iter_cost);
3033 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3034 scalar_outside_cost);
3035 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3036 vec_outside_cost);
3037 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3038 peel_iters_prologue);
3039 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3040 peel_iters_epilogue);
3041 dump_printf (MSG_NOTE,
3042 " Calculated minimum iters for profitability: %d\n",
3043 min_profitable_iters);
3044 dump_printf (MSG_NOTE, "\n");
3047 min_profitable_iters =
3048 min_profitable_iters < vf ? vf : min_profitable_iters;
3050 /* Because the condition we create is:
3051 if (niters <= min_profitable_iters)
3052 then skip the vectorized loop. */
3053 min_profitable_iters--;
3055 if (dump_enabled_p ())
3056 dump_printf_loc (MSG_NOTE, vect_location,
3057 " Runtime profitability threshold = %d\n",
3058 min_profitable_iters);
3060 *ret_min_profitable_niters = min_profitable_iters;
3062 /* Calculate number of iterations required to make the vector version
3063 profitable, relative to the loop bodies only.
3065 Non-vectorized variant is SIC * niters and it must win over vector
3066 variant on the expected loop trip count. The following condition must hold true:
3067 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3069 if (vec_outside_cost <= 0)
3070 min_profitable_estimate = 1;
3071 else
3073 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3074 - vec_inside_cost * peel_iters_prologue
3075 - vec_inside_cost * peel_iters_epilogue)
3076 / ((scalar_single_iter_cost * vf)
3077 - vec_inside_cost);
3079 min_profitable_estimate --;
3080 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3081 if (dump_enabled_p ())
3082 dump_printf_loc (MSG_NOTE, vect_location,
3083 " Static estimate profitability threshold = %d\n",
3084 min_profitable_iters);
3086 *ret_min_profitable_estimate = min_profitable_estimate;
3089 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3090 vector elements (not bits) for a vector of mode MODE. */
3091 static void
3092 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3093 unsigned char *sel)
3095 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3097 for (i = 0; i < nelt; i++)
3098 sel[i] = (i + offset) & (2*nelt - 1);
3101 /* Checks whether the target supports whole-vector shifts for vectors of mode
3102 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3103 it supports vec_perm_const with masks for all necessary shift amounts. */
3104 static bool
3105 have_whole_vector_shift (enum machine_mode mode)
3107 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3108 return true;
3110 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3111 return false;
3113 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3114 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3116 for (i = nelt/2; i >= 1; i/=2)
3118 calc_vec_perm_mask_for_shift (mode, i, sel);
3119 if (!can_vec_perm_p (mode, false, sel))
3120 return false;
3122 return true;
3125 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3126 functions. Design better to avoid maintenance issues. */
3128 /* Function vect_model_reduction_cost.
3130 Models cost for a reduction operation, including the vector ops
3131 generated within the strip-mine loop, the initial definition before
3132 the loop, and the epilogue code that must be generated. */
3134 static bool
3135 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3136 int ncopies)
3138 int prologue_cost = 0, epilogue_cost = 0;
3139 enum tree_code code;
3140 optab optab;
3141 tree vectype;
3142 gimple stmt, orig_stmt;
3143 tree reduction_op;
3144 machine_mode mode;
3145 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3147 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3149 /* Cost of reduction op inside loop. */
3150 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3151 stmt_info, 0, vect_body);
3152 stmt = STMT_VINFO_STMT (stmt_info);
3154 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3156 case GIMPLE_SINGLE_RHS:
3157 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3158 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3159 break;
3160 case GIMPLE_UNARY_RHS:
3161 reduction_op = gimple_assign_rhs1 (stmt);
3162 break;
3163 case GIMPLE_BINARY_RHS:
3164 reduction_op = gimple_assign_rhs2 (stmt);
3165 break;
3166 case GIMPLE_TERNARY_RHS:
3167 reduction_op = gimple_assign_rhs3 (stmt);
3168 break;
3169 default:
3170 gcc_unreachable ();
3173 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3174 if (!vectype)
3176 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3179 "unsupported data-type ");
3180 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3181 TREE_TYPE (reduction_op));
3182 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3184 return false;
3187 mode = TYPE_MODE (vectype);
3188 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3190 if (!orig_stmt)
3191 orig_stmt = STMT_VINFO_STMT (stmt_info);
3193 code = gimple_assign_rhs_code (orig_stmt);
3195 /* Add in cost for initial definition. */
3196 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3197 stmt_info, 0, vect_prologue);
3199 /* Determine cost of epilogue code.
3201 We have a reduction operator that will reduce the vector in one statement.
3202 Also requires scalar extract. */
3204 if (!nested_in_vect_loop_p (loop, orig_stmt))
3206 if (reduc_code != ERROR_MARK)
3208 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3209 stmt_info, 0, vect_epilogue);
3210 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3211 stmt_info, 0, vect_epilogue);
3213 else
3215 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3216 tree bitsize =
3217 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3218 int element_bitsize = tree_to_uhwi (bitsize);
3219 int nelements = vec_size_in_bits / element_bitsize;
3221 optab = optab_for_tree_code (code, vectype, optab_default);
3223 /* We have a whole vector shift available. */
3224 if (VECTOR_MODE_P (mode)
3225 && optab_handler (optab, mode) != CODE_FOR_nothing
3226 && have_whole_vector_shift (mode))
3228 /* Final reduction via vector shifts and the reduction operator.
3229 Also requires scalar extract. */
3230 epilogue_cost += add_stmt_cost (target_cost_data,
3231 exact_log2 (nelements) * 2,
3232 vector_stmt, stmt_info, 0,
3233 vect_epilogue);
3234 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3235 vec_to_scalar, stmt_info, 0,
3236 vect_epilogue);
3238 else
3239 /* Use extracts and reduction op for final reduction. For N
3240 elements, we have N extracts and N-1 reduction ops. */
3241 epilogue_cost += add_stmt_cost (target_cost_data,
3242 nelements + nelements - 1,
3243 vector_stmt, stmt_info, 0,
3244 vect_epilogue);
3248 if (dump_enabled_p ())
3249 dump_printf (MSG_NOTE,
3250 "vect_model_reduction_cost: inside_cost = %d, "
3251 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3252 prologue_cost, epilogue_cost);
3254 return true;
3258 /* Function vect_model_induction_cost.
3260 Models cost for induction operations. */
3262 static void
3263 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3265 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3266 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3267 unsigned inside_cost, prologue_cost;
3269 /* loop cost for vec_loop. */
3270 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3271 stmt_info, 0, vect_body);
3273 /* prologue cost for vec_init and vec_step. */
3274 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3275 stmt_info, 0, vect_prologue);
3277 if (dump_enabled_p ())
3278 dump_printf_loc (MSG_NOTE, vect_location,
3279 "vect_model_induction_cost: inside_cost = %d, "
3280 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3284 /* Function get_initial_def_for_induction
3286 Input:
3287 STMT - a stmt that performs an induction operation in the loop.
3288 IV_PHI - the initial value of the induction variable
3290 Output:
3291 Return a vector variable, initialized with the first VF values of
3292 the induction variable. E.g., for an iv with IV_PHI='X' and
3293 evolution S, for a vector of 4 units, we want to return:
3294 [X, X + S, X + 2*S, X + 3*S]. */
3296 static tree
3297 get_initial_def_for_induction (gimple iv_phi)
3299 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3300 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3301 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3302 tree vectype;
3303 int nunits;
3304 edge pe = loop_preheader_edge (loop);
3305 struct loop *iv_loop;
3306 basic_block new_bb;
3307 tree new_vec, vec_init, vec_step, t;
3308 tree new_var;
3309 tree new_name;
3310 gimple init_stmt, new_stmt;
3311 gphi *induction_phi;
3312 tree induc_def, vec_def, vec_dest;
3313 tree init_expr, step_expr;
3314 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3315 int i;
3316 int ncopies;
3317 tree expr;
3318 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3319 bool nested_in_vect_loop = false;
3320 gimple_seq stmts = NULL;
3321 imm_use_iterator imm_iter;
3322 use_operand_p use_p;
3323 gimple exit_phi;
3324 edge latch_e;
3325 tree loop_arg;
3326 gimple_stmt_iterator si;
3327 basic_block bb = gimple_bb (iv_phi);
3328 tree stepvectype;
3329 tree resvectype;
3331 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3332 if (nested_in_vect_loop_p (loop, iv_phi))
3334 nested_in_vect_loop = true;
3335 iv_loop = loop->inner;
3337 else
3338 iv_loop = loop;
3339 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3341 latch_e = loop_latch_edge (iv_loop);
3342 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3344 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3345 gcc_assert (step_expr != NULL_TREE);
3347 pe = loop_preheader_edge (iv_loop);
3348 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3349 loop_preheader_edge (iv_loop));
3351 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3352 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3353 gcc_assert (vectype);
3354 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3355 ncopies = vf / nunits;
3357 gcc_assert (phi_info);
3358 gcc_assert (ncopies >= 1);
3360 /* Convert the step to the desired type. */
3361 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3362 step_expr),
3363 &stmts, true, NULL_TREE);
3364 if (stmts)
3366 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3367 gcc_assert (!new_bb);
3370 /* Find the first insertion point in the BB. */
3371 si = gsi_after_labels (bb);
3373 /* Create the vector that holds the initial_value of the induction. */
3374 if (nested_in_vect_loop)
3376 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3377 been created during vectorization of previous stmts. We obtain it
3378 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3379 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3380 /* If the initial value is not of proper type, convert it. */
3381 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3383 new_stmt = gimple_build_assign_with_ops
3384 (VIEW_CONVERT_EXPR,
3385 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3386 build1 (VIEW_CONVERT_EXPR, vectype, vec_init));
3387 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3388 gimple_assign_set_lhs (new_stmt, vec_init);
3389 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3390 new_stmt);
3391 gcc_assert (!new_bb);
3392 set_vinfo_for_stmt (new_stmt,
3393 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3396 else
3398 vec<constructor_elt, va_gc> *v;
3400 /* iv_loop is the loop to be vectorized. Create:
3401 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3402 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3403 vect_scalar_var, "var_");
3404 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3405 init_expr),
3406 &stmts, false, new_var);
3407 if (stmts)
3409 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3410 gcc_assert (!new_bb);
3413 vec_alloc (v, nunits);
3414 bool constant_p = is_gimple_min_invariant (new_name);
3415 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3416 for (i = 1; i < nunits; i++)
3418 /* Create: new_name_i = new_name + step_expr */
3419 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3420 new_name, step_expr);
3421 if (!is_gimple_min_invariant (new_name))
3423 init_stmt = gimple_build_assign (new_var, new_name);
3424 new_name = make_ssa_name (new_var, init_stmt);
3425 gimple_assign_set_lhs (init_stmt, new_name);
3426 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3427 gcc_assert (!new_bb);
3428 if (dump_enabled_p ())
3430 dump_printf_loc (MSG_NOTE, vect_location,
3431 "created new init_stmt: ");
3432 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3433 dump_printf (MSG_NOTE, "\n");
3435 constant_p = false;
3437 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3439 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3440 if (constant_p)
3441 new_vec = build_vector_from_ctor (vectype, v);
3442 else
3443 new_vec = build_constructor (vectype, v);
3444 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3448 /* Create the vector that holds the step of the induction. */
3449 if (nested_in_vect_loop)
3450 /* iv_loop is nested in the loop to be vectorized. Generate:
3451 vec_step = [S, S, S, S] */
3452 new_name = step_expr;
3453 else
3455 /* iv_loop is the loop to be vectorized. Generate:
3456 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3457 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3459 expr = build_int_cst (integer_type_node, vf);
3460 expr = fold_convert (TREE_TYPE (step_expr), expr);
3462 else
3463 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3464 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3465 expr, step_expr);
3466 if (TREE_CODE (step_expr) == SSA_NAME)
3467 new_name = vect_init_vector (iv_phi, new_name,
3468 TREE_TYPE (step_expr), NULL);
3471 t = unshare_expr (new_name);
3472 gcc_assert (CONSTANT_CLASS_P (new_name)
3473 || TREE_CODE (new_name) == SSA_NAME);
3474 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3475 gcc_assert (stepvectype);
3476 new_vec = build_vector_from_val (stepvectype, t);
3477 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3480 /* Create the following def-use cycle:
3481 loop prolog:
3482 vec_init = ...
3483 vec_step = ...
3484 loop:
3485 vec_iv = PHI <vec_init, vec_loop>
3487 STMT
3489 vec_loop = vec_iv + vec_step; */
3491 /* Create the induction-phi that defines the induction-operand. */
3492 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3493 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3494 set_vinfo_for_stmt (induction_phi,
3495 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3496 induc_def = PHI_RESULT (induction_phi);
3498 /* Create the iv update inside the loop */
3499 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3500 induc_def, vec_step);
3501 vec_def = make_ssa_name (vec_dest, new_stmt);
3502 gimple_assign_set_lhs (new_stmt, vec_def);
3503 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3504 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3505 NULL));
3507 /* Set the arguments of the phi node: */
3508 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3509 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3510 UNKNOWN_LOCATION);
3513 /* In case that vectorization factor (VF) is bigger than the number
3514 of elements that we can fit in a vectype (nunits), we have to generate
3515 more than one vector stmt - i.e - we need to "unroll" the
3516 vector stmt by a factor VF/nunits. For more details see documentation
3517 in vectorizable_operation. */
3519 if (ncopies > 1)
3521 stmt_vec_info prev_stmt_vinfo;
3522 /* FORNOW. This restriction should be relaxed. */
3523 gcc_assert (!nested_in_vect_loop);
3525 /* Create the vector that holds the step of the induction. */
3526 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3528 expr = build_int_cst (integer_type_node, nunits);
3529 expr = fold_convert (TREE_TYPE (step_expr), expr);
3531 else
3532 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3533 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3534 expr, step_expr);
3535 if (TREE_CODE (step_expr) == SSA_NAME)
3536 new_name = vect_init_vector (iv_phi, new_name,
3537 TREE_TYPE (step_expr), NULL);
3538 t = unshare_expr (new_name);
3539 gcc_assert (CONSTANT_CLASS_P (new_name)
3540 || TREE_CODE (new_name) == SSA_NAME);
3541 new_vec = build_vector_from_val (stepvectype, t);
3542 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3544 vec_def = induc_def;
3545 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3546 for (i = 1; i < ncopies; i++)
3548 /* vec_i = vec_prev + vec_step */
3549 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3550 vec_def, vec_step);
3551 vec_def = make_ssa_name (vec_dest, new_stmt);
3552 gimple_assign_set_lhs (new_stmt, vec_def);
3554 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3555 if (!useless_type_conversion_p (resvectype, vectype))
3557 new_stmt = gimple_build_assign_with_ops
3558 (VIEW_CONVERT_EXPR,
3559 vect_get_new_vect_var (resvectype, vect_simple_var,
3560 "vec_iv_"),
3561 build1 (VIEW_CONVERT_EXPR, resvectype,
3562 gimple_assign_lhs (new_stmt)));
3563 gimple_assign_set_lhs (new_stmt,
3564 make_ssa_name
3565 (gimple_assign_lhs (new_stmt), new_stmt));
3566 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3568 set_vinfo_for_stmt (new_stmt,
3569 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3570 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3571 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3575 if (nested_in_vect_loop)
3577 /* Find the loop-closed exit-phi of the induction, and record
3578 the final vector of induction results: */
3579 exit_phi = NULL;
3580 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3582 gimple use_stmt = USE_STMT (use_p);
3583 if (is_gimple_debug (use_stmt))
3584 continue;
3586 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3588 exit_phi = use_stmt;
3589 break;
3592 if (exit_phi)
3594 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3595 /* FORNOW. Currently not supporting the case that an inner-loop induction
3596 is not used in the outer-loop (i.e. only outside the outer-loop). */
3597 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3598 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3600 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3601 if (dump_enabled_p ())
3603 dump_printf_loc (MSG_NOTE, vect_location,
3604 "vector of inductions after inner-loop:");
3605 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3606 dump_printf (MSG_NOTE, "\n");
3612 if (dump_enabled_p ())
3614 dump_printf_loc (MSG_NOTE, vect_location,
3615 "transform induction: created def-use cycle: ");
3616 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3617 dump_printf (MSG_NOTE, "\n");
3618 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3619 SSA_NAME_DEF_STMT (vec_def), 0);
3620 dump_printf (MSG_NOTE, "\n");
3623 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3624 if (!useless_type_conversion_p (resvectype, vectype))
3626 new_stmt = gimple_build_assign_with_ops
3627 (VIEW_CONVERT_EXPR,
3628 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3629 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def));
3630 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3631 gimple_assign_set_lhs (new_stmt, induc_def);
3632 si = gsi_after_labels (bb);
3633 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3634 set_vinfo_for_stmt (new_stmt,
3635 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3636 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3637 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3640 return induc_def;
3644 /* Function get_initial_def_for_reduction
3646 Input:
3647 STMT - a stmt that performs a reduction operation in the loop.
3648 INIT_VAL - the initial value of the reduction variable
3650 Output:
3651 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3652 of the reduction (used for adjusting the epilog - see below).
3653 Return a vector variable, initialized according to the operation that STMT
3654 performs. This vector will be used as the initial value of the
3655 vector of partial results.
3657 Option1 (adjust in epilog): Initialize the vector as follows:
3658 add/bit or/xor: [0,0,...,0,0]
3659 mult/bit and: [1,1,...,1,1]
3660 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3661 and when necessary (e.g. add/mult case) let the caller know
3662 that it needs to adjust the result by init_val.
3664 Option2: Initialize the vector as follows:
3665 add/bit or/xor: [init_val,0,0,...,0]
3666 mult/bit and: [init_val,1,1,...,1]
3667 min/max/cond_expr: [init_val,init_val,...,init_val]
3668 and no adjustments are needed.
3670 For example, for the following code:
3672 s = init_val;
3673 for (i=0;i<n;i++)
3674 s = s + a[i];
3676 STMT is 's = s + a[i]', and the reduction variable is 's'.
3677 For a vector of 4 units, we want to return either [0,0,0,init_val],
3678 or [0,0,0,0] and let the caller know that it needs to adjust
3679 the result at the end by 'init_val'.
3681 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3682 initialization vector is simpler (same element in all entries), if
3683 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3685 A cost model should help decide between these two schemes. */
3687 tree
3688 get_initial_def_for_reduction (gimple stmt, tree init_val,
3689 tree *adjustment_def)
3691 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3692 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3693 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3694 tree scalar_type = TREE_TYPE (init_val);
3695 tree vectype = get_vectype_for_scalar_type (scalar_type);
3696 int nunits;
3697 enum tree_code code = gimple_assign_rhs_code (stmt);
3698 tree def_for_init;
3699 tree init_def;
3700 tree *elts;
3701 int i;
3702 bool nested_in_vect_loop = false;
3703 tree init_value;
3704 REAL_VALUE_TYPE real_init_val = dconst0;
3705 int int_init_val = 0;
3706 gimple def_stmt = NULL;
3708 gcc_assert (vectype);
3709 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3711 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3712 || SCALAR_FLOAT_TYPE_P (scalar_type));
3714 if (nested_in_vect_loop_p (loop, stmt))
3715 nested_in_vect_loop = true;
3716 else
3717 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3719 /* In case of double reduction we only create a vector variable to be put
3720 in the reduction phi node. The actual statement creation is done in
3721 vect_create_epilog_for_reduction. */
3722 if (adjustment_def && nested_in_vect_loop
3723 && TREE_CODE (init_val) == SSA_NAME
3724 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3725 && gimple_code (def_stmt) == GIMPLE_PHI
3726 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3727 && vinfo_for_stmt (def_stmt)
3728 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3729 == vect_double_reduction_def)
3731 *adjustment_def = NULL;
3732 return vect_create_destination_var (init_val, vectype);
3735 if (TREE_CONSTANT (init_val))
3737 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3738 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3739 else
3740 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3742 else
3743 init_value = init_val;
3745 switch (code)
3747 case WIDEN_SUM_EXPR:
3748 case DOT_PROD_EXPR:
3749 case SAD_EXPR:
3750 case PLUS_EXPR:
3751 case MINUS_EXPR:
3752 case BIT_IOR_EXPR:
3753 case BIT_XOR_EXPR:
3754 case MULT_EXPR:
3755 case BIT_AND_EXPR:
3756 /* ADJUSMENT_DEF is NULL when called from
3757 vect_create_epilog_for_reduction to vectorize double reduction. */
3758 if (adjustment_def)
3760 if (nested_in_vect_loop)
3761 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3762 NULL);
3763 else
3764 *adjustment_def = init_val;
3767 if (code == MULT_EXPR)
3769 real_init_val = dconst1;
3770 int_init_val = 1;
3773 if (code == BIT_AND_EXPR)
3774 int_init_val = -1;
3776 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3777 def_for_init = build_real (scalar_type, real_init_val);
3778 else
3779 def_for_init = build_int_cst (scalar_type, int_init_val);
3781 /* Create a vector of '0' or '1' except the first element. */
3782 elts = XALLOCAVEC (tree, nunits);
3783 for (i = nunits - 2; i >= 0; --i)
3784 elts[i + 1] = def_for_init;
3786 /* Option1: the first element is '0' or '1' as well. */
3787 if (adjustment_def)
3789 elts[0] = def_for_init;
3790 init_def = build_vector (vectype, elts);
3791 break;
3794 /* Option2: the first element is INIT_VAL. */
3795 elts[0] = init_val;
3796 if (TREE_CONSTANT (init_val))
3797 init_def = build_vector (vectype, elts);
3798 else
3800 vec<constructor_elt, va_gc> *v;
3801 vec_alloc (v, nunits);
3802 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3803 for (i = 1; i < nunits; ++i)
3804 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3805 init_def = build_constructor (vectype, v);
3808 break;
3810 case MIN_EXPR:
3811 case MAX_EXPR:
3812 case COND_EXPR:
3813 if (adjustment_def)
3815 *adjustment_def = NULL_TREE;
3816 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3817 break;
3820 init_def = build_vector_from_val (vectype, init_value);
3821 break;
3823 default:
3824 gcc_unreachable ();
3827 return init_def;
3830 /* Function vect_create_epilog_for_reduction
3832 Create code at the loop-epilog to finalize the result of a reduction
3833 computation.
3835 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3836 reduction statements.
3837 STMT is the scalar reduction stmt that is being vectorized.
3838 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3839 number of elements that we can fit in a vectype (nunits). In this case
3840 we have to generate more than one vector stmt - i.e - we need to "unroll"
3841 the vector stmt by a factor VF/nunits. For more details see documentation
3842 in vectorizable_operation.
3843 REDUC_CODE is the tree-code for the epilog reduction.
3844 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3845 computation.
3846 REDUC_INDEX is the index of the operand in the right hand side of the
3847 statement that is defined by REDUCTION_PHI.
3848 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3849 SLP_NODE is an SLP node containing a group of reduction statements. The
3850 first one in this group is STMT.
3852 This function:
3853 1. Creates the reduction def-use cycles: sets the arguments for
3854 REDUCTION_PHIS:
3855 The loop-entry argument is the vectorized initial-value of the reduction.
3856 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3857 sums.
3858 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3859 by applying the operation specified by REDUC_CODE if available, or by
3860 other means (whole-vector shifts or a scalar loop).
3861 The function also creates a new phi node at the loop exit to preserve
3862 loop-closed form, as illustrated below.
3864 The flow at the entry to this function:
3866 loop:
3867 vec_def = phi <null, null> # REDUCTION_PHI
3868 VECT_DEF = vector_stmt # vectorized form of STMT
3869 s_loop = scalar_stmt # (scalar) STMT
3870 loop_exit:
3871 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3872 use <s_out0>
3873 use <s_out0>
3875 The above is transformed by this function into:
3877 loop:
3878 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3879 VECT_DEF = vector_stmt # vectorized form of STMT
3880 s_loop = scalar_stmt # (scalar) STMT
3881 loop_exit:
3882 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3883 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3884 v_out2 = reduce <v_out1>
3885 s_out3 = extract_field <v_out2, 0>
3886 s_out4 = adjust_result <s_out3>
3887 use <s_out4>
3888 use <s_out4>
3891 static void
3892 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3893 int ncopies, enum tree_code reduc_code,
3894 vec<gimple> reduction_phis,
3895 int reduc_index, bool double_reduc,
3896 slp_tree slp_node)
3898 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3899 stmt_vec_info prev_phi_info;
3900 tree vectype;
3901 machine_mode mode;
3902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3903 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3904 basic_block exit_bb;
3905 tree scalar_dest;
3906 tree scalar_type;
3907 gimple new_phi = NULL, phi;
3908 gimple_stmt_iterator exit_gsi;
3909 tree vec_dest;
3910 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3911 gimple epilog_stmt = NULL;
3912 enum tree_code code = gimple_assign_rhs_code (stmt);
3913 gimple exit_phi;
3914 tree bitsize;
3915 tree adjustment_def = NULL;
3916 tree vec_initial_def = NULL;
3917 tree reduction_op, expr, def;
3918 tree orig_name, scalar_result;
3919 imm_use_iterator imm_iter, phi_imm_iter;
3920 use_operand_p use_p, phi_use_p;
3921 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3922 bool nested_in_vect_loop = false;
3923 auto_vec<gimple> new_phis;
3924 auto_vec<gimple> inner_phis;
3925 enum vect_def_type dt = vect_unknown_def_type;
3926 int j, i;
3927 auto_vec<tree> scalar_results;
3928 unsigned int group_size = 1, k, ratio;
3929 auto_vec<tree> vec_initial_defs;
3930 auto_vec<gimple> phis;
3931 bool slp_reduc = false;
3932 tree new_phi_result;
3933 gimple inner_phi = NULL;
3935 if (slp_node)
3936 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3938 if (nested_in_vect_loop_p (loop, stmt))
3940 outer_loop = loop;
3941 loop = loop->inner;
3942 nested_in_vect_loop = true;
3943 gcc_assert (!slp_node);
3946 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3948 case GIMPLE_SINGLE_RHS:
3949 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3950 == ternary_op);
3951 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3952 break;
3953 case GIMPLE_UNARY_RHS:
3954 reduction_op = gimple_assign_rhs1 (stmt);
3955 break;
3956 case GIMPLE_BINARY_RHS:
3957 reduction_op = reduc_index ?
3958 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3959 break;
3960 case GIMPLE_TERNARY_RHS:
3961 reduction_op = gimple_op (stmt, reduc_index + 1);
3962 break;
3963 default:
3964 gcc_unreachable ();
3967 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3968 gcc_assert (vectype);
3969 mode = TYPE_MODE (vectype);
3971 /* 1. Create the reduction def-use cycle:
3972 Set the arguments of REDUCTION_PHIS, i.e., transform
3974 loop:
3975 vec_def = phi <null, null> # REDUCTION_PHI
3976 VECT_DEF = vector_stmt # vectorized form of STMT
3979 into:
3981 loop:
3982 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3983 VECT_DEF = vector_stmt # vectorized form of STMT
3986 (in case of SLP, do it for all the phis). */
3988 /* Get the loop-entry arguments. */
3989 if (slp_node)
3990 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3991 NULL, slp_node, reduc_index);
3992 else
3994 vec_initial_defs.create (1);
3995 /* For the case of reduction, vect_get_vec_def_for_operand returns
3996 the scalar def before the loop, that defines the initial value
3997 of the reduction variable. */
3998 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3999 &adjustment_def);
4000 vec_initial_defs.quick_push (vec_initial_def);
4003 /* Set phi nodes arguments. */
4004 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4006 tree vec_init_def, def;
4007 gimple_seq stmts;
4008 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4009 true, NULL_TREE);
4010 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4011 def = vect_defs[i];
4012 for (j = 0; j < ncopies; j++)
4014 /* Set the loop-entry arg of the reduction-phi. */
4015 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4016 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4018 /* Set the loop-latch arg for the reduction-phi. */
4019 if (j > 0)
4020 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4022 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4023 UNKNOWN_LOCATION);
4025 if (dump_enabled_p ())
4027 dump_printf_loc (MSG_NOTE, vect_location,
4028 "transform reduction: created def-use cycle: ");
4029 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4030 dump_printf (MSG_NOTE, "\n");
4031 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4032 dump_printf (MSG_NOTE, "\n");
4035 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4039 /* 2. Create epilog code.
4040 The reduction epilog code operates across the elements of the vector
4041 of partial results computed by the vectorized loop.
4042 The reduction epilog code consists of:
4044 step 1: compute the scalar result in a vector (v_out2)
4045 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4046 step 3: adjust the scalar result (s_out3) if needed.
4048 Step 1 can be accomplished using one the following three schemes:
4049 (scheme 1) using reduc_code, if available.
4050 (scheme 2) using whole-vector shifts, if available.
4051 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4052 combined.
4054 The overall epilog code looks like this:
4056 s_out0 = phi <s_loop> # original EXIT_PHI
4057 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4058 v_out2 = reduce <v_out1> # step 1
4059 s_out3 = extract_field <v_out2, 0> # step 2
4060 s_out4 = adjust_result <s_out3> # step 3
4062 (step 3 is optional, and steps 1 and 2 may be combined).
4063 Lastly, the uses of s_out0 are replaced by s_out4. */
4066 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4067 v_out1 = phi <VECT_DEF>
4068 Store them in NEW_PHIS. */
4070 exit_bb = single_exit (loop)->dest;
4071 prev_phi_info = NULL;
4072 new_phis.create (vect_defs.length ());
4073 FOR_EACH_VEC_ELT (vect_defs, i, def)
4075 for (j = 0; j < ncopies; j++)
4077 tree new_def = copy_ssa_name (def, NULL);
4078 phi = create_phi_node (new_def, exit_bb);
4079 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4080 if (j == 0)
4081 new_phis.quick_push (phi);
4082 else
4084 def = vect_get_vec_def_for_stmt_copy (dt, def);
4085 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4088 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4089 prev_phi_info = vinfo_for_stmt (phi);
4093 /* The epilogue is created for the outer-loop, i.e., for the loop being
4094 vectorized. Create exit phis for the outer loop. */
4095 if (double_reduc)
4097 loop = outer_loop;
4098 exit_bb = single_exit (loop)->dest;
4099 inner_phis.create (vect_defs.length ());
4100 FOR_EACH_VEC_ELT (new_phis, i, phi)
4102 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4103 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4104 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4105 PHI_RESULT (phi));
4106 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4107 loop_vinfo, NULL));
4108 inner_phis.quick_push (phi);
4109 new_phis[i] = outer_phi;
4110 prev_phi_info = vinfo_for_stmt (outer_phi);
4111 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4113 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4114 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4115 outer_phi = create_phi_node (new_result, exit_bb);
4116 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4117 PHI_RESULT (phi));
4118 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4119 loop_vinfo, NULL));
4120 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4121 prev_phi_info = vinfo_for_stmt (outer_phi);
4126 exit_gsi = gsi_after_labels (exit_bb);
4128 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4129 (i.e. when reduc_code is not available) and in the final adjustment
4130 code (if needed). Also get the original scalar reduction variable as
4131 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4132 represents a reduction pattern), the tree-code and scalar-def are
4133 taken from the original stmt that the pattern-stmt (STMT) replaces.
4134 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4135 are taken from STMT. */
4137 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4138 if (!orig_stmt)
4140 /* Regular reduction */
4141 orig_stmt = stmt;
4143 else
4145 /* Reduction pattern */
4146 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4147 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4148 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4151 code = gimple_assign_rhs_code (orig_stmt);
4152 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4153 partial results are added and not subtracted. */
4154 if (code == MINUS_EXPR)
4155 code = PLUS_EXPR;
4157 scalar_dest = gimple_assign_lhs (orig_stmt);
4158 scalar_type = TREE_TYPE (scalar_dest);
4159 scalar_results.create (group_size);
4160 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4161 bitsize = TYPE_SIZE (scalar_type);
4163 /* In case this is a reduction in an inner-loop while vectorizing an outer
4164 loop - we don't need to extract a single scalar result at the end of the
4165 inner-loop (unless it is double reduction, i.e., the use of reduction is
4166 outside the outer-loop). The final vector of partial results will be used
4167 in the vectorized outer-loop, or reduced to a scalar result at the end of
4168 the outer-loop. */
4169 if (nested_in_vect_loop && !double_reduc)
4170 goto vect_finalize_reduction;
4172 /* SLP reduction without reduction chain, e.g.,
4173 # a1 = phi <a2, a0>
4174 # b1 = phi <b2, b0>
4175 a2 = operation (a1)
4176 b2 = operation (b1) */
4177 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4179 /* In case of reduction chain, e.g.,
4180 # a1 = phi <a3, a0>
4181 a2 = operation (a1)
4182 a3 = operation (a2),
4184 we may end up with more than one vector result. Here we reduce them to
4185 one vector. */
4186 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4188 tree first_vect = PHI_RESULT (new_phis[0]);
4189 tree tmp;
4190 gassign *new_vec_stmt = NULL;
4192 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4193 for (k = 1; k < new_phis.length (); k++)
4195 gimple next_phi = new_phis[k];
4196 tree second_vect = PHI_RESULT (next_phi);
4198 tmp = build2 (code, vectype, first_vect, second_vect);
4199 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4200 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4201 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4202 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4205 new_phi_result = first_vect;
4206 if (new_vec_stmt)
4208 new_phis.truncate (0);
4209 new_phis.safe_push (new_vec_stmt);
4212 else
4213 new_phi_result = PHI_RESULT (new_phis[0]);
4215 /* 2.3 Create the reduction code, using one of the three schemes described
4216 above. In SLP we simply need to extract all the elements from the
4217 vector (without reducing them), so we use scalar shifts. */
4218 if (reduc_code != ERROR_MARK && !slp_reduc)
4220 tree tmp;
4221 tree vec_elem_type;
4223 /*** Case 1: Create:
4224 v_out2 = reduc_expr <v_out1> */
4226 if (dump_enabled_p ())
4227 dump_printf_loc (MSG_NOTE, vect_location,
4228 "Reduce using direct vector reduction.\n");
4230 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4231 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4233 tree tmp_dest =
4234 vect_create_destination_var (scalar_dest, vec_elem_type);
4235 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4236 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4237 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4238 gimple_assign_set_lhs (epilog_stmt, new_temp);
4239 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4241 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4243 else
4244 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4245 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4246 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4247 gimple_assign_set_lhs (epilog_stmt, new_temp);
4248 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4249 scalar_results.safe_push (new_temp);
4251 else
4253 bool reduce_with_shift = have_whole_vector_shift (mode);
4254 int element_bitsize = tree_to_uhwi (bitsize);
4255 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4256 tree vec_temp;
4258 /* Regardless of whether we have a whole vector shift, if we're
4259 emulating the operation via tree-vect-generic, we don't want
4260 to use it. Only the first round of the reduction is likely
4261 to still be profitable via emulation. */
4262 /* ??? It might be better to emit a reduction tree code here, so that
4263 tree-vect-generic can expand the first round via bit tricks. */
4264 if (!VECTOR_MODE_P (mode))
4265 reduce_with_shift = false;
4266 else
4268 optab optab = optab_for_tree_code (code, vectype, optab_default);
4269 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4270 reduce_with_shift = false;
4273 if (reduce_with_shift && !slp_reduc)
4275 int nelements = vec_size_in_bits / element_bitsize;
4276 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4278 int elt_offset;
4280 tree zero_vec = build_zero_cst (vectype);
4281 /*** Case 2: Create:
4282 for (offset = nelements/2; offset >= 1; offset/=2)
4284 Create: va' = vec_shift <va, offset>
4285 Create: va = vop <va, va'>
4286 } */
4288 tree rhs;
4290 if (dump_enabled_p ())
4291 dump_printf_loc (MSG_NOTE, vect_location,
4292 "Reduce using vector shifts\n");
4294 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4295 new_temp = new_phi_result;
4296 for (elt_offset = nelements / 2;
4297 elt_offset >= 1;
4298 elt_offset /= 2)
4300 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4301 tree mask = vect_gen_perm_mask_any (vectype, sel);
4302 epilog_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR,
4303 vec_dest, new_temp,
4304 zero_vec, mask);
4305 new_name = make_ssa_name (vec_dest, epilog_stmt);
4306 gimple_assign_set_lhs (epilog_stmt, new_name);
4307 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4309 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4310 new_name, new_temp);
4311 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4312 gimple_assign_set_lhs (epilog_stmt, new_temp);
4313 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4316 /* 2.4 Extract the final scalar result. Create:
4317 s_out3 = extract_field <v_out2, bitpos> */
4319 if (dump_enabled_p ())
4320 dump_printf_loc (MSG_NOTE, vect_location,
4321 "extract scalar result\n");
4323 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4324 bitsize, bitsize_zero_node);
4325 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4326 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4327 gimple_assign_set_lhs (epilog_stmt, new_temp);
4328 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4329 scalar_results.safe_push (new_temp);
4331 else
4333 /*** Case 3: Create:
4334 s = extract_field <v_out2, 0>
4335 for (offset = element_size;
4336 offset < vector_size;
4337 offset += element_size;)
4339 Create: s' = extract_field <v_out2, offset>
4340 Create: s = op <s, s'> // For non SLP cases
4341 } */
4343 if (dump_enabled_p ())
4344 dump_printf_loc (MSG_NOTE, vect_location,
4345 "Reduce using scalar code.\n");
4347 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4348 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4350 int bit_offset;
4351 if (gimple_code (new_phi) == GIMPLE_PHI)
4352 vec_temp = PHI_RESULT (new_phi);
4353 else
4354 vec_temp = gimple_assign_lhs (new_phi);
4355 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4356 bitsize_zero_node);
4357 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4358 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4359 gimple_assign_set_lhs (epilog_stmt, new_temp);
4360 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4362 /* In SLP we don't need to apply reduction operation, so we just
4363 collect s' values in SCALAR_RESULTS. */
4364 if (slp_reduc)
4365 scalar_results.safe_push (new_temp);
4367 for (bit_offset = element_bitsize;
4368 bit_offset < vec_size_in_bits;
4369 bit_offset += element_bitsize)
4371 tree bitpos = bitsize_int (bit_offset);
4372 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4373 bitsize, bitpos);
4375 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4376 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4377 gimple_assign_set_lhs (epilog_stmt, new_name);
4378 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4380 if (slp_reduc)
4382 /* In SLP we don't need to apply reduction operation, so
4383 we just collect s' values in SCALAR_RESULTS. */
4384 new_temp = new_name;
4385 scalar_results.safe_push (new_name);
4387 else
4389 epilog_stmt = gimple_build_assign_with_ops (code,
4390 new_scalar_dest, new_name, new_temp);
4391 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4392 gimple_assign_set_lhs (epilog_stmt, new_temp);
4393 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4398 /* The only case where we need to reduce scalar results in SLP, is
4399 unrolling. If the size of SCALAR_RESULTS is greater than
4400 GROUP_SIZE, we reduce them combining elements modulo
4401 GROUP_SIZE. */
4402 if (slp_reduc)
4404 tree res, first_res, new_res;
4405 gimple new_stmt;
4407 /* Reduce multiple scalar results in case of SLP unrolling. */
4408 for (j = group_size; scalar_results.iterate (j, &res);
4409 j++)
4411 first_res = scalar_results[j % group_size];
4412 new_stmt = gimple_build_assign_with_ops (code,
4413 new_scalar_dest, first_res, res);
4414 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4415 gimple_assign_set_lhs (new_stmt, new_res);
4416 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4417 scalar_results[j % group_size] = new_res;
4420 else
4421 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4422 scalar_results.safe_push (new_temp);
4426 vect_finalize_reduction:
4428 if (double_reduc)
4429 loop = loop->inner;
4431 /* 2.5 Adjust the final result by the initial value of the reduction
4432 variable. (When such adjustment is not needed, then
4433 'adjustment_def' is zero). For example, if code is PLUS we create:
4434 new_temp = loop_exit_def + adjustment_def */
4436 if (adjustment_def)
4438 gcc_assert (!slp_reduc);
4439 if (nested_in_vect_loop)
4441 new_phi = new_phis[0];
4442 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4443 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4444 new_dest = vect_create_destination_var (scalar_dest, vectype);
4446 else
4448 new_temp = scalar_results[0];
4449 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4450 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4451 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4454 epilog_stmt = gimple_build_assign (new_dest, expr);
4455 new_temp = make_ssa_name (new_dest, epilog_stmt);
4456 gimple_assign_set_lhs (epilog_stmt, new_temp);
4457 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4458 if (nested_in_vect_loop)
4460 set_vinfo_for_stmt (epilog_stmt,
4461 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4462 NULL));
4463 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4464 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4466 if (!double_reduc)
4467 scalar_results.quick_push (new_temp);
4468 else
4469 scalar_results[0] = new_temp;
4471 else
4472 scalar_results[0] = new_temp;
4474 new_phis[0] = epilog_stmt;
4477 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4478 phis with new adjusted scalar results, i.e., replace use <s_out0>
4479 with use <s_out4>.
4481 Transform:
4482 loop_exit:
4483 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4484 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4485 v_out2 = reduce <v_out1>
4486 s_out3 = extract_field <v_out2, 0>
4487 s_out4 = adjust_result <s_out3>
4488 use <s_out0>
4489 use <s_out0>
4491 into:
4493 loop_exit:
4494 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4495 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4496 v_out2 = reduce <v_out1>
4497 s_out3 = extract_field <v_out2, 0>
4498 s_out4 = adjust_result <s_out3>
4499 use <s_out4>
4500 use <s_out4> */
4503 /* In SLP reduction chain we reduce vector results into one vector if
4504 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4505 the last stmt in the reduction chain, since we are looking for the loop
4506 exit phi node. */
4507 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4509 scalar_dest = gimple_assign_lhs (
4510 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4511 group_size = 1;
4514 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4515 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4516 need to match SCALAR_RESULTS with corresponding statements. The first
4517 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4518 the first vector stmt, etc.
4519 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4520 if (group_size > new_phis.length ())
4522 ratio = group_size / new_phis.length ();
4523 gcc_assert (!(group_size % new_phis.length ()));
4525 else
4526 ratio = 1;
4528 for (k = 0; k < group_size; k++)
4530 if (k % ratio == 0)
4532 epilog_stmt = new_phis[k / ratio];
4533 reduction_phi = reduction_phis[k / ratio];
4534 if (double_reduc)
4535 inner_phi = inner_phis[k / ratio];
4538 if (slp_reduc)
4540 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4542 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4543 /* SLP statements can't participate in patterns. */
4544 gcc_assert (!orig_stmt);
4545 scalar_dest = gimple_assign_lhs (current_stmt);
4548 phis.create (3);
4549 /* Find the loop-closed-use at the loop exit of the original scalar
4550 result. (The reduction result is expected to have two immediate uses -
4551 one at the latch block, and one at the loop exit). */
4552 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4553 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4554 && !is_gimple_debug (USE_STMT (use_p)))
4555 phis.safe_push (USE_STMT (use_p));
4557 /* While we expect to have found an exit_phi because of loop-closed-ssa
4558 form we can end up without one if the scalar cycle is dead. */
4560 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4562 if (outer_loop)
4564 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4565 gphi *vect_phi;
4567 /* FORNOW. Currently not supporting the case that an inner-loop
4568 reduction is not used in the outer-loop (but only outside the
4569 outer-loop), unless it is double reduction. */
4570 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4571 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4572 || double_reduc);
4574 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4575 if (!double_reduc
4576 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4577 != vect_double_reduction_def)
4578 continue;
4580 /* Handle double reduction:
4582 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4583 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4584 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4585 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4587 At that point the regular reduction (stmt2 and stmt3) is
4588 already vectorized, as well as the exit phi node, stmt4.
4589 Here we vectorize the phi node of double reduction, stmt1, and
4590 update all relevant statements. */
4592 /* Go through all the uses of s2 to find double reduction phi
4593 node, i.e., stmt1 above. */
4594 orig_name = PHI_RESULT (exit_phi);
4595 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4597 stmt_vec_info use_stmt_vinfo;
4598 stmt_vec_info new_phi_vinfo;
4599 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4600 basic_block bb = gimple_bb (use_stmt);
4601 gimple use;
4603 /* Check that USE_STMT is really double reduction phi
4604 node. */
4605 if (gimple_code (use_stmt) != GIMPLE_PHI
4606 || gimple_phi_num_args (use_stmt) != 2
4607 || bb->loop_father != outer_loop)
4608 continue;
4609 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4610 if (!use_stmt_vinfo
4611 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4612 != vect_double_reduction_def)
4613 continue;
4615 /* Create vector phi node for double reduction:
4616 vs1 = phi <vs0, vs2>
4617 vs1 was created previously in this function by a call to
4618 vect_get_vec_def_for_operand and is stored in
4619 vec_initial_def;
4620 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4621 vs0 is created here. */
4623 /* Create vector phi node. */
4624 vect_phi = create_phi_node (vec_initial_def, bb);
4625 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4626 loop_vec_info_for_loop (outer_loop), NULL);
4627 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4629 /* Create vs0 - initial def of the double reduction phi. */
4630 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4631 loop_preheader_edge (outer_loop));
4632 init_def = get_initial_def_for_reduction (stmt,
4633 preheader_arg, NULL);
4634 vect_phi_init = vect_init_vector (use_stmt, init_def,
4635 vectype, NULL);
4637 /* Update phi node arguments with vs0 and vs2. */
4638 add_phi_arg (vect_phi, vect_phi_init,
4639 loop_preheader_edge (outer_loop),
4640 UNKNOWN_LOCATION);
4641 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4642 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4643 if (dump_enabled_p ())
4645 dump_printf_loc (MSG_NOTE, vect_location,
4646 "created double reduction phi node: ");
4647 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4648 dump_printf (MSG_NOTE, "\n");
4651 vect_phi_res = PHI_RESULT (vect_phi);
4653 /* Replace the use, i.e., set the correct vs1 in the regular
4654 reduction phi node. FORNOW, NCOPIES is always 1, so the
4655 loop is redundant. */
4656 use = reduction_phi;
4657 for (j = 0; j < ncopies; j++)
4659 edge pr_edge = loop_preheader_edge (loop);
4660 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4661 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4667 phis.release ();
4668 if (nested_in_vect_loop)
4670 if (double_reduc)
4671 loop = outer_loop;
4672 else
4673 continue;
4676 phis.create (3);
4677 /* Find the loop-closed-use at the loop exit of the original scalar
4678 result. (The reduction result is expected to have two immediate uses,
4679 one at the latch block, and one at the loop exit). For double
4680 reductions we are looking for exit phis of the outer loop. */
4681 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4683 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4685 if (!is_gimple_debug (USE_STMT (use_p)))
4686 phis.safe_push (USE_STMT (use_p));
4688 else
4690 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4692 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4694 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4696 if (!flow_bb_inside_loop_p (loop,
4697 gimple_bb (USE_STMT (phi_use_p)))
4698 && !is_gimple_debug (USE_STMT (phi_use_p)))
4699 phis.safe_push (USE_STMT (phi_use_p));
4705 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4707 /* Replace the uses: */
4708 orig_name = PHI_RESULT (exit_phi);
4709 scalar_result = scalar_results[k];
4710 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4711 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4712 SET_USE (use_p, scalar_result);
4715 phis.release ();
4720 /* Function vectorizable_reduction.
4722 Check if STMT performs a reduction operation that can be vectorized.
4723 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4724 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4725 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4727 This function also handles reduction idioms (patterns) that have been
4728 recognized in advance during vect_pattern_recog. In this case, STMT may be
4729 of this form:
4730 X = pattern_expr (arg0, arg1, ..., X)
4731 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4732 sequence that had been detected and replaced by the pattern-stmt (STMT).
4734 In some cases of reduction patterns, the type of the reduction variable X is
4735 different than the type of the other arguments of STMT.
4736 In such cases, the vectype that is used when transforming STMT into a vector
4737 stmt is different than the vectype that is used to determine the
4738 vectorization factor, because it consists of a different number of elements
4739 than the actual number of elements that are being operated upon in parallel.
4741 For example, consider an accumulation of shorts into an int accumulator.
4742 On some targets it's possible to vectorize this pattern operating on 8
4743 shorts at a time (hence, the vectype for purposes of determining the
4744 vectorization factor should be V8HI); on the other hand, the vectype that
4745 is used to create the vector form is actually V4SI (the type of the result).
4747 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4748 indicates what is the actual level of parallelism (V8HI in the example), so
4749 that the right vectorization factor would be derived. This vectype
4750 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4751 be used to create the vectorized stmt. The right vectype for the vectorized
4752 stmt is obtained from the type of the result X:
4753 get_vectype_for_scalar_type (TREE_TYPE (X))
4755 This means that, contrary to "regular" reductions (or "regular" stmts in
4756 general), the following equation:
4757 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4758 does *NOT* necessarily hold for reduction patterns. */
4760 bool
4761 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4762 gimple *vec_stmt, slp_tree slp_node)
4764 tree vec_dest;
4765 tree scalar_dest;
4766 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4767 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4768 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4769 tree vectype_in = NULL_TREE;
4770 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4771 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4772 enum tree_code code, orig_code, epilog_reduc_code;
4773 machine_mode vec_mode;
4774 int op_type;
4775 optab optab, reduc_optab;
4776 tree new_temp = NULL_TREE;
4777 tree def;
4778 gimple def_stmt;
4779 enum vect_def_type dt;
4780 gphi *new_phi = NULL;
4781 tree scalar_type;
4782 bool is_simple_use;
4783 gimple orig_stmt;
4784 stmt_vec_info orig_stmt_info;
4785 tree expr = NULL_TREE;
4786 int i;
4787 int ncopies;
4788 int epilog_copies;
4789 stmt_vec_info prev_stmt_info, prev_phi_info;
4790 bool single_defuse_cycle = false;
4791 tree reduc_def = NULL_TREE;
4792 gimple new_stmt = NULL;
4793 int j;
4794 tree ops[3];
4795 bool nested_cycle = false, found_nested_cycle_def = false;
4796 gimple reduc_def_stmt = NULL;
4797 /* The default is that the reduction variable is the last in statement. */
4798 int reduc_index = 2;
4799 bool double_reduc = false, dummy;
4800 basic_block def_bb;
4801 struct loop * def_stmt_loop, *outer_loop = NULL;
4802 tree def_arg;
4803 gimple def_arg_stmt;
4804 auto_vec<tree> vec_oprnds0;
4805 auto_vec<tree> vec_oprnds1;
4806 auto_vec<tree> vect_defs;
4807 auto_vec<gimple> phis;
4808 int vec_num;
4809 tree def0, def1, tem, op0, op1 = NULL_TREE;
4811 /* In case of reduction chain we switch to the first stmt in the chain, but
4812 we don't update STMT_INFO, since only the last stmt is marked as reduction
4813 and has reduction properties. */
4814 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4815 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4817 if (nested_in_vect_loop_p (loop, stmt))
4819 outer_loop = loop;
4820 loop = loop->inner;
4821 nested_cycle = true;
4824 /* 1. Is vectorizable reduction? */
4825 /* Not supportable if the reduction variable is used in the loop, unless
4826 it's a reduction chain. */
4827 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4828 && !GROUP_FIRST_ELEMENT (stmt_info))
4829 return false;
4831 /* Reductions that are not used even in an enclosing outer-loop,
4832 are expected to be "live" (used out of the loop). */
4833 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4834 && !STMT_VINFO_LIVE_P (stmt_info))
4835 return false;
4837 /* Make sure it was already recognized as a reduction computation. */
4838 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4839 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4840 return false;
4842 /* 2. Has this been recognized as a reduction pattern?
4844 Check if STMT represents a pattern that has been recognized
4845 in earlier analysis stages. For stmts that represent a pattern,
4846 the STMT_VINFO_RELATED_STMT field records the last stmt in
4847 the original sequence that constitutes the pattern. */
4849 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4850 if (orig_stmt)
4852 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4853 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4854 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4857 /* 3. Check the operands of the operation. The first operands are defined
4858 inside the loop body. The last operand is the reduction variable,
4859 which is defined by the loop-header-phi. */
4861 gcc_assert (is_gimple_assign (stmt));
4863 /* Flatten RHS. */
4864 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4866 case GIMPLE_SINGLE_RHS:
4867 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4868 if (op_type == ternary_op)
4870 tree rhs = gimple_assign_rhs1 (stmt);
4871 ops[0] = TREE_OPERAND (rhs, 0);
4872 ops[1] = TREE_OPERAND (rhs, 1);
4873 ops[2] = TREE_OPERAND (rhs, 2);
4874 code = TREE_CODE (rhs);
4876 else
4877 return false;
4878 break;
4880 case GIMPLE_BINARY_RHS:
4881 code = gimple_assign_rhs_code (stmt);
4882 op_type = TREE_CODE_LENGTH (code);
4883 gcc_assert (op_type == binary_op);
4884 ops[0] = gimple_assign_rhs1 (stmt);
4885 ops[1] = gimple_assign_rhs2 (stmt);
4886 break;
4888 case GIMPLE_TERNARY_RHS:
4889 code = gimple_assign_rhs_code (stmt);
4890 op_type = TREE_CODE_LENGTH (code);
4891 gcc_assert (op_type == ternary_op);
4892 ops[0] = gimple_assign_rhs1 (stmt);
4893 ops[1] = gimple_assign_rhs2 (stmt);
4894 ops[2] = gimple_assign_rhs3 (stmt);
4895 break;
4897 case GIMPLE_UNARY_RHS:
4898 return false;
4900 default:
4901 gcc_unreachable ();
4904 if (code == COND_EXPR && slp_node)
4905 return false;
4907 scalar_dest = gimple_assign_lhs (stmt);
4908 scalar_type = TREE_TYPE (scalar_dest);
4909 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4910 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4911 return false;
4913 /* Do not try to vectorize bit-precision reductions. */
4914 if ((TYPE_PRECISION (scalar_type)
4915 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4916 return false;
4918 /* All uses but the last are expected to be defined in the loop.
4919 The last use is the reduction variable. In case of nested cycle this
4920 assumption is not true: we use reduc_index to record the index of the
4921 reduction variable. */
4922 for (i = 0; i < op_type - 1; i++)
4924 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4925 if (i == 0 && code == COND_EXPR)
4926 continue;
4928 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4929 &def_stmt, &def, &dt, &tem);
4930 if (!vectype_in)
4931 vectype_in = tem;
4932 gcc_assert (is_simple_use);
4934 if (dt != vect_internal_def
4935 && dt != vect_external_def
4936 && dt != vect_constant_def
4937 && dt != vect_induction_def
4938 && !(dt == vect_nested_cycle && nested_cycle))
4939 return false;
4941 if (dt == vect_nested_cycle)
4943 found_nested_cycle_def = true;
4944 reduc_def_stmt = def_stmt;
4945 reduc_index = i;
4949 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4950 &def_stmt, &def, &dt, &tem);
4951 if (!vectype_in)
4952 vectype_in = tem;
4953 gcc_assert (is_simple_use);
4954 if (!(dt == vect_reduction_def
4955 || dt == vect_nested_cycle
4956 || ((dt == vect_internal_def || dt == vect_external_def
4957 || dt == vect_constant_def || dt == vect_induction_def)
4958 && nested_cycle && found_nested_cycle_def)))
4960 /* For pattern recognized stmts, orig_stmt might be a reduction,
4961 but some helper statements for the pattern might not, or
4962 might be COND_EXPRs with reduction uses in the condition. */
4963 gcc_assert (orig_stmt);
4964 return false;
4966 if (!found_nested_cycle_def)
4967 reduc_def_stmt = def_stmt;
4969 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4970 if (orig_stmt)
4971 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4972 reduc_def_stmt,
4973 !nested_cycle,
4974 &dummy));
4975 else
4977 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4978 !nested_cycle, &dummy);
4979 /* We changed STMT to be the first stmt in reduction chain, hence we
4980 check that in this case the first element in the chain is STMT. */
4981 gcc_assert (stmt == tmp
4982 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4985 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4986 return false;
4988 if (slp_node || PURE_SLP_STMT (stmt_info))
4989 ncopies = 1;
4990 else
4991 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4992 / TYPE_VECTOR_SUBPARTS (vectype_in));
4994 gcc_assert (ncopies >= 1);
4996 vec_mode = TYPE_MODE (vectype_in);
4998 if (code == COND_EXPR)
5000 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5002 if (dump_enabled_p ())
5003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5004 "unsupported condition in reduction\n");
5006 return false;
5009 else
5011 /* 4. Supportable by target? */
5013 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5014 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5016 /* Shifts and rotates are only supported by vectorizable_shifts,
5017 not vectorizable_reduction. */
5018 if (dump_enabled_p ())
5019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5020 "unsupported shift or rotation.\n");
5021 return false;
5024 /* 4.1. check support for the operation in the loop */
5025 optab = optab_for_tree_code (code, vectype_in, optab_default);
5026 if (!optab)
5028 if (dump_enabled_p ())
5029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5030 "no optab.\n");
5032 return false;
5035 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5037 if (dump_enabled_p ())
5038 dump_printf (MSG_NOTE, "op not supported by target.\n");
5040 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5041 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5042 < vect_min_worthwhile_factor (code))
5043 return false;
5045 if (dump_enabled_p ())
5046 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5049 /* Worthwhile without SIMD support? */
5050 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5051 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5052 < vect_min_worthwhile_factor (code))
5054 if (dump_enabled_p ())
5055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5056 "not worthwhile without SIMD support.\n");
5058 return false;
5062 /* 4.2. Check support for the epilog operation.
5064 If STMT represents a reduction pattern, then the type of the
5065 reduction variable may be different than the type of the rest
5066 of the arguments. For example, consider the case of accumulation
5067 of shorts into an int accumulator; The original code:
5068 S1: int_a = (int) short_a;
5069 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5071 was replaced with:
5072 STMT: int_acc = widen_sum <short_a, int_acc>
5074 This means that:
5075 1. The tree-code that is used to create the vector operation in the
5076 epilog code (that reduces the partial results) is not the
5077 tree-code of STMT, but is rather the tree-code of the original
5078 stmt from the pattern that STMT is replacing. I.e, in the example
5079 above we want to use 'widen_sum' in the loop, but 'plus' in the
5080 epilog.
5081 2. The type (mode) we use to check available target support
5082 for the vector operation to be created in the *epilog*, is
5083 determined by the type of the reduction variable (in the example
5084 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5085 However the type (mode) we use to check available target support
5086 for the vector operation to be created *inside the loop*, is
5087 determined by the type of the other arguments to STMT (in the
5088 example we'd check this: optab_handler (widen_sum_optab,
5089 vect_short_mode)).
5091 This is contrary to "regular" reductions, in which the types of all
5092 the arguments are the same as the type of the reduction variable.
5093 For "regular" reductions we can therefore use the same vector type
5094 (and also the same tree-code) when generating the epilog code and
5095 when generating the code inside the loop. */
5097 if (orig_stmt)
5099 /* This is a reduction pattern: get the vectype from the type of the
5100 reduction variable, and get the tree-code from orig_stmt. */
5101 orig_code = gimple_assign_rhs_code (orig_stmt);
5102 gcc_assert (vectype_out);
5103 vec_mode = TYPE_MODE (vectype_out);
5105 else
5107 /* Regular reduction: use the same vectype and tree-code as used for
5108 the vector code inside the loop can be used for the epilog code. */
5109 orig_code = code;
5112 if (nested_cycle)
5114 def_bb = gimple_bb (reduc_def_stmt);
5115 def_stmt_loop = def_bb->loop_father;
5116 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5117 loop_preheader_edge (def_stmt_loop));
5118 if (TREE_CODE (def_arg) == SSA_NAME
5119 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5120 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5121 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5122 && vinfo_for_stmt (def_arg_stmt)
5123 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5124 == vect_double_reduction_def)
5125 double_reduc = true;
5128 epilog_reduc_code = ERROR_MARK;
5129 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5131 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5132 optab_default);
5133 if (!reduc_optab)
5135 if (dump_enabled_p ())
5136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5137 "no optab for reduction.\n");
5139 epilog_reduc_code = ERROR_MARK;
5141 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5143 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5144 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5146 if (dump_enabled_p ())
5147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5148 "reduc op not supported by target.\n");
5150 epilog_reduc_code = ERROR_MARK;
5154 else
5156 if (!nested_cycle || double_reduc)
5158 if (dump_enabled_p ())
5159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5160 "no reduc code for scalar code.\n");
5162 return false;
5166 if (double_reduc && ncopies > 1)
5168 if (dump_enabled_p ())
5169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5170 "multiple types in double reduction\n");
5172 return false;
5175 /* In case of widenning multiplication by a constant, we update the type
5176 of the constant to be the type of the other operand. We check that the
5177 constant fits the type in the pattern recognition pass. */
5178 if (code == DOT_PROD_EXPR
5179 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5181 if (TREE_CODE (ops[0]) == INTEGER_CST)
5182 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5183 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5184 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5185 else
5187 if (dump_enabled_p ())
5188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5189 "invalid types in dot-prod\n");
5191 return false;
5195 if (!vec_stmt) /* transformation not required. */
5197 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5198 return false;
5199 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5200 return true;
5203 /** Transform. **/
5205 if (dump_enabled_p ())
5206 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5208 /* FORNOW: Multiple types are not supported for condition. */
5209 if (code == COND_EXPR)
5210 gcc_assert (ncopies == 1);
5212 /* Create the destination vector */
5213 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5215 /* In case the vectorization factor (VF) is bigger than the number
5216 of elements that we can fit in a vectype (nunits), we have to generate
5217 more than one vector stmt - i.e - we need to "unroll" the
5218 vector stmt by a factor VF/nunits. For more details see documentation
5219 in vectorizable_operation. */
5221 /* If the reduction is used in an outer loop we need to generate
5222 VF intermediate results, like so (e.g. for ncopies=2):
5223 r0 = phi (init, r0)
5224 r1 = phi (init, r1)
5225 r0 = x0 + r0;
5226 r1 = x1 + r1;
5227 (i.e. we generate VF results in 2 registers).
5228 In this case we have a separate def-use cycle for each copy, and therefore
5229 for each copy we get the vector def for the reduction variable from the
5230 respective phi node created for this copy.
5232 Otherwise (the reduction is unused in the loop nest), we can combine
5233 together intermediate results, like so (e.g. for ncopies=2):
5234 r = phi (init, r)
5235 r = x0 + r;
5236 r = x1 + r;
5237 (i.e. we generate VF/2 results in a single register).
5238 In this case for each copy we get the vector def for the reduction variable
5239 from the vectorized reduction operation generated in the previous iteration.
5242 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5244 single_defuse_cycle = true;
5245 epilog_copies = 1;
5247 else
5248 epilog_copies = ncopies;
5250 prev_stmt_info = NULL;
5251 prev_phi_info = NULL;
5252 if (slp_node)
5254 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5255 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5256 == TYPE_VECTOR_SUBPARTS (vectype_in));
5258 else
5260 vec_num = 1;
5261 vec_oprnds0.create (1);
5262 if (op_type == ternary_op)
5263 vec_oprnds1.create (1);
5266 phis.create (vec_num);
5267 vect_defs.create (vec_num);
5268 if (!slp_node)
5269 vect_defs.quick_push (NULL_TREE);
5271 for (j = 0; j < ncopies; j++)
5273 if (j == 0 || !single_defuse_cycle)
5275 for (i = 0; i < vec_num; i++)
5277 /* Create the reduction-phi that defines the reduction
5278 operand. */
5279 new_phi = create_phi_node (vec_dest, loop->header);
5280 set_vinfo_for_stmt (new_phi,
5281 new_stmt_vec_info (new_phi, loop_vinfo,
5282 NULL));
5283 if (j == 0 || slp_node)
5284 phis.quick_push (new_phi);
5288 if (code == COND_EXPR)
5290 gcc_assert (!slp_node);
5291 vectorizable_condition (stmt, gsi, vec_stmt,
5292 PHI_RESULT (phis[0]),
5293 reduc_index, NULL);
5294 /* Multiple types are not supported for condition. */
5295 break;
5298 /* Handle uses. */
5299 if (j == 0)
5301 op0 = ops[!reduc_index];
5302 if (op_type == ternary_op)
5304 if (reduc_index == 0)
5305 op1 = ops[2];
5306 else
5307 op1 = ops[1];
5310 if (slp_node)
5311 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5312 slp_node, -1);
5313 else
5315 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5316 stmt, NULL);
5317 vec_oprnds0.quick_push (loop_vec_def0);
5318 if (op_type == ternary_op)
5320 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5321 NULL);
5322 vec_oprnds1.quick_push (loop_vec_def1);
5326 else
5328 if (!slp_node)
5330 enum vect_def_type dt;
5331 gimple dummy_stmt;
5332 tree dummy;
5334 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5335 &dummy_stmt, &dummy, &dt);
5336 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5337 loop_vec_def0);
5338 vec_oprnds0[0] = loop_vec_def0;
5339 if (op_type == ternary_op)
5341 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5342 &dummy, &dt);
5343 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5344 loop_vec_def1);
5345 vec_oprnds1[0] = loop_vec_def1;
5349 if (single_defuse_cycle)
5350 reduc_def = gimple_assign_lhs (new_stmt);
5352 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5355 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5357 if (slp_node)
5358 reduc_def = PHI_RESULT (phis[i]);
5359 else
5361 if (!single_defuse_cycle || j == 0)
5362 reduc_def = PHI_RESULT (new_phi);
5365 def1 = ((op_type == ternary_op)
5366 ? vec_oprnds1[i] : NULL);
5367 if (op_type == binary_op)
5369 if (reduc_index == 0)
5370 expr = build2 (code, vectype_out, reduc_def, def0);
5371 else
5372 expr = build2 (code, vectype_out, def0, reduc_def);
5374 else
5376 if (reduc_index == 0)
5377 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5378 else
5380 if (reduc_index == 1)
5381 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5382 else
5383 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5387 new_stmt = gimple_build_assign (vec_dest, expr);
5388 new_temp = make_ssa_name (vec_dest, new_stmt);
5389 gimple_assign_set_lhs (new_stmt, new_temp);
5390 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5392 if (slp_node)
5394 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5395 vect_defs.quick_push (new_temp);
5397 else
5398 vect_defs[0] = new_temp;
5401 if (slp_node)
5402 continue;
5404 if (j == 0)
5405 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5406 else
5407 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5409 prev_stmt_info = vinfo_for_stmt (new_stmt);
5410 prev_phi_info = vinfo_for_stmt (new_phi);
5413 /* Finalize the reduction-phi (set its arguments) and create the
5414 epilog reduction code. */
5415 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5417 new_temp = gimple_assign_lhs (*vec_stmt);
5418 vect_defs[0] = new_temp;
5421 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5422 epilog_reduc_code, phis, reduc_index,
5423 double_reduc, slp_node);
5425 return true;
5428 /* Function vect_min_worthwhile_factor.
5430 For a loop where we could vectorize the operation indicated by CODE,
5431 return the minimum vectorization factor that makes it worthwhile
5432 to use generic vectors. */
5434 vect_min_worthwhile_factor (enum tree_code code)
5436 switch (code)
5438 case PLUS_EXPR:
5439 case MINUS_EXPR:
5440 case NEGATE_EXPR:
5441 return 4;
5443 case BIT_AND_EXPR:
5444 case BIT_IOR_EXPR:
5445 case BIT_XOR_EXPR:
5446 case BIT_NOT_EXPR:
5447 return 2;
5449 default:
5450 return INT_MAX;
5455 /* Function vectorizable_induction
5457 Check if PHI performs an induction computation that can be vectorized.
5458 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5459 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5460 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5462 bool
5463 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5464 gimple *vec_stmt)
5466 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5467 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5468 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5469 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5470 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5471 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5472 tree vec_def;
5474 gcc_assert (ncopies >= 1);
5475 /* FORNOW. These restrictions should be relaxed. */
5476 if (nested_in_vect_loop_p (loop, phi))
5478 imm_use_iterator imm_iter;
5479 use_operand_p use_p;
5480 gimple exit_phi;
5481 edge latch_e;
5482 tree loop_arg;
5484 if (ncopies > 1)
5486 if (dump_enabled_p ())
5487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5488 "multiple types in nested loop.\n");
5489 return false;
5492 exit_phi = NULL;
5493 latch_e = loop_latch_edge (loop->inner);
5494 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5495 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5497 gimple use_stmt = USE_STMT (use_p);
5498 if (is_gimple_debug (use_stmt))
5499 continue;
5501 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5503 exit_phi = use_stmt;
5504 break;
5507 if (exit_phi)
5509 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5510 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5511 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5513 if (dump_enabled_p ())
5514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5515 "inner-loop induction only used outside "
5516 "of the outer vectorized loop.\n");
5517 return false;
5522 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5523 return false;
5525 /* FORNOW: SLP not supported. */
5526 if (STMT_SLP_TYPE (stmt_info))
5527 return false;
5529 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5531 if (gimple_code (phi) != GIMPLE_PHI)
5532 return false;
5534 if (!vec_stmt) /* transformation not required. */
5536 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5537 if (dump_enabled_p ())
5538 dump_printf_loc (MSG_NOTE, vect_location,
5539 "=== vectorizable_induction ===\n");
5540 vect_model_induction_cost (stmt_info, ncopies);
5541 return true;
5544 /** Transform. **/
5546 if (dump_enabled_p ())
5547 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5549 vec_def = get_initial_def_for_induction (phi);
5550 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5551 return true;
5554 /* Function vectorizable_live_operation.
5556 STMT computes a value that is used outside the loop. Check if
5557 it can be supported. */
5559 bool
5560 vectorizable_live_operation (gimple stmt,
5561 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5562 gimple *vec_stmt)
5564 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5565 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5566 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5567 int i;
5568 int op_type;
5569 tree op;
5570 tree def;
5571 gimple def_stmt;
5572 enum vect_def_type dt;
5573 enum tree_code code;
5574 enum gimple_rhs_class rhs_class;
5576 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5578 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5579 return false;
5581 if (!is_gimple_assign (stmt))
5583 if (gimple_call_internal_p (stmt)
5584 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5585 && gimple_call_lhs (stmt)
5586 && loop->simduid
5587 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5588 && loop->simduid
5589 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5591 edge e = single_exit (loop);
5592 basic_block merge_bb = e->dest;
5593 imm_use_iterator imm_iter;
5594 use_operand_p use_p;
5595 tree lhs = gimple_call_lhs (stmt);
5597 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5599 gimple use_stmt = USE_STMT (use_p);
5600 if (gimple_code (use_stmt) == GIMPLE_PHI
5601 && gimple_bb (use_stmt) == merge_bb)
5603 if (vec_stmt)
5605 tree vfm1
5606 = build_int_cst (unsigned_type_node,
5607 loop_vinfo->vectorization_factor - 1);
5608 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5610 return true;
5615 return false;
5618 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5619 return false;
5621 /* FORNOW. CHECKME. */
5622 if (nested_in_vect_loop_p (loop, stmt))
5623 return false;
5625 code = gimple_assign_rhs_code (stmt);
5626 op_type = TREE_CODE_LENGTH (code);
5627 rhs_class = get_gimple_rhs_class (code);
5628 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5629 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5631 /* FORNOW: support only if all uses are invariant. This means
5632 that the scalar operations can remain in place, unvectorized.
5633 The original last scalar value that they compute will be used. */
5635 for (i = 0; i < op_type; i++)
5637 if (rhs_class == GIMPLE_SINGLE_RHS)
5638 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5639 else
5640 op = gimple_op (stmt, i + 1);
5641 if (op
5642 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5643 &dt))
5645 if (dump_enabled_p ())
5646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5647 "use not simple.\n");
5648 return false;
5651 if (dt != vect_external_def && dt != vect_constant_def)
5652 return false;
5655 /* No transformation is required for the cases we currently support. */
5656 return true;
5659 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5661 static void
5662 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5664 ssa_op_iter op_iter;
5665 imm_use_iterator imm_iter;
5666 def_operand_p def_p;
5667 gimple ustmt;
5669 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5671 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5673 basic_block bb;
5675 if (!is_gimple_debug (ustmt))
5676 continue;
5678 bb = gimple_bb (ustmt);
5680 if (!flow_bb_inside_loop_p (loop, bb))
5682 if (gimple_debug_bind_p (ustmt))
5684 if (dump_enabled_p ())
5685 dump_printf_loc (MSG_NOTE, vect_location,
5686 "killing debug use\n");
5688 gimple_debug_bind_reset_value (ustmt);
5689 update_stmt (ustmt);
5691 else
5692 gcc_unreachable ();
5699 /* This function builds ni_name = number of iterations. Statements
5700 are emitted on the loop preheader edge. */
5702 static tree
5703 vect_build_loop_niters (loop_vec_info loop_vinfo)
5705 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5706 if (TREE_CODE (ni) == INTEGER_CST)
5707 return ni;
5708 else
5710 tree ni_name, var;
5711 gimple_seq stmts = NULL;
5712 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5714 var = create_tmp_var (TREE_TYPE (ni), "niters");
5715 ni_name = force_gimple_operand (ni, &stmts, false, var);
5716 if (stmts)
5717 gsi_insert_seq_on_edge_immediate (pe, stmts);
5719 return ni_name;
5724 /* This function generates the following statements:
5726 ni_name = number of iterations loop executes
5727 ratio = ni_name / vf
5728 ratio_mult_vf_name = ratio * vf
5730 and places them on the loop preheader edge. */
5732 static void
5733 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5734 tree ni_name,
5735 tree *ratio_mult_vf_name_ptr,
5736 tree *ratio_name_ptr)
5738 tree ni_minus_gap_name;
5739 tree var;
5740 tree ratio_name;
5741 tree ratio_mult_vf_name;
5742 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5743 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5744 tree log_vf;
5746 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5748 /* If epilogue loop is required because of data accesses with gaps, we
5749 subtract one iteration from the total number of iterations here for
5750 correct calculation of RATIO. */
5751 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5753 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5754 ni_name,
5755 build_one_cst (TREE_TYPE (ni_name)));
5756 if (!is_gimple_val (ni_minus_gap_name))
5758 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5759 gimple stmts = NULL;
5760 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5761 true, var);
5762 gsi_insert_seq_on_edge_immediate (pe, stmts);
5765 else
5766 ni_minus_gap_name = ni_name;
5768 /* Create: ratio = ni >> log2(vf) */
5769 /* ??? As we have ni == number of latch executions + 1, ni could
5770 have overflown to zero. So avoid computing ratio based on ni
5771 but compute it using the fact that we know ratio will be at least
5772 one, thus via (ni - vf) >> log2(vf) + 1. */
5773 ratio_name
5774 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5775 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5776 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5777 ni_minus_gap_name,
5778 build_int_cst
5779 (TREE_TYPE (ni_name), vf)),
5780 log_vf),
5781 build_int_cst (TREE_TYPE (ni_name), 1));
5782 if (!is_gimple_val (ratio_name))
5784 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5785 gimple stmts = NULL;
5786 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5787 gsi_insert_seq_on_edge_immediate (pe, stmts);
5789 *ratio_name_ptr = ratio_name;
5791 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5793 if (ratio_mult_vf_name_ptr)
5795 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5796 ratio_name, log_vf);
5797 if (!is_gimple_val (ratio_mult_vf_name))
5799 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5800 gimple stmts = NULL;
5801 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5802 true, var);
5803 gsi_insert_seq_on_edge_immediate (pe, stmts);
5805 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5808 return;
5812 /* Function vect_transform_loop.
5814 The analysis phase has determined that the loop is vectorizable.
5815 Vectorize the loop - created vectorized stmts to replace the scalar
5816 stmts in the loop, and update the loop exit condition. */
5818 void
5819 vect_transform_loop (loop_vec_info loop_vinfo)
5821 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5822 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5823 int nbbs = loop->num_nodes;
5824 int i;
5825 tree ratio = NULL;
5826 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5827 bool grouped_store;
5828 bool slp_scheduled = false;
5829 gimple stmt, pattern_stmt;
5830 gimple_seq pattern_def_seq = NULL;
5831 gimple_stmt_iterator pattern_def_si = gsi_none ();
5832 bool transform_pattern_stmt = false;
5833 bool check_profitability = false;
5834 int th;
5835 /* Record number of iterations before we started tampering with the profile. */
5836 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5838 if (dump_enabled_p ())
5839 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5841 /* If profile is inprecise, we have chance to fix it up. */
5842 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5843 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5845 /* Use the more conservative vectorization threshold. If the number
5846 of iterations is constant assume the cost check has been performed
5847 by our caller. If the threshold makes all loops profitable that
5848 run at least the vectorization factor number of times checking
5849 is pointless, too. */
5850 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5851 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5852 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5854 if (dump_enabled_p ())
5855 dump_printf_loc (MSG_NOTE, vect_location,
5856 "Profitability threshold is %d loop iterations.\n",
5857 th);
5858 check_profitability = true;
5861 /* Version the loop first, if required, so the profitability check
5862 comes first. */
5864 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5865 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5867 vect_loop_versioning (loop_vinfo, th, check_profitability);
5868 check_profitability = false;
5871 tree ni_name = vect_build_loop_niters (loop_vinfo);
5872 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5874 /* Peel the loop if there are data refs with unknown alignment.
5875 Only one data ref with unknown store is allowed. */
5877 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5879 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5880 th, check_profitability);
5881 check_profitability = false;
5882 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5883 be re-computed. */
5884 ni_name = NULL_TREE;
5887 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5888 compile time constant), or it is a constant that doesn't divide by the
5889 vectorization factor, then an epilog loop needs to be created.
5890 We therefore duplicate the loop: the original loop will be vectorized,
5891 and will compute the first (n/VF) iterations. The second copy of the loop
5892 will remain scalar and will compute the remaining (n%VF) iterations.
5893 (VF is the vectorization factor). */
5895 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5896 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5898 tree ratio_mult_vf;
5899 if (!ni_name)
5900 ni_name = vect_build_loop_niters (loop_vinfo);
5901 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5902 &ratio);
5903 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5904 th, check_profitability);
5906 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5907 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5908 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5909 else
5911 if (!ni_name)
5912 ni_name = vect_build_loop_niters (loop_vinfo);
5913 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5916 /* 1) Make sure the loop header has exactly two entries
5917 2) Make sure we have a preheader basic block. */
5919 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5921 split_edge (loop_preheader_edge (loop));
5923 /* FORNOW: the vectorizer supports only loops which body consist
5924 of one basic block (header + empty latch). When the vectorizer will
5925 support more involved loop forms, the order by which the BBs are
5926 traversed need to be reconsidered. */
5928 for (i = 0; i < nbbs; i++)
5930 basic_block bb = bbs[i];
5931 stmt_vec_info stmt_info;
5933 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5934 gsi_next (&si))
5936 gphi *phi = si.phi ();
5937 if (dump_enabled_p ())
5939 dump_printf_loc (MSG_NOTE, vect_location,
5940 "------>vectorizing phi: ");
5941 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5942 dump_printf (MSG_NOTE, "\n");
5944 stmt_info = vinfo_for_stmt (phi);
5945 if (!stmt_info)
5946 continue;
5948 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5949 vect_loop_kill_debug_uses (loop, phi);
5951 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5952 && !STMT_VINFO_LIVE_P (stmt_info))
5953 continue;
5955 if (STMT_VINFO_VECTYPE (stmt_info)
5956 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5957 != (unsigned HOST_WIDE_INT) vectorization_factor)
5958 && dump_enabled_p ())
5959 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5961 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5963 if (dump_enabled_p ())
5964 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5965 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5969 pattern_stmt = NULL;
5970 for (gimple_stmt_iterator si = gsi_start_bb (bb);
5971 !gsi_end_p (si) || transform_pattern_stmt;)
5973 bool is_store;
5975 if (transform_pattern_stmt)
5976 stmt = pattern_stmt;
5977 else
5979 stmt = gsi_stmt (si);
5980 /* During vectorization remove existing clobber stmts. */
5981 if (gimple_clobber_p (stmt))
5983 unlink_stmt_vdef (stmt);
5984 gsi_remove (&si, true);
5985 release_defs (stmt);
5986 continue;
5990 if (dump_enabled_p ())
5992 dump_printf_loc (MSG_NOTE, vect_location,
5993 "------>vectorizing statement: ");
5994 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5995 dump_printf (MSG_NOTE, "\n");
5998 stmt_info = vinfo_for_stmt (stmt);
6000 /* vector stmts created in the outer-loop during vectorization of
6001 stmts in an inner-loop may not have a stmt_info, and do not
6002 need to be vectorized. */
6003 if (!stmt_info)
6005 gsi_next (&si);
6006 continue;
6009 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6010 vect_loop_kill_debug_uses (loop, stmt);
6012 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6013 && !STMT_VINFO_LIVE_P (stmt_info))
6015 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6016 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6017 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6018 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6020 stmt = pattern_stmt;
6021 stmt_info = vinfo_for_stmt (stmt);
6023 else
6025 gsi_next (&si);
6026 continue;
6029 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6030 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6031 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6032 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6033 transform_pattern_stmt = true;
6035 /* If pattern statement has def stmts, vectorize them too. */
6036 if (is_pattern_stmt_p (stmt_info))
6038 if (pattern_def_seq == NULL)
6040 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6041 pattern_def_si = gsi_start (pattern_def_seq);
6043 else if (!gsi_end_p (pattern_def_si))
6044 gsi_next (&pattern_def_si);
6045 if (pattern_def_seq != NULL)
6047 gimple pattern_def_stmt = NULL;
6048 stmt_vec_info pattern_def_stmt_info = NULL;
6050 while (!gsi_end_p (pattern_def_si))
6052 pattern_def_stmt = gsi_stmt (pattern_def_si);
6053 pattern_def_stmt_info
6054 = vinfo_for_stmt (pattern_def_stmt);
6055 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6056 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6057 break;
6058 gsi_next (&pattern_def_si);
6061 if (!gsi_end_p (pattern_def_si))
6063 if (dump_enabled_p ())
6065 dump_printf_loc (MSG_NOTE, vect_location,
6066 "==> vectorizing pattern def "
6067 "stmt: ");
6068 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6069 pattern_def_stmt, 0);
6070 dump_printf (MSG_NOTE, "\n");
6073 stmt = pattern_def_stmt;
6074 stmt_info = pattern_def_stmt_info;
6076 else
6078 pattern_def_si = gsi_none ();
6079 transform_pattern_stmt = false;
6082 else
6083 transform_pattern_stmt = false;
6086 if (STMT_VINFO_VECTYPE (stmt_info))
6088 unsigned int nunits
6089 = (unsigned int)
6090 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6091 if (!STMT_SLP_TYPE (stmt_info)
6092 && nunits != (unsigned int) vectorization_factor
6093 && dump_enabled_p ())
6094 /* For SLP VF is set according to unrolling factor, and not
6095 to vector size, hence for SLP this print is not valid. */
6096 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6099 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6100 reached. */
6101 if (STMT_SLP_TYPE (stmt_info))
6103 if (!slp_scheduled)
6105 slp_scheduled = true;
6107 if (dump_enabled_p ())
6108 dump_printf_loc (MSG_NOTE, vect_location,
6109 "=== scheduling SLP instances ===\n");
6111 vect_schedule_slp (loop_vinfo, NULL);
6114 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6115 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6117 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6119 pattern_def_seq = NULL;
6120 gsi_next (&si);
6122 continue;
6126 /* -------- vectorize statement ------------ */
6127 if (dump_enabled_p ())
6128 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6130 grouped_store = false;
6131 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6132 if (is_store)
6134 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6136 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6137 interleaving chain was completed - free all the stores in
6138 the chain. */
6139 gsi_next (&si);
6140 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6142 else
6144 /* Free the attached stmt_vec_info and remove the stmt. */
6145 gimple store = gsi_stmt (si);
6146 free_stmt_vec_info (store);
6147 unlink_stmt_vdef (store);
6148 gsi_remove (&si, true);
6149 release_defs (store);
6152 /* Stores can only appear at the end of pattern statements. */
6153 gcc_assert (!transform_pattern_stmt);
6154 pattern_def_seq = NULL;
6156 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6158 pattern_def_seq = NULL;
6159 gsi_next (&si);
6161 } /* stmts in BB */
6162 } /* BBs in loop */
6164 slpeel_make_loop_iterate_ntimes (loop, ratio);
6166 /* Reduce loop iterations by the vectorization factor. */
6167 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6168 expected_iterations / vectorization_factor);
6169 loop->nb_iterations_upper_bound
6170 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6171 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6172 && loop->nb_iterations_upper_bound != 0)
6173 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6174 if (loop->any_estimate)
6176 loop->nb_iterations_estimate
6177 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6178 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6179 && loop->nb_iterations_estimate != 0)
6180 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6183 if (dump_enabled_p ())
6185 dump_printf_loc (MSG_NOTE, vect_location,
6186 "LOOP VECTORIZED\n");
6187 if (loop->inner)
6188 dump_printf_loc (MSG_NOTE, vect_location,
6189 "OUTER LOOP VECTORIZED\n");
6190 dump_printf (MSG_NOTE, "\n");