Fix GNU coding style for G_.
[official-gcc.git] / gcc / tree-vect-loop.c
blob00db49d691657ec905a94f18604e8637a654c5fe
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
184 if (stmt_vectype)
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return true;
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
221 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
222 return false;
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
227 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
229 /* If a pattern statement has def stmts, analyze them too. */
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
234 stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location,
238 "==> examining pattern def stmt: ");
239 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
240 def_stmt_info->stmt, 0);
242 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
243 vf, mask_producers))
244 return false;
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE, vect_location,
250 "==> examining pattern statement: ");
251 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
253 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
254 return false;
257 return true;
260 /* Function vect_determine_vectorization_factor
262 Determine the vectorization factor (VF). VF is the number of data elements
263 that are operated upon in parallel in a single iteration of the vectorized
264 loop. For example, when vectorizing a loop that operates on 4byte elements,
265 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
266 elements can fit in a single vector register.
268 We currently support vectorization of loops in which all types operated upon
269 are of the same size. Therefore this function currently sets VF according to
270 the size of the types operated upon, and fails if there are multiple sizes
271 in the loop.
273 VF is also the factor by which the loop iterations are strip-mined, e.g.:
274 original loop:
275 for (i=0; i<N; i++){
276 a[i] = b[i] + c[i];
279 vectorized loop:
280 for (i=0; i<N; i+=VF){
281 a[i:VF] = b[i:VF] + c[i:VF];
285 static bool
286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
289 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
290 unsigned nbbs = loop->num_nodes;
291 poly_uint64 vectorization_factor = 1;
292 tree scalar_type = NULL_TREE;
293 gphi *phi;
294 tree vectype;
295 stmt_vec_info stmt_info;
296 unsigned i;
297 auto_vec<stmt_vec_info> mask_producers;
299 if (dump_enabled_p ())
300 dump_printf_loc (MSG_NOTE, vect_location,
301 "=== vect_determine_vectorization_factor ===\n");
303 for (i = 0; i < nbbs; i++)
305 basic_block bb = bbs[i];
307 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
308 gsi_next (&si))
310 phi = si.phi ();
311 stmt_info = vinfo_for_stmt (phi);
312 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
315 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
318 gcc_assert (stmt_info);
320 if (STMT_VINFO_RELEVANT_P (stmt_info)
321 || STMT_VINFO_LIVE_P (stmt_info))
323 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
324 scalar_type = TREE_TYPE (PHI_RESULT (phi));
326 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location,
329 "get vectype for scalar type: ");
330 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
331 dump_printf (MSG_NOTE, "\n");
334 vectype = get_vectype_for_scalar_type (scalar_type);
335 if (!vectype)
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 "not vectorized: unsupported "
341 "data-type ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
343 scalar_type);
344 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
346 return false;
348 STMT_VINFO_VECTYPE (stmt_info) = vectype;
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
353 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
354 dump_printf (MSG_NOTE, "\n");
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
360 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
361 dump_printf (MSG_NOTE, "\n");
364 vect_update_max_nunits (&vectorization_factor, vectype);
368 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
369 gsi_next (&si))
371 stmt_info = vinfo_for_stmt (gsi_stmt (si));
372 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
373 &mask_producers))
374 return false;
378 /* TODO: Analyze cost. Decide if worth while to vectorize. */
379 if (dump_enabled_p ())
381 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
382 dump_dec (MSG_NOTE, vectorization_factor);
383 dump_printf (MSG_NOTE, "\n");
386 if (known_le (vectorization_factor, 1U))
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "not vectorized: unsupported data-type\n");
391 return false;
393 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
395 for (i = 0; i < mask_producers.length (); i++)
397 stmt_info = mask_producers[i];
398 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
399 if (!mask_type)
400 return false;
401 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
404 return true;
408 /* Function vect_is_simple_iv_evolution.
410 FORNOW: A simple evolution of an induction variables in the loop is
411 considered a polynomial evolution. */
413 static bool
414 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
415 tree * step)
417 tree init_expr;
418 tree step_expr;
419 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
420 basic_block bb;
422 /* When there is no evolution in this loop, the evolution function
423 is not "simple". */
424 if (evolution_part == NULL_TREE)
425 return false;
427 /* When the evolution is a polynomial of degree >= 2
428 the evolution function is not "simple". */
429 if (tree_is_chrec (evolution_part))
430 return false;
432 step_expr = evolution_part;
433 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
439 dump_printf (MSG_NOTE, ", init: ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
441 dump_printf (MSG_NOTE, "\n");
444 *init = init_expr;
445 *step = step_expr;
447 if (TREE_CODE (step_expr) != INTEGER_CST
448 && (TREE_CODE (step_expr) != SSA_NAME
449 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
450 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
451 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
452 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
453 || !flag_associative_math)))
454 && (TREE_CODE (step_expr) != REAL_CST
455 || !flag_associative_math))
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
459 "step unknown.\n");
460 return false;
463 return true;
466 /* Function vect_analyze_scalar_cycles_1.
468 Examine the cross iteration def-use cycles of scalar variables
469 in LOOP. LOOP_VINFO represents the loop that is now being
470 considered for vectorization (can be LOOP, or an outer-loop
471 enclosing LOOP). */
473 static void
474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
476 basic_block bb = loop->header;
477 tree init, step;
478 auto_vec<gimple *, 64> worklist;
479 gphi_iterator gsi;
480 bool double_reduc;
482 if (dump_enabled_p ())
483 dump_printf_loc (MSG_NOTE, vect_location,
484 "=== vect_analyze_scalar_cycles ===\n");
486 /* First - identify all inductions. Reduction detection assumes that all the
487 inductions have been identified, therefore, this order must not be
488 changed. */
489 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
491 gphi *phi = gsi.phi ();
492 tree access_fn = NULL;
493 tree def = PHI_RESULT (phi);
494 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
496 if (dump_enabled_p ())
498 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
499 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
502 /* Skip virtual phi's. The data dependences that are associated with
503 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
504 if (virtual_operand_p (def))
505 continue;
507 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
509 /* Analyze the evolution function. */
510 access_fn = analyze_scalar_evolution (loop, def);
511 if (access_fn)
513 STRIP_NOPS (access_fn);
514 if (dump_enabled_p ())
516 dump_printf_loc (MSG_NOTE, vect_location,
517 "Access function of PHI: ");
518 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
519 dump_printf (MSG_NOTE, "\n");
521 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 = initial_condition_in_loop_num (access_fn, loop->num);
523 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
524 = evolution_part_in_loop_num (access_fn, loop->num);
527 if (!access_fn
528 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
529 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
530 && TREE_CODE (step) != INTEGER_CST))
532 worklist.safe_push (phi);
533 continue;
536 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
537 != NULL_TREE);
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
542 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
546 /* Second - identify all reductions and nested cycles. */
547 while (worklist.length () > 0)
549 gimple *phi = worklist.pop ();
550 tree def = PHI_RESULT (phi);
551 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
552 gimple *reduc_stmt;
554 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
557 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
560 gcc_assert (!virtual_operand_p (def)
561 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
563 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
564 &double_reduc, false);
565 if (reduc_stmt)
567 if (double_reduc)
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "Detected double reduction.\n");
573 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
574 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
575 vect_double_reduction_def;
577 else
579 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
581 if (dump_enabled_p ())
582 dump_printf_loc (MSG_NOTE, vect_location,
583 "Detected vectorizable nested cycle.\n");
585 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
586 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
587 vect_nested_cycle;
589 else
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "Detected reduction.\n");
595 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
596 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
597 vect_reduction_def;
598 /* Store the reduction cycles for possible vectorization in
599 loop-aware SLP if it was not detected as reduction
600 chain. */
601 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
602 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
606 else
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
609 "Unknown def-use cycle pattern.\n");
614 /* Function vect_analyze_scalar_cycles.
616 Examine the cross iteration def-use cycles of scalar variables, by
617 analyzing the loop-header PHIs of scalar variables. Classify each
618 cycle as one of the following: invariant, induction, reduction, unknown.
619 We do that for the loop represented by LOOP_VINFO, and also to its
620 inner-loop, if exists.
621 Examples for scalar cycles:
623 Example1: reduction:
625 loop1:
626 for (i=0; i<N; i++)
627 sum += a[i];
629 Example2: induction:
631 loop2:
632 for (i=0; i<N; i++)
633 a[i] = i; */
635 static void
636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
638 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
640 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
642 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
643 Reductions in such inner-loop therefore have different properties than
644 the reductions in the nest that gets vectorized:
645 1. When vectorized, they are executed in the same order as in the original
646 scalar loop, so we can't change the order of computation when
647 vectorizing them.
648 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
649 current checks are too strict. */
651 if (loop->inner)
652 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
655 /* Transfer group and reduction information from STMT to its pattern stmt. */
657 static void
658 vect_fixup_reduc_chain (gimple *stmt)
660 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
661 gimple *stmtp;
662 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
663 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
664 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
667 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
668 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
669 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
670 if (stmt)
671 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
672 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
674 while (stmt);
675 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
678 /* Fixup scalar cycles that now have their stmts detected as patterns. */
680 static void
681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
683 gimple *first;
684 unsigned i;
686 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
687 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
689 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
690 while (next)
692 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
693 break;
694 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
696 /* If not all stmt in the chain are patterns try to handle
697 the chain without patterns. */
698 if (! next)
700 vect_fixup_reduc_chain (first);
701 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
702 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
707 /* Function vect_get_loop_niters.
709 Determine how many iterations the loop is executed and place it
710 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
711 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
712 niter information holds in ASSUMPTIONS.
714 Return the loop exit condition. */
717 static gcond *
718 vect_get_loop_niters (struct loop *loop, tree *assumptions,
719 tree *number_of_iterations, tree *number_of_iterationsm1)
721 edge exit = single_exit (loop);
722 struct tree_niter_desc niter_desc;
723 tree niter_assumptions, niter, may_be_zero;
724 gcond *cond = get_loop_exit_condition (loop);
726 *assumptions = boolean_true_node;
727 *number_of_iterationsm1 = chrec_dont_know;
728 *number_of_iterations = chrec_dont_know;
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_NOTE, vect_location,
731 "=== get_loop_niters ===\n");
733 if (!exit)
734 return cond;
736 niter = chrec_dont_know;
737 may_be_zero = NULL_TREE;
738 niter_assumptions = boolean_true_node;
739 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
740 || chrec_contains_undetermined (niter_desc.niter))
741 return cond;
743 niter_assumptions = niter_desc.assumptions;
744 may_be_zero = niter_desc.may_be_zero;
745 niter = niter_desc.niter;
747 if (may_be_zero && integer_zerop (may_be_zero))
748 may_be_zero = NULL_TREE;
750 if (may_be_zero)
752 if (COMPARISON_CLASS_P (may_be_zero))
754 /* Try to combine may_be_zero with assumptions, this can simplify
755 computation of niter expression. */
756 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
757 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
758 niter_assumptions,
759 fold_build1 (TRUTH_NOT_EXPR,
760 boolean_type_node,
761 may_be_zero));
762 else
763 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
764 build_int_cst (TREE_TYPE (niter), 0),
765 rewrite_to_non_trapping_overflow (niter));
767 may_be_zero = NULL_TREE;
769 else if (integer_nonzerop (may_be_zero))
771 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
772 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
773 return cond;
775 else
776 return cond;
779 *assumptions = niter_assumptions;
780 *number_of_iterationsm1 = niter;
782 /* We want the number of loop header executions which is the number
783 of latch executions plus one.
784 ??? For UINT_MAX latch executions this number overflows to zero
785 for loops like do { n++; } while (n != 0); */
786 if (niter && !chrec_contains_undetermined (niter))
787 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
788 build_int_cst (TREE_TYPE (niter), 1));
789 *number_of_iterations = niter;
791 return cond;
794 /* Function bb_in_loop_p
796 Used as predicate for dfs order traversal of the loop bbs. */
798 static bool
799 bb_in_loop_p (const_basic_block bb, const void *data)
801 const struct loop *const loop = (const struct loop *)data;
802 if (flow_bb_inside_loop_p (loop, bb))
803 return true;
804 return false;
808 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
809 stmt_vec_info structs for all the stmts in LOOP_IN. */
811 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
812 : vec_info (vec_info::loop, init_cost (loop_in)),
813 loop (loop_in),
814 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
815 num_itersm1 (NULL_TREE),
816 num_iters (NULL_TREE),
817 num_iters_unchanged (NULL_TREE),
818 num_iters_assumptions (NULL_TREE),
819 th (0),
820 versioning_threshold (0),
821 vectorization_factor (0),
822 max_vectorization_factor (0),
823 mask_skip_niters (NULL_TREE),
824 mask_compare_type (NULL_TREE),
825 unaligned_dr (NULL),
826 peeling_for_alignment (0),
827 ptr_mask (0),
828 ivexpr_map (NULL),
829 slp_unrolling_factor (1),
830 single_scalar_iteration_cost (0),
831 vectorizable (false),
832 can_fully_mask_p (true),
833 fully_masked_p (false),
834 peeling_for_gaps (false),
835 peeling_for_niter (false),
836 operands_swapped (false),
837 no_data_dependencies (false),
838 has_mask_store (false),
839 scalar_loop (NULL),
840 orig_loop_info (NULL)
842 /* Create/Update stmt_info for all stmts in the loop. */
843 basic_block *body = get_loop_body (loop);
844 for (unsigned int i = 0; i < loop->num_nodes; i++)
846 basic_block bb = body[i];
847 gimple_stmt_iterator si;
849 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
851 gimple *phi = gsi_stmt (si);
852 gimple_set_uid (phi, 0);
853 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
856 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
858 gimple *stmt = gsi_stmt (si);
859 gimple_set_uid (stmt, 0);
860 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
863 free (body);
865 /* CHECKME: We want to visit all BBs before their successors (except for
866 latch blocks, for which this assertion wouldn't hold). In the simple
867 case of the loop forms we allow, a dfs order of the BBs would the same
868 as reversed postorder traversal, so we are safe. */
870 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
871 bbs, loop->num_nodes, loop);
872 gcc_assert (nbbs == loop->num_nodes);
875 /* Free all levels of MASKS. */
877 void
878 release_vec_loop_masks (vec_loop_masks *masks)
880 rgroup_masks *rgm;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (*masks, i, rgm)
883 rgm->masks.release ();
884 masks->release ();
887 /* Free all memory used by the _loop_vec_info, as well as all the
888 stmt_vec_info structs of all the stmts in the loop. */
890 _loop_vec_info::~_loop_vec_info ()
892 int nbbs;
893 gimple_stmt_iterator si;
894 int j;
896 nbbs = loop->num_nodes;
897 for (j = 0; j < nbbs; j++)
899 basic_block bb = bbs[j];
900 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
901 free_stmt_vec_info (gsi_stmt (si));
903 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
905 gimple *stmt = gsi_stmt (si);
907 /* We may have broken canonical form by moving a constant
908 into RHS1 of a commutative op. Fix such occurrences. */
909 if (operands_swapped && is_gimple_assign (stmt))
911 enum tree_code code = gimple_assign_rhs_code (stmt);
913 if ((code == PLUS_EXPR
914 || code == POINTER_PLUS_EXPR
915 || code == MULT_EXPR)
916 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
917 swap_ssa_operands (stmt,
918 gimple_assign_rhs1_ptr (stmt),
919 gimple_assign_rhs2_ptr (stmt));
920 else if (code == COND_EXPR
921 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
923 tree cond_expr = gimple_assign_rhs1 (stmt);
924 enum tree_code cond_code = TREE_CODE (cond_expr);
926 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
928 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
929 0));
930 cond_code = invert_tree_comparison (cond_code,
931 honor_nans);
932 if (cond_code != ERROR_MARK)
934 TREE_SET_CODE (cond_expr, cond_code);
935 swap_ssa_operands (stmt,
936 gimple_assign_rhs2_ptr (stmt),
937 gimple_assign_rhs3_ptr (stmt));
943 /* Free stmt_vec_info. */
944 free_stmt_vec_info (stmt);
945 gsi_next (&si);
949 free (bbs);
951 release_vec_loop_masks (&masks);
952 delete ivexpr_map;
954 loop->aux = NULL;
957 /* Return an invariant or register for EXPR and emit necessary
958 computations in the LOOP_VINFO loop preheader. */
960 tree
961 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
963 if (is_gimple_reg (expr)
964 || is_gimple_min_invariant (expr))
965 return expr;
967 if (! loop_vinfo->ivexpr_map)
968 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
969 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
970 if (! cached)
972 gimple_seq stmts = NULL;
973 cached = force_gimple_operand (unshare_expr (expr),
974 &stmts, true, NULL_TREE);
975 if (stmts)
977 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
978 gsi_insert_seq_on_edge_immediate (e, stmts);
981 return cached;
984 /* Return true if we can use CMP_TYPE as the comparison type to produce
985 all masks required to mask LOOP_VINFO. */
987 static bool
988 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
990 rgroup_masks *rgm;
991 unsigned int i;
992 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
993 if (rgm->mask_type != NULL_TREE
994 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
995 cmp_type, rgm->mask_type,
996 OPTIMIZE_FOR_SPEED))
997 return false;
998 return true;
1001 /* Calculate the maximum number of scalars per iteration for every
1002 rgroup in LOOP_VINFO. */
1004 static unsigned int
1005 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1007 unsigned int res = 1;
1008 unsigned int i;
1009 rgroup_masks *rgm;
1010 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1011 res = MAX (res, rgm->max_nscalars_per_iter);
1012 return res;
1015 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1016 whether we can actually generate the masks required. Return true if so,
1017 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1019 static bool
1020 vect_verify_full_masking (loop_vec_info loop_vinfo)
1022 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1023 unsigned int min_ni_width;
1025 /* Use a normal loop if there are no statements that need masking.
1026 This only happens in rare degenerate cases: it means that the loop
1027 has no loads, no stores, and no live-out values. */
1028 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1029 return false;
1031 /* Get the maximum number of iterations that is representable
1032 in the counter type. */
1033 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1034 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1036 /* Get a more refined estimate for the number of iterations. */
1037 widest_int max_back_edges;
1038 if (max_loop_iterations (loop, &max_back_edges))
1039 max_ni = wi::smin (max_ni, max_back_edges + 1);
1041 /* Account for rgroup masks, in which each bit is replicated N times. */
1042 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1044 /* Work out how many bits we need to represent the limit. */
1045 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1047 /* Find a scalar mode for which WHILE_ULT is supported. */
1048 opt_scalar_int_mode cmp_mode_iter;
1049 tree cmp_type = NULL_TREE;
1050 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1052 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1053 if (cmp_bits >= min_ni_width
1054 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1056 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1057 if (this_type
1058 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1060 /* Although we could stop as soon as we find a valid mode,
1061 it's often better to continue until we hit Pmode, since the
1062 operands to the WHILE are more likely to be reusable in
1063 address calculations. */
1064 cmp_type = this_type;
1065 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1066 break;
1071 if (!cmp_type)
1072 return false;
1074 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1075 return true;
1078 /* Calculate the cost of one scalar iteration of the loop. */
1079 static void
1080 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1082 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1083 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1084 int nbbs = loop->num_nodes, factor;
1085 int innerloop_iters, i;
1087 /* Gather costs for statements in the scalar loop. */
1089 /* FORNOW. */
1090 innerloop_iters = 1;
1091 if (loop->inner)
1092 innerloop_iters = 50; /* FIXME */
1094 for (i = 0; i < nbbs; i++)
1096 gimple_stmt_iterator si;
1097 basic_block bb = bbs[i];
1099 if (bb->loop_father == loop->inner)
1100 factor = innerloop_iters;
1101 else
1102 factor = 1;
1104 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1106 gimple *stmt = gsi_stmt (si);
1107 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1109 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1110 continue;
1112 /* Skip stmts that are not vectorized inside the loop. */
1113 if (stmt_info
1114 && !STMT_VINFO_RELEVANT_P (stmt_info)
1115 && (!STMT_VINFO_LIVE_P (stmt_info)
1116 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1117 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1118 continue;
1120 vect_cost_for_stmt kind;
1121 if (STMT_VINFO_DATA_REF (stmt_info))
1123 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1124 kind = scalar_load;
1125 else
1126 kind = scalar_store;
1128 else
1129 kind = scalar_stmt;
1131 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1132 factor, kind, stmt_info, 0, vect_prologue);
1136 /* Now accumulate cost. */
1137 void *target_cost_data = init_cost (loop);
1138 stmt_info_for_cost *si;
1139 int j;
1140 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1141 j, si)
1143 struct _stmt_vec_info *stmt_info
1144 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1145 (void) add_stmt_cost (target_cost_data, si->count,
1146 si->kind, stmt_info, si->misalign,
1147 vect_body);
1149 unsigned dummy, body_cost = 0;
1150 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1151 destroy_cost_data (target_cost_data);
1152 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1156 /* Function vect_analyze_loop_form_1.
1158 Verify that certain CFG restrictions hold, including:
1159 - the loop has a pre-header
1160 - the loop has a single entry and exit
1161 - the loop exit condition is simple enough
1162 - the number of iterations can be analyzed, i.e, a countable loop. The
1163 niter could be analyzed under some assumptions. */
1165 bool
1166 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1167 tree *assumptions, tree *number_of_iterationsm1,
1168 tree *number_of_iterations, gcond **inner_loop_cond)
1170 if (dump_enabled_p ())
1171 dump_printf_loc (MSG_NOTE, vect_location,
1172 "=== vect_analyze_loop_form ===\n");
1174 /* Different restrictions apply when we are considering an inner-most loop,
1175 vs. an outer (nested) loop.
1176 (FORNOW. May want to relax some of these restrictions in the future). */
1178 if (!loop->inner)
1180 /* Inner-most loop. We currently require that the number of BBs is
1181 exactly 2 (the header and latch). Vectorizable inner-most loops
1182 look like this:
1184 (pre-header)
1186 header <--------+
1187 | | |
1188 | +--> latch --+
1190 (exit-bb) */
1192 if (loop->num_nodes != 2)
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "not vectorized: control flow in loop.\n");
1197 return false;
1200 if (empty_block_p (loop->header))
1202 if (dump_enabled_p ())
1203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1204 "not vectorized: empty loop.\n");
1205 return false;
1208 else
1210 struct loop *innerloop = loop->inner;
1211 edge entryedge;
1213 /* Nested loop. We currently require that the loop is doubly-nested,
1214 contains a single inner loop, and the number of BBs is exactly 5.
1215 Vectorizable outer-loops look like this:
1217 (pre-header)
1219 header <---+
1221 inner-loop |
1223 tail ------+
1225 (exit-bb)
1227 The inner-loop has the properties expected of inner-most loops
1228 as described above. */
1230 if ((loop->inner)->inner || (loop->inner)->next)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: multiple nested loops.\n");
1235 return false;
1238 if (loop->num_nodes != 5)
1240 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242 "not vectorized: control flow in loop.\n");
1243 return false;
1246 entryedge = loop_preheader_edge (innerloop);
1247 if (entryedge->src != loop->header
1248 || !single_exit (innerloop)
1249 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1251 if (dump_enabled_p ())
1252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1253 "not vectorized: unsupported outerloop form.\n");
1254 return false;
1257 /* Analyze the inner-loop. */
1258 tree inner_niterm1, inner_niter, inner_assumptions;
1259 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1260 &inner_assumptions, &inner_niterm1,
1261 &inner_niter, NULL)
1262 /* Don't support analyzing niter under assumptions for inner
1263 loop. */
1264 || !integer_onep (inner_assumptions))
1266 if (dump_enabled_p ())
1267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1268 "not vectorized: Bad inner loop.\n");
1269 return false;
1272 if (!expr_invariant_in_loop_p (loop, inner_niter))
1274 if (dump_enabled_p ())
1275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276 "not vectorized: inner-loop count not"
1277 " invariant.\n");
1278 return false;
1281 if (dump_enabled_p ())
1282 dump_printf_loc (MSG_NOTE, vect_location,
1283 "Considering outer-loop vectorization.\n");
1286 if (!single_exit (loop)
1287 || EDGE_COUNT (loop->header->preds) != 2)
1289 if (dump_enabled_p ())
1291 if (!single_exit (loop))
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: multiple exits.\n");
1294 else if (EDGE_COUNT (loop->header->preds) != 2)
1295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1296 "not vectorized: too many incoming edges.\n");
1298 return false;
1301 /* We assume that the loop exit condition is at the end of the loop. i.e,
1302 that the loop is represented as a do-while (with a proper if-guard
1303 before the loop if needed), where the loop header contains all the
1304 executable statements, and the latch is empty. */
1305 if (!empty_block_p (loop->latch)
1306 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 "not vectorized: latch block not empty.\n");
1311 return false;
1314 /* Make sure the exit is not abnormal. */
1315 edge e = single_exit (loop);
1316 if (e->flags & EDGE_ABNORMAL)
1318 if (dump_enabled_p ())
1319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1320 "not vectorized: abnormal loop exit edge.\n");
1321 return false;
1324 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1325 number_of_iterationsm1);
1326 if (!*loop_cond)
1328 if (dump_enabled_p ())
1329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330 "not vectorized: complicated exit condition.\n");
1331 return false;
1334 if (integer_zerop (*assumptions)
1335 || !*number_of_iterations
1336 || chrec_contains_undetermined (*number_of_iterations))
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340 "not vectorized: number of iterations cannot be "
1341 "computed.\n");
1342 return false;
1345 if (integer_zerop (*number_of_iterations))
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349 "not vectorized: number of iterations = 0.\n");
1350 return false;
1353 return true;
1356 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1358 loop_vec_info
1359 vect_analyze_loop_form (struct loop *loop)
1361 tree assumptions, number_of_iterations, number_of_iterationsm1;
1362 gcond *loop_cond, *inner_loop_cond = NULL;
1364 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1365 &assumptions, &number_of_iterationsm1,
1366 &number_of_iterations, &inner_loop_cond))
1367 return NULL;
1369 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1370 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1371 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1372 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1373 if (!integer_onep (assumptions))
1375 /* We consider to vectorize this loop by versioning it under
1376 some assumptions. In order to do this, we need to clear
1377 existing information computed by scev and niter analyzer. */
1378 scev_reset_htab ();
1379 free_numbers_of_iterations_estimates (loop);
1380 /* Also set flag for this loop so that following scev and niter
1381 analysis are done under the assumptions. */
1382 loop_constraint_set (loop, LOOP_C_FINITE);
1383 /* Also record the assumptions for versioning. */
1384 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1387 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1389 if (dump_enabled_p ())
1391 dump_printf_loc (MSG_NOTE, vect_location,
1392 "Symbolic number of iterations is ");
1393 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1394 dump_printf (MSG_NOTE, "\n");
1398 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1399 if (inner_loop_cond)
1400 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1401 = loop_exit_ctrl_vec_info_type;
1403 gcc_assert (!loop->aux);
1404 loop->aux = loop_vinfo;
1405 return loop_vinfo;
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411 statements update the vectorization factor. */
1413 static void
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1416 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1417 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1418 int nbbs = loop->num_nodes;
1419 poly_uint64 vectorization_factor;
1420 int i;
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "=== vect_update_vf_for_slp ===\n");
1426 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1427 gcc_assert (known_ne (vectorization_factor, 0U));
1429 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1430 vectorization factor of the loop is the unrolling factor required by
1431 the SLP instances. If that unrolling factor is 1, we say, that we
1432 perform pure SLP on loop - cross iteration parallelism is not
1433 exploited. */
1434 bool only_slp_in_loop = true;
1435 for (i = 0; i < nbbs; i++)
1437 basic_block bb = bbs[i];
1438 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1439 gsi_next (&si))
1441 gimple *stmt = gsi_stmt (si);
1442 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1443 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1444 && STMT_VINFO_RELATED_STMT (stmt_info))
1446 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1447 stmt_info = vinfo_for_stmt (stmt);
1449 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1450 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1451 && !PURE_SLP_STMT (stmt_info))
1452 /* STMT needs both SLP and loop-based vectorization. */
1453 only_slp_in_loop = false;
1457 if (only_slp_in_loop)
1459 dump_printf_loc (MSG_NOTE, vect_location,
1460 "Loop contains only SLP stmts\n");
1461 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1463 else
1465 dump_printf_loc (MSG_NOTE, vect_location,
1466 "Loop contains SLP and non-SLP stmts\n");
1467 /* Both the vectorization factor and unroll factor have the form
1468 current_vector_size * X for some rational X, so they must have
1469 a common multiple. */
1470 vectorization_factor
1471 = force_common_multiple (vectorization_factor,
1472 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1475 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1476 if (dump_enabled_p ())
1478 dump_printf_loc (MSG_NOTE, vect_location,
1479 "Updating vectorization factor to ");
1480 dump_dec (MSG_NOTE, vectorization_factor);
1481 dump_printf (MSG_NOTE, ".\n");
1485 /* Return true if STMT_INFO describes a double reduction phi and if
1486 the other phi in the reduction is also relevant for vectorization.
1487 This rejects cases such as:
1489 outer1:
1490 x_1 = PHI <x_3(outer2), ...>;
1493 inner:
1494 x_2 = ...;
1497 outer2:
1498 x_3 = PHI <x_2(inner)>;
1500 if nothing in x_2 or elsewhere makes x_1 relevant. */
1502 static bool
1503 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1505 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1506 return false;
1508 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1509 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1512 /* Function vect_analyze_loop_operations.
1514 Scan the loop stmts and make sure they are all vectorizable. */
1516 static bool
1517 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1519 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1521 int nbbs = loop->num_nodes;
1522 int i;
1523 stmt_vec_info stmt_info;
1524 bool need_to_vectorize = false;
1525 bool ok;
1527 if (dump_enabled_p ())
1528 dump_printf_loc (MSG_NOTE, vect_location,
1529 "=== vect_analyze_loop_operations ===\n");
1531 stmt_vector_for_cost cost_vec;
1532 cost_vec.create (2);
1534 for (i = 0; i < nbbs; i++)
1536 basic_block bb = bbs[i];
1538 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1539 gsi_next (&si))
1541 gphi *phi = si.phi ();
1542 ok = true;
1544 stmt_info = vinfo_for_stmt (phi);
1545 if (dump_enabled_p ())
1547 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1548 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1550 if (virtual_operand_p (gimple_phi_result (phi)))
1551 continue;
1553 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1554 (i.e., a phi in the tail of the outer-loop). */
1555 if (! is_loop_header_bb_p (bb))
1557 /* FORNOW: we currently don't support the case that these phis
1558 are not used in the outerloop (unless it is double reduction,
1559 i.e., this phi is vect_reduction_def), cause this case
1560 requires to actually do something here. */
1561 if (STMT_VINFO_LIVE_P (stmt_info)
1562 && !vect_active_double_reduction_p (stmt_info))
1564 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566 "Unsupported loop-closed phi in "
1567 "outer-loop.\n");
1568 return false;
1571 /* If PHI is used in the outer loop, we check that its operand
1572 is defined in the inner loop. */
1573 if (STMT_VINFO_RELEVANT_P (stmt_info))
1575 tree phi_op;
1576 gimple *op_def_stmt;
1578 if (gimple_phi_num_args (phi) != 1)
1579 return false;
1581 phi_op = PHI_ARG_DEF (phi, 0);
1582 if (TREE_CODE (phi_op) != SSA_NAME)
1583 return false;
1585 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1586 if (gimple_nop_p (op_def_stmt)
1587 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1588 || !vinfo_for_stmt (op_def_stmt))
1589 return false;
1591 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1592 != vect_used_in_outer
1593 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1594 != vect_used_in_outer_by_reduction)
1595 return false;
1598 continue;
1601 gcc_assert (stmt_info);
1603 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1604 || STMT_VINFO_LIVE_P (stmt_info))
1605 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1607 /* A scalar-dependence cycle that we don't support. */
1608 if (dump_enabled_p ())
1609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1610 "not vectorized: scalar dependence cycle.\n");
1611 return false;
1614 if (STMT_VINFO_RELEVANT_P (stmt_info))
1616 need_to_vectorize = true;
1617 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1618 && ! PURE_SLP_STMT (stmt_info))
1619 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1620 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1621 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1622 && ! PURE_SLP_STMT (stmt_info))
1623 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1624 &cost_vec);
1627 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1628 if (ok
1629 && STMT_VINFO_LIVE_P (stmt_info)
1630 && !PURE_SLP_STMT (stmt_info))
1631 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1632 &cost_vec);
1634 if (!ok)
1636 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "not vectorized: relevant phi not "
1640 "supported: ");
1641 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1643 return false;
1647 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1648 gsi_next (&si))
1650 gimple *stmt = gsi_stmt (si);
1651 if (!gimple_clobber_p (stmt)
1652 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1653 &cost_vec))
1654 return false;
1656 } /* bbs */
1658 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1659 cost_vec.release ();
1661 /* All operations in the loop are either irrelevant (deal with loop
1662 control, or dead), or only used outside the loop and can be moved
1663 out of the loop (e.g. invariants, inductions). The loop can be
1664 optimized away by scalar optimizations. We're better off not
1665 touching this loop. */
1666 if (!need_to_vectorize)
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_NOTE, vect_location,
1670 "All the computation can be taken out of the loop.\n");
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673 "not vectorized: redundant loop. no profit to "
1674 "vectorize.\n");
1675 return false;
1678 return true;
1681 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1682 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1683 definitely no, or -1 if it's worth retrying. */
1685 static int
1686 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1688 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1689 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1691 /* Only fully-masked loops can have iteration counts less than the
1692 vectorization factor. */
1693 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1695 HOST_WIDE_INT max_niter;
1697 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1698 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1699 else
1700 max_niter = max_stmt_executions_int (loop);
1702 if (max_niter != -1
1703 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1705 if (dump_enabled_p ())
1706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707 "not vectorized: iteration count smaller than "
1708 "vectorization factor.\n");
1709 return 0;
1713 int min_profitable_iters, min_profitable_estimate;
1714 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1715 &min_profitable_estimate);
1717 if (min_profitable_iters < 0)
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721 "not vectorized: vectorization not profitable.\n");
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724 "not vectorized: vector version will never be "
1725 "profitable.\n");
1726 return -1;
1729 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1730 * assumed_vf);
1732 /* Use the cost model only if it is more conservative than user specified
1733 threshold. */
1734 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1735 min_profitable_iters);
1737 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1739 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1740 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 "not vectorized: vectorization not profitable.\n");
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_NOTE, vect_location,
1747 "not vectorized: iteration count smaller than user "
1748 "specified loop bound parameter or minimum profitable "
1749 "iterations (whichever is more conservative).\n");
1750 return 0;
1753 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1754 if (estimated_niter == -1)
1755 estimated_niter = likely_max_stmt_executions_int (loop);
1756 if (estimated_niter != -1
1757 && ((unsigned HOST_WIDE_INT) estimated_niter
1758 < MAX (th, (unsigned) min_profitable_estimate)))
1760 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762 "not vectorized: estimated iteration count too "
1763 "small.\n");
1764 if (dump_enabled_p ())
1765 dump_printf_loc (MSG_NOTE, vect_location,
1766 "not vectorized: estimated iteration count smaller "
1767 "than specified loop bound parameter or minimum "
1768 "profitable iterations (whichever is more "
1769 "conservative).\n");
1770 return -1;
1773 return 1;
1777 /* Function vect_analyze_loop_2.
1779 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780 for it. The different analyses will record information in the
1781 loop_vec_info struct. */
1782 static bool
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1785 bool ok;
1786 int res;
1787 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1788 poly_uint64 min_vf = 2;
1789 unsigned int n_stmts = 0;
1791 /* The first group of checks is independent of the vector size. */
1792 fatal = true;
1794 /* Find all data references in the loop (which correspond to vdefs/vuses)
1795 and analyze their evolution in the loop. */
1797 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1799 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1800 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1802 if (dump_enabled_p ())
1803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1804 "not vectorized: loop nest containing two "
1805 "or more consecutive inner loops cannot be "
1806 "vectorized\n");
1807 return false;
1810 for (unsigned i = 0; i < loop->num_nodes; i++)
1811 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1812 !gsi_end_p (gsi); gsi_next (&gsi))
1814 gimple *stmt = gsi_stmt (gsi);
1815 if (is_gimple_debug (stmt))
1816 continue;
1817 ++n_stmts;
1818 if (!find_data_references_in_stmt (loop, stmt,
1819 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1821 if (is_gimple_call (stmt) && loop->safelen)
1823 tree fndecl = gimple_call_fndecl (stmt), op;
1824 if (fndecl != NULL_TREE)
1826 cgraph_node *node = cgraph_node::get (fndecl);
1827 if (node != NULL && node->simd_clones != NULL)
1829 unsigned int j, n = gimple_call_num_args (stmt);
1830 for (j = 0; j < n; j++)
1832 op = gimple_call_arg (stmt, j);
1833 if (DECL_P (op)
1834 || (REFERENCE_CLASS_P (op)
1835 && get_base_address (op)))
1836 break;
1838 op = gimple_call_lhs (stmt);
1839 /* Ignore #pragma omp declare simd functions
1840 if they don't have data references in the
1841 call stmt itself. */
1842 if (j == n
1843 && !(op
1844 && (DECL_P (op)
1845 || (REFERENCE_CLASS_P (op)
1846 && get_base_address (op)))))
1847 continue;
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853 "not vectorized: loop contains function "
1854 "calls or data references that cannot "
1855 "be analyzed\n");
1856 return false;
1860 /* Analyze the data references and also adjust the minimal
1861 vectorization factor according to the loads and stores. */
1863 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1864 if (!ok)
1866 if (dump_enabled_p ())
1867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868 "bad data references.\n");
1869 return false;
1872 /* Classify all cross-iteration scalar data-flow cycles.
1873 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1874 vect_analyze_scalar_cycles (loop_vinfo);
1876 vect_pattern_recog (loop_vinfo);
1878 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1880 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1881 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1883 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1884 if (!ok)
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 "bad data access.\n");
1889 return false;
1892 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1894 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1895 if (!ok)
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 "unexpected pattern.\n");
1900 return false;
1903 /* While the rest of the analysis below depends on it in some way. */
1904 fatal = false;
1906 /* Analyze data dependences between the data-refs in the loop
1907 and adjust the maximum vectorization factor according to
1908 the dependences.
1909 FORNOW: fail at the first data dependence that we encounter. */
1911 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1912 if (!ok
1913 || (max_vf != MAX_VECTORIZATION_FACTOR
1914 && maybe_lt (max_vf, min_vf)))
1916 if (dump_enabled_p ())
1917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918 "bad data dependence.\n");
1919 return false;
1921 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1923 ok = vect_determine_vectorization_factor (loop_vinfo);
1924 if (!ok)
1926 if (dump_enabled_p ())
1927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928 "can't determine vectorization factor.\n");
1929 return false;
1931 if (max_vf != MAX_VECTORIZATION_FACTOR
1932 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1934 if (dump_enabled_p ())
1935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936 "bad data dependence.\n");
1937 return false;
1940 /* Compute the scalar iteration cost. */
1941 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1943 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1944 unsigned th;
1946 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1947 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1948 if (!ok)
1949 return false;
1951 /* If there are any SLP instances mark them as pure_slp. */
1952 bool slp = vect_make_slp_decision (loop_vinfo);
1953 if (slp)
1955 /* Find stmts that need to be both vectorized and SLPed. */
1956 vect_detect_hybrid_slp (loop_vinfo);
1958 /* Update the vectorization factor based on the SLP decision. */
1959 vect_update_vf_for_slp (loop_vinfo);
1962 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1964 /* We don't expect to have to roll back to anything other than an empty
1965 set of rgroups. */
1966 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1968 /* This is the point where we can re-start analysis with SLP forced off. */
1969 start_over:
1971 /* Now the vectorization factor is final. */
1972 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1973 gcc_assert (known_ne (vectorization_factor, 0U));
1975 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1977 dump_printf_loc (MSG_NOTE, vect_location,
1978 "vectorization_factor = ");
1979 dump_dec (MSG_NOTE, vectorization_factor);
1980 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1981 LOOP_VINFO_INT_NITERS (loop_vinfo));
1984 HOST_WIDE_INT max_niter
1985 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1987 /* Analyze the alignment of the data-refs in the loop.
1988 Fail if a data reference is found that cannot be vectorized. */
1990 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1991 if (!ok)
1993 if (dump_enabled_p ())
1994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1995 "bad data alignment.\n");
1996 return false;
1999 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2000 It is important to call pruning after vect_analyze_data_ref_accesses,
2001 since we use grouping information gathered by interleaving analysis. */
2002 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2003 if (!ok)
2004 return false;
2006 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2007 vectorization. */
2008 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2010 /* This pass will decide on using loop versioning and/or loop peeling in
2011 order to enhance the alignment of data references in the loop. */
2012 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2013 if (!ok)
2015 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2017 "bad data alignment.\n");
2018 return false;
2022 if (slp)
2024 /* Analyze operations in the SLP instances. Note this may
2025 remove unsupported SLP instances which makes the above
2026 SLP kind detection invalid. */
2027 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2028 vect_slp_analyze_operations (loop_vinfo);
2029 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2030 goto again;
2033 /* Scan all the remaining operations in the loop that are not subject
2034 to SLP and make sure they are vectorizable. */
2035 ok = vect_analyze_loop_operations (loop_vinfo);
2036 if (!ok)
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040 "bad operation or unsupported loop bound.\n");
2041 return false;
2044 /* Decide whether to use a fully-masked loop for this vectorization
2045 factor. */
2046 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2047 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2048 && vect_verify_full_masking (loop_vinfo));
2049 if (dump_enabled_p ())
2051 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2052 dump_printf_loc (MSG_NOTE, vect_location,
2053 "using a fully-masked loop.\n");
2054 else
2055 dump_printf_loc (MSG_NOTE, vect_location,
2056 "not using a fully-masked loop.\n");
2059 /* If epilog loop is required because of data accesses with gaps,
2060 one additional iteration needs to be peeled. Check if there is
2061 enough iterations for vectorization. */
2062 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2063 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2064 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2066 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2067 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2069 if (known_lt (wi::to_widest (scalar_niters), vf))
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_NOTE, vect_location,
2073 "loop has no enough iterations to support"
2074 " peeling for gaps.\n");
2075 return false;
2079 /* Check the costings of the loop make vectorizing worthwhile. */
2080 res = vect_analyze_loop_costing (loop_vinfo);
2081 if (res < 0)
2082 goto again;
2083 if (!res)
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2087 "Loop costings not worthwhile.\n");
2088 return false;
2091 /* Decide whether we need to create an epilogue loop to handle
2092 remaining scalar iterations. */
2093 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2095 unsigned HOST_WIDE_INT const_vf;
2096 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2097 /* The main loop handles all iterations. */
2098 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2099 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2100 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2102 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2103 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2104 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2105 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2107 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2108 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2109 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2110 < (unsigned) exact_log2 (const_vf))
2111 /* In case of versioning, check if the maximum number of
2112 iterations is greater than th. If they are identical,
2113 the epilogue is unnecessary. */
2114 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2115 || ((unsigned HOST_WIDE_INT) max_niter
2116 > (th / const_vf) * const_vf))))
2117 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2119 /* If an epilogue loop is required make sure we can create one. */
2120 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2121 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2125 if (!vect_can_advance_ivs_p (loop_vinfo)
2126 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2127 single_exit (LOOP_VINFO_LOOP
2128 (loop_vinfo))))
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2132 "not vectorized: can't create required "
2133 "epilog loop\n");
2134 goto again;
2138 /* During peeling, we need to check if number of loop iterations is
2139 enough for both peeled prolog loop and vector loop. This check
2140 can be merged along with threshold check of loop versioning, so
2141 increase threshold for this case if necessary. */
2142 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2144 poly_uint64 niters_th = 0;
2146 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2148 /* Niters for peeled prolog loop. */
2149 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2151 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2152 tree vectype
2153 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2154 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2156 else
2157 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2160 /* Niters for at least one iteration of vectorized loop. */
2161 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2162 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2163 /* One additional iteration because of peeling for gap. */
2164 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2165 niters_th += 1;
2166 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2169 gcc_assert (known_eq (vectorization_factor,
2170 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2172 /* Ok to vectorize! */
2173 return true;
2175 again:
2176 /* Try again with SLP forced off but if we didn't do any SLP there is
2177 no point in re-trying. */
2178 if (!slp)
2179 return false;
2181 /* If there are reduction chains re-trying will fail anyway. */
2182 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2183 return false;
2185 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2186 via interleaving or lane instructions. */
2187 slp_instance instance;
2188 slp_tree node;
2189 unsigned i, j;
2190 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2192 stmt_vec_info vinfo;
2193 vinfo = vinfo_for_stmt
2194 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2195 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2196 continue;
2197 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2198 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2199 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2200 if (! vect_store_lanes_supported (vectype, size, false)
2201 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2202 && ! vect_grouped_store_supported (vectype, size))
2203 return false;
2204 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2206 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2207 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2208 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2209 size = STMT_VINFO_GROUP_SIZE (vinfo);
2210 vectype = STMT_VINFO_VECTYPE (vinfo);
2211 if (! vect_load_lanes_supported (vectype, size, false)
2212 && ! vect_grouped_load_supported (vectype, single_element_p,
2213 size))
2214 return false;
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_NOTE, vect_location,
2220 "re-trying with SLP disabled\n");
2222 /* Roll back state appropriately. No SLP this time. */
2223 slp = false;
2224 /* Restore vectorization factor as it were without SLP. */
2225 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2226 /* Free the SLP instances. */
2227 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2228 vect_free_slp_instance (instance);
2229 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2230 /* Reset SLP type to loop_vect on all stmts. */
2231 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2233 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2234 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2235 !gsi_end_p (si); gsi_next (&si))
2237 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2238 STMT_SLP_TYPE (stmt_info) = loop_vect;
2240 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2241 !gsi_end_p (si); gsi_next (&si))
2243 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2244 STMT_SLP_TYPE (stmt_info) = loop_vect;
2245 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2247 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2248 STMT_SLP_TYPE (stmt_info) = loop_vect;
2249 for (gimple_stmt_iterator pi
2250 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2251 !gsi_end_p (pi); gsi_next (&pi))
2253 gimple *pstmt = gsi_stmt (pi);
2254 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2259 /* Free optimized alias test DDRS. */
2260 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2261 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2262 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2263 /* Reset target cost data. */
2264 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2265 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2266 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2267 /* Reset accumulated rgroup information. */
2268 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2269 /* Reset assorted flags. */
2270 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2271 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2272 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2273 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2274 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2276 goto start_over;
2279 /* Function vect_analyze_loop.
2281 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2282 for it. The different analyses will record information in the
2283 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2284 be vectorized. */
2285 loop_vec_info
2286 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2288 loop_vec_info loop_vinfo;
2289 auto_vector_sizes vector_sizes;
2291 /* Autodetect first vector size we try. */
2292 current_vector_size = 0;
2293 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2294 unsigned int next_size = 0;
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "===== analyze_loop_nest =====\n");
2300 if (loop_outer (loop)
2301 && loop_vec_info_for_loop (loop_outer (loop))
2302 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE, vect_location,
2306 "outer-loop already vectorized.\n");
2307 return NULL;
2310 poly_uint64 autodetected_vector_size = 0;
2311 while (1)
2313 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2314 loop_vinfo = vect_analyze_loop_form (loop);
2315 if (!loop_vinfo)
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319 "bad loop form.\n");
2320 return NULL;
2323 bool fatal = false;
2325 if (orig_loop_vinfo)
2326 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2328 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2330 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2332 return loop_vinfo;
2335 delete loop_vinfo;
2337 if (next_size == 0)
2338 autodetected_vector_size = current_vector_size;
2340 if (next_size < vector_sizes.length ()
2341 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2342 next_size += 1;
2344 if (fatal
2345 || next_size == vector_sizes.length ()
2346 || known_eq (current_vector_size, 0U))
2347 return NULL;
2349 /* Try the next biggest vector size. */
2350 current_vector_size = vector_sizes[next_size++];
2351 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE, vect_location,
2354 "***** Re-trying analysis with "
2355 "vector size ");
2356 dump_dec (MSG_NOTE, current_vector_size);
2357 dump_printf (MSG_NOTE, "\n");
2362 /* Return true if there is an in-order reduction function for CODE, storing
2363 it in *REDUC_FN if so. */
2365 static bool
2366 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2368 switch (code)
2370 case PLUS_EXPR:
2371 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2372 return true;
2374 default:
2375 return false;
2379 /* Function reduction_fn_for_scalar_code
2381 Input:
2382 CODE - tree_code of a reduction operations.
2384 Output:
2385 REDUC_FN - the corresponding internal function to be used to reduce the
2386 vector of partial results into a single scalar result, or IFN_LAST
2387 if the operation is a supported reduction operation, but does not have
2388 such an internal function.
2390 Return FALSE if CODE currently cannot be vectorized as reduction. */
2392 static bool
2393 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2395 switch (code)
2397 case MAX_EXPR:
2398 *reduc_fn = IFN_REDUC_MAX;
2399 return true;
2401 case MIN_EXPR:
2402 *reduc_fn = IFN_REDUC_MIN;
2403 return true;
2405 case PLUS_EXPR:
2406 *reduc_fn = IFN_REDUC_PLUS;
2407 return true;
2409 case BIT_AND_EXPR:
2410 *reduc_fn = IFN_REDUC_AND;
2411 return true;
2413 case BIT_IOR_EXPR:
2414 *reduc_fn = IFN_REDUC_IOR;
2415 return true;
2417 case BIT_XOR_EXPR:
2418 *reduc_fn = IFN_REDUC_XOR;
2419 return true;
2421 case MULT_EXPR:
2422 case MINUS_EXPR:
2423 *reduc_fn = IFN_LAST;
2424 return true;
2426 default:
2427 return false;
2431 /* If there is a neutral value X such that SLP reduction NODE would not
2432 be affected by the introduction of additional X elements, return that X,
2433 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2434 is true if the SLP statements perform a single reduction, false if each
2435 statement performs an independent reduction. */
2437 static tree
2438 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2439 bool reduc_chain)
2441 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2442 gimple *stmt = stmts[0];
2443 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2444 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2445 tree scalar_type = TREE_TYPE (vector_type);
2446 struct loop *loop = gimple_bb (stmt)->loop_father;
2447 gcc_assert (loop);
2449 switch (code)
2451 case WIDEN_SUM_EXPR:
2452 case DOT_PROD_EXPR:
2453 case SAD_EXPR:
2454 case PLUS_EXPR:
2455 case MINUS_EXPR:
2456 case BIT_IOR_EXPR:
2457 case BIT_XOR_EXPR:
2458 return build_zero_cst (scalar_type);
2460 case MULT_EXPR:
2461 return build_one_cst (scalar_type);
2463 case BIT_AND_EXPR:
2464 return build_all_ones_cst (scalar_type);
2466 case MAX_EXPR:
2467 case MIN_EXPR:
2468 /* For MIN/MAX the initial values are neutral. A reduction chain
2469 has only a single initial value, so that value is neutral for
2470 all statements. */
2471 if (reduc_chain)
2472 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2473 return NULL_TREE;
2475 default:
2476 return NULL_TREE;
2480 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2481 STMT is printed with a message MSG. */
2483 static void
2484 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2486 dump_printf_loc (msg_type, vect_location, "%s", msg);
2487 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2491 /* Detect SLP reduction of the form:
2493 #a1 = phi <a5, a0>
2494 a2 = operation (a1)
2495 a3 = operation (a2)
2496 a4 = operation (a3)
2497 a5 = operation (a4)
2499 #a = phi <a5>
2501 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2502 FIRST_STMT is the first reduction stmt in the chain
2503 (a2 = operation (a1)).
2505 Return TRUE if a reduction chain was detected. */
2507 static bool
2508 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2509 gimple *first_stmt)
2511 struct loop *loop = (gimple_bb (phi))->loop_father;
2512 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2513 enum tree_code code;
2514 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2515 stmt_vec_info use_stmt_info, current_stmt_info;
2516 tree lhs;
2517 imm_use_iterator imm_iter;
2518 use_operand_p use_p;
2519 int nloop_uses, size = 0, n_out_of_loop_uses;
2520 bool found = false;
2522 if (loop != vect_loop)
2523 return false;
2525 lhs = PHI_RESULT (phi);
2526 code = gimple_assign_rhs_code (first_stmt);
2527 while (1)
2529 nloop_uses = 0;
2530 n_out_of_loop_uses = 0;
2531 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2533 gimple *use_stmt = USE_STMT (use_p);
2534 if (is_gimple_debug (use_stmt))
2535 continue;
2537 /* Check if we got back to the reduction phi. */
2538 if (use_stmt == phi)
2540 loop_use_stmt = use_stmt;
2541 found = true;
2542 break;
2545 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2547 loop_use_stmt = use_stmt;
2548 nloop_uses++;
2550 else
2551 n_out_of_loop_uses++;
2553 /* There are can be either a single use in the loop or two uses in
2554 phi nodes. */
2555 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2556 return false;
2559 if (found)
2560 break;
2562 /* We reached a statement with no loop uses. */
2563 if (nloop_uses == 0)
2564 return false;
2566 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2567 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2568 return false;
2570 if (!is_gimple_assign (loop_use_stmt)
2571 || code != gimple_assign_rhs_code (loop_use_stmt)
2572 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2573 return false;
2575 /* Insert USE_STMT into reduction chain. */
2576 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2577 if (current_stmt)
2579 current_stmt_info = vinfo_for_stmt (current_stmt);
2580 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2581 GROUP_FIRST_ELEMENT (use_stmt_info)
2582 = GROUP_FIRST_ELEMENT (current_stmt_info);
2584 else
2585 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2587 lhs = gimple_assign_lhs (loop_use_stmt);
2588 current_stmt = loop_use_stmt;
2589 size++;
2592 if (!found || loop_use_stmt != phi || size < 2)
2593 return false;
2595 /* Swap the operands, if needed, to make the reduction operand be the second
2596 operand. */
2597 lhs = PHI_RESULT (phi);
2598 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2599 while (next_stmt)
2601 if (gimple_assign_rhs2 (next_stmt) == lhs)
2603 tree op = gimple_assign_rhs1 (next_stmt);
2604 gimple *def_stmt = NULL;
2606 if (TREE_CODE (op) == SSA_NAME)
2607 def_stmt = SSA_NAME_DEF_STMT (op);
2609 /* Check that the other def is either defined in the loop
2610 ("vect_internal_def"), or it's an induction (defined by a
2611 loop-header phi-node). */
2612 if (def_stmt
2613 && gimple_bb (def_stmt)
2614 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2615 && (is_gimple_assign (def_stmt)
2616 || is_gimple_call (def_stmt)
2617 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2618 == vect_induction_def
2619 || (gimple_code (def_stmt) == GIMPLE_PHI
2620 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2621 == vect_internal_def
2622 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2624 lhs = gimple_assign_lhs (next_stmt);
2625 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2626 continue;
2629 return false;
2631 else
2633 tree op = gimple_assign_rhs2 (next_stmt);
2634 gimple *def_stmt = NULL;
2636 if (TREE_CODE (op) == SSA_NAME)
2637 def_stmt = SSA_NAME_DEF_STMT (op);
2639 /* Check that the other def is either defined in the loop
2640 ("vect_internal_def"), or it's an induction (defined by a
2641 loop-header phi-node). */
2642 if (def_stmt
2643 && gimple_bb (def_stmt)
2644 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2645 && (is_gimple_assign (def_stmt)
2646 || is_gimple_call (def_stmt)
2647 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2648 == vect_induction_def
2649 || (gimple_code (def_stmt) == GIMPLE_PHI
2650 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2651 == vect_internal_def
2652 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2654 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2657 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2660 swap_ssa_operands (next_stmt,
2661 gimple_assign_rhs1_ptr (next_stmt),
2662 gimple_assign_rhs2_ptr (next_stmt));
2663 update_stmt (next_stmt);
2665 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2666 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2668 else
2669 return false;
2672 lhs = gimple_assign_lhs (next_stmt);
2673 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2676 /* Save the chain for further analysis in SLP detection. */
2677 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2678 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2679 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2681 return true;
2684 /* Return true if we need an in-order reduction for operation CODE
2685 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2686 overflow must wrap. */
2688 static bool
2689 needs_fold_left_reduction_p (tree type, tree_code code,
2690 bool need_wrapping_integral_overflow)
2692 /* CHECKME: check for !flag_finite_math_only too? */
2693 if (SCALAR_FLOAT_TYPE_P (type))
2694 switch (code)
2696 case MIN_EXPR:
2697 case MAX_EXPR:
2698 return false;
2700 default:
2701 return !flag_associative_math;
2704 if (INTEGRAL_TYPE_P (type))
2706 if (!operation_no_trapping_overflow (type, code))
2707 return true;
2708 if (need_wrapping_integral_overflow
2709 && !TYPE_OVERFLOW_WRAPS (type)
2710 && operation_can_overflow (code))
2711 return true;
2712 return false;
2715 if (SAT_FIXED_POINT_TYPE_P (type))
2716 return true;
2718 return false;
2721 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2722 reduction operation CODE has a handled computation expression. */
2724 bool
2725 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2726 enum tree_code code)
2728 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2729 auto_bitmap visited;
2730 tree lookfor = PHI_RESULT (phi);
2731 ssa_op_iter curri;
2732 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2733 while (USE_FROM_PTR (curr) != loop_arg)
2734 curr = op_iter_next_use (&curri);
2735 curri.i = curri.numops;
2738 path.safe_push (std::make_pair (curri, curr));
2739 tree use = USE_FROM_PTR (curr);
2740 if (use == lookfor)
2741 break;
2742 gimple *def = SSA_NAME_DEF_STMT (use);
2743 if (gimple_nop_p (def)
2744 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2746 pop:
2749 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2750 curri = x.first;
2751 curr = x.second;
2753 curr = op_iter_next_use (&curri);
2754 /* Skip already visited or non-SSA operands (from iterating
2755 over PHI args). */
2756 while (curr != NULL_USE_OPERAND_P
2757 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2758 || ! bitmap_set_bit (visited,
2759 SSA_NAME_VERSION
2760 (USE_FROM_PTR (curr)))));
2762 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2763 if (curr == NULL_USE_OPERAND_P)
2764 break;
2766 else
2768 if (gimple_code (def) == GIMPLE_PHI)
2769 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2770 else
2771 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2772 while (curr != NULL_USE_OPERAND_P
2773 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2774 || ! bitmap_set_bit (visited,
2775 SSA_NAME_VERSION
2776 (USE_FROM_PTR (curr)))))
2777 curr = op_iter_next_use (&curri);
2778 if (curr == NULL_USE_OPERAND_P)
2779 goto pop;
2782 while (1);
2783 if (dump_file && (dump_flags & TDF_DETAILS))
2785 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2786 unsigned i;
2787 std::pair<ssa_op_iter, use_operand_p> *x;
2788 FOR_EACH_VEC_ELT (path, i, x)
2790 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2791 dump_printf (MSG_NOTE, " ");
2793 dump_printf (MSG_NOTE, "\n");
2796 /* Check whether the reduction path detected is valid. */
2797 bool fail = path.length () == 0;
2798 bool neg = false;
2799 for (unsigned i = 1; i < path.length (); ++i)
2801 gimple *use_stmt = USE_STMT (path[i].second);
2802 tree op = USE_FROM_PTR (path[i].second);
2803 if (! has_single_use (op)
2804 || ! is_gimple_assign (use_stmt))
2806 fail = true;
2807 break;
2809 if (gimple_assign_rhs_code (use_stmt) != code)
2811 if (code == PLUS_EXPR
2812 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2814 /* Track whether we negate the reduction value each iteration. */
2815 if (gimple_assign_rhs2 (use_stmt) == op)
2816 neg = ! neg;
2818 else
2820 fail = true;
2821 break;
2825 return ! fail && ! neg;
2829 /* Function vect_is_simple_reduction
2831 (1) Detect a cross-iteration def-use cycle that represents a simple
2832 reduction computation. We look for the following pattern:
2834 loop_header:
2835 a1 = phi < a0, a2 >
2836 a3 = ...
2837 a2 = operation (a3, a1)
2841 a3 = ...
2842 loop_header:
2843 a1 = phi < a0, a2 >
2844 a2 = operation (a3, a1)
2846 such that:
2847 1. operation is commutative and associative and it is safe to
2848 change the order of the computation
2849 2. no uses for a2 in the loop (a2 is used out of the loop)
2850 3. no uses of a1 in the loop besides the reduction operation
2851 4. no uses of a1 outside the loop.
2853 Conditions 1,4 are tested here.
2854 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2856 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2857 nested cycles.
2859 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2860 reductions:
2862 a1 = phi < a0, a2 >
2863 inner loop (def of a3)
2864 a2 = phi < a3 >
2866 (4) Detect condition expressions, ie:
2867 for (int i = 0; i < N; i++)
2868 if (a[i] < val)
2869 ret_val = a[i];
2873 static gimple *
2874 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2875 bool *double_reduc,
2876 bool need_wrapping_integral_overflow,
2877 enum vect_reduction_type *v_reduc_type)
2879 struct loop *loop = (gimple_bb (phi))->loop_father;
2880 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2881 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2882 enum tree_code orig_code, code;
2883 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2884 tree type;
2885 int nloop_uses;
2886 tree name;
2887 imm_use_iterator imm_iter;
2888 use_operand_p use_p;
2889 bool phi_def;
2891 *double_reduc = false;
2892 *v_reduc_type = TREE_CODE_REDUCTION;
2894 tree phi_name = PHI_RESULT (phi);
2895 /* ??? If there are no uses of the PHI result the inner loop reduction
2896 won't be detected as possibly double-reduction by vectorizable_reduction
2897 because that tries to walk the PHI arg from the preheader edge which
2898 can be constant. See PR60382. */
2899 if (has_zero_uses (phi_name))
2900 return NULL;
2901 nloop_uses = 0;
2902 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2904 gimple *use_stmt = USE_STMT (use_p);
2905 if (is_gimple_debug (use_stmt))
2906 continue;
2908 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912 "intermediate value used outside loop.\n");
2914 return NULL;
2917 nloop_uses++;
2918 if (nloop_uses > 1)
2920 if (dump_enabled_p ())
2921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2922 "reduction value used in loop.\n");
2923 return NULL;
2926 phi_use_stmt = use_stmt;
2929 edge latch_e = loop_latch_edge (loop);
2930 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2931 if (TREE_CODE (loop_arg) != SSA_NAME)
2933 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936 "reduction: not ssa_name: ");
2937 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2938 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2940 return NULL;
2943 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2944 if (is_gimple_assign (def_stmt))
2946 name = gimple_assign_lhs (def_stmt);
2947 phi_def = false;
2949 else if (gimple_code (def_stmt) == GIMPLE_PHI)
2951 name = PHI_RESULT (def_stmt);
2952 phi_def = true;
2954 else
2956 if (dump_enabled_p ())
2958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2959 "reduction: unhandled reduction operation: ");
2960 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2962 return NULL;
2965 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2966 return NULL;
2968 nloop_uses = 0;
2969 auto_vec<gphi *, 3> lcphis;
2970 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2972 gimple *use_stmt = USE_STMT (use_p);
2973 if (is_gimple_debug (use_stmt))
2974 continue;
2975 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2976 nloop_uses++;
2977 else
2978 /* We can have more than one loop-closed PHI. */
2979 lcphis.safe_push (as_a <gphi *> (use_stmt));
2980 if (nloop_uses > 1)
2982 if (dump_enabled_p ())
2983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2984 "reduction used in loop.\n");
2985 return NULL;
2989 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2990 defined in the inner loop. */
2991 if (phi_def)
2993 op1 = PHI_ARG_DEF (def_stmt, 0);
2995 if (gimple_phi_num_args (def_stmt) != 1
2996 || TREE_CODE (op1) != SSA_NAME)
2998 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3000 "unsupported phi node definition.\n");
3002 return NULL;
3005 def1 = SSA_NAME_DEF_STMT (op1);
3006 if (gimple_bb (def1)
3007 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3008 && loop->inner
3009 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3010 && is_gimple_assign (def1)
3011 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3013 if (dump_enabled_p ())
3014 report_vect_op (MSG_NOTE, def_stmt,
3015 "detected double reduction: ");
3017 *double_reduc = true;
3018 return def_stmt;
3021 return NULL;
3024 /* If we are vectorizing an inner reduction we are executing that
3025 in the original order only in case we are not dealing with a
3026 double reduction. */
3027 bool check_reduction = true;
3028 if (flow_loop_nested_p (vect_loop, loop))
3030 gphi *lcphi;
3031 unsigned i;
3032 check_reduction = false;
3033 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3034 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3036 gimple *use_stmt = USE_STMT (use_p);
3037 if (is_gimple_debug (use_stmt))
3038 continue;
3039 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3040 check_reduction = true;
3044 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3045 code = orig_code = gimple_assign_rhs_code (def_stmt);
3047 /* We can handle "res -= x[i]", which is non-associative by
3048 simply rewriting this into "res += -x[i]". Avoid changing
3049 gimple instruction for the first simple tests and only do this
3050 if we're allowed to change code at all. */
3051 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3052 code = PLUS_EXPR;
3054 if (code == COND_EXPR)
3056 if (! nested_in_vect_loop)
3057 *v_reduc_type = COND_REDUCTION;
3059 op3 = gimple_assign_rhs1 (def_stmt);
3060 if (COMPARISON_CLASS_P (op3))
3062 op4 = TREE_OPERAND (op3, 1);
3063 op3 = TREE_OPERAND (op3, 0);
3065 if (op3 == phi_name || op4 == phi_name)
3067 if (dump_enabled_p ())
3068 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3069 "reduction: condition depends on previous"
3070 " iteration: ");
3071 return NULL;
3074 op1 = gimple_assign_rhs2 (def_stmt);
3075 op2 = gimple_assign_rhs3 (def_stmt);
3077 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3079 if (dump_enabled_p ())
3080 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3081 "reduction: not commutative/associative: ");
3082 return NULL;
3084 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3086 op1 = gimple_assign_rhs1 (def_stmt);
3087 op2 = gimple_assign_rhs2 (def_stmt);
3089 else
3091 if (dump_enabled_p ())
3092 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3093 "reduction: not handled operation: ");
3094 return NULL;
3097 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3099 if (dump_enabled_p ())
3100 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3101 "reduction: both uses not ssa_names: ");
3103 return NULL;
3106 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3107 if ((TREE_CODE (op1) == SSA_NAME
3108 && !types_compatible_p (type,TREE_TYPE (op1)))
3109 || (TREE_CODE (op2) == SSA_NAME
3110 && !types_compatible_p (type, TREE_TYPE (op2)))
3111 || (op3 && TREE_CODE (op3) == SSA_NAME
3112 && !types_compatible_p (type, TREE_TYPE (op3)))
3113 || (op4 && TREE_CODE (op4) == SSA_NAME
3114 && !types_compatible_p (type, TREE_TYPE (op4))))
3116 if (dump_enabled_p ())
3118 dump_printf_loc (MSG_NOTE, vect_location,
3119 "reduction: multiple types: operation type: ");
3120 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3121 dump_printf (MSG_NOTE, ", operands types: ");
3122 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3123 TREE_TYPE (op1));
3124 dump_printf (MSG_NOTE, ",");
3125 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126 TREE_TYPE (op2));
3127 if (op3)
3129 dump_printf (MSG_NOTE, ",");
3130 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3131 TREE_TYPE (op3));
3134 if (op4)
3136 dump_printf (MSG_NOTE, ",");
3137 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138 TREE_TYPE (op4));
3140 dump_printf (MSG_NOTE, "\n");
3143 return NULL;
3146 /* Check whether it's ok to change the order of the computation.
3147 Generally, when vectorizing a reduction we change the order of the
3148 computation. This may change the behavior of the program in some
3149 cases, so we need to check that this is ok. One exception is when
3150 vectorizing an outer-loop: the inner-loop is executed sequentially,
3151 and therefore vectorizing reductions in the inner-loop during
3152 outer-loop vectorization is safe. */
3153 if (check_reduction
3154 && *v_reduc_type == TREE_CODE_REDUCTION
3155 && needs_fold_left_reduction_p (type, code,
3156 need_wrapping_integral_overflow))
3157 *v_reduc_type = FOLD_LEFT_REDUCTION;
3159 /* Reduction is safe. We're dealing with one of the following:
3160 1) integer arithmetic and no trapv
3161 2) floating point arithmetic, and special flags permit this optimization
3162 3) nested cycle (i.e., outer loop vectorization). */
3163 if (TREE_CODE (op1) == SSA_NAME)
3164 def1 = SSA_NAME_DEF_STMT (op1);
3166 if (TREE_CODE (op2) == SSA_NAME)
3167 def2 = SSA_NAME_DEF_STMT (op2);
3169 if (code != COND_EXPR
3170 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3172 if (dump_enabled_p ())
3173 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3174 return NULL;
3177 /* Check that one def is the reduction def, defined by PHI,
3178 the other def is either defined in the loop ("vect_internal_def"),
3179 or it's an induction (defined by a loop-header phi-node). */
3181 if (def2 && def2 == phi
3182 && (code == COND_EXPR
3183 || !def1 || gimple_nop_p (def1)
3184 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3185 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3186 && (is_gimple_assign (def1)
3187 || is_gimple_call (def1)
3188 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3189 == vect_induction_def
3190 || (gimple_code (def1) == GIMPLE_PHI
3191 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3192 == vect_internal_def
3193 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3195 if (dump_enabled_p ())
3196 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3197 return def_stmt;
3200 if (def1 && def1 == phi
3201 && (code == COND_EXPR
3202 || !def2 || gimple_nop_p (def2)
3203 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3204 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3205 && (is_gimple_assign (def2)
3206 || is_gimple_call (def2)
3207 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3208 == vect_induction_def
3209 || (gimple_code (def2) == GIMPLE_PHI
3210 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3211 == vect_internal_def
3212 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3214 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3216 /* Check if we can swap operands (just for simplicity - so that
3217 the rest of the code can assume that the reduction variable
3218 is always the last (second) argument). */
3219 if (code == COND_EXPR)
3221 /* Swap cond_expr by inverting the condition. */
3222 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3223 enum tree_code invert_code = ERROR_MARK;
3224 enum tree_code cond_code = TREE_CODE (cond_expr);
3226 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3228 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3229 invert_code = invert_tree_comparison (cond_code, honor_nans);
3231 if (invert_code != ERROR_MARK)
3233 TREE_SET_CODE (cond_expr, invert_code);
3234 swap_ssa_operands (def_stmt,
3235 gimple_assign_rhs2_ptr (def_stmt),
3236 gimple_assign_rhs3_ptr (def_stmt));
3238 else
3240 if (dump_enabled_p ())
3241 report_vect_op (MSG_NOTE, def_stmt,
3242 "detected reduction: cannot swap operands "
3243 "for cond_expr");
3244 return NULL;
3247 else
3248 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3249 gimple_assign_rhs2_ptr (def_stmt));
3251 if (dump_enabled_p ())
3252 report_vect_op (MSG_NOTE, def_stmt,
3253 "detected reduction: need to swap operands: ");
3255 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3256 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3258 else
3260 if (dump_enabled_p ())
3261 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3264 return def_stmt;
3267 /* Try to find SLP reduction chain. */
3268 if (! nested_in_vect_loop
3269 && code != COND_EXPR
3270 && orig_code != MINUS_EXPR
3271 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3273 if (dump_enabled_p ())
3274 report_vect_op (MSG_NOTE, def_stmt,
3275 "reduction: detected reduction chain: ");
3277 return def_stmt;
3280 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3281 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3282 while (first)
3284 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3285 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3286 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3287 first = next;
3290 /* Look for the expression computing loop_arg from loop PHI result. */
3291 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3292 code))
3293 return def_stmt;
3295 if (dump_enabled_p ())
3297 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3298 "reduction: unknown pattern: ");
3301 return NULL;
3304 /* Wrapper around vect_is_simple_reduction, which will modify code
3305 in-place if it enables detection of more reductions. Arguments
3306 as there. */
3308 gimple *
3309 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3310 bool *double_reduc,
3311 bool need_wrapping_integral_overflow)
3313 enum vect_reduction_type v_reduc_type;
3314 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3315 need_wrapping_integral_overflow,
3316 &v_reduc_type);
3317 if (def)
3319 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3320 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3321 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3322 reduc_def_info = vinfo_for_stmt (def);
3323 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3324 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3326 return def;
3329 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3331 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3332 int *peel_iters_epilogue,
3333 stmt_vector_for_cost *scalar_cost_vec,
3334 stmt_vector_for_cost *prologue_cost_vec,
3335 stmt_vector_for_cost *epilogue_cost_vec)
3337 int retval = 0;
3338 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3340 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3342 *peel_iters_epilogue = assumed_vf / 2;
3343 if (dump_enabled_p ())
3344 dump_printf_loc (MSG_NOTE, vect_location,
3345 "cost model: epilogue peel iters set to vf/2 "
3346 "because loop iterations are unknown .\n");
3348 /* If peeled iterations are known but number of scalar loop
3349 iterations are unknown, count a taken branch per peeled loop. */
3350 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3351 NULL, 0, vect_prologue);
3352 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3353 NULL, 0, vect_epilogue);
3355 else
3357 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3358 peel_iters_prologue = niters < peel_iters_prologue ?
3359 niters : peel_iters_prologue;
3360 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3361 /* If we need to peel for gaps, but no peeling is required, we have to
3362 peel VF iterations. */
3363 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3364 *peel_iters_epilogue = assumed_vf;
3367 stmt_info_for_cost *si;
3368 int j;
3369 if (peel_iters_prologue)
3370 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3372 stmt_vec_info stmt_info
3373 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3374 retval += record_stmt_cost (prologue_cost_vec,
3375 si->count * peel_iters_prologue,
3376 si->kind, stmt_info, si->misalign,
3377 vect_prologue);
3379 if (*peel_iters_epilogue)
3380 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3382 stmt_vec_info stmt_info
3383 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3384 retval += record_stmt_cost (epilogue_cost_vec,
3385 si->count * *peel_iters_epilogue,
3386 si->kind, stmt_info, si->misalign,
3387 vect_epilogue);
3390 return retval;
3393 /* Function vect_estimate_min_profitable_iters
3395 Return the number of iterations required for the vector version of the
3396 loop to be profitable relative to the cost of the scalar version of the
3397 loop.
3399 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3400 of iterations for vectorization. -1 value means loop vectorization
3401 is not profitable. This returned value may be used for dynamic
3402 profitability check.
3404 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3405 for static check against estimated number of iterations. */
3407 static void
3408 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3409 int *ret_min_profitable_niters,
3410 int *ret_min_profitable_estimate)
3412 int min_profitable_iters;
3413 int min_profitable_estimate;
3414 int peel_iters_prologue;
3415 int peel_iters_epilogue;
3416 unsigned vec_inside_cost = 0;
3417 int vec_outside_cost = 0;
3418 unsigned vec_prologue_cost = 0;
3419 unsigned vec_epilogue_cost = 0;
3420 int scalar_single_iter_cost = 0;
3421 int scalar_outside_cost = 0;
3422 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3423 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3424 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3426 /* Cost model disabled. */
3427 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3429 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3430 *ret_min_profitable_niters = 0;
3431 *ret_min_profitable_estimate = 0;
3432 return;
3435 /* Requires loop versioning tests to handle misalignment. */
3436 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3438 /* FIXME: Make cost depend on complexity of individual check. */
3439 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3440 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3441 vect_prologue);
3442 dump_printf (MSG_NOTE,
3443 "cost model: Adding cost of checks for loop "
3444 "versioning to treat misalignment.\n");
3447 /* Requires loop versioning with alias checks. */
3448 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3450 /* FIXME: Make cost depend on complexity of individual check. */
3451 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3452 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3453 vect_prologue);
3454 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3455 if (len)
3456 /* Count LEN - 1 ANDs and LEN comparisons. */
3457 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3458 NULL, 0, vect_prologue);
3459 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3460 if (len)
3462 /* Count LEN - 1 ANDs and LEN comparisons. */
3463 unsigned int nstmts = len * 2 - 1;
3464 /* +1 for each bias that needs adding. */
3465 for (unsigned int i = 0; i < len; ++i)
3466 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3467 nstmts += 1;
3468 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3469 NULL, 0, vect_prologue);
3471 dump_printf (MSG_NOTE,
3472 "cost model: Adding cost of checks for loop "
3473 "versioning aliasing.\n");
3476 /* Requires loop versioning with niter checks. */
3477 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3479 /* FIXME: Make cost depend on complexity of individual check. */
3480 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3481 vect_prologue);
3482 dump_printf (MSG_NOTE,
3483 "cost model: Adding cost of checks for loop "
3484 "versioning niters.\n");
3487 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3488 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3489 vect_prologue);
3491 /* Count statements in scalar loop. Using this as scalar cost for a single
3492 iteration for now.
3494 TODO: Add outer loop support.
3496 TODO: Consider assigning different costs to different scalar
3497 statements. */
3499 scalar_single_iter_cost
3500 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3502 /* Add additional cost for the peeled instructions in prologue and epilogue
3503 loop. (For fully-masked loops there will be no peeling.)
3505 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3506 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3508 TODO: Build an expression that represents peel_iters for prologue and
3509 epilogue to be used in a run-time test. */
3511 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3513 peel_iters_prologue = 0;
3514 peel_iters_epilogue = 0;
3516 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3518 /* We need to peel exactly one iteration. */
3519 peel_iters_epilogue += 1;
3520 stmt_info_for_cost *si;
3521 int j;
3522 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3523 j, si)
3525 struct _stmt_vec_info *stmt_info
3526 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3527 (void) add_stmt_cost (target_cost_data, si->count,
3528 si->kind, stmt_info, si->misalign,
3529 vect_epilogue);
3533 else if (npeel < 0)
3535 peel_iters_prologue = assumed_vf / 2;
3536 dump_printf (MSG_NOTE, "cost model: "
3537 "prologue peel iters set to vf/2.\n");
3539 /* If peeling for alignment is unknown, loop bound of main loop becomes
3540 unknown. */
3541 peel_iters_epilogue = assumed_vf / 2;
3542 dump_printf (MSG_NOTE, "cost model: "
3543 "epilogue peel iters set to vf/2 because "
3544 "peeling for alignment is unknown.\n");
3546 /* If peeled iterations are unknown, count a taken branch and a not taken
3547 branch per peeled loop. Even if scalar loop iterations are known,
3548 vector iterations are not known since peeled prologue iterations are
3549 not known. Hence guards remain the same. */
3550 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3551 NULL, 0, vect_prologue);
3552 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3553 NULL, 0, vect_prologue);
3554 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3555 NULL, 0, vect_epilogue);
3556 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3557 NULL, 0, vect_epilogue);
3558 stmt_info_for_cost *si;
3559 int j;
3560 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3562 struct _stmt_vec_info *stmt_info
3563 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3564 (void) add_stmt_cost (target_cost_data,
3565 si->count * peel_iters_prologue,
3566 si->kind, stmt_info, si->misalign,
3567 vect_prologue);
3568 (void) add_stmt_cost (target_cost_data,
3569 si->count * peel_iters_epilogue,
3570 si->kind, stmt_info, si->misalign,
3571 vect_epilogue);
3574 else
3576 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3577 stmt_info_for_cost *si;
3578 int j;
3579 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3581 prologue_cost_vec.create (2);
3582 epilogue_cost_vec.create (2);
3583 peel_iters_prologue = npeel;
3585 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3586 &peel_iters_epilogue,
3587 &LOOP_VINFO_SCALAR_ITERATION_COST
3588 (loop_vinfo),
3589 &prologue_cost_vec,
3590 &epilogue_cost_vec);
3592 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3594 struct _stmt_vec_info *stmt_info
3595 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3596 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3597 si->misalign, vect_prologue);
3600 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3602 struct _stmt_vec_info *stmt_info
3603 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3604 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3605 si->misalign, vect_epilogue);
3608 prologue_cost_vec.release ();
3609 epilogue_cost_vec.release ();
3612 /* FORNOW: The scalar outside cost is incremented in one of the
3613 following ways:
3615 1. The vectorizer checks for alignment and aliasing and generates
3616 a condition that allows dynamic vectorization. A cost model
3617 check is ANDED with the versioning condition. Hence scalar code
3618 path now has the added cost of the versioning check.
3620 if (cost > th & versioning_check)
3621 jmp to vector code
3623 Hence run-time scalar is incremented by not-taken branch cost.
3625 2. The vectorizer then checks if a prologue is required. If the
3626 cost model check was not done before during versioning, it has to
3627 be done before the prologue check.
3629 if (cost <= th)
3630 prologue = scalar_iters
3631 if (prologue == 0)
3632 jmp to vector code
3633 else
3634 execute prologue
3635 if (prologue == num_iters)
3636 go to exit
3638 Hence the run-time scalar cost is incremented by a taken branch,
3639 plus a not-taken branch, plus a taken branch cost.
3641 3. The vectorizer then checks if an epilogue is required. If the
3642 cost model check was not done before during prologue check, it
3643 has to be done with the epilogue check.
3645 if (prologue == 0)
3646 jmp to vector code
3647 else
3648 execute prologue
3649 if (prologue == num_iters)
3650 go to exit
3651 vector code:
3652 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3653 jmp to epilogue
3655 Hence the run-time scalar cost should be incremented by 2 taken
3656 branches.
3658 TODO: The back end may reorder the BBS's differently and reverse
3659 conditions/branch directions. Change the estimates below to
3660 something more reasonable. */
3662 /* If the number of iterations is known and we do not do versioning, we can
3663 decide whether to vectorize at compile time. Hence the scalar version
3664 do not carry cost model guard costs. */
3665 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3666 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3668 /* Cost model check occurs at versioning. */
3669 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3670 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3671 else
3673 /* Cost model check occurs at prologue generation. */
3674 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3675 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3676 + vect_get_stmt_cost (cond_branch_not_taken);
3677 /* Cost model check occurs at epilogue generation. */
3678 else
3679 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3683 /* Complete the target-specific cost calculations. */
3684 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3685 &vec_inside_cost, &vec_epilogue_cost);
3687 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3689 if (dump_enabled_p ())
3691 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3692 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3693 vec_inside_cost);
3694 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3695 vec_prologue_cost);
3696 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3697 vec_epilogue_cost);
3698 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3699 scalar_single_iter_cost);
3700 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3701 scalar_outside_cost);
3702 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3703 vec_outside_cost);
3704 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3705 peel_iters_prologue);
3706 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3707 peel_iters_epilogue);
3710 /* Calculate number of iterations required to make the vector version
3711 profitable, relative to the loop bodies only. The following condition
3712 must hold true:
3713 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3714 where
3715 SIC = scalar iteration cost, VIC = vector iteration cost,
3716 VOC = vector outside cost, VF = vectorization factor,
3717 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3718 SOC = scalar outside cost for run time cost model check. */
3720 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3722 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3723 * assumed_vf
3724 - vec_inside_cost * peel_iters_prologue
3725 - vec_inside_cost * peel_iters_epilogue);
3726 if (min_profitable_iters <= 0)
3727 min_profitable_iters = 0;
3728 else
3730 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3731 - vec_inside_cost);
3733 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3734 <= (((int) vec_inside_cost * min_profitable_iters)
3735 + (((int) vec_outside_cost - scalar_outside_cost)
3736 * assumed_vf)))
3737 min_profitable_iters++;
3740 /* vector version will never be profitable. */
3741 else
3743 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3744 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3745 "did not happen for a simd loop");
3747 if (dump_enabled_p ())
3748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3749 "cost model: the vector iteration cost = %d "
3750 "divided by the scalar iteration cost = %d "
3751 "is greater or equal to the vectorization factor = %d"
3752 ".\n",
3753 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3754 *ret_min_profitable_niters = -1;
3755 *ret_min_profitable_estimate = -1;
3756 return;
3759 dump_printf (MSG_NOTE,
3760 " Calculated minimum iters for profitability: %d\n",
3761 min_profitable_iters);
3763 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3764 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3765 /* We want the vectorized loop to execute at least once. */
3766 min_profitable_iters = assumed_vf + peel_iters_prologue;
3768 if (dump_enabled_p ())
3769 dump_printf_loc (MSG_NOTE, vect_location,
3770 " Runtime profitability threshold = %d\n",
3771 min_profitable_iters);
3773 *ret_min_profitable_niters = min_profitable_iters;
3775 /* Calculate number of iterations required to make the vector version
3776 profitable, relative to the loop bodies only.
3778 Non-vectorized variant is SIC * niters and it must win over vector
3779 variant on the expected loop trip count. The following condition must hold true:
3780 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3782 if (vec_outside_cost <= 0)
3783 min_profitable_estimate = 0;
3784 else
3786 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3787 * assumed_vf
3788 - vec_inside_cost * peel_iters_prologue
3789 - vec_inside_cost * peel_iters_epilogue)
3790 / ((scalar_single_iter_cost * assumed_vf)
3791 - vec_inside_cost);
3793 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 " Static estimate profitability threshold = %d\n",
3797 min_profitable_estimate);
3799 *ret_min_profitable_estimate = min_profitable_estimate;
3802 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3803 vector elements (not bits) for a vector with NELT elements. */
3804 static void
3805 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3806 vec_perm_builder *sel)
3808 /* The encoding is a single stepped pattern. Any wrap-around is handled
3809 by vec_perm_indices. */
3810 sel->new_vector (nelt, 1, 3);
3811 for (unsigned int i = 0; i < 3; i++)
3812 sel->quick_push (i + offset);
3815 /* Checks whether the target supports whole-vector shifts for vectors of mode
3816 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3817 it supports vec_perm_const with masks for all necessary shift amounts. */
3818 static bool
3819 have_whole_vector_shift (machine_mode mode)
3821 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3822 return true;
3824 /* Variable-length vectors should be handled via the optab. */
3825 unsigned int nelt;
3826 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3827 return false;
3829 vec_perm_builder sel;
3830 vec_perm_indices indices;
3831 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3833 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3834 indices.new_vector (sel, 2, nelt);
3835 if (!can_vec_perm_const_p (mode, indices, false))
3836 return false;
3838 return true;
3841 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3842 functions. Design better to avoid maintenance issues. */
3844 /* Function vect_model_reduction_cost.
3846 Models cost for a reduction operation, including the vector ops
3847 generated within the strip-mine loop, the initial definition before
3848 the loop, and the epilogue code that must be generated. */
3850 static void
3851 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3852 int ncopies, stmt_vector_for_cost *cost_vec)
3854 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3855 enum tree_code code;
3856 optab optab;
3857 tree vectype;
3858 gimple *orig_stmt;
3859 machine_mode mode;
3860 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3861 struct loop *loop = NULL;
3863 if (loop_vinfo)
3864 loop = LOOP_VINFO_LOOP (loop_vinfo);
3866 /* Condition reductions generate two reductions in the loop. */
3867 vect_reduction_type reduction_type
3868 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3869 if (reduction_type == COND_REDUCTION)
3870 ncopies *= 2;
3872 vectype = STMT_VINFO_VECTYPE (stmt_info);
3873 mode = TYPE_MODE (vectype);
3874 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3876 if (!orig_stmt)
3877 orig_stmt = STMT_VINFO_STMT (stmt_info);
3879 code = gimple_assign_rhs_code (orig_stmt);
3881 if (reduction_type == EXTRACT_LAST_REDUCTION
3882 || reduction_type == FOLD_LEFT_REDUCTION)
3884 /* No extra instructions needed in the prologue. */
3885 prologue_cost = 0;
3887 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3888 /* Count one reduction-like operation per vector. */
3889 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3890 stmt_info, 0, vect_body);
3891 else
3893 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3894 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3895 inside_cost = record_stmt_cost (cost_vec, nelements,
3896 vec_to_scalar, stmt_info, 0,
3897 vect_body);
3898 inside_cost += record_stmt_cost (cost_vec, nelements,
3899 scalar_stmt, stmt_info, 0,
3900 vect_body);
3903 else
3905 /* Add in cost for initial definition.
3906 For cond reduction we have four vectors: initial index, step,
3907 initial result of the data reduction, initial value of the index
3908 reduction. */
3909 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3910 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3911 scalar_to_vec, stmt_info, 0,
3912 vect_prologue);
3914 /* Cost of reduction op inside loop. */
3915 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3916 stmt_info, 0, vect_body);
3919 /* Determine cost of epilogue code.
3921 We have a reduction operator that will reduce the vector in one statement.
3922 Also requires scalar extract. */
3924 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3926 if (reduc_fn != IFN_LAST)
3928 if (reduction_type == COND_REDUCTION)
3930 /* An EQ stmt and an COND_EXPR stmt. */
3931 epilogue_cost += record_stmt_cost (cost_vec, 2,
3932 vector_stmt, stmt_info, 0,
3933 vect_epilogue);
3934 /* Reduction of the max index and a reduction of the found
3935 values. */
3936 epilogue_cost += record_stmt_cost (cost_vec, 2,
3937 vec_to_scalar, stmt_info, 0,
3938 vect_epilogue);
3939 /* A broadcast of the max value. */
3940 epilogue_cost += record_stmt_cost (cost_vec, 1,
3941 scalar_to_vec, stmt_info, 0,
3942 vect_epilogue);
3944 else
3946 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3947 stmt_info, 0, vect_epilogue);
3948 epilogue_cost += record_stmt_cost (cost_vec, 1,
3949 vec_to_scalar, stmt_info, 0,
3950 vect_epilogue);
3953 else if (reduction_type == COND_REDUCTION)
3955 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3956 /* Extraction of scalar elements. */
3957 epilogue_cost += record_stmt_cost (cost_vec,
3958 2 * estimated_nunits,
3959 vec_to_scalar, stmt_info, 0,
3960 vect_epilogue);
3961 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3962 epilogue_cost += record_stmt_cost (cost_vec,
3963 2 * estimated_nunits - 3,
3964 scalar_stmt, stmt_info, 0,
3965 vect_epilogue);
3967 else if (reduction_type == EXTRACT_LAST_REDUCTION
3968 || reduction_type == FOLD_LEFT_REDUCTION)
3969 /* No extra instructions need in the epilogue. */
3971 else
3973 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3974 tree bitsize =
3975 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3976 int element_bitsize = tree_to_uhwi (bitsize);
3977 int nelements = vec_size_in_bits / element_bitsize;
3979 if (code == COND_EXPR)
3980 code = MAX_EXPR;
3982 optab = optab_for_tree_code (code, vectype, optab_default);
3984 /* We have a whole vector shift available. */
3985 if (optab != unknown_optab
3986 && VECTOR_MODE_P (mode)
3987 && optab_handler (optab, mode) != CODE_FOR_nothing
3988 && have_whole_vector_shift (mode))
3990 /* Final reduction via vector shifts and the reduction operator.
3991 Also requires scalar extract. */
3992 epilogue_cost += record_stmt_cost (cost_vec,
3993 exact_log2 (nelements) * 2,
3994 vector_stmt, stmt_info, 0,
3995 vect_epilogue);
3996 epilogue_cost += record_stmt_cost (cost_vec, 1,
3997 vec_to_scalar, stmt_info, 0,
3998 vect_epilogue);
4000 else
4001 /* Use extracts and reduction op for final reduction. For N
4002 elements, we have N extracts and N-1 reduction ops. */
4003 epilogue_cost += record_stmt_cost (cost_vec,
4004 nelements + nelements - 1,
4005 vector_stmt, stmt_info, 0,
4006 vect_epilogue);
4010 if (dump_enabled_p ())
4011 dump_printf (MSG_NOTE,
4012 "vect_model_reduction_cost: inside_cost = %d, "
4013 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4014 prologue_cost, epilogue_cost);
4018 /* Function vect_model_induction_cost.
4020 Models cost for induction operations. */
4022 static void
4023 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4024 stmt_vector_for_cost *cost_vec)
4026 unsigned inside_cost, prologue_cost;
4028 if (PURE_SLP_STMT (stmt_info))
4029 return;
4031 /* loop cost for vec_loop. */
4032 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4033 stmt_info, 0, vect_body);
4035 /* prologue cost for vec_init and vec_step. */
4036 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4037 stmt_info, 0, vect_prologue);
4039 if (dump_enabled_p ())
4040 dump_printf_loc (MSG_NOTE, vect_location,
4041 "vect_model_induction_cost: inside_cost = %d, "
4042 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4047 /* Function get_initial_def_for_reduction
4049 Input:
4050 STMT - a stmt that performs a reduction operation in the loop.
4051 INIT_VAL - the initial value of the reduction variable
4053 Output:
4054 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4055 of the reduction (used for adjusting the epilog - see below).
4056 Return a vector variable, initialized according to the operation that STMT
4057 performs. This vector will be used as the initial value of the
4058 vector of partial results.
4060 Option1 (adjust in epilog): Initialize the vector as follows:
4061 add/bit or/xor: [0,0,...,0,0]
4062 mult/bit and: [1,1,...,1,1]
4063 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4064 and when necessary (e.g. add/mult case) let the caller know
4065 that it needs to adjust the result by init_val.
4067 Option2: Initialize the vector as follows:
4068 add/bit or/xor: [init_val,0,0,...,0]
4069 mult/bit and: [init_val,1,1,...,1]
4070 min/max/cond_expr: [init_val,init_val,...,init_val]
4071 and no adjustments are needed.
4073 For example, for the following code:
4075 s = init_val;
4076 for (i=0;i<n;i++)
4077 s = s + a[i];
4079 STMT is 's = s + a[i]', and the reduction variable is 's'.
4080 For a vector of 4 units, we want to return either [0,0,0,init_val],
4081 or [0,0,0,0] and let the caller know that it needs to adjust
4082 the result at the end by 'init_val'.
4084 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4085 initialization vector is simpler (same element in all entries), if
4086 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4088 A cost model should help decide between these two schemes. */
4090 tree
4091 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4092 tree *adjustment_def)
4094 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4095 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4096 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4097 tree scalar_type = TREE_TYPE (init_val);
4098 tree vectype = get_vectype_for_scalar_type (scalar_type);
4099 enum tree_code code = gimple_assign_rhs_code (stmt);
4100 tree def_for_init;
4101 tree init_def;
4102 bool nested_in_vect_loop = false;
4103 REAL_VALUE_TYPE real_init_val = dconst0;
4104 int int_init_val = 0;
4105 gimple *def_stmt = NULL;
4106 gimple_seq stmts = NULL;
4108 gcc_assert (vectype);
4110 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4111 || SCALAR_FLOAT_TYPE_P (scalar_type));
4113 if (nested_in_vect_loop_p (loop, stmt))
4114 nested_in_vect_loop = true;
4115 else
4116 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4118 /* In case of double reduction we only create a vector variable to be put
4119 in the reduction phi node. The actual statement creation is done in
4120 vect_create_epilog_for_reduction. */
4121 if (adjustment_def && nested_in_vect_loop
4122 && TREE_CODE (init_val) == SSA_NAME
4123 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4124 && gimple_code (def_stmt) == GIMPLE_PHI
4125 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4126 && vinfo_for_stmt (def_stmt)
4127 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4128 == vect_double_reduction_def)
4130 *adjustment_def = NULL;
4131 return vect_create_destination_var (init_val, vectype);
4134 vect_reduction_type reduction_type
4135 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4137 /* In case of a nested reduction do not use an adjustment def as
4138 that case is not supported by the epilogue generation correctly
4139 if ncopies is not one. */
4140 if (adjustment_def && nested_in_vect_loop)
4142 *adjustment_def = NULL;
4143 return vect_get_vec_def_for_operand (init_val, stmt);
4146 switch (code)
4148 case WIDEN_SUM_EXPR:
4149 case DOT_PROD_EXPR:
4150 case SAD_EXPR:
4151 case PLUS_EXPR:
4152 case MINUS_EXPR:
4153 case BIT_IOR_EXPR:
4154 case BIT_XOR_EXPR:
4155 case MULT_EXPR:
4156 case BIT_AND_EXPR:
4158 /* ADJUSTMENT_DEF is NULL when called from
4159 vect_create_epilog_for_reduction to vectorize double reduction. */
4160 if (adjustment_def)
4161 *adjustment_def = init_val;
4163 if (code == MULT_EXPR)
4165 real_init_val = dconst1;
4166 int_init_val = 1;
4169 if (code == BIT_AND_EXPR)
4170 int_init_val = -1;
4172 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4173 def_for_init = build_real (scalar_type, real_init_val);
4174 else
4175 def_for_init = build_int_cst (scalar_type, int_init_val);
4177 if (adjustment_def)
4178 /* Option1: the first element is '0' or '1' as well. */
4179 init_def = gimple_build_vector_from_val (&stmts, vectype,
4180 def_for_init);
4181 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4183 /* Option2 (variable length): the first element is INIT_VAL. */
4184 init_def = build_vector_from_val (vectype, def_for_init);
4185 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4186 2, init_def, init_val);
4187 init_def = make_ssa_name (vectype);
4188 gimple_call_set_lhs (call, init_def);
4189 gimple_seq_add_stmt (&stmts, call);
4191 else
4193 /* Option2: the first element is INIT_VAL. */
4194 tree_vector_builder elts (vectype, 1, 2);
4195 elts.quick_push (init_val);
4196 elts.quick_push (def_for_init);
4197 init_def = gimple_build_vector (&stmts, &elts);
4200 break;
4202 case MIN_EXPR:
4203 case MAX_EXPR:
4204 case COND_EXPR:
4206 if (adjustment_def)
4208 *adjustment_def = NULL_TREE;
4209 if (reduction_type != COND_REDUCTION
4210 && reduction_type != EXTRACT_LAST_REDUCTION)
4212 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4213 break;
4216 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4217 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4219 break;
4221 default:
4222 gcc_unreachable ();
4225 if (stmts)
4226 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4227 return init_def;
4230 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4231 NUMBER_OF_VECTORS is the number of vector defs to create.
4232 If NEUTRAL_OP is nonnull, introducing extra elements of that
4233 value will not change the result. */
4235 static void
4236 get_initial_defs_for_reduction (slp_tree slp_node,
4237 vec<tree> *vec_oprnds,
4238 unsigned int number_of_vectors,
4239 bool reduc_chain, tree neutral_op)
4241 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4242 gimple *stmt = stmts[0];
4243 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4244 unsigned HOST_WIDE_INT nunits;
4245 unsigned j, number_of_places_left_in_vector;
4246 tree vector_type;
4247 tree vop;
4248 int group_size = stmts.length ();
4249 unsigned int vec_num, i;
4250 unsigned number_of_copies = 1;
4251 vec<tree> voprnds;
4252 voprnds.create (number_of_vectors);
4253 struct loop *loop;
4254 auto_vec<tree, 16> permute_results;
4256 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4258 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4260 loop = (gimple_bb (stmt))->loop_father;
4261 gcc_assert (loop);
4262 edge pe = loop_preheader_edge (loop);
4264 gcc_assert (!reduc_chain || neutral_op);
4266 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4267 created vectors. It is greater than 1 if unrolling is performed.
4269 For example, we have two scalar operands, s1 and s2 (e.g., group of
4270 strided accesses of size two), while NUNITS is four (i.e., four scalars
4271 of this type can be packed in a vector). The output vector will contain
4272 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4273 will be 2).
4275 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4276 containing the operands.
4278 For example, NUNITS is four as before, and the group size is 8
4279 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4280 {s5, s6, s7, s8}. */
4282 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4283 nunits = group_size;
4285 number_of_copies = nunits * number_of_vectors / group_size;
4287 number_of_places_left_in_vector = nunits;
4288 bool constant_p = true;
4289 tree_vector_builder elts (vector_type, nunits, 1);
4290 elts.quick_grow (nunits);
4291 for (j = 0; j < number_of_copies; j++)
4293 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4295 tree op;
4296 /* Get the def before the loop. In reduction chain we have only
4297 one initial value. */
4298 if ((j != (number_of_copies - 1)
4299 || (reduc_chain && i != 0))
4300 && neutral_op)
4301 op = neutral_op;
4302 else
4303 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4305 /* Create 'vect_ = {op0,op1,...,opn}'. */
4306 number_of_places_left_in_vector--;
4307 elts[number_of_places_left_in_vector] = op;
4308 if (!CONSTANT_CLASS_P (op))
4309 constant_p = false;
4311 if (number_of_places_left_in_vector == 0)
4313 gimple_seq ctor_seq = NULL;
4314 tree init;
4315 if (constant_p && !neutral_op
4316 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4317 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4318 /* Build the vector directly from ELTS. */
4319 init = gimple_build_vector (&ctor_seq, &elts);
4320 else if (neutral_op)
4322 /* Build a vector of the neutral value and shift the
4323 other elements into place. */
4324 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4325 neutral_op);
4326 int k = nunits;
4327 while (k > 0 && elts[k - 1] == neutral_op)
4328 k -= 1;
4329 while (k > 0)
4331 k -= 1;
4332 gcall *call = gimple_build_call_internal
4333 (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4334 init = make_ssa_name (vector_type);
4335 gimple_call_set_lhs (call, init);
4336 gimple_seq_add_stmt (&ctor_seq, call);
4339 else
4341 /* First time round, duplicate ELTS to fill the
4342 required number of vectors, then cherry pick the
4343 appropriate result for each iteration. */
4344 if (vec_oprnds->is_empty ())
4345 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4346 number_of_vectors,
4347 permute_results);
4348 init = permute_results[number_of_vectors - j - 1];
4350 if (ctor_seq != NULL)
4351 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4352 voprnds.quick_push (init);
4354 number_of_places_left_in_vector = nunits;
4355 elts.new_vector (vector_type, nunits, 1);
4356 elts.quick_grow (nunits);
4357 constant_p = true;
4362 /* Since the vectors are created in the reverse order, we should invert
4363 them. */
4364 vec_num = voprnds.length ();
4365 for (j = vec_num; j != 0; j--)
4367 vop = voprnds[j - 1];
4368 vec_oprnds->quick_push (vop);
4371 voprnds.release ();
4373 /* In case that VF is greater than the unrolling factor needed for the SLP
4374 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4375 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4376 to replicate the vectors. */
4377 tree neutral_vec = NULL;
4378 while (number_of_vectors > vec_oprnds->length ())
4380 if (neutral_op)
4382 if (!neutral_vec)
4384 gimple_seq ctor_seq = NULL;
4385 neutral_vec = gimple_build_vector_from_val
4386 (&ctor_seq, vector_type, neutral_op);
4387 if (ctor_seq != NULL)
4388 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4390 vec_oprnds->quick_push (neutral_vec);
4392 else
4394 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4395 vec_oprnds->quick_push (vop);
4401 /* Function vect_create_epilog_for_reduction
4403 Create code at the loop-epilog to finalize the result of a reduction
4404 computation.
4406 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4407 reduction statements.
4408 STMT is the scalar reduction stmt that is being vectorized.
4409 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4410 number of elements that we can fit in a vectype (nunits). In this case
4411 we have to generate more than one vector stmt - i.e - we need to "unroll"
4412 the vector stmt by a factor VF/nunits. For more details see documentation
4413 in vectorizable_operation.
4414 REDUC_FN is the internal function for the epilog reduction.
4415 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4416 computation.
4417 REDUC_INDEX is the index of the operand in the right hand side of the
4418 statement that is defined by REDUCTION_PHI.
4419 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4420 SLP_NODE is an SLP node containing a group of reduction statements. The
4421 first one in this group is STMT.
4422 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4423 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4424 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4425 any value of the IV in the loop.
4426 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4427 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4428 null if this is not an SLP reduction
4430 This function:
4431 1. Creates the reduction def-use cycles: sets the arguments for
4432 REDUCTION_PHIS:
4433 The loop-entry argument is the vectorized initial-value of the reduction.
4434 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4435 sums.
4436 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4437 by calling the function specified by REDUC_FN if available, or by
4438 other means (whole-vector shifts or a scalar loop).
4439 The function also creates a new phi node at the loop exit to preserve
4440 loop-closed form, as illustrated below.
4442 The flow at the entry to this function:
4444 loop:
4445 vec_def = phi <null, null> # REDUCTION_PHI
4446 VECT_DEF = vector_stmt # vectorized form of STMT
4447 s_loop = scalar_stmt # (scalar) STMT
4448 loop_exit:
4449 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4450 use <s_out0>
4451 use <s_out0>
4453 The above is transformed by this function into:
4455 loop:
4456 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4457 VECT_DEF = vector_stmt # vectorized form of STMT
4458 s_loop = scalar_stmt # (scalar) STMT
4459 loop_exit:
4460 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4461 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4462 v_out2 = reduce <v_out1>
4463 s_out3 = extract_field <v_out2, 0>
4464 s_out4 = adjust_result <s_out3>
4465 use <s_out4>
4466 use <s_out4>
4469 static void
4470 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4471 gimple *reduc_def_stmt,
4472 int ncopies, internal_fn reduc_fn,
4473 vec<gimple *> reduction_phis,
4474 bool double_reduc,
4475 slp_tree slp_node,
4476 slp_instance slp_node_instance,
4477 tree induc_val, enum tree_code induc_code,
4478 tree neutral_op)
4480 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4481 stmt_vec_info prev_phi_info;
4482 tree vectype;
4483 machine_mode mode;
4484 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4485 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4486 basic_block exit_bb;
4487 tree scalar_dest;
4488 tree scalar_type;
4489 gimple *new_phi = NULL, *phi;
4490 gimple_stmt_iterator exit_gsi;
4491 tree vec_dest;
4492 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4493 gimple *epilog_stmt = NULL;
4494 enum tree_code code = gimple_assign_rhs_code (stmt);
4495 gimple *exit_phi;
4496 tree bitsize;
4497 tree adjustment_def = NULL;
4498 tree vec_initial_def = NULL;
4499 tree expr, def, initial_def = NULL;
4500 tree orig_name, scalar_result;
4501 imm_use_iterator imm_iter, phi_imm_iter;
4502 use_operand_p use_p, phi_use_p;
4503 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4504 bool nested_in_vect_loop = false;
4505 auto_vec<gimple *> new_phis;
4506 auto_vec<gimple *> inner_phis;
4507 enum vect_def_type dt = vect_unknown_def_type;
4508 int j, i;
4509 auto_vec<tree> scalar_results;
4510 unsigned int group_size = 1, k, ratio;
4511 auto_vec<tree> vec_initial_defs;
4512 auto_vec<gimple *> phis;
4513 bool slp_reduc = false;
4514 bool direct_slp_reduc;
4515 tree new_phi_result;
4516 gimple *inner_phi = NULL;
4517 tree induction_index = NULL_TREE;
4519 if (slp_node)
4520 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4522 if (nested_in_vect_loop_p (loop, stmt))
4524 outer_loop = loop;
4525 loop = loop->inner;
4526 nested_in_vect_loop = true;
4527 gcc_assert (!slp_node);
4530 vectype = STMT_VINFO_VECTYPE (stmt_info);
4531 gcc_assert (vectype);
4532 mode = TYPE_MODE (vectype);
4534 /* 1. Create the reduction def-use cycle:
4535 Set the arguments of REDUCTION_PHIS, i.e., transform
4537 loop:
4538 vec_def = phi <null, null> # REDUCTION_PHI
4539 VECT_DEF = vector_stmt # vectorized form of STMT
4542 into:
4544 loop:
4545 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4546 VECT_DEF = vector_stmt # vectorized form of STMT
4549 (in case of SLP, do it for all the phis). */
4551 /* Get the loop-entry arguments. */
4552 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4553 if (slp_node)
4555 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4556 vec_initial_defs.reserve (vec_num);
4557 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4558 &vec_initial_defs, vec_num,
4559 GROUP_FIRST_ELEMENT (stmt_info),
4560 neutral_op);
4562 else
4564 /* Get at the scalar def before the loop, that defines the initial value
4565 of the reduction variable. */
4566 gimple *def_stmt;
4567 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4568 loop_preheader_edge (loop));
4569 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4570 and we can't use zero for induc_val, use initial_def. Similarly
4571 for REDUC_MIN and initial_def larger than the base. */
4572 if (TREE_CODE (initial_def) == INTEGER_CST
4573 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4574 == INTEGER_INDUC_COND_REDUCTION)
4575 && !integer_zerop (induc_val)
4576 && ((induc_code == MAX_EXPR
4577 && tree_int_cst_lt (initial_def, induc_val))
4578 || (induc_code == MIN_EXPR
4579 && tree_int_cst_lt (induc_val, initial_def))))
4580 induc_val = initial_def;
4581 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4582 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4583 &adjustment_def);
4584 vec_initial_defs.create (1);
4585 vec_initial_defs.quick_push (vec_initial_def);
4588 /* Set phi nodes arguments. */
4589 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4591 tree vec_init_def = vec_initial_defs[i];
4592 tree def = vect_defs[i];
4593 for (j = 0; j < ncopies; j++)
4595 if (j != 0)
4597 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4598 if (nested_in_vect_loop)
4599 vec_init_def
4600 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4601 vec_init_def);
4604 /* Set the loop-entry arg of the reduction-phi. */
4606 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4607 == INTEGER_INDUC_COND_REDUCTION)
4609 /* Initialise the reduction phi to zero. This prevents initial
4610 values of non-zero interferring with the reduction op. */
4611 gcc_assert (ncopies == 1);
4612 gcc_assert (i == 0);
4614 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4615 tree induc_val_vec
4616 = build_vector_from_val (vec_init_def_type, induc_val);
4618 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4619 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4621 else
4622 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4623 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4625 /* Set the loop-latch arg for the reduction-phi. */
4626 if (j > 0)
4627 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4629 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4630 UNKNOWN_LOCATION);
4632 if (dump_enabled_p ())
4634 dump_printf_loc (MSG_NOTE, vect_location,
4635 "transform reduction: created def-use cycle: ");
4636 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4637 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4642 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4643 which is updated with the current index of the loop for every match of
4644 the original loop's cond_expr (VEC_STMT). This results in a vector
4645 containing the last time the condition passed for that vector lane.
4646 The first match will be a 1 to allow 0 to be used for non-matching
4647 indexes. If there are no matches at all then the vector will be all
4648 zeroes. */
4649 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4651 tree indx_before_incr, indx_after_incr;
4652 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4654 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4655 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4657 int scalar_precision
4658 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4659 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4660 tree cr_index_vector_type = build_vector_type
4661 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4663 /* First we create a simple vector induction variable which starts
4664 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4665 vector size (STEP). */
4667 /* Create a {1,2,3,...} vector. */
4668 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4670 /* Create a vector of the step value. */
4671 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4672 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4674 /* Create an induction variable. */
4675 gimple_stmt_iterator incr_gsi;
4676 bool insert_after;
4677 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4678 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4679 insert_after, &indx_before_incr, &indx_after_incr);
4681 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4682 filled with zeros (VEC_ZERO). */
4684 /* Create a vector of 0s. */
4685 tree zero = build_zero_cst (cr_index_scalar_type);
4686 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4688 /* Create a vector phi node. */
4689 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4690 new_phi = create_phi_node (new_phi_tree, loop->header);
4691 set_vinfo_for_stmt (new_phi,
4692 new_stmt_vec_info (new_phi, loop_vinfo));
4693 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4694 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4696 /* Now take the condition from the loops original cond_expr
4697 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4698 every match uses values from the induction variable
4699 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4700 (NEW_PHI_TREE).
4701 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4702 the new cond_expr (INDEX_COND_EXPR). */
4704 /* Duplicate the condition from vec_stmt. */
4705 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4707 /* Create a conditional, where the condition is taken from vec_stmt
4708 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4709 else is the phi (NEW_PHI_TREE). */
4710 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4711 ccompare, indx_before_incr,
4712 new_phi_tree);
4713 induction_index = make_ssa_name (cr_index_vector_type);
4714 gimple *index_condition = gimple_build_assign (induction_index,
4715 index_cond_expr);
4716 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4717 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4718 loop_vinfo);
4719 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4720 set_vinfo_for_stmt (index_condition, index_vec_info);
4722 /* Update the phi with the vec cond. */
4723 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4724 loop_latch_edge (loop), UNKNOWN_LOCATION);
4727 /* 2. Create epilog code.
4728 The reduction epilog code operates across the elements of the vector
4729 of partial results computed by the vectorized loop.
4730 The reduction epilog code consists of:
4732 step 1: compute the scalar result in a vector (v_out2)
4733 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4734 step 3: adjust the scalar result (s_out3) if needed.
4736 Step 1 can be accomplished using one the following three schemes:
4737 (scheme 1) using reduc_fn, if available.
4738 (scheme 2) using whole-vector shifts, if available.
4739 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4740 combined.
4742 The overall epilog code looks like this:
4744 s_out0 = phi <s_loop> # original EXIT_PHI
4745 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4746 v_out2 = reduce <v_out1> # step 1
4747 s_out3 = extract_field <v_out2, 0> # step 2
4748 s_out4 = adjust_result <s_out3> # step 3
4750 (step 3 is optional, and steps 1 and 2 may be combined).
4751 Lastly, the uses of s_out0 are replaced by s_out4. */
4754 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4755 v_out1 = phi <VECT_DEF>
4756 Store them in NEW_PHIS. */
4758 exit_bb = single_exit (loop)->dest;
4759 prev_phi_info = NULL;
4760 new_phis.create (vect_defs.length ());
4761 FOR_EACH_VEC_ELT (vect_defs, i, def)
4763 for (j = 0; j < ncopies; j++)
4765 tree new_def = copy_ssa_name (def);
4766 phi = create_phi_node (new_def, exit_bb);
4767 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4768 if (j == 0)
4769 new_phis.quick_push (phi);
4770 else
4772 def = vect_get_vec_def_for_stmt_copy (dt, def);
4773 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4776 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4777 prev_phi_info = vinfo_for_stmt (phi);
4781 /* The epilogue is created for the outer-loop, i.e., for the loop being
4782 vectorized. Create exit phis for the outer loop. */
4783 if (double_reduc)
4785 loop = outer_loop;
4786 exit_bb = single_exit (loop)->dest;
4787 inner_phis.create (vect_defs.length ());
4788 FOR_EACH_VEC_ELT (new_phis, i, phi)
4790 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4791 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4792 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4793 PHI_RESULT (phi));
4794 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4795 loop_vinfo));
4796 inner_phis.quick_push (phi);
4797 new_phis[i] = outer_phi;
4798 prev_phi_info = vinfo_for_stmt (outer_phi);
4799 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4801 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4802 new_result = copy_ssa_name (PHI_RESULT (phi));
4803 outer_phi = create_phi_node (new_result, exit_bb);
4804 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4805 PHI_RESULT (phi));
4806 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4807 loop_vinfo));
4808 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4809 prev_phi_info = vinfo_for_stmt (outer_phi);
4814 exit_gsi = gsi_after_labels (exit_bb);
4816 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4817 (i.e. when reduc_fn is not available) and in the final adjustment
4818 code (if needed). Also get the original scalar reduction variable as
4819 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4820 represents a reduction pattern), the tree-code and scalar-def are
4821 taken from the original stmt that the pattern-stmt (STMT) replaces.
4822 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4823 are taken from STMT. */
4825 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4826 if (!orig_stmt)
4828 /* Regular reduction */
4829 orig_stmt = stmt;
4831 else
4833 /* Reduction pattern */
4834 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4835 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4836 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4839 code = gimple_assign_rhs_code (orig_stmt);
4840 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4841 partial results are added and not subtracted. */
4842 if (code == MINUS_EXPR)
4843 code = PLUS_EXPR;
4845 scalar_dest = gimple_assign_lhs (orig_stmt);
4846 scalar_type = TREE_TYPE (scalar_dest);
4847 scalar_results.create (group_size);
4848 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4849 bitsize = TYPE_SIZE (scalar_type);
4851 /* In case this is a reduction in an inner-loop while vectorizing an outer
4852 loop - we don't need to extract a single scalar result at the end of the
4853 inner-loop (unless it is double reduction, i.e., the use of reduction is
4854 outside the outer-loop). The final vector of partial results will be used
4855 in the vectorized outer-loop, or reduced to a scalar result at the end of
4856 the outer-loop. */
4857 if (nested_in_vect_loop && !double_reduc)
4858 goto vect_finalize_reduction;
4860 /* SLP reduction without reduction chain, e.g.,
4861 # a1 = phi <a2, a0>
4862 # b1 = phi <b2, b0>
4863 a2 = operation (a1)
4864 b2 = operation (b1) */
4865 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4867 /* True if we should implement SLP_REDUC using native reduction operations
4868 instead of scalar operations. */
4869 direct_slp_reduc = (reduc_fn != IFN_LAST
4870 && slp_reduc
4871 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4873 /* In case of reduction chain, e.g.,
4874 # a1 = phi <a3, a0>
4875 a2 = operation (a1)
4876 a3 = operation (a2),
4878 we may end up with more than one vector result. Here we reduce them to
4879 one vector. */
4880 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4882 tree first_vect = PHI_RESULT (new_phis[0]);
4883 gassign *new_vec_stmt = NULL;
4884 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4885 for (k = 1; k < new_phis.length (); k++)
4887 gimple *next_phi = new_phis[k];
4888 tree second_vect = PHI_RESULT (next_phi);
4889 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4890 new_vec_stmt = gimple_build_assign (tem, code,
4891 first_vect, second_vect);
4892 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4893 first_vect = tem;
4896 new_phi_result = first_vect;
4897 if (new_vec_stmt)
4899 new_phis.truncate (0);
4900 new_phis.safe_push (new_vec_stmt);
4903 /* Likewise if we couldn't use a single defuse cycle. */
4904 else if (ncopies > 1)
4906 gcc_assert (new_phis.length () == 1);
4907 tree first_vect = PHI_RESULT (new_phis[0]);
4908 gassign *new_vec_stmt = NULL;
4909 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4910 gimple *next_phi = new_phis[0];
4911 for (int k = 1; k < ncopies; ++k)
4913 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4914 tree second_vect = PHI_RESULT (next_phi);
4915 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4916 new_vec_stmt = gimple_build_assign (tem, code,
4917 first_vect, second_vect);
4918 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4919 first_vect = tem;
4921 new_phi_result = first_vect;
4922 new_phis.truncate (0);
4923 new_phis.safe_push (new_vec_stmt);
4925 else
4926 new_phi_result = PHI_RESULT (new_phis[0]);
4928 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4929 && reduc_fn != IFN_LAST)
4931 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4932 various data values where the condition matched and another vector
4933 (INDUCTION_INDEX) containing all the indexes of those matches. We
4934 need to extract the last matching index (which will be the index with
4935 highest value) and use this to index into the data vector.
4936 For the case where there were no matches, the data vector will contain
4937 all default values and the index vector will be all zeros. */
4939 /* Get various versions of the type of the vector of indexes. */
4940 tree index_vec_type = TREE_TYPE (induction_index);
4941 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4942 tree index_scalar_type = TREE_TYPE (index_vec_type);
4943 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4944 (index_vec_type);
4946 /* Get an unsigned integer version of the type of the data vector. */
4947 int scalar_precision
4948 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4949 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4950 tree vectype_unsigned = build_vector_type
4951 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4953 /* First we need to create a vector (ZERO_VEC) of zeros and another
4954 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4955 can create using a MAX reduction and then expanding.
4956 In the case where the loop never made any matches, the max index will
4957 be zero. */
4959 /* Vector of {0, 0, 0,...}. */
4960 tree zero_vec = make_ssa_name (vectype);
4961 tree zero_vec_rhs = build_zero_cst (vectype);
4962 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4963 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4965 /* Find maximum value from the vector of found indexes. */
4966 tree max_index = make_ssa_name (index_scalar_type);
4967 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4968 1, induction_index);
4969 gimple_call_set_lhs (max_index_stmt, max_index);
4970 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4972 /* Vector of {max_index, max_index, max_index,...}. */
4973 tree max_index_vec = make_ssa_name (index_vec_type);
4974 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4975 max_index);
4976 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4977 max_index_vec_rhs);
4978 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4980 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4981 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4982 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4983 otherwise. Only one value should match, resulting in a vector
4984 (VEC_COND) with one data value and the rest zeros.
4985 In the case where the loop never made any matches, every index will
4986 match, resulting in a vector with all data values (which will all be
4987 the default value). */
4989 /* Compare the max index vector to the vector of found indexes to find
4990 the position of the max value. */
4991 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4992 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4993 induction_index,
4994 max_index_vec);
4995 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4997 /* Use the compare to choose either values from the data vector or
4998 zero. */
4999 tree vec_cond = make_ssa_name (vectype);
5000 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5001 vec_compare, new_phi_result,
5002 zero_vec);
5003 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5005 /* Finally we need to extract the data value from the vector (VEC_COND)
5006 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5007 reduction, but because this doesn't exist, we can use a MAX reduction
5008 instead. The data value might be signed or a float so we need to cast
5009 it first.
5010 In the case where the loop never made any matches, the data values are
5011 all identical, and so will reduce down correctly. */
5013 /* Make the matched data values unsigned. */
5014 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5015 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5016 vec_cond);
5017 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5018 VIEW_CONVERT_EXPR,
5019 vec_cond_cast_rhs);
5020 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5022 /* Reduce down to a scalar value. */
5023 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5024 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5025 1, vec_cond_cast);
5026 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5027 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5029 /* Convert the reduced value back to the result type and set as the
5030 result. */
5031 gimple_seq stmts = NULL;
5032 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5033 data_reduc);
5034 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5035 scalar_results.safe_push (new_temp);
5037 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5038 && reduc_fn == IFN_LAST)
5040 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5041 idx = 0;
5042 idx_val = induction_index[0];
5043 val = data_reduc[0];
5044 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5045 if (induction_index[i] > idx_val)
5046 val = data_reduc[i], idx_val = induction_index[i];
5047 return val; */
5049 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5050 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5051 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5052 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5053 /* Enforced by vectorizable_reduction, which ensures we have target
5054 support before allowing a conditional reduction on variable-length
5055 vectors. */
5056 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5057 tree idx_val = NULL_TREE, val = NULL_TREE;
5058 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5060 tree old_idx_val = idx_val;
5061 tree old_val = val;
5062 idx_val = make_ssa_name (idx_eltype);
5063 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5064 build3 (BIT_FIELD_REF, idx_eltype,
5065 induction_index,
5066 bitsize_int (el_size),
5067 bitsize_int (off)));
5068 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5069 val = make_ssa_name (data_eltype);
5070 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5071 build3 (BIT_FIELD_REF,
5072 data_eltype,
5073 new_phi_result,
5074 bitsize_int (el_size),
5075 bitsize_int (off)));
5076 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5077 if (off != 0)
5079 tree new_idx_val = idx_val;
5080 tree new_val = val;
5081 if (off != v_size - el_size)
5083 new_idx_val = make_ssa_name (idx_eltype);
5084 epilog_stmt = gimple_build_assign (new_idx_val,
5085 MAX_EXPR, idx_val,
5086 old_idx_val);
5087 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5089 new_val = make_ssa_name (data_eltype);
5090 epilog_stmt = gimple_build_assign (new_val,
5091 COND_EXPR,
5092 build2 (GT_EXPR,
5093 boolean_type_node,
5094 idx_val,
5095 old_idx_val),
5096 val, old_val);
5097 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5098 idx_val = new_idx_val;
5099 val = new_val;
5102 /* Convert the reduced value back to the result type and set as the
5103 result. */
5104 gimple_seq stmts = NULL;
5105 val = gimple_convert (&stmts, scalar_type, val);
5106 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5107 scalar_results.safe_push (val);
5110 /* 2.3 Create the reduction code, using one of the three schemes described
5111 above. In SLP we simply need to extract all the elements from the
5112 vector (without reducing them), so we use scalar shifts. */
5113 else if (reduc_fn != IFN_LAST && !slp_reduc)
5115 tree tmp;
5116 tree vec_elem_type;
5118 /* Case 1: Create:
5119 v_out2 = reduc_expr <v_out1> */
5121 if (dump_enabled_p ())
5122 dump_printf_loc (MSG_NOTE, vect_location,
5123 "Reduce using direct vector reduction.\n");
5125 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5126 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5128 tree tmp_dest
5129 = vect_create_destination_var (scalar_dest, vec_elem_type);
5130 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5131 new_phi_result);
5132 gimple_set_lhs (epilog_stmt, tmp_dest);
5133 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5134 gimple_set_lhs (epilog_stmt, new_temp);
5135 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5137 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5138 new_temp);
5140 else
5142 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5143 new_phi_result);
5144 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5147 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5148 gimple_set_lhs (epilog_stmt, new_temp);
5149 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5151 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5152 == INTEGER_INDUC_COND_REDUCTION)
5153 && !operand_equal_p (initial_def, induc_val, 0))
5155 /* Earlier we set the initial value to be a vector if induc_val
5156 values. Check the result and if it is induc_val then replace
5157 with the original initial value, unless induc_val is
5158 the same as initial_def already. */
5159 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5160 induc_val);
5162 tmp = make_ssa_name (new_scalar_dest);
5163 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5164 initial_def, new_temp);
5165 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5166 new_temp = tmp;
5169 scalar_results.safe_push (new_temp);
5171 else if (direct_slp_reduc)
5173 /* Here we create one vector for each of the GROUP_SIZE results,
5174 with the elements for other SLP statements replaced with the
5175 neutral value. We can then do a normal reduction on each vector. */
5177 /* Enforced by vectorizable_reduction. */
5178 gcc_assert (new_phis.length () == 1);
5179 gcc_assert (pow2p_hwi (group_size));
5181 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5182 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5183 gimple_seq seq = NULL;
5185 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5186 and the same element size as VECTYPE. */
5187 tree index = build_index_vector (vectype, 0, 1);
5188 tree index_type = TREE_TYPE (index);
5189 tree index_elt_type = TREE_TYPE (index_type);
5190 tree mask_type = build_same_sized_truth_vector_type (index_type);
5192 /* Create a vector that, for each element, identifies which of
5193 the GROUP_SIZE results should use it. */
5194 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5195 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5196 build_vector_from_val (index_type, index_mask));
5198 /* Get a neutral vector value. This is simply a splat of the neutral
5199 scalar value if we have one, otherwise the initial scalar value
5200 is itself a neutral value. */
5201 tree vector_identity = NULL_TREE;
5202 if (neutral_op)
5203 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5204 neutral_op);
5205 for (unsigned int i = 0; i < group_size; ++i)
5207 /* If there's no univeral neutral value, we can use the
5208 initial scalar value from the original PHI. This is used
5209 for MIN and MAX reduction, for example. */
5210 if (!neutral_op)
5212 tree scalar_value
5213 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5214 loop_preheader_edge (loop));
5215 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5216 scalar_value);
5219 /* Calculate the equivalent of:
5221 sel[j] = (index[j] == i);
5223 which selects the elements of NEW_PHI_RESULT that should
5224 be included in the result. */
5225 tree compare_val = build_int_cst (index_elt_type, i);
5226 compare_val = build_vector_from_val (index_type, compare_val);
5227 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5228 index, compare_val);
5230 /* Calculate the equivalent of:
5232 vec = seq ? new_phi_result : vector_identity;
5234 VEC is now suitable for a full vector reduction. */
5235 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5236 sel, new_phi_result, vector_identity);
5238 /* Do the reduction and convert it to the appropriate type. */
5239 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5240 tree scalar = make_ssa_name (TREE_TYPE (vectype));
5241 gimple_call_set_lhs (call, scalar);
5242 gimple_seq_add_stmt (&seq, call);
5243 scalar = gimple_convert (&seq, scalar_type, scalar);
5244 scalar_results.safe_push (scalar);
5246 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5248 else
5250 bool reduce_with_shift;
5251 tree vec_temp;
5253 /* COND reductions all do the final reduction with MAX_EXPR
5254 or MIN_EXPR. */
5255 if (code == COND_EXPR)
5257 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5258 == INTEGER_INDUC_COND_REDUCTION)
5259 code = induc_code;
5260 else
5261 code = MAX_EXPR;
5264 /* See if the target wants to do the final (shift) reduction
5265 in a vector mode of smaller size and first reduce upper/lower
5266 halves against each other. */
5267 enum machine_mode mode1 = mode;
5268 tree vectype1 = vectype;
5269 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5270 unsigned sz1 = sz;
5271 if (!slp_reduc
5272 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5273 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5275 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5276 reduce_with_shift = have_whole_vector_shift (mode1);
5277 if (!VECTOR_MODE_P (mode1))
5278 reduce_with_shift = false;
5279 else
5281 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5282 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5283 reduce_with_shift = false;
5286 /* First reduce the vector to the desired vector size we should
5287 do shift reduction on by combining upper and lower halves. */
5288 new_temp = new_phi_result;
5289 while (sz > sz1)
5291 gcc_assert (!slp_reduc);
5292 sz /= 2;
5293 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5295 /* The target has to make sure we support lowpart/highpart
5296 extraction, either via direct vector extract or through
5297 an integer mode punning. */
5298 tree dst1, dst2;
5299 if (convert_optab_handler (vec_extract_optab,
5300 TYPE_MODE (TREE_TYPE (new_temp)),
5301 TYPE_MODE (vectype1))
5302 != CODE_FOR_nothing)
5304 /* Extract sub-vectors directly once vec_extract becomes
5305 a conversion optab. */
5306 dst1 = make_ssa_name (vectype1);
5307 epilog_stmt
5308 = gimple_build_assign (dst1, BIT_FIELD_REF,
5309 build3 (BIT_FIELD_REF, vectype1,
5310 new_temp, TYPE_SIZE (vectype1),
5311 bitsize_int (0)));
5312 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5313 dst2 = make_ssa_name (vectype1);
5314 epilog_stmt
5315 = gimple_build_assign (dst2, BIT_FIELD_REF,
5316 build3 (BIT_FIELD_REF, vectype1,
5317 new_temp, TYPE_SIZE (vectype1),
5318 bitsize_int (sz * BITS_PER_UNIT)));
5319 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5321 else
5323 /* Extract via punning to appropriately sized integer mode
5324 vector. */
5325 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5327 tree etype = build_vector_type (eltype, 2);
5328 gcc_assert (convert_optab_handler (vec_extract_optab,
5329 TYPE_MODE (etype),
5330 TYPE_MODE (eltype))
5331 != CODE_FOR_nothing);
5332 tree tem = make_ssa_name (etype);
5333 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5334 build1 (VIEW_CONVERT_EXPR,
5335 etype, new_temp));
5336 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5337 new_temp = tem;
5338 tem = make_ssa_name (eltype);
5339 epilog_stmt
5340 = gimple_build_assign (tem, BIT_FIELD_REF,
5341 build3 (BIT_FIELD_REF, eltype,
5342 new_temp, TYPE_SIZE (eltype),
5343 bitsize_int (0)));
5344 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345 dst1 = make_ssa_name (vectype1);
5346 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5347 build1 (VIEW_CONVERT_EXPR,
5348 vectype1, tem));
5349 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5350 tem = make_ssa_name (eltype);
5351 epilog_stmt
5352 = gimple_build_assign (tem, BIT_FIELD_REF,
5353 build3 (BIT_FIELD_REF, eltype,
5354 new_temp, TYPE_SIZE (eltype),
5355 bitsize_int (sz * BITS_PER_UNIT)));
5356 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357 dst2 = make_ssa_name (vectype1);
5358 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5359 build1 (VIEW_CONVERT_EXPR,
5360 vectype1, tem));
5361 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5364 new_temp = make_ssa_name (vectype1);
5365 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5366 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5369 if (reduce_with_shift && !slp_reduc)
5371 int element_bitsize = tree_to_uhwi (bitsize);
5372 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5373 for variable-length vectors and also requires direct target support
5374 for loop reductions. */
5375 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5376 int nelements = vec_size_in_bits / element_bitsize;
5377 vec_perm_builder sel;
5378 vec_perm_indices indices;
5380 int elt_offset;
5382 tree zero_vec = build_zero_cst (vectype1);
5383 /* Case 2: Create:
5384 for (offset = nelements/2; offset >= 1; offset/=2)
5386 Create: va' = vec_shift <va, offset>
5387 Create: va = vop <va, va'>
5388 } */
5390 tree rhs;
5392 if (dump_enabled_p ())
5393 dump_printf_loc (MSG_NOTE, vect_location,
5394 "Reduce using vector shifts\n");
5396 mode1 = TYPE_MODE (vectype1);
5397 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5398 for (elt_offset = nelements / 2;
5399 elt_offset >= 1;
5400 elt_offset /= 2)
5402 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5403 indices.new_vector (sel, 2, nelements);
5404 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5405 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5406 new_temp, zero_vec, mask);
5407 new_name = make_ssa_name (vec_dest, epilog_stmt);
5408 gimple_assign_set_lhs (epilog_stmt, new_name);
5409 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5411 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5412 new_temp);
5413 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5414 gimple_assign_set_lhs (epilog_stmt, new_temp);
5415 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5418 /* 2.4 Extract the final scalar result. Create:
5419 s_out3 = extract_field <v_out2, bitpos> */
5421 if (dump_enabled_p ())
5422 dump_printf_loc (MSG_NOTE, vect_location,
5423 "extract scalar result\n");
5425 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5426 bitsize, bitsize_zero_node);
5427 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5428 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5429 gimple_assign_set_lhs (epilog_stmt, new_temp);
5430 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5431 scalar_results.safe_push (new_temp);
5433 else
5435 /* Case 3: Create:
5436 s = extract_field <v_out2, 0>
5437 for (offset = element_size;
5438 offset < vector_size;
5439 offset += element_size;)
5441 Create: s' = extract_field <v_out2, offset>
5442 Create: s = op <s, s'> // For non SLP cases
5443 } */
5445 if (dump_enabled_p ())
5446 dump_printf_loc (MSG_NOTE, vect_location,
5447 "Reduce using scalar code.\n");
5449 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5450 int element_bitsize = tree_to_uhwi (bitsize);
5451 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5453 int bit_offset;
5454 if (gimple_code (new_phi) == GIMPLE_PHI)
5455 vec_temp = PHI_RESULT (new_phi);
5456 else
5457 vec_temp = gimple_assign_lhs (new_phi);
5458 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5459 bitsize_zero_node);
5460 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5461 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5462 gimple_assign_set_lhs (epilog_stmt, new_temp);
5463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5465 /* In SLP we don't need to apply reduction operation, so we just
5466 collect s' values in SCALAR_RESULTS. */
5467 if (slp_reduc)
5468 scalar_results.safe_push (new_temp);
5470 for (bit_offset = element_bitsize;
5471 bit_offset < vec_size_in_bits;
5472 bit_offset += element_bitsize)
5474 tree bitpos = bitsize_int (bit_offset);
5475 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5476 bitsize, bitpos);
5478 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5479 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5480 gimple_assign_set_lhs (epilog_stmt, new_name);
5481 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5483 if (slp_reduc)
5485 /* In SLP we don't need to apply reduction operation, so
5486 we just collect s' values in SCALAR_RESULTS. */
5487 new_temp = new_name;
5488 scalar_results.safe_push (new_name);
5490 else
5492 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5493 new_name, new_temp);
5494 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5495 gimple_assign_set_lhs (epilog_stmt, new_temp);
5496 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5501 /* The only case where we need to reduce scalar results in SLP, is
5502 unrolling. If the size of SCALAR_RESULTS is greater than
5503 GROUP_SIZE, we reduce them combining elements modulo
5504 GROUP_SIZE. */
5505 if (slp_reduc)
5507 tree res, first_res, new_res;
5508 gimple *new_stmt;
5510 /* Reduce multiple scalar results in case of SLP unrolling. */
5511 for (j = group_size; scalar_results.iterate (j, &res);
5512 j++)
5514 first_res = scalar_results[j % group_size];
5515 new_stmt = gimple_build_assign (new_scalar_dest, code,
5516 first_res, res);
5517 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5518 gimple_assign_set_lhs (new_stmt, new_res);
5519 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5520 scalar_results[j % group_size] = new_res;
5523 else
5524 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5525 scalar_results.safe_push (new_temp);
5528 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5529 == INTEGER_INDUC_COND_REDUCTION)
5530 && !operand_equal_p (initial_def, induc_val, 0))
5532 /* Earlier we set the initial value to be a vector if induc_val
5533 values. Check the result and if it is induc_val then replace
5534 with the original initial value, unless induc_val is
5535 the same as initial_def already. */
5536 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5537 induc_val);
5539 tree tmp = make_ssa_name (new_scalar_dest);
5540 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5541 initial_def, new_temp);
5542 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5543 scalar_results[0] = tmp;
5547 vect_finalize_reduction:
5549 if (double_reduc)
5550 loop = loop->inner;
5552 /* 2.5 Adjust the final result by the initial value of the reduction
5553 variable. (When such adjustment is not needed, then
5554 'adjustment_def' is zero). For example, if code is PLUS we create:
5555 new_temp = loop_exit_def + adjustment_def */
5557 if (adjustment_def)
5559 gcc_assert (!slp_reduc);
5560 if (nested_in_vect_loop)
5562 new_phi = new_phis[0];
5563 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5564 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5565 new_dest = vect_create_destination_var (scalar_dest, vectype);
5567 else
5569 new_temp = scalar_results[0];
5570 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5571 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5572 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5575 epilog_stmt = gimple_build_assign (new_dest, expr);
5576 new_temp = make_ssa_name (new_dest, epilog_stmt);
5577 gimple_assign_set_lhs (epilog_stmt, new_temp);
5578 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5579 if (nested_in_vect_loop)
5581 set_vinfo_for_stmt (epilog_stmt,
5582 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5583 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5584 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5586 if (!double_reduc)
5587 scalar_results.quick_push (new_temp);
5588 else
5589 scalar_results[0] = new_temp;
5591 else
5592 scalar_results[0] = new_temp;
5594 new_phis[0] = epilog_stmt;
5597 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5598 phis with new adjusted scalar results, i.e., replace use <s_out0>
5599 with use <s_out4>.
5601 Transform:
5602 loop_exit:
5603 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5604 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5605 v_out2 = reduce <v_out1>
5606 s_out3 = extract_field <v_out2, 0>
5607 s_out4 = adjust_result <s_out3>
5608 use <s_out0>
5609 use <s_out0>
5611 into:
5613 loop_exit:
5614 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5615 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5616 v_out2 = reduce <v_out1>
5617 s_out3 = extract_field <v_out2, 0>
5618 s_out4 = adjust_result <s_out3>
5619 use <s_out4>
5620 use <s_out4> */
5623 /* In SLP reduction chain we reduce vector results into one vector if
5624 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5625 the last stmt in the reduction chain, since we are looking for the loop
5626 exit phi node. */
5627 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5629 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5630 /* Handle reduction patterns. */
5631 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5632 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5634 scalar_dest = gimple_assign_lhs (dest_stmt);
5635 group_size = 1;
5638 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5639 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5640 need to match SCALAR_RESULTS with corresponding statements. The first
5641 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5642 the first vector stmt, etc.
5643 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5644 if (group_size > new_phis.length ())
5646 ratio = group_size / new_phis.length ();
5647 gcc_assert (!(group_size % new_phis.length ()));
5649 else
5650 ratio = 1;
5652 for (k = 0; k < group_size; k++)
5654 if (k % ratio == 0)
5656 epilog_stmt = new_phis[k / ratio];
5657 reduction_phi = reduction_phis[k / ratio];
5658 if (double_reduc)
5659 inner_phi = inner_phis[k / ratio];
5662 if (slp_reduc)
5664 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5666 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5667 /* SLP statements can't participate in patterns. */
5668 gcc_assert (!orig_stmt);
5669 scalar_dest = gimple_assign_lhs (current_stmt);
5672 phis.create (3);
5673 /* Find the loop-closed-use at the loop exit of the original scalar
5674 result. (The reduction result is expected to have two immediate uses -
5675 one at the latch block, and one at the loop exit). */
5676 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5677 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5678 && !is_gimple_debug (USE_STMT (use_p)))
5679 phis.safe_push (USE_STMT (use_p));
5681 /* While we expect to have found an exit_phi because of loop-closed-ssa
5682 form we can end up without one if the scalar cycle is dead. */
5684 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5686 if (outer_loop)
5688 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5689 gphi *vect_phi;
5691 /* FORNOW. Currently not supporting the case that an inner-loop
5692 reduction is not used in the outer-loop (but only outside the
5693 outer-loop), unless it is double reduction. */
5694 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5695 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5696 || double_reduc);
5698 if (double_reduc)
5699 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5700 else
5701 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5702 if (!double_reduc
5703 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5704 != vect_double_reduction_def)
5705 continue;
5707 /* Handle double reduction:
5709 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5710 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5711 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5712 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5714 At that point the regular reduction (stmt2 and stmt3) is
5715 already vectorized, as well as the exit phi node, stmt4.
5716 Here we vectorize the phi node of double reduction, stmt1, and
5717 update all relevant statements. */
5719 /* Go through all the uses of s2 to find double reduction phi
5720 node, i.e., stmt1 above. */
5721 orig_name = PHI_RESULT (exit_phi);
5722 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5724 stmt_vec_info use_stmt_vinfo;
5725 stmt_vec_info new_phi_vinfo;
5726 tree vect_phi_init, preheader_arg, vect_phi_res;
5727 basic_block bb = gimple_bb (use_stmt);
5728 gimple *use;
5730 /* Check that USE_STMT is really double reduction phi
5731 node. */
5732 if (gimple_code (use_stmt) != GIMPLE_PHI
5733 || gimple_phi_num_args (use_stmt) != 2
5734 || bb->loop_father != outer_loop)
5735 continue;
5736 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5737 if (!use_stmt_vinfo
5738 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5739 != vect_double_reduction_def)
5740 continue;
5742 /* Create vector phi node for double reduction:
5743 vs1 = phi <vs0, vs2>
5744 vs1 was created previously in this function by a call to
5745 vect_get_vec_def_for_operand and is stored in
5746 vec_initial_def;
5747 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5748 vs0 is created here. */
5750 /* Create vector phi node. */
5751 vect_phi = create_phi_node (vec_initial_def, bb);
5752 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5753 loop_vec_info_for_loop (outer_loop));
5754 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5756 /* Create vs0 - initial def of the double reduction phi. */
5757 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5758 loop_preheader_edge (outer_loop));
5759 vect_phi_init = get_initial_def_for_reduction
5760 (stmt, preheader_arg, NULL);
5762 /* Update phi node arguments with vs0 and vs2. */
5763 add_phi_arg (vect_phi, vect_phi_init,
5764 loop_preheader_edge (outer_loop),
5765 UNKNOWN_LOCATION);
5766 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5767 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5768 if (dump_enabled_p ())
5770 dump_printf_loc (MSG_NOTE, vect_location,
5771 "created double reduction phi node: ");
5772 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5775 vect_phi_res = PHI_RESULT (vect_phi);
5777 /* Replace the use, i.e., set the correct vs1 in the regular
5778 reduction phi node. FORNOW, NCOPIES is always 1, so the
5779 loop is redundant. */
5780 use = reduction_phi;
5781 for (j = 0; j < ncopies; j++)
5783 edge pr_edge = loop_preheader_edge (loop);
5784 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5785 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5791 phis.release ();
5792 if (nested_in_vect_loop)
5794 if (double_reduc)
5795 loop = outer_loop;
5796 else
5797 continue;
5800 phis.create (3);
5801 /* Find the loop-closed-use at the loop exit of the original scalar
5802 result. (The reduction result is expected to have two immediate uses,
5803 one at the latch block, and one at the loop exit). For double
5804 reductions we are looking for exit phis of the outer loop. */
5805 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5807 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5809 if (!is_gimple_debug (USE_STMT (use_p)))
5810 phis.safe_push (USE_STMT (use_p));
5812 else
5814 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5816 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5818 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5820 if (!flow_bb_inside_loop_p (loop,
5821 gimple_bb (USE_STMT (phi_use_p)))
5822 && !is_gimple_debug (USE_STMT (phi_use_p)))
5823 phis.safe_push (USE_STMT (phi_use_p));
5829 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5831 /* Replace the uses: */
5832 orig_name = PHI_RESULT (exit_phi);
5833 scalar_result = scalar_results[k];
5834 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5835 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5836 SET_USE (use_p, scalar_result);
5839 phis.release ();
5843 /* Return a vector of type VECTYPE that is equal to the vector select
5844 operation "MASK ? VEC : IDENTITY". Insert the select statements
5845 before GSI. */
5847 static tree
5848 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5849 tree vec, tree identity)
5851 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5852 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5853 mask, vec, identity);
5854 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5855 return cond;
5858 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5859 order, starting with LHS. Insert the extraction statements before GSI and
5860 associate the new scalar SSA names with variable SCALAR_DEST.
5861 Return the SSA name for the result. */
5863 static tree
5864 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5865 tree_code code, tree lhs, tree vector_rhs)
5867 tree vectype = TREE_TYPE (vector_rhs);
5868 tree scalar_type = TREE_TYPE (vectype);
5869 tree bitsize = TYPE_SIZE (scalar_type);
5870 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5871 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5873 for (unsigned HOST_WIDE_INT bit_offset = 0;
5874 bit_offset < vec_size_in_bits;
5875 bit_offset += element_bitsize)
5877 tree bitpos = bitsize_int (bit_offset);
5878 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5879 bitsize, bitpos);
5881 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5882 rhs = make_ssa_name (scalar_dest, stmt);
5883 gimple_assign_set_lhs (stmt, rhs);
5884 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5886 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5887 tree new_name = make_ssa_name (scalar_dest, stmt);
5888 gimple_assign_set_lhs (stmt, new_name);
5889 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5890 lhs = new_name;
5892 return lhs;
5895 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
5896 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5897 statement. CODE is the operation performed by STMT and OPS are
5898 its scalar operands. REDUC_INDEX is the index of the operand in
5899 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5900 implements in-order reduction, or IFN_LAST if we should open-code it.
5901 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5902 that should be used to control the operation in a fully-masked loop. */
5904 static bool
5905 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5906 gimple **vec_stmt, slp_tree slp_node,
5907 gimple *reduc_def_stmt,
5908 tree_code code, internal_fn reduc_fn,
5909 tree ops[3], tree vectype_in,
5910 int reduc_index, vec_loop_masks *masks)
5912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5913 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5914 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5915 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5916 gimple *new_stmt = NULL;
5918 int ncopies;
5919 if (slp_node)
5920 ncopies = 1;
5921 else
5922 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5924 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5925 gcc_assert (ncopies == 1);
5926 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5927 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5928 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5929 == FOLD_LEFT_REDUCTION);
5931 if (slp_node)
5932 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5933 TYPE_VECTOR_SUBPARTS (vectype_in)));
5935 tree op0 = ops[1 - reduc_index];
5937 int group_size = 1;
5938 gimple *scalar_dest_def;
5939 auto_vec<tree> vec_oprnds0;
5940 if (slp_node)
5942 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5943 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5944 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5946 else
5948 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5949 vec_oprnds0.create (1);
5950 vec_oprnds0.quick_push (loop_vec_def0);
5951 scalar_dest_def = stmt;
5954 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5955 tree scalar_type = TREE_TYPE (scalar_dest);
5956 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5958 int vec_num = vec_oprnds0.length ();
5959 gcc_assert (vec_num == 1 || slp_node);
5960 tree vec_elem_type = TREE_TYPE (vectype_out);
5961 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5963 tree vector_identity = NULL_TREE;
5964 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5965 vector_identity = build_zero_cst (vectype_out);
5967 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5968 int i;
5969 tree def0;
5970 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5972 tree mask = NULL_TREE;
5973 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5974 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5976 /* Handle MINUS by adding the negative. */
5977 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5979 tree negated = make_ssa_name (vectype_out);
5980 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5981 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5982 def0 = negated;
5985 if (mask)
5986 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5987 vector_identity);
5989 /* On the first iteration the input is simply the scalar phi
5990 result, and for subsequent iterations it is the output of
5991 the preceding operation. */
5992 if (reduc_fn != IFN_LAST)
5994 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5995 /* For chained SLP reductions the output of the previous reduction
5996 operation serves as the input of the next. For the final statement
5997 the output cannot be a temporary - we reuse the original
5998 scalar destination of the last statement. */
5999 if (i != vec_num - 1)
6001 gimple_set_lhs (new_stmt, scalar_dest_var);
6002 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6003 gimple_set_lhs (new_stmt, reduc_var);
6006 else
6008 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6009 reduc_var, def0);
6010 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6011 /* Remove the statement, so that we can use the same code paths
6012 as for statements that we've just created. */
6013 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6014 gsi_remove (&tmp_gsi, false);
6017 if (i == vec_num - 1)
6019 gimple_set_lhs (new_stmt, scalar_dest);
6020 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6022 else
6023 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6025 if (slp_node)
6026 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6029 if (!slp_node)
6030 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6032 return true;
6035 /* Function is_nonwrapping_integer_induction.
6037 Check if STMT (which is part of loop LOOP) both increments and
6038 does not cause overflow. */
6040 static bool
6041 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6043 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6044 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6045 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6046 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6047 widest_int ni, max_loop_value, lhs_max;
6048 bool overflow = false;
6050 /* Make sure the loop is integer based. */
6051 if (TREE_CODE (base) != INTEGER_CST
6052 || TREE_CODE (step) != INTEGER_CST)
6053 return false;
6055 /* Check that the max size of the loop will not wrap. */
6057 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6058 return true;
6060 if (! max_stmt_executions (loop, &ni))
6061 return false;
6063 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6064 &overflow);
6065 if (overflow)
6066 return false;
6068 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6069 TYPE_SIGN (lhs_type), &overflow);
6070 if (overflow)
6071 return false;
6073 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6074 <= TYPE_PRECISION (lhs_type));
6077 /* Function vectorizable_reduction.
6079 Check if STMT performs a reduction operation that can be vectorized.
6080 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6081 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6082 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6084 This function also handles reduction idioms (patterns) that have been
6085 recognized in advance during vect_pattern_recog. In this case, STMT may be
6086 of this form:
6087 X = pattern_expr (arg0, arg1, ..., X)
6088 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6089 sequence that had been detected and replaced by the pattern-stmt (STMT).
6091 This function also handles reduction of condition expressions, for example:
6092 for (int i = 0; i < N; i++)
6093 if (a[i] < value)
6094 last = a[i];
6095 This is handled by vectorising the loop and creating an additional vector
6096 containing the loop indexes for which "a[i] < value" was true. In the
6097 function epilogue this is reduced to a single max value and then used to
6098 index into the vector of results.
6100 In some cases of reduction patterns, the type of the reduction variable X is
6101 different than the type of the other arguments of STMT.
6102 In such cases, the vectype that is used when transforming STMT into a vector
6103 stmt is different than the vectype that is used to determine the
6104 vectorization factor, because it consists of a different number of elements
6105 than the actual number of elements that are being operated upon in parallel.
6107 For example, consider an accumulation of shorts into an int accumulator.
6108 On some targets it's possible to vectorize this pattern operating on 8
6109 shorts at a time (hence, the vectype for purposes of determining the
6110 vectorization factor should be V8HI); on the other hand, the vectype that
6111 is used to create the vector form is actually V4SI (the type of the result).
6113 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6114 indicates what is the actual level of parallelism (V8HI in the example), so
6115 that the right vectorization factor would be derived. This vectype
6116 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6117 be used to create the vectorized stmt. The right vectype for the vectorized
6118 stmt is obtained from the type of the result X:
6119 get_vectype_for_scalar_type (TREE_TYPE (X))
6121 This means that, contrary to "regular" reductions (or "regular" stmts in
6122 general), the following equation:
6123 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6124 does *NOT* necessarily hold for reduction patterns. */
6126 bool
6127 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6128 gimple **vec_stmt, slp_tree slp_node,
6129 slp_instance slp_node_instance,
6130 stmt_vector_for_cost *cost_vec)
6132 tree vec_dest;
6133 tree scalar_dest;
6134 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6135 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6136 tree vectype_in = NULL_TREE;
6137 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6138 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6139 enum tree_code code, orig_code;
6140 internal_fn reduc_fn;
6141 machine_mode vec_mode;
6142 int op_type;
6143 optab optab;
6144 tree new_temp = NULL_TREE;
6145 gimple *def_stmt;
6146 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6147 gimple *cond_reduc_def_stmt = NULL;
6148 enum tree_code cond_reduc_op_code = ERROR_MARK;
6149 tree scalar_type;
6150 bool is_simple_use;
6151 gimple *orig_stmt;
6152 stmt_vec_info orig_stmt_info = NULL;
6153 int i;
6154 int ncopies;
6155 int epilog_copies;
6156 stmt_vec_info prev_stmt_info, prev_phi_info;
6157 bool single_defuse_cycle = false;
6158 gimple *new_stmt = NULL;
6159 int j;
6160 tree ops[3];
6161 enum vect_def_type dts[3];
6162 bool nested_cycle = false, found_nested_cycle_def = false;
6163 bool double_reduc = false;
6164 basic_block def_bb;
6165 struct loop * def_stmt_loop, *outer_loop = NULL;
6166 tree def_arg;
6167 gimple *def_arg_stmt;
6168 auto_vec<tree> vec_oprnds0;
6169 auto_vec<tree> vec_oprnds1;
6170 auto_vec<tree> vec_oprnds2;
6171 auto_vec<tree> vect_defs;
6172 auto_vec<gimple *> phis;
6173 int vec_num;
6174 tree def0, tem;
6175 bool first_p = true;
6176 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6177 tree cond_reduc_val = NULL_TREE;
6179 /* Make sure it was already recognized as a reduction computation. */
6180 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6181 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6182 return false;
6184 if (nested_in_vect_loop_p (loop, stmt))
6186 outer_loop = loop;
6187 loop = loop->inner;
6188 nested_cycle = true;
6191 /* In case of reduction chain we switch to the first stmt in the chain, but
6192 we don't update STMT_INFO, since only the last stmt is marked as reduction
6193 and has reduction properties. */
6194 if (GROUP_FIRST_ELEMENT (stmt_info)
6195 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6197 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6198 first_p = false;
6201 if (gimple_code (stmt) == GIMPLE_PHI)
6203 /* Analysis is fully done on the reduction stmt invocation. */
6204 if (! vec_stmt)
6206 if (slp_node)
6207 slp_node_instance->reduc_phis = slp_node;
6209 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6210 return true;
6213 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6214 /* Leave the scalar phi in place. Note that checking
6215 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6216 for reductions involving a single statement. */
6217 return true;
6219 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6220 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6221 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6223 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6224 == EXTRACT_LAST_REDUCTION)
6225 /* Leave the scalar phi in place. */
6226 return true;
6228 gcc_assert (is_gimple_assign (reduc_stmt));
6229 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6231 tree op = gimple_op (reduc_stmt, k);
6232 if (op == gimple_phi_result (stmt))
6233 continue;
6234 if (k == 1
6235 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6236 continue;
6237 if (!vectype_in
6238 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6239 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6240 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6241 break;
6243 gcc_assert (vectype_in);
6245 if (slp_node)
6246 ncopies = 1;
6247 else
6248 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6250 use_operand_p use_p;
6251 gimple *use_stmt;
6252 if (ncopies > 1
6253 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6254 <= vect_used_only_live)
6255 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6256 && (use_stmt == reduc_stmt
6257 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6258 == reduc_stmt)))
6259 single_defuse_cycle = true;
6261 /* Create the destination vector */
6262 scalar_dest = gimple_assign_lhs (reduc_stmt);
6263 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6265 if (slp_node)
6266 /* The size vect_schedule_slp_instance computes is off for us. */
6267 vec_num = vect_get_num_vectors
6268 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6269 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6270 vectype_in);
6271 else
6272 vec_num = 1;
6274 /* Generate the reduction PHIs upfront. */
6275 prev_phi_info = NULL;
6276 for (j = 0; j < ncopies; j++)
6278 if (j == 0 || !single_defuse_cycle)
6280 for (i = 0; i < vec_num; i++)
6282 /* Create the reduction-phi that defines the reduction
6283 operand. */
6284 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6285 set_vinfo_for_stmt (new_phi,
6286 new_stmt_vec_info (new_phi, loop_vinfo));
6288 if (slp_node)
6289 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6290 else
6292 if (j == 0)
6293 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6294 else
6295 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6296 prev_phi_info = vinfo_for_stmt (new_phi);
6302 return true;
6305 /* 1. Is vectorizable reduction? */
6306 /* Not supportable if the reduction variable is used in the loop, unless
6307 it's a reduction chain. */
6308 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6309 && !GROUP_FIRST_ELEMENT (stmt_info))
6310 return false;
6312 /* Reductions that are not used even in an enclosing outer-loop,
6313 are expected to be "live" (used out of the loop). */
6314 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6315 && !STMT_VINFO_LIVE_P (stmt_info))
6316 return false;
6318 /* 2. Has this been recognized as a reduction pattern?
6320 Check if STMT represents a pattern that has been recognized
6321 in earlier analysis stages. For stmts that represent a pattern,
6322 the STMT_VINFO_RELATED_STMT field records the last stmt in
6323 the original sequence that constitutes the pattern. */
6325 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6326 if (orig_stmt)
6328 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6329 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6330 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6333 /* 3. Check the operands of the operation. The first operands are defined
6334 inside the loop body. The last operand is the reduction variable,
6335 which is defined by the loop-header-phi. */
6337 gcc_assert (is_gimple_assign (stmt));
6339 /* Flatten RHS. */
6340 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6342 case GIMPLE_BINARY_RHS:
6343 code = gimple_assign_rhs_code (stmt);
6344 op_type = TREE_CODE_LENGTH (code);
6345 gcc_assert (op_type == binary_op);
6346 ops[0] = gimple_assign_rhs1 (stmt);
6347 ops[1] = gimple_assign_rhs2 (stmt);
6348 break;
6350 case GIMPLE_TERNARY_RHS:
6351 code = gimple_assign_rhs_code (stmt);
6352 op_type = TREE_CODE_LENGTH (code);
6353 gcc_assert (op_type == ternary_op);
6354 ops[0] = gimple_assign_rhs1 (stmt);
6355 ops[1] = gimple_assign_rhs2 (stmt);
6356 ops[2] = gimple_assign_rhs3 (stmt);
6357 break;
6359 case GIMPLE_UNARY_RHS:
6360 return false;
6362 default:
6363 gcc_unreachable ();
6366 if (code == COND_EXPR && slp_node)
6367 return false;
6369 scalar_dest = gimple_assign_lhs (stmt);
6370 scalar_type = TREE_TYPE (scalar_dest);
6371 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6372 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6373 return false;
6375 /* Do not try to vectorize bit-precision reductions. */
6376 if (!type_has_mode_precision_p (scalar_type))
6377 return false;
6379 /* All uses but the last are expected to be defined in the loop.
6380 The last use is the reduction variable. In case of nested cycle this
6381 assumption is not true: we use reduc_index to record the index of the
6382 reduction variable. */
6383 gimple *reduc_def_stmt = NULL;
6384 int reduc_index = -1;
6385 for (i = 0; i < op_type; i++)
6387 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6388 if (i == 0 && code == COND_EXPR)
6389 continue;
6391 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6392 &def_stmt, &dts[i], &tem);
6393 dt = dts[i];
6394 gcc_assert (is_simple_use);
6395 if (dt == vect_reduction_def)
6397 reduc_def_stmt = def_stmt;
6398 reduc_index = i;
6399 continue;
6401 else if (tem)
6403 /* To properly compute ncopies we are interested in the widest
6404 input type in case we're looking at a widening accumulation. */
6405 if (!vectype_in
6406 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6407 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6408 vectype_in = tem;
6411 if (dt != vect_internal_def
6412 && dt != vect_external_def
6413 && dt != vect_constant_def
6414 && dt != vect_induction_def
6415 && !(dt == vect_nested_cycle && nested_cycle))
6416 return false;
6418 if (dt == vect_nested_cycle)
6420 found_nested_cycle_def = true;
6421 reduc_def_stmt = def_stmt;
6422 reduc_index = i;
6425 if (i == 1 && code == COND_EXPR)
6427 /* Record how value of COND_EXPR is defined. */
6428 if (dt == vect_constant_def)
6430 cond_reduc_dt = dt;
6431 cond_reduc_val = ops[i];
6433 if (dt == vect_induction_def
6434 && def_stmt != NULL
6435 && is_nonwrapping_integer_induction (def_stmt, loop))
6437 cond_reduc_dt = dt;
6438 cond_reduc_def_stmt = def_stmt;
6443 if (!vectype_in)
6444 vectype_in = vectype_out;
6446 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6447 directy used in stmt. */
6448 if (reduc_index == -1)
6450 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6452 if (dump_enabled_p ())
6453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6454 "in-order reduction chain without SLP.\n");
6455 return false;
6458 if (orig_stmt)
6459 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6460 else
6461 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6464 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6465 return false;
6467 if (!(reduc_index == -1
6468 || dts[reduc_index] == vect_reduction_def
6469 || dts[reduc_index] == vect_nested_cycle
6470 || ((dts[reduc_index] == vect_internal_def
6471 || dts[reduc_index] == vect_external_def
6472 || dts[reduc_index] == vect_constant_def
6473 || dts[reduc_index] == vect_induction_def)
6474 && nested_cycle && found_nested_cycle_def)))
6476 /* For pattern recognized stmts, orig_stmt might be a reduction,
6477 but some helper statements for the pattern might not, or
6478 might be COND_EXPRs with reduction uses in the condition. */
6479 gcc_assert (orig_stmt);
6480 return false;
6483 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6484 enum vect_reduction_type v_reduc_type
6485 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6486 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6488 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6489 /* If we have a condition reduction, see if we can simplify it further. */
6490 if (v_reduc_type == COND_REDUCTION)
6492 /* TODO: We can't yet handle reduction chains, since we need to treat
6493 each COND_EXPR in the chain specially, not just the last one.
6494 E.g. for:
6496 x_1 = PHI <x_3, ...>
6497 x_2 = a_2 ? ... : x_1;
6498 x_3 = a_3 ? ... : x_2;
6500 we're interested in the last element in x_3 for which a_2 || a_3
6501 is true, whereas the current reduction chain handling would
6502 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6503 as a reduction operation. */
6504 if (reduc_index == -1)
6506 if (dump_enabled_p ())
6507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6508 "conditional reduction chains not supported\n");
6509 return false;
6512 /* vect_is_simple_reduction ensured that operand 2 is the
6513 loop-carried operand. */
6514 gcc_assert (reduc_index == 2);
6516 /* Loop peeling modifies initial value of reduction PHI, which
6517 makes the reduction stmt to be transformed different to the
6518 original stmt analyzed. We need to record reduction code for
6519 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6520 it can be used directly at transform stage. */
6521 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6522 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6524 /* Also set the reduction type to CONST_COND_REDUCTION. */
6525 gcc_assert (cond_reduc_dt == vect_constant_def);
6526 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6528 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6529 vectype_in, OPTIMIZE_FOR_SPEED))
6531 if (dump_enabled_p ())
6532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6533 "optimizing condition reduction with"
6534 " FOLD_EXTRACT_LAST.\n");
6535 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6537 else if (cond_reduc_dt == vect_induction_def)
6539 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6540 tree base
6541 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6542 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6544 gcc_assert (TREE_CODE (base) == INTEGER_CST
6545 && TREE_CODE (step) == INTEGER_CST);
6546 cond_reduc_val = NULL_TREE;
6547 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6548 above base; punt if base is the minimum value of the type for
6549 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6550 if (tree_int_cst_sgn (step) == -1)
6552 cond_reduc_op_code = MIN_EXPR;
6553 if (tree_int_cst_sgn (base) == -1)
6554 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6555 else if (tree_int_cst_lt (base,
6556 TYPE_MAX_VALUE (TREE_TYPE (base))))
6557 cond_reduc_val
6558 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6560 else
6562 cond_reduc_op_code = MAX_EXPR;
6563 if (tree_int_cst_sgn (base) == 1)
6564 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6565 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6566 base))
6567 cond_reduc_val
6568 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6570 if (cond_reduc_val)
6572 if (dump_enabled_p ())
6573 dump_printf_loc (MSG_NOTE, vect_location,
6574 "condition expression based on "
6575 "integer induction.\n");
6576 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6577 = INTEGER_INDUC_COND_REDUCTION;
6580 else if (cond_reduc_dt == vect_constant_def)
6582 enum vect_def_type cond_initial_dt;
6583 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6584 tree cond_initial_val
6585 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6587 gcc_assert (cond_reduc_val != NULL_TREE);
6588 vect_is_simple_use (cond_initial_val, loop_vinfo,
6589 &def_stmt, &cond_initial_dt);
6590 if (cond_initial_dt == vect_constant_def
6591 && types_compatible_p (TREE_TYPE (cond_initial_val),
6592 TREE_TYPE (cond_reduc_val)))
6594 tree e = fold_binary (LE_EXPR, boolean_type_node,
6595 cond_initial_val, cond_reduc_val);
6596 if (e && (integer_onep (e) || integer_zerop (e)))
6598 if (dump_enabled_p ())
6599 dump_printf_loc (MSG_NOTE, vect_location,
6600 "condition expression based on "
6601 "compile time constant.\n");
6602 /* Record reduction code at analysis stage. */
6603 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6604 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6605 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6606 = CONST_COND_REDUCTION;
6612 if (orig_stmt)
6613 gcc_assert (tmp == orig_stmt
6614 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6615 else
6616 /* We changed STMT to be the first stmt in reduction chain, hence we
6617 check that in this case the first element in the chain is STMT. */
6618 gcc_assert (stmt == tmp
6619 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6621 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6622 return false;
6624 if (slp_node)
6625 ncopies = 1;
6626 else
6627 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6629 gcc_assert (ncopies >= 1);
6631 vec_mode = TYPE_MODE (vectype_in);
6632 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6634 if (code == COND_EXPR)
6636 /* Only call during the analysis stage, otherwise we'll lose
6637 STMT_VINFO_TYPE. */
6638 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6639 ops[reduc_index], 0, NULL,
6640 cost_vec))
6642 if (dump_enabled_p ())
6643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6644 "unsupported condition in reduction\n");
6645 return false;
6648 else
6650 /* 4. Supportable by target? */
6652 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6653 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6655 /* Shifts and rotates are only supported by vectorizable_shifts,
6656 not vectorizable_reduction. */
6657 if (dump_enabled_p ())
6658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6659 "unsupported shift or rotation.\n");
6660 return false;
6663 /* 4.1. check support for the operation in the loop */
6664 optab = optab_for_tree_code (code, vectype_in, optab_default);
6665 if (!optab)
6667 if (dump_enabled_p ())
6668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6669 "no optab.\n");
6671 return false;
6674 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6676 if (dump_enabled_p ())
6677 dump_printf (MSG_NOTE, "op not supported by target.\n");
6679 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6680 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6681 return false;
6683 if (dump_enabled_p ())
6684 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6687 /* Worthwhile without SIMD support? */
6688 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6689 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6691 if (dump_enabled_p ())
6692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6693 "not worthwhile without SIMD support.\n");
6695 return false;
6699 /* 4.2. Check support for the epilog operation.
6701 If STMT represents a reduction pattern, then the type of the
6702 reduction variable may be different than the type of the rest
6703 of the arguments. For example, consider the case of accumulation
6704 of shorts into an int accumulator; The original code:
6705 S1: int_a = (int) short_a;
6706 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6708 was replaced with:
6709 STMT: int_acc = widen_sum <short_a, int_acc>
6711 This means that:
6712 1. The tree-code that is used to create the vector operation in the
6713 epilog code (that reduces the partial results) is not the
6714 tree-code of STMT, but is rather the tree-code of the original
6715 stmt from the pattern that STMT is replacing. I.e, in the example
6716 above we want to use 'widen_sum' in the loop, but 'plus' in the
6717 epilog.
6718 2. The type (mode) we use to check available target support
6719 for the vector operation to be created in the *epilog*, is
6720 determined by the type of the reduction variable (in the example
6721 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6722 However the type (mode) we use to check available target support
6723 for the vector operation to be created *inside the loop*, is
6724 determined by the type of the other arguments to STMT (in the
6725 example we'd check this: optab_handler (widen_sum_optab,
6726 vect_short_mode)).
6728 This is contrary to "regular" reductions, in which the types of all
6729 the arguments are the same as the type of the reduction variable.
6730 For "regular" reductions we can therefore use the same vector type
6731 (and also the same tree-code) when generating the epilog code and
6732 when generating the code inside the loop. */
6734 vect_reduction_type reduction_type
6735 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6736 if (orig_stmt
6737 && (reduction_type == TREE_CODE_REDUCTION
6738 || reduction_type == FOLD_LEFT_REDUCTION))
6740 /* This is a reduction pattern: get the vectype from the type of the
6741 reduction variable, and get the tree-code from orig_stmt. */
6742 orig_code = gimple_assign_rhs_code (orig_stmt);
6743 gcc_assert (vectype_out);
6744 vec_mode = TYPE_MODE (vectype_out);
6746 else
6748 /* Regular reduction: use the same vectype and tree-code as used for
6749 the vector code inside the loop can be used for the epilog code. */
6750 orig_code = code;
6752 if (code == MINUS_EXPR)
6753 orig_code = PLUS_EXPR;
6755 /* For simple condition reductions, replace with the actual expression
6756 we want to base our reduction around. */
6757 if (reduction_type == CONST_COND_REDUCTION)
6759 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6760 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6762 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6763 orig_code = cond_reduc_op_code;
6766 if (nested_cycle)
6768 def_bb = gimple_bb (reduc_def_stmt);
6769 def_stmt_loop = def_bb->loop_father;
6770 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6771 loop_preheader_edge (def_stmt_loop));
6772 if (TREE_CODE (def_arg) == SSA_NAME
6773 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6774 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6775 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6776 && vinfo_for_stmt (def_arg_stmt)
6777 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6778 == vect_double_reduction_def)
6779 double_reduc = true;
6782 reduc_fn = IFN_LAST;
6784 if (reduction_type == TREE_CODE_REDUCTION
6785 || reduction_type == FOLD_LEFT_REDUCTION
6786 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6787 || reduction_type == CONST_COND_REDUCTION)
6789 if (reduction_type == FOLD_LEFT_REDUCTION
6790 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6791 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6793 if (reduc_fn != IFN_LAST
6794 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6795 OPTIMIZE_FOR_SPEED))
6797 if (dump_enabled_p ())
6798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799 "reduc op not supported by target.\n");
6801 reduc_fn = IFN_LAST;
6804 else
6806 if (!nested_cycle || double_reduc)
6808 if (dump_enabled_p ())
6809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810 "no reduc code for scalar code.\n");
6812 return false;
6816 else if (reduction_type == COND_REDUCTION)
6818 int scalar_precision
6819 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6820 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6821 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6822 nunits_out);
6824 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6825 OPTIMIZE_FOR_SPEED))
6826 reduc_fn = IFN_REDUC_MAX;
6829 if (reduction_type != EXTRACT_LAST_REDUCTION
6830 && reduc_fn == IFN_LAST
6831 && !nunits_out.is_constant ())
6833 if (dump_enabled_p ())
6834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6835 "missing target support for reduction on"
6836 " variable-length vectors.\n");
6837 return false;
6840 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6841 && ncopies > 1)
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845 "multiple types in double reduction or condition "
6846 "reduction.\n");
6847 return false;
6850 /* For SLP reductions, see if there is a neutral value we can use. */
6851 tree neutral_op = NULL_TREE;
6852 if (slp_node)
6853 neutral_op
6854 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
6855 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6857 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6859 /* We can't support in-order reductions of code such as this:
6861 for (int i = 0; i < n1; ++i)
6862 for (int j = 0; j < n2; ++j)
6863 l += a[j];
6865 since GCC effectively transforms the loop when vectorizing:
6867 for (int i = 0; i < n1 / VF; ++i)
6868 for (int j = 0; j < n2; ++j)
6869 for (int k = 0; k < VF; ++k)
6870 l += a[j];
6872 which is a reassociation of the original operation. */
6873 if (dump_enabled_p ())
6874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6875 "in-order double reduction not supported.\n");
6877 return false;
6880 if (reduction_type == FOLD_LEFT_REDUCTION
6881 && slp_node
6882 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6884 /* We cannot use in-order reductions in this case because there is
6885 an implicit reassociation of the operations involved. */
6886 if (dump_enabled_p ())
6887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6888 "in-order unchained SLP reductions not supported.\n");
6889 return false;
6892 /* For double reductions, and for SLP reductions with a neutral value,
6893 we construct a variable-length initial vector by loading a vector
6894 full of the neutral value and then shift-and-inserting the start
6895 values into the low-numbered elements. */
6896 if ((double_reduc || neutral_op)
6897 && !nunits_out.is_constant ()
6898 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6899 vectype_out, OPTIMIZE_FOR_SPEED))
6901 if (dump_enabled_p ())
6902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6903 "reduction on variable-length vectors requires"
6904 " target support for a vector-shift-and-insert"
6905 " operation.\n");
6906 return false;
6909 /* Check extra constraints for variable-length unchained SLP reductions. */
6910 if (STMT_SLP_TYPE (stmt_info)
6911 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6912 && !nunits_out.is_constant ())
6914 /* We checked above that we could build the initial vector when
6915 there's a neutral element value. Check here for the case in
6916 which each SLP statement has its own initial value and in which
6917 that value needs to be repeated for every instance of the
6918 statement within the initial vector. */
6919 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6920 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6921 if (!neutral_op
6922 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6924 if (dump_enabled_p ())
6925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6926 "unsupported form of SLP reduction for"
6927 " variable-length vectors: cannot build"
6928 " initial vector.\n");
6929 return false;
6931 /* The epilogue code relies on the number of elements being a multiple
6932 of the group size. The duplicate-and-interleave approach to setting
6933 up the the initial vector does too. */
6934 if (!multiple_p (nunits_out, group_size))
6936 if (dump_enabled_p ())
6937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6938 "unsupported form of SLP reduction for"
6939 " variable-length vectors: the vector size"
6940 " is not a multiple of the number of results.\n");
6941 return false;
6945 /* In case of widenning multiplication by a constant, we update the type
6946 of the constant to be the type of the other operand. We check that the
6947 constant fits the type in the pattern recognition pass. */
6948 if (code == DOT_PROD_EXPR
6949 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6951 if (TREE_CODE (ops[0]) == INTEGER_CST)
6952 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6953 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6954 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6955 else
6957 if (dump_enabled_p ())
6958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6959 "invalid types in dot-prod\n");
6961 return false;
6965 if (reduction_type == COND_REDUCTION)
6967 widest_int ni;
6969 if (! max_loop_iterations (loop, &ni))
6971 if (dump_enabled_p ())
6972 dump_printf_loc (MSG_NOTE, vect_location,
6973 "loop count not known, cannot create cond "
6974 "reduction.\n");
6975 return false;
6977 /* Convert backedges to iterations. */
6978 ni += 1;
6980 /* The additional index will be the same type as the condition. Check
6981 that the loop can fit into this less one (because we'll use up the
6982 zero slot for when there are no matches). */
6983 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6984 if (wi::geu_p (ni, wi::to_widest (max_index)))
6986 if (dump_enabled_p ())
6987 dump_printf_loc (MSG_NOTE, vect_location,
6988 "loop size is greater than data size.\n");
6989 return false;
6993 /* In case the vectorization factor (VF) is bigger than the number
6994 of elements that we can fit in a vectype (nunits), we have to generate
6995 more than one vector stmt - i.e - we need to "unroll" the
6996 vector stmt by a factor VF/nunits. For more details see documentation
6997 in vectorizable_operation. */
6999 /* If the reduction is used in an outer loop we need to generate
7000 VF intermediate results, like so (e.g. for ncopies=2):
7001 r0 = phi (init, r0)
7002 r1 = phi (init, r1)
7003 r0 = x0 + r0;
7004 r1 = x1 + r1;
7005 (i.e. we generate VF results in 2 registers).
7006 In this case we have a separate def-use cycle for each copy, and therefore
7007 for each copy we get the vector def for the reduction variable from the
7008 respective phi node created for this copy.
7010 Otherwise (the reduction is unused in the loop nest), we can combine
7011 together intermediate results, like so (e.g. for ncopies=2):
7012 r = phi (init, r)
7013 r = x0 + r;
7014 r = x1 + r;
7015 (i.e. we generate VF/2 results in a single register).
7016 In this case for each copy we get the vector def for the reduction variable
7017 from the vectorized reduction operation generated in the previous iteration.
7019 This only works when we see both the reduction PHI and its only consumer
7020 in vectorizable_reduction and there are no intermediate stmts
7021 participating. */
7022 use_operand_p use_p;
7023 gimple *use_stmt;
7024 if (ncopies > 1
7025 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7026 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7027 && (use_stmt == stmt
7028 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7030 single_defuse_cycle = true;
7031 epilog_copies = 1;
7033 else
7034 epilog_copies = ncopies;
7036 /* If the reduction stmt is one of the patterns that have lane
7037 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7038 if ((ncopies > 1
7039 && ! single_defuse_cycle)
7040 && (code == DOT_PROD_EXPR
7041 || code == WIDEN_SUM_EXPR
7042 || code == SAD_EXPR))
7044 if (dump_enabled_p ())
7045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7046 "multi def-use cycle not possible for lane-reducing "
7047 "reduction operation\n");
7048 return false;
7051 if (slp_node)
7052 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7053 else
7054 vec_num = 1;
7056 internal_fn cond_fn = get_conditional_internal_fn (code);
7057 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7059 if (!vec_stmt) /* transformation not required. */
7061 if (first_p)
7062 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7063 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7065 if (reduction_type != FOLD_LEFT_REDUCTION
7066 && (cond_fn == IFN_LAST
7067 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7068 OPTIMIZE_FOR_SPEED)))
7070 if (dump_enabled_p ())
7071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7072 "can't use a fully-masked loop because no"
7073 " conditional operation is available.\n");
7074 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7076 else if (reduc_index == -1)
7078 if (dump_enabled_p ())
7079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7080 "can't use a fully-masked loop for chained"
7081 " reductions.\n");
7082 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7084 else
7085 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7086 vectype_in);
7088 if (dump_enabled_p ()
7089 && reduction_type == FOLD_LEFT_REDUCTION)
7090 dump_printf_loc (MSG_NOTE, vect_location,
7091 "using an in-order (fold-left) reduction.\n");
7092 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7093 return true;
7096 /* Transform. */
7098 if (dump_enabled_p ())
7099 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7101 /* FORNOW: Multiple types are not supported for condition. */
7102 if (code == COND_EXPR)
7103 gcc_assert (ncopies == 1);
7105 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7107 if (reduction_type == FOLD_LEFT_REDUCTION)
7108 return vectorize_fold_left_reduction
7109 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7110 reduc_fn, ops, vectype_in, reduc_index, masks);
7112 if (reduction_type == EXTRACT_LAST_REDUCTION)
7114 gcc_assert (!slp_node);
7115 return vectorizable_condition (stmt, gsi, vec_stmt,
7116 NULL, reduc_index, NULL, NULL);
7119 /* Create the destination vector */
7120 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7122 prev_stmt_info = NULL;
7123 prev_phi_info = NULL;
7124 if (!slp_node)
7126 vec_oprnds0.create (1);
7127 vec_oprnds1.create (1);
7128 if (op_type == ternary_op)
7129 vec_oprnds2.create (1);
7132 phis.create (vec_num);
7133 vect_defs.create (vec_num);
7134 if (!slp_node)
7135 vect_defs.quick_push (NULL_TREE);
7137 if (slp_node)
7138 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7139 else
7140 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7142 for (j = 0; j < ncopies; j++)
7144 if (code == COND_EXPR)
7146 gcc_assert (!slp_node);
7147 vectorizable_condition (stmt, gsi, vec_stmt,
7148 PHI_RESULT (phis[0]),
7149 reduc_index, NULL, NULL);
7150 /* Multiple types are not supported for condition. */
7151 break;
7154 /* Handle uses. */
7155 if (j == 0)
7157 if (slp_node)
7159 /* Get vec defs for all the operands except the reduction index,
7160 ensuring the ordering of the ops in the vector is kept. */
7161 auto_vec<tree, 3> slp_ops;
7162 auto_vec<vec<tree>, 3> vec_defs;
7164 slp_ops.quick_push (ops[0]);
7165 slp_ops.quick_push (ops[1]);
7166 if (op_type == ternary_op)
7167 slp_ops.quick_push (ops[2]);
7169 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7171 vec_oprnds0.safe_splice (vec_defs[0]);
7172 vec_defs[0].release ();
7173 vec_oprnds1.safe_splice (vec_defs[1]);
7174 vec_defs[1].release ();
7175 if (op_type == ternary_op)
7177 vec_oprnds2.safe_splice (vec_defs[2]);
7178 vec_defs[2].release ();
7181 else
7183 vec_oprnds0.quick_push
7184 (vect_get_vec_def_for_operand (ops[0], stmt));
7185 vec_oprnds1.quick_push
7186 (vect_get_vec_def_for_operand (ops[1], stmt));
7187 if (op_type == ternary_op)
7188 vec_oprnds2.quick_push
7189 (vect_get_vec_def_for_operand (ops[2], stmt));
7192 else
7194 if (!slp_node)
7196 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7198 if (single_defuse_cycle && reduc_index == 0)
7199 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7200 else
7201 vec_oprnds0[0]
7202 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7203 if (single_defuse_cycle && reduc_index == 1)
7204 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7205 else
7206 vec_oprnds1[0]
7207 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7208 if (op_type == ternary_op)
7210 if (single_defuse_cycle && reduc_index == 2)
7211 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7212 else
7213 vec_oprnds2[0]
7214 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7219 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7221 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7222 if (masked_loop_p)
7224 /* Make sure that the reduction accumulator is vop[0]. */
7225 if (reduc_index == 1)
7227 gcc_assert (commutative_tree_code (code));
7228 std::swap (vop[0], vop[1]);
7230 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7231 vectype_in, i * ncopies + j);
7232 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7233 vop[0], vop[1]);
7234 new_temp = make_ssa_name (vec_dest, call);
7235 gimple_call_set_lhs (call, new_temp);
7236 gimple_call_set_nothrow (call, true);
7237 new_stmt = call;
7239 else
7241 if (op_type == ternary_op)
7242 vop[2] = vec_oprnds2[i];
7244 new_temp = make_ssa_name (vec_dest, new_stmt);
7245 new_stmt = gimple_build_assign (new_temp, code,
7246 vop[0], vop[1], vop[2]);
7248 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7250 if (slp_node)
7252 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7253 vect_defs.quick_push (new_temp);
7255 else
7256 vect_defs[0] = new_temp;
7259 if (slp_node)
7260 continue;
7262 if (j == 0)
7263 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7264 else
7265 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7267 prev_stmt_info = vinfo_for_stmt (new_stmt);
7270 /* Finalize the reduction-phi (set its arguments) and create the
7271 epilog reduction code. */
7272 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7273 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7275 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7276 epilog_copies, reduc_fn, phis,
7277 double_reduc, slp_node, slp_node_instance,
7278 cond_reduc_val, cond_reduc_op_code,
7279 neutral_op);
7281 return true;
7284 /* Function vect_min_worthwhile_factor.
7286 For a loop where we could vectorize the operation indicated by CODE,
7287 return the minimum vectorization factor that makes it worthwhile
7288 to use generic vectors. */
7289 static unsigned int
7290 vect_min_worthwhile_factor (enum tree_code code)
7292 switch (code)
7294 case PLUS_EXPR:
7295 case MINUS_EXPR:
7296 case NEGATE_EXPR:
7297 return 4;
7299 case BIT_AND_EXPR:
7300 case BIT_IOR_EXPR:
7301 case BIT_XOR_EXPR:
7302 case BIT_NOT_EXPR:
7303 return 2;
7305 default:
7306 return INT_MAX;
7310 /* Return true if VINFO indicates we are doing loop vectorization and if
7311 it is worth decomposing CODE operations into scalar operations for
7312 that loop's vectorization factor. */
7314 bool
7315 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7317 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7318 unsigned HOST_WIDE_INT value;
7319 return (loop_vinfo
7320 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7321 && value >= vect_min_worthwhile_factor (code));
7324 /* Function vectorizable_induction
7326 Check if PHI performs an induction computation that can be vectorized.
7327 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7328 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7329 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7331 bool
7332 vectorizable_induction (gimple *phi,
7333 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7334 gimple **vec_stmt, slp_tree slp_node,
7335 stmt_vector_for_cost *cost_vec)
7337 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7338 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7339 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7340 unsigned ncopies;
7341 bool nested_in_vect_loop = false;
7342 struct loop *iv_loop;
7343 tree vec_def;
7344 edge pe = loop_preheader_edge (loop);
7345 basic_block new_bb;
7346 tree new_vec, vec_init, vec_step, t;
7347 tree new_name;
7348 gimple *new_stmt;
7349 gphi *induction_phi;
7350 tree induc_def, vec_dest;
7351 tree init_expr, step_expr;
7352 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7353 unsigned i;
7354 tree expr;
7355 gimple_seq stmts;
7356 imm_use_iterator imm_iter;
7357 use_operand_p use_p;
7358 gimple *exit_phi;
7359 edge latch_e;
7360 tree loop_arg;
7361 gimple_stmt_iterator si;
7362 basic_block bb = gimple_bb (phi);
7364 if (gimple_code (phi) != GIMPLE_PHI)
7365 return false;
7367 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7368 return false;
7370 /* Make sure it was recognized as induction computation. */
7371 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7372 return false;
7374 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7375 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7377 if (slp_node)
7378 ncopies = 1;
7379 else
7380 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7381 gcc_assert (ncopies >= 1);
7383 /* FORNOW. These restrictions should be relaxed. */
7384 if (nested_in_vect_loop_p (loop, phi))
7386 imm_use_iterator imm_iter;
7387 use_operand_p use_p;
7388 gimple *exit_phi;
7389 edge latch_e;
7390 tree loop_arg;
7392 if (ncopies > 1)
7394 if (dump_enabled_p ())
7395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7396 "multiple types in nested loop.\n");
7397 return false;
7400 /* FORNOW: outer loop induction with SLP not supported. */
7401 if (STMT_SLP_TYPE (stmt_info))
7402 return false;
7404 exit_phi = NULL;
7405 latch_e = loop_latch_edge (loop->inner);
7406 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7407 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7409 gimple *use_stmt = USE_STMT (use_p);
7410 if (is_gimple_debug (use_stmt))
7411 continue;
7413 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7415 exit_phi = use_stmt;
7416 break;
7419 if (exit_phi)
7421 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7422 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7423 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7425 if (dump_enabled_p ())
7426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7427 "inner-loop induction only used outside "
7428 "of the outer vectorized loop.\n");
7429 return false;
7433 nested_in_vect_loop = true;
7434 iv_loop = loop->inner;
7436 else
7437 iv_loop = loop;
7438 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7440 if (slp_node && !nunits.is_constant ())
7442 /* The current SLP code creates the initial value element-by-element. */
7443 if (dump_enabled_p ())
7444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7445 "SLP induction not supported for variable-length"
7446 " vectors.\n");
7447 return false;
7450 if (!vec_stmt) /* transformation not required. */
7452 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7453 if (dump_enabled_p ())
7454 dump_printf_loc (MSG_NOTE, vect_location,
7455 "=== vectorizable_induction ===\n");
7456 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7457 return true;
7460 /* Transform. */
7462 /* Compute a vector variable, initialized with the first VF values of
7463 the induction variable. E.g., for an iv with IV_PHI='X' and
7464 evolution S, for a vector of 4 units, we want to compute:
7465 [X, X + S, X + 2*S, X + 3*S]. */
7467 if (dump_enabled_p ())
7468 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7470 latch_e = loop_latch_edge (iv_loop);
7471 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7473 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7474 gcc_assert (step_expr != NULL_TREE);
7476 pe = loop_preheader_edge (iv_loop);
7477 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7478 loop_preheader_edge (iv_loop));
7480 stmts = NULL;
7481 if (!nested_in_vect_loop)
7483 /* Convert the initial value to the desired type. */
7484 tree new_type = TREE_TYPE (vectype);
7485 init_expr = gimple_convert (&stmts, new_type, init_expr);
7487 /* If we are using the loop mask to "peel" for alignment then we need
7488 to adjust the start value here. */
7489 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7490 if (skip_niters != NULL_TREE)
7492 if (FLOAT_TYPE_P (vectype))
7493 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7494 skip_niters);
7495 else
7496 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7497 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7498 skip_niters, step_expr);
7499 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7500 init_expr, skip_step);
7504 /* Convert the step to the desired type. */
7505 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7507 if (stmts)
7509 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7510 gcc_assert (!new_bb);
7513 /* Find the first insertion point in the BB. */
7514 si = gsi_after_labels (bb);
7516 /* For SLP induction we have to generate several IVs as for example
7517 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7518 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7519 [VF*S, VF*S, VF*S, VF*S] for all. */
7520 if (slp_node)
7522 /* Enforced above. */
7523 unsigned int const_nunits = nunits.to_constant ();
7525 /* Generate [VF*S, VF*S, ... ]. */
7526 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7528 expr = build_int_cst (integer_type_node, vf);
7529 expr = fold_convert (TREE_TYPE (step_expr), expr);
7531 else
7532 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7533 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7534 expr, step_expr);
7535 if (! CONSTANT_CLASS_P (new_name))
7536 new_name = vect_init_vector (phi, new_name,
7537 TREE_TYPE (step_expr), NULL);
7538 new_vec = build_vector_from_val (vectype, new_name);
7539 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7541 /* Now generate the IVs. */
7542 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7543 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7544 unsigned elts = const_nunits * nvects;
7545 unsigned nivs = least_common_multiple (group_size,
7546 const_nunits) / const_nunits;
7547 gcc_assert (elts % group_size == 0);
7548 tree elt = init_expr;
7549 unsigned ivn;
7550 for (ivn = 0; ivn < nivs; ++ivn)
7552 tree_vector_builder elts (vectype, const_nunits, 1);
7553 stmts = NULL;
7554 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7556 if (ivn*const_nunits + eltn >= group_size
7557 && (ivn * const_nunits + eltn) % group_size == 0)
7558 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7559 elt, step_expr);
7560 elts.quick_push (elt);
7562 vec_init = gimple_build_vector (&stmts, &elts);
7563 if (stmts)
7565 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7566 gcc_assert (!new_bb);
7569 /* Create the induction-phi that defines the induction-operand. */
7570 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7571 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7572 set_vinfo_for_stmt (induction_phi,
7573 new_stmt_vec_info (induction_phi, loop_vinfo));
7574 induc_def = PHI_RESULT (induction_phi);
7576 /* Create the iv update inside the loop */
7577 vec_def = make_ssa_name (vec_dest);
7578 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7579 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7580 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7582 /* Set the arguments of the phi node: */
7583 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7584 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7585 UNKNOWN_LOCATION);
7587 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7590 /* Re-use IVs when we can. */
7591 if (ivn < nvects)
7593 unsigned vfp
7594 = least_common_multiple (group_size, const_nunits) / group_size;
7595 /* Generate [VF'*S, VF'*S, ... ]. */
7596 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7598 expr = build_int_cst (integer_type_node, vfp);
7599 expr = fold_convert (TREE_TYPE (step_expr), expr);
7601 else
7602 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7603 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7604 expr, step_expr);
7605 if (! CONSTANT_CLASS_P (new_name))
7606 new_name = vect_init_vector (phi, new_name,
7607 TREE_TYPE (step_expr), NULL);
7608 new_vec = build_vector_from_val (vectype, new_name);
7609 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7610 for (; ivn < nvects; ++ivn)
7612 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7613 tree def;
7614 if (gimple_code (iv) == GIMPLE_PHI)
7615 def = gimple_phi_result (iv);
7616 else
7617 def = gimple_assign_lhs (iv);
7618 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7619 PLUS_EXPR,
7620 def, vec_step);
7621 if (gimple_code (iv) == GIMPLE_PHI)
7622 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7623 else
7625 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7626 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7628 set_vinfo_for_stmt (new_stmt,
7629 new_stmt_vec_info (new_stmt, loop_vinfo));
7630 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7634 return true;
7637 /* Create the vector that holds the initial_value of the induction. */
7638 if (nested_in_vect_loop)
7640 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7641 been created during vectorization of previous stmts. We obtain it
7642 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7643 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7644 /* If the initial value is not of proper type, convert it. */
7645 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7647 new_stmt
7648 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7649 vect_simple_var,
7650 "vec_iv_"),
7651 VIEW_CONVERT_EXPR,
7652 build1 (VIEW_CONVERT_EXPR, vectype,
7653 vec_init));
7654 vec_init = gimple_assign_lhs (new_stmt);
7655 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7656 new_stmt);
7657 gcc_assert (!new_bb);
7658 set_vinfo_for_stmt (new_stmt,
7659 new_stmt_vec_info (new_stmt, loop_vinfo));
7662 else
7664 /* iv_loop is the loop to be vectorized. Create:
7665 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7666 stmts = NULL;
7667 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7669 unsigned HOST_WIDE_INT const_nunits;
7670 if (nunits.is_constant (&const_nunits))
7672 tree_vector_builder elts (vectype, const_nunits, 1);
7673 elts.quick_push (new_name);
7674 for (i = 1; i < const_nunits; i++)
7676 /* Create: new_name_i = new_name + step_expr */
7677 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7678 new_name, step_expr);
7679 elts.quick_push (new_name);
7681 /* Create a vector from [new_name_0, new_name_1, ...,
7682 new_name_nunits-1] */
7683 vec_init = gimple_build_vector (&stmts, &elts);
7685 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7686 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7687 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7688 new_name, step_expr);
7689 else
7691 /* Build:
7692 [base, base, base, ...]
7693 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7694 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7695 gcc_assert (flag_associative_math);
7696 tree index = build_index_vector (vectype, 0, 1);
7697 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7698 new_name);
7699 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7700 step_expr);
7701 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7702 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7703 vec_init, step_vec);
7704 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7705 vec_init, base_vec);
7708 if (stmts)
7710 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7711 gcc_assert (!new_bb);
7716 /* Create the vector that holds the step of the induction. */
7717 if (nested_in_vect_loop)
7718 /* iv_loop is nested in the loop to be vectorized. Generate:
7719 vec_step = [S, S, S, S] */
7720 new_name = step_expr;
7721 else
7723 /* iv_loop is the loop to be vectorized. Generate:
7724 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7725 gimple_seq seq = NULL;
7726 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7728 expr = build_int_cst (integer_type_node, vf);
7729 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7731 else
7732 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7733 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7734 expr, step_expr);
7735 if (seq)
7737 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7738 gcc_assert (!new_bb);
7742 t = unshare_expr (new_name);
7743 gcc_assert (CONSTANT_CLASS_P (new_name)
7744 || TREE_CODE (new_name) == SSA_NAME);
7745 new_vec = build_vector_from_val (vectype, t);
7746 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7749 /* Create the following def-use cycle:
7750 loop prolog:
7751 vec_init = ...
7752 vec_step = ...
7753 loop:
7754 vec_iv = PHI <vec_init, vec_loop>
7756 STMT
7758 vec_loop = vec_iv + vec_step; */
7760 /* Create the induction-phi that defines the induction-operand. */
7761 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7762 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7763 set_vinfo_for_stmt (induction_phi,
7764 new_stmt_vec_info (induction_phi, loop_vinfo));
7765 induc_def = PHI_RESULT (induction_phi);
7767 /* Create the iv update inside the loop */
7768 vec_def = make_ssa_name (vec_dest);
7769 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7770 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7771 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7773 /* Set the arguments of the phi node: */
7774 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7775 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7776 UNKNOWN_LOCATION);
7778 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7780 /* In case that vectorization factor (VF) is bigger than the number
7781 of elements that we can fit in a vectype (nunits), we have to generate
7782 more than one vector stmt - i.e - we need to "unroll" the
7783 vector stmt by a factor VF/nunits. For more details see documentation
7784 in vectorizable_operation. */
7786 if (ncopies > 1)
7788 gimple_seq seq = NULL;
7789 stmt_vec_info prev_stmt_vinfo;
7790 /* FORNOW. This restriction should be relaxed. */
7791 gcc_assert (!nested_in_vect_loop);
7793 /* Create the vector that holds the step of the induction. */
7794 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7796 expr = build_int_cst (integer_type_node, nunits);
7797 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7799 else
7800 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7801 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7802 expr, step_expr);
7803 if (seq)
7805 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7806 gcc_assert (!new_bb);
7809 t = unshare_expr (new_name);
7810 gcc_assert (CONSTANT_CLASS_P (new_name)
7811 || TREE_CODE (new_name) == SSA_NAME);
7812 new_vec = build_vector_from_val (vectype, t);
7813 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7815 vec_def = induc_def;
7816 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7817 for (i = 1; i < ncopies; i++)
7819 /* vec_i = vec_prev + vec_step */
7820 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7821 vec_def, vec_step);
7822 vec_def = make_ssa_name (vec_dest, new_stmt);
7823 gimple_assign_set_lhs (new_stmt, vec_def);
7825 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7826 set_vinfo_for_stmt (new_stmt,
7827 new_stmt_vec_info (new_stmt, loop_vinfo));
7828 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7829 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7833 if (nested_in_vect_loop)
7835 /* Find the loop-closed exit-phi of the induction, and record
7836 the final vector of induction results: */
7837 exit_phi = NULL;
7838 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7840 gimple *use_stmt = USE_STMT (use_p);
7841 if (is_gimple_debug (use_stmt))
7842 continue;
7844 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7846 exit_phi = use_stmt;
7847 break;
7850 if (exit_phi)
7852 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7853 /* FORNOW. Currently not supporting the case that an inner-loop induction
7854 is not used in the outer-loop (i.e. only outside the outer-loop). */
7855 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7856 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7858 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7859 if (dump_enabled_p ())
7861 dump_printf_loc (MSG_NOTE, vect_location,
7862 "vector of inductions after inner-loop:");
7863 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7869 if (dump_enabled_p ())
7871 dump_printf_loc (MSG_NOTE, vect_location,
7872 "transform induction: created def-use cycle: ");
7873 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7874 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7875 SSA_NAME_DEF_STMT (vec_def), 0);
7878 return true;
7881 /* Function vectorizable_live_operation.
7883 STMT computes a value that is used outside the loop. Check if
7884 it can be supported. */
7886 bool
7887 vectorizable_live_operation (gimple *stmt,
7888 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7889 slp_tree slp_node, int slp_index,
7890 gimple **vec_stmt,
7891 stmt_vector_for_cost *)
7893 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7894 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7895 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7896 imm_use_iterator imm_iter;
7897 tree lhs, lhs_type, bitsize, vec_bitsize;
7898 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7899 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7900 int ncopies;
7901 gimple *use_stmt;
7902 auto_vec<tree> vec_oprnds;
7903 int vec_entry = 0;
7904 poly_uint64 vec_index = 0;
7906 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7908 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7909 return false;
7911 /* FORNOW. CHECKME. */
7912 if (nested_in_vect_loop_p (loop, stmt))
7913 return false;
7915 /* If STMT is not relevant and it is a simple assignment and its inputs are
7916 invariant then it can remain in place, unvectorized. The original last
7917 scalar value that it computes will be used. */
7918 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7920 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7921 if (dump_enabled_p ())
7922 dump_printf_loc (MSG_NOTE, vect_location,
7923 "statement is simple and uses invariant. Leaving in "
7924 "place.\n");
7925 return true;
7928 if (slp_node)
7929 ncopies = 1;
7930 else
7931 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7933 if (slp_node)
7935 gcc_assert (slp_index >= 0);
7937 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7938 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7940 /* Get the last occurrence of the scalar index from the concatenation of
7941 all the slp vectors. Calculate which slp vector it is and the index
7942 within. */
7943 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7945 /* Calculate which vector contains the result, and which lane of
7946 that vector we need. */
7947 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7949 if (dump_enabled_p ())
7950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7951 "Cannot determine which vector holds the"
7952 " final result.\n");
7953 return false;
7957 if (!vec_stmt)
7959 /* No transformation required. */
7960 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7962 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7963 OPTIMIZE_FOR_SPEED))
7965 if (dump_enabled_p ())
7966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7967 "can't use a fully-masked loop because "
7968 "the target doesn't support extract last "
7969 "reduction.\n");
7970 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7972 else if (slp_node)
7974 if (dump_enabled_p ())
7975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7976 "can't use a fully-masked loop because an "
7977 "SLP statement is live after the loop.\n");
7978 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7980 else if (ncopies > 1)
7982 if (dump_enabled_p ())
7983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7984 "can't use a fully-masked loop because"
7985 " ncopies is greater than 1.\n");
7986 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7988 else
7990 gcc_assert (ncopies == 1 && !slp_node);
7991 vect_record_loop_mask (loop_vinfo,
7992 &LOOP_VINFO_MASKS (loop_vinfo),
7993 1, vectype);
7996 return true;
7999 /* If stmt has a related stmt, then use that for getting the lhs. */
8000 if (is_pattern_stmt_p (stmt_info))
8001 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8003 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8004 : gimple_get_lhs (stmt);
8005 lhs_type = TREE_TYPE (lhs);
8007 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8008 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8009 : TYPE_SIZE (TREE_TYPE (vectype)));
8010 vec_bitsize = TYPE_SIZE (vectype);
8012 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8013 tree vec_lhs, bitstart;
8014 if (slp_node)
8016 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8018 /* Get the correct slp vectorized stmt. */
8019 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8020 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8021 vec_lhs = gimple_phi_result (phi);
8022 else
8023 vec_lhs = gimple_get_lhs (vec_stmt);
8025 /* Get entry to use. */
8026 bitstart = bitsize_int (vec_index);
8027 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8029 else
8031 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8032 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8033 gcc_checking_assert (ncopies == 1
8034 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8036 /* For multiple copies, get the last copy. */
8037 for (int i = 1; i < ncopies; ++i)
8038 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8039 vec_lhs);
8041 /* Get the last lane in the vector. */
8042 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8045 gimple_seq stmts = NULL;
8046 tree new_tree;
8047 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8049 /* Emit:
8051 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8053 where VEC_LHS is the vectorized live-out result and MASK is
8054 the loop mask for the final iteration. */
8055 gcc_assert (ncopies == 1 && !slp_node);
8056 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8057 tree scalar_res = make_ssa_name (scalar_type);
8058 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8059 1, vectype, 0);
8060 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8061 2, mask, vec_lhs);
8062 gimple_call_set_lhs (new_stmt, scalar_res);
8063 gimple_seq_add_stmt (&stmts, new_stmt);
8065 /* Convert the extracted vector element to the required scalar type. */
8066 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8068 else
8070 tree bftype = TREE_TYPE (vectype);
8071 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8072 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8073 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8074 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8075 &stmts, true, NULL_TREE);
8078 if (stmts)
8079 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8081 /* Replace use of lhs with newly computed result. If the use stmt is a
8082 single arg PHI, just replace all uses of PHI result. It's necessary
8083 because lcssa PHI defining lhs may be before newly inserted stmt. */
8084 use_operand_p use_p;
8085 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8086 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8087 && !is_gimple_debug (use_stmt))
8089 if (gimple_code (use_stmt) == GIMPLE_PHI
8090 && gimple_phi_num_args (use_stmt) == 1)
8092 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8094 else
8096 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8097 SET_USE (use_p, new_tree);
8099 update_stmt (use_stmt);
8102 return true;
8105 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8107 static void
8108 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8110 ssa_op_iter op_iter;
8111 imm_use_iterator imm_iter;
8112 def_operand_p def_p;
8113 gimple *ustmt;
8115 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8117 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8119 basic_block bb;
8121 if (!is_gimple_debug (ustmt))
8122 continue;
8124 bb = gimple_bb (ustmt);
8126 if (!flow_bb_inside_loop_p (loop, bb))
8128 if (gimple_debug_bind_p (ustmt))
8130 if (dump_enabled_p ())
8131 dump_printf_loc (MSG_NOTE, vect_location,
8132 "killing debug use\n");
8134 gimple_debug_bind_reset_value (ustmt);
8135 update_stmt (ustmt);
8137 else
8138 gcc_unreachable ();
8144 /* Given loop represented by LOOP_VINFO, return true if computation of
8145 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8146 otherwise. */
8148 static bool
8149 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8151 /* Constant case. */
8152 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8154 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8155 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8157 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8158 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8159 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8160 return true;
8163 widest_int max;
8164 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8165 /* Check the upper bound of loop niters. */
8166 if (get_max_loop_iterations (loop, &max))
8168 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8169 signop sgn = TYPE_SIGN (type);
8170 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8171 if (max < type_max)
8172 return true;
8174 return false;
8177 /* Return a mask type with half the number of elements as TYPE. */
8179 tree
8180 vect_halve_mask_nunits (tree type)
8182 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8183 return build_truth_vector_type (nunits, current_vector_size);
8186 /* Return a mask type with twice as many elements as TYPE. */
8188 tree
8189 vect_double_mask_nunits (tree type)
8191 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8192 return build_truth_vector_type (nunits, current_vector_size);
8195 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8196 contain a sequence of NVECTORS masks that each control a vector of type
8197 VECTYPE. */
8199 void
8200 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8201 unsigned int nvectors, tree vectype)
8203 gcc_assert (nvectors != 0);
8204 if (masks->length () < nvectors)
8205 masks->safe_grow_cleared (nvectors);
8206 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8207 /* The number of scalars per iteration and the number of vectors are
8208 both compile-time constants. */
8209 unsigned int nscalars_per_iter
8210 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8211 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8212 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8214 rgm->max_nscalars_per_iter = nscalars_per_iter;
8215 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8219 /* Given a complete set of masks MASKS, extract mask number INDEX
8220 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8221 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8223 See the comment above vec_loop_masks for more details about the mask
8224 arrangement. */
8226 tree
8227 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8228 unsigned int nvectors, tree vectype, unsigned int index)
8230 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8231 tree mask_type = rgm->mask_type;
8233 /* Populate the rgroup's mask array, if this is the first time we've
8234 used it. */
8235 if (rgm->masks.is_empty ())
8237 rgm->masks.safe_grow_cleared (nvectors);
8238 for (unsigned int i = 0; i < nvectors; ++i)
8240 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8241 /* Provide a dummy definition until the real one is available. */
8242 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8243 rgm->masks[i] = mask;
8247 tree mask = rgm->masks[index];
8248 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8249 TYPE_VECTOR_SUBPARTS (vectype)))
8251 /* A loop mask for data type X can be reused for data type Y
8252 if X has N times more elements than Y and if Y's elements
8253 are N times bigger than X's. In this case each sequence
8254 of N elements in the loop mask will be all-zero or all-one.
8255 We can then view-convert the mask so that each sequence of
8256 N elements is replaced by a single element. */
8257 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8258 TYPE_VECTOR_SUBPARTS (vectype)));
8259 gimple_seq seq = NULL;
8260 mask_type = build_same_sized_truth_vector_type (vectype);
8261 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8262 if (seq)
8263 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8265 return mask;
8268 /* Scale profiling counters by estimation for LOOP which is vectorized
8269 by factor VF. */
8271 static void
8272 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8274 edge preheader = loop_preheader_edge (loop);
8275 /* Reduce loop iterations by the vectorization factor. */
8276 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8277 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8279 if (freq_h.nonzero_p ())
8281 profile_probability p;
8283 /* Avoid dropping loop body profile counter to 0 because of zero count
8284 in loop's preheader. */
8285 if (!(freq_e == profile_count::zero ()))
8286 freq_e = freq_e.force_nonzero ();
8287 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8288 scale_loop_frequencies (loop, p);
8291 edge exit_e = single_exit (loop);
8292 exit_e->probability = profile_probability::always ()
8293 .apply_scale (1, new_est_niter + 1);
8295 edge exit_l = single_pred_edge (loop->latch);
8296 profile_probability prob = exit_l->probability;
8297 exit_l->probability = exit_e->probability.invert ();
8298 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8299 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8302 /* Function vect_transform_loop.
8304 The analysis phase has determined that the loop is vectorizable.
8305 Vectorize the loop - created vectorized stmts to replace the scalar
8306 stmts in the loop, and update the loop exit condition.
8307 Returns scalar epilogue loop if any. */
8309 struct loop *
8310 vect_transform_loop (loop_vec_info loop_vinfo)
8312 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8313 struct loop *epilogue = NULL;
8314 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8315 int nbbs = loop->num_nodes;
8316 int i;
8317 tree niters_vector = NULL_TREE;
8318 tree step_vector = NULL_TREE;
8319 tree niters_vector_mult_vf = NULL_TREE;
8320 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8321 unsigned int lowest_vf = constant_lower_bound (vf);
8322 bool grouped_store;
8323 bool slp_scheduled = false;
8324 gimple *stmt, *pattern_stmt;
8325 gimple_seq pattern_def_seq = NULL;
8326 gimple_stmt_iterator pattern_def_si = gsi_none ();
8327 bool transform_pattern_stmt = false;
8328 bool check_profitability = false;
8329 unsigned int th;
8331 if (dump_enabled_p ())
8332 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8334 /* Use the more conservative vectorization threshold. If the number
8335 of iterations is constant assume the cost check has been performed
8336 by our caller. If the threshold makes all loops profitable that
8337 run at least the (estimated) vectorization factor number of times
8338 checking is pointless, too. */
8339 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8340 if (th >= vect_vf_for_cost (loop_vinfo)
8341 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8343 if (dump_enabled_p ())
8344 dump_printf_loc (MSG_NOTE, vect_location,
8345 "Profitability threshold is %d loop iterations.\n",
8346 th);
8347 check_profitability = true;
8350 /* Make sure there exists a single-predecessor exit bb. Do this before
8351 versioning. */
8352 edge e = single_exit (loop);
8353 if (! single_pred_p (e->dest))
8355 split_loop_exit_edge (e);
8356 if (dump_enabled_p ())
8357 dump_printf (MSG_NOTE, "split exit edge\n");
8360 /* Version the loop first, if required, so the profitability check
8361 comes first. */
8363 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8365 poly_uint64 versioning_threshold
8366 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8367 if (check_profitability
8368 && ordered_p (poly_uint64 (th), versioning_threshold))
8370 versioning_threshold = ordered_max (poly_uint64 (th),
8371 versioning_threshold);
8372 check_profitability = false;
8374 vect_loop_versioning (loop_vinfo, th, check_profitability,
8375 versioning_threshold);
8376 check_profitability = false;
8379 /* Make sure there exists a single-predecessor exit bb also on the
8380 scalar loop copy. Do this after versioning but before peeling
8381 so CFG structure is fine for both scalar and if-converted loop
8382 to make slpeel_duplicate_current_defs_from_edges face matched
8383 loop closed PHI nodes on the exit. */
8384 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8386 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8387 if (! single_pred_p (e->dest))
8389 split_loop_exit_edge (e);
8390 if (dump_enabled_p ())
8391 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8395 tree niters = vect_build_loop_niters (loop_vinfo);
8396 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8397 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8398 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8399 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8400 &step_vector, &niters_vector_mult_vf, th,
8401 check_profitability, niters_no_overflow);
8403 if (niters_vector == NULL_TREE)
8405 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8406 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8407 && known_eq (lowest_vf, vf))
8409 niters_vector
8410 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8411 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8412 step_vector = build_one_cst (TREE_TYPE (niters));
8414 else
8415 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8416 &step_vector, niters_no_overflow);
8419 /* 1) Make sure the loop header has exactly two entries
8420 2) Make sure we have a preheader basic block. */
8422 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8424 split_edge (loop_preheader_edge (loop));
8426 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8427 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8428 /* This will deal with any possible peeling. */
8429 vect_prepare_for_masked_peels (loop_vinfo);
8431 /* FORNOW: the vectorizer supports only loops which body consist
8432 of one basic block (header + empty latch). When the vectorizer will
8433 support more involved loop forms, the order by which the BBs are
8434 traversed need to be reconsidered. */
8436 for (i = 0; i < nbbs; i++)
8438 basic_block bb = bbs[i];
8439 stmt_vec_info stmt_info;
8441 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8442 gsi_next (&si))
8444 gphi *phi = si.phi ();
8445 if (dump_enabled_p ())
8447 dump_printf_loc (MSG_NOTE, vect_location,
8448 "------>vectorizing phi: ");
8449 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8451 stmt_info = vinfo_for_stmt (phi);
8452 if (!stmt_info)
8453 continue;
8455 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8456 vect_loop_kill_debug_uses (loop, phi);
8458 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8459 && !STMT_VINFO_LIVE_P (stmt_info))
8460 continue;
8462 if (STMT_VINFO_VECTYPE (stmt_info)
8463 && (maybe_ne
8464 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8465 && dump_enabled_p ())
8466 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8468 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8469 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8470 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8471 && ! PURE_SLP_STMT (stmt_info))
8473 if (dump_enabled_p ())
8474 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8475 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8479 pattern_stmt = NULL;
8480 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8481 !gsi_end_p (si) || transform_pattern_stmt;)
8483 bool is_store;
8485 if (transform_pattern_stmt)
8486 stmt = pattern_stmt;
8487 else
8489 stmt = gsi_stmt (si);
8490 /* During vectorization remove existing clobber stmts. */
8491 if (gimple_clobber_p (stmt))
8493 unlink_stmt_vdef (stmt);
8494 gsi_remove (&si, true);
8495 release_defs (stmt);
8496 continue;
8500 if (dump_enabled_p ())
8502 dump_printf_loc (MSG_NOTE, vect_location,
8503 "------>vectorizing statement: ");
8504 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8507 stmt_info = vinfo_for_stmt (stmt);
8509 /* vector stmts created in the outer-loop during vectorization of
8510 stmts in an inner-loop may not have a stmt_info, and do not
8511 need to be vectorized. */
8512 if (!stmt_info)
8514 gsi_next (&si);
8515 continue;
8518 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8519 vect_loop_kill_debug_uses (loop, stmt);
8521 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8522 && !STMT_VINFO_LIVE_P (stmt_info))
8524 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8525 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8526 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8527 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8529 stmt = pattern_stmt;
8530 stmt_info = vinfo_for_stmt (stmt);
8532 else
8534 gsi_next (&si);
8535 continue;
8538 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8539 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8540 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8541 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8542 transform_pattern_stmt = true;
8544 /* If pattern statement has def stmts, vectorize them too. */
8545 if (is_pattern_stmt_p (stmt_info))
8547 if (pattern_def_seq == NULL)
8549 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8550 pattern_def_si = gsi_start (pattern_def_seq);
8552 else if (!gsi_end_p (pattern_def_si))
8553 gsi_next (&pattern_def_si);
8554 if (pattern_def_seq != NULL)
8556 gimple *pattern_def_stmt = NULL;
8557 stmt_vec_info pattern_def_stmt_info = NULL;
8559 while (!gsi_end_p (pattern_def_si))
8561 pattern_def_stmt = gsi_stmt (pattern_def_si);
8562 pattern_def_stmt_info
8563 = vinfo_for_stmt (pattern_def_stmt);
8564 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8565 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8566 break;
8567 gsi_next (&pattern_def_si);
8570 if (!gsi_end_p (pattern_def_si))
8572 if (dump_enabled_p ())
8574 dump_printf_loc (MSG_NOTE, vect_location,
8575 "==> vectorizing pattern def "
8576 "stmt: ");
8577 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8578 pattern_def_stmt, 0);
8581 stmt = pattern_def_stmt;
8582 stmt_info = pattern_def_stmt_info;
8584 else
8586 pattern_def_si = gsi_none ();
8587 transform_pattern_stmt = false;
8590 else
8591 transform_pattern_stmt = false;
8594 if (STMT_VINFO_VECTYPE (stmt_info))
8596 poly_uint64 nunits
8597 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8598 if (!STMT_SLP_TYPE (stmt_info)
8599 && maybe_ne (nunits, vf)
8600 && dump_enabled_p ())
8601 /* For SLP VF is set according to unrolling factor, and not
8602 to vector size, hence for SLP this print is not valid. */
8603 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8606 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8607 reached. */
8608 if (STMT_SLP_TYPE (stmt_info))
8610 if (!slp_scheduled)
8612 slp_scheduled = true;
8614 if (dump_enabled_p ())
8615 dump_printf_loc (MSG_NOTE, vect_location,
8616 "=== scheduling SLP instances ===\n");
8618 vect_schedule_slp (loop_vinfo);
8621 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8622 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8624 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8626 pattern_def_seq = NULL;
8627 gsi_next (&si);
8629 continue;
8633 /* -------- vectorize statement ------------ */
8634 if (dump_enabled_p ())
8635 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8637 grouped_store = false;
8638 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8639 if (is_store)
8641 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8643 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8644 interleaving chain was completed - free all the stores in
8645 the chain. */
8646 gsi_next (&si);
8647 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8649 else
8651 /* Free the attached stmt_vec_info and remove the stmt. */
8652 gimple *store = gsi_stmt (si);
8653 free_stmt_vec_info (store);
8654 unlink_stmt_vdef (store);
8655 gsi_remove (&si, true);
8656 release_defs (store);
8659 /* Stores can only appear at the end of pattern statements. */
8660 gcc_assert (!transform_pattern_stmt);
8661 pattern_def_seq = NULL;
8663 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8665 pattern_def_seq = NULL;
8666 gsi_next (&si);
8668 } /* stmts in BB */
8670 /* Stub out scalar statements that must not survive vectorization.
8671 Doing this here helps with grouped statements, or statements that
8672 are involved in patterns. */
8673 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8674 !gsi_end_p (gsi); gsi_next (&gsi))
8676 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8677 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8679 tree lhs = gimple_get_lhs (call);
8680 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8682 tree zero = build_zero_cst (TREE_TYPE (lhs));
8683 gimple *new_stmt = gimple_build_assign (lhs, zero);
8684 gsi_replace (&gsi, new_stmt, true);
8688 } /* BBs in loop */
8690 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8691 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8692 if (integer_onep (step_vector))
8693 niters_no_overflow = true;
8694 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8695 niters_vector_mult_vf, !niters_no_overflow);
8697 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8698 scale_profile_for_vect_loop (loop, assumed_vf);
8700 /* True if the final iteration might not handle a full vector's
8701 worth of scalar iterations. */
8702 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8703 /* The minimum number of iterations performed by the epilogue. This
8704 is 1 when peeling for gaps because we always need a final scalar
8705 iteration. */
8706 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8707 /* +1 to convert latch counts to loop iteration counts,
8708 -min_epilogue_iters to remove iterations that cannot be performed
8709 by the vector code. */
8710 int bias_for_lowest = 1 - min_epilogue_iters;
8711 int bias_for_assumed = bias_for_lowest;
8712 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8713 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8715 /* When the amount of peeling is known at compile time, the first
8716 iteration will have exactly alignment_npeels active elements.
8717 In the worst case it will have at least one. */
8718 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8719 bias_for_lowest += lowest_vf - min_first_active;
8720 bias_for_assumed += assumed_vf - min_first_active;
8722 /* In these calculations the "- 1" converts loop iteration counts
8723 back to latch counts. */
8724 if (loop->any_upper_bound)
8725 loop->nb_iterations_upper_bound
8726 = (final_iter_may_be_partial
8727 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8728 lowest_vf) - 1
8729 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8730 lowest_vf) - 1);
8731 if (loop->any_likely_upper_bound)
8732 loop->nb_iterations_likely_upper_bound
8733 = (final_iter_may_be_partial
8734 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8735 + bias_for_lowest, lowest_vf) - 1
8736 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8737 + bias_for_lowest, lowest_vf) - 1);
8738 if (loop->any_estimate)
8739 loop->nb_iterations_estimate
8740 = (final_iter_may_be_partial
8741 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8742 assumed_vf) - 1
8743 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8744 assumed_vf) - 1);
8746 if (dump_enabled_p ())
8748 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8750 dump_printf_loc (MSG_NOTE, vect_location,
8751 "LOOP VECTORIZED\n");
8752 if (loop->inner)
8753 dump_printf_loc (MSG_NOTE, vect_location,
8754 "OUTER LOOP VECTORIZED\n");
8755 dump_printf (MSG_NOTE, "\n");
8757 else
8759 dump_printf_loc (MSG_NOTE, vect_location,
8760 "LOOP EPILOGUE VECTORIZED (VS=");
8761 dump_dec (MSG_NOTE, current_vector_size);
8762 dump_printf (MSG_NOTE, ")\n");
8766 /* Free SLP instances here because otherwise stmt reference counting
8767 won't work. */
8768 slp_instance instance;
8769 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8770 vect_free_slp_instance (instance);
8771 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8772 /* Clear-up safelen field since its value is invalid after vectorization
8773 since vectorized loop can have loop-carried dependencies. */
8774 loop->safelen = 0;
8776 /* Don't vectorize epilogue for epilogue. */
8777 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8778 epilogue = NULL;
8780 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8781 epilogue = NULL;
8783 if (epilogue)
8785 auto_vector_sizes vector_sizes;
8786 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8787 unsigned int next_size = 0;
8789 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8790 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8791 && known_eq (vf, lowest_vf))
8793 unsigned int eiters
8794 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8795 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8796 eiters = eiters % lowest_vf;
8797 epilogue->nb_iterations_upper_bound = eiters - 1;
8799 unsigned int ratio;
8800 while (next_size < vector_sizes.length ()
8801 && !(constant_multiple_p (current_vector_size,
8802 vector_sizes[next_size], &ratio)
8803 && eiters >= lowest_vf / ratio))
8804 next_size += 1;
8806 else
8807 while (next_size < vector_sizes.length ()
8808 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8809 next_size += 1;
8811 if (next_size == vector_sizes.length ())
8812 epilogue = NULL;
8815 if (epilogue)
8817 epilogue->force_vectorize = loop->force_vectorize;
8818 epilogue->safelen = loop->safelen;
8819 epilogue->dont_vectorize = false;
8821 /* We may need to if-convert epilogue to vectorize it. */
8822 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8823 tree_if_conversion (epilogue);
8826 return epilogue;
8829 /* The code below is trying to perform simple optimization - revert
8830 if-conversion for masked stores, i.e. if the mask of a store is zero
8831 do not perform it and all stored value producers also if possible.
8832 For example,
8833 for (i=0; i<n; i++)
8834 if (c[i])
8836 p1[i] += 1;
8837 p2[i] = p3[i] +2;
8839 this transformation will produce the following semi-hammock:
8841 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8843 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8844 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8845 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8846 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8847 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8848 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8852 void
8853 optimize_mask_stores (struct loop *loop)
8855 basic_block *bbs = get_loop_body (loop);
8856 unsigned nbbs = loop->num_nodes;
8857 unsigned i;
8858 basic_block bb;
8859 struct loop *bb_loop;
8860 gimple_stmt_iterator gsi;
8861 gimple *stmt;
8862 auto_vec<gimple *> worklist;
8864 vect_location = find_loop_location (loop);
8865 /* Pick up all masked stores in loop if any. */
8866 for (i = 0; i < nbbs; i++)
8868 bb = bbs[i];
8869 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8870 gsi_next (&gsi))
8872 stmt = gsi_stmt (gsi);
8873 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8874 worklist.safe_push (stmt);
8878 free (bbs);
8879 if (worklist.is_empty ())
8880 return;
8882 /* Loop has masked stores. */
8883 while (!worklist.is_empty ())
8885 gimple *last, *last_store;
8886 edge e, efalse;
8887 tree mask;
8888 basic_block store_bb, join_bb;
8889 gimple_stmt_iterator gsi_to;
8890 tree vdef, new_vdef;
8891 gphi *phi;
8892 tree vectype;
8893 tree zero;
8895 last = worklist.pop ();
8896 mask = gimple_call_arg (last, 2);
8897 bb = gimple_bb (last);
8898 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8899 the same loop as if_bb. It could be different to LOOP when two
8900 level loop-nest is vectorized and mask_store belongs to the inner
8901 one. */
8902 e = split_block (bb, last);
8903 bb_loop = bb->loop_father;
8904 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8905 join_bb = e->dest;
8906 store_bb = create_empty_bb (bb);
8907 add_bb_to_loop (store_bb, bb_loop);
8908 e->flags = EDGE_TRUE_VALUE;
8909 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8910 /* Put STORE_BB to likely part. */
8911 efalse->probability = profile_probability::unlikely ();
8912 store_bb->count = efalse->count ();
8913 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8914 if (dom_info_available_p (CDI_DOMINATORS))
8915 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8916 if (dump_enabled_p ())
8917 dump_printf_loc (MSG_NOTE, vect_location,
8918 "Create new block %d to sink mask stores.",
8919 store_bb->index);
8920 /* Create vector comparison with boolean result. */
8921 vectype = TREE_TYPE (mask);
8922 zero = build_zero_cst (vectype);
8923 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8924 gsi = gsi_last_bb (bb);
8925 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8926 /* Create new PHI node for vdef of the last masked store:
8927 .MEM_2 = VDEF <.MEM_1>
8928 will be converted to
8929 .MEM.3 = VDEF <.MEM_1>
8930 and new PHI node will be created in join bb
8931 .MEM_2 = PHI <.MEM_1, .MEM_3>
8933 vdef = gimple_vdef (last);
8934 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8935 gimple_set_vdef (last, new_vdef);
8936 phi = create_phi_node (vdef, join_bb);
8937 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8939 /* Put all masked stores with the same mask to STORE_BB if possible. */
8940 while (true)
8942 gimple_stmt_iterator gsi_from;
8943 gimple *stmt1 = NULL;
8945 /* Move masked store to STORE_BB. */
8946 last_store = last;
8947 gsi = gsi_for_stmt (last);
8948 gsi_from = gsi;
8949 /* Shift GSI to the previous stmt for further traversal. */
8950 gsi_prev (&gsi);
8951 gsi_to = gsi_start_bb (store_bb);
8952 gsi_move_before (&gsi_from, &gsi_to);
8953 /* Setup GSI_TO to the non-empty block start. */
8954 gsi_to = gsi_start_bb (store_bb);
8955 if (dump_enabled_p ())
8957 dump_printf_loc (MSG_NOTE, vect_location,
8958 "Move stmt to created bb\n");
8959 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8961 /* Move all stored value producers if possible. */
8962 while (!gsi_end_p (gsi))
8964 tree lhs;
8965 imm_use_iterator imm_iter;
8966 use_operand_p use_p;
8967 bool res;
8969 /* Skip debug statements. */
8970 if (is_gimple_debug (gsi_stmt (gsi)))
8972 gsi_prev (&gsi);
8973 continue;
8975 stmt1 = gsi_stmt (gsi);
8976 /* Do not consider statements writing to memory or having
8977 volatile operand. */
8978 if (gimple_vdef (stmt1)
8979 || gimple_has_volatile_ops (stmt1))
8980 break;
8981 gsi_from = gsi;
8982 gsi_prev (&gsi);
8983 lhs = gimple_get_lhs (stmt1);
8984 if (!lhs)
8985 break;
8987 /* LHS of vectorized stmt must be SSA_NAME. */
8988 if (TREE_CODE (lhs) != SSA_NAME)
8989 break;
8991 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8993 /* Remove dead scalar statement. */
8994 if (has_zero_uses (lhs))
8996 gsi_remove (&gsi_from, true);
8997 continue;
9001 /* Check that LHS does not have uses outside of STORE_BB. */
9002 res = true;
9003 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9005 gimple *use_stmt;
9006 use_stmt = USE_STMT (use_p);
9007 if (is_gimple_debug (use_stmt))
9008 continue;
9009 if (gimple_bb (use_stmt) != store_bb)
9011 res = false;
9012 break;
9015 if (!res)
9016 break;
9018 if (gimple_vuse (stmt1)
9019 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9020 break;
9022 /* Can move STMT1 to STORE_BB. */
9023 if (dump_enabled_p ())
9025 dump_printf_loc (MSG_NOTE, vect_location,
9026 "Move stmt to created bb\n");
9027 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9029 gsi_move_before (&gsi_from, &gsi_to);
9030 /* Shift GSI_TO for further insertion. */
9031 gsi_prev (&gsi_to);
9033 /* Put other masked stores with the same mask to STORE_BB. */
9034 if (worklist.is_empty ()
9035 || gimple_call_arg (worklist.last (), 2) != mask
9036 || worklist.last () != stmt1)
9037 break;
9038 last = worklist.pop ();
9040 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);