/cp
[official-gcc.git] / gcc / tree-vect-loop.c
blob385d62f4cb3adcaa941a22c34bd4439a499a9da6
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
184 if (stmt_vectype)
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return true;
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
221 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
222 return false;
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
227 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
229 /* If a pattern statement has def stmts, analyze them too. */
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
234 stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location,
238 "==> examining pattern def stmt: ");
239 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
240 def_stmt_info->stmt, 0);
242 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
243 vf, mask_producers))
244 return false;
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE, vect_location,
250 "==> examining pattern statement: ");
251 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
253 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
254 return false;
257 return true;
260 /* Function vect_determine_vectorization_factor
262 Determine the vectorization factor (VF). VF is the number of data elements
263 that are operated upon in parallel in a single iteration of the vectorized
264 loop. For example, when vectorizing a loop that operates on 4byte elements,
265 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
266 elements can fit in a single vector register.
268 We currently support vectorization of loops in which all types operated upon
269 are of the same size. Therefore this function currently sets VF according to
270 the size of the types operated upon, and fails if there are multiple sizes
271 in the loop.
273 VF is also the factor by which the loop iterations are strip-mined, e.g.:
274 original loop:
275 for (i=0; i<N; i++){
276 a[i] = b[i] + c[i];
279 vectorized loop:
280 for (i=0; i<N; i+=VF){
281 a[i:VF] = b[i:VF] + c[i:VF];
285 static bool
286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
289 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
290 unsigned nbbs = loop->num_nodes;
291 poly_uint64 vectorization_factor = 1;
292 tree scalar_type = NULL_TREE;
293 gphi *phi;
294 tree vectype;
295 stmt_vec_info stmt_info;
296 unsigned i;
297 auto_vec<stmt_vec_info> mask_producers;
299 if (dump_enabled_p ())
300 dump_printf_loc (MSG_NOTE, vect_location,
301 "=== vect_determine_vectorization_factor ===\n");
303 for (i = 0; i < nbbs; i++)
305 basic_block bb = bbs[i];
307 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
308 gsi_next (&si))
310 phi = si.phi ();
311 stmt_info = vinfo_for_stmt (phi);
312 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
315 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
318 gcc_assert (stmt_info);
320 if (STMT_VINFO_RELEVANT_P (stmt_info)
321 || STMT_VINFO_LIVE_P (stmt_info))
323 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
324 scalar_type = TREE_TYPE (PHI_RESULT (phi));
326 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location,
329 "get vectype for scalar type: ");
330 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
331 dump_printf (MSG_NOTE, "\n");
334 vectype = get_vectype_for_scalar_type (scalar_type);
335 if (!vectype)
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 "not vectorized: unsupported "
341 "data-type ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
343 scalar_type);
344 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
346 return false;
348 STMT_VINFO_VECTYPE (stmt_info) = vectype;
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
353 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
354 dump_printf (MSG_NOTE, "\n");
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
360 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
361 dump_printf (MSG_NOTE, "\n");
364 vect_update_max_nunits (&vectorization_factor, vectype);
368 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
369 gsi_next (&si))
371 stmt_info = vinfo_for_stmt (gsi_stmt (si));
372 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
373 &mask_producers))
374 return false;
378 /* TODO: Analyze cost. Decide if worth while to vectorize. */
379 if (dump_enabled_p ())
381 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
382 dump_dec (MSG_NOTE, vectorization_factor);
383 dump_printf (MSG_NOTE, "\n");
386 if (known_le (vectorization_factor, 1U))
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "not vectorized: unsupported data-type\n");
391 return false;
393 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
395 for (i = 0; i < mask_producers.length (); i++)
397 stmt_info = mask_producers[i];
398 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
399 if (!mask_type)
400 return false;
401 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
404 return true;
408 /* Function vect_is_simple_iv_evolution.
410 FORNOW: A simple evolution of an induction variables in the loop is
411 considered a polynomial evolution. */
413 static bool
414 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
415 tree * step)
417 tree init_expr;
418 tree step_expr;
419 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
420 basic_block bb;
422 /* When there is no evolution in this loop, the evolution function
423 is not "simple". */
424 if (evolution_part == NULL_TREE)
425 return false;
427 /* When the evolution is a polynomial of degree >= 2
428 the evolution function is not "simple". */
429 if (tree_is_chrec (evolution_part))
430 return false;
432 step_expr = evolution_part;
433 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
439 dump_printf (MSG_NOTE, ", init: ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
441 dump_printf (MSG_NOTE, "\n");
444 *init = init_expr;
445 *step = step_expr;
447 if (TREE_CODE (step_expr) != INTEGER_CST
448 && (TREE_CODE (step_expr) != SSA_NAME
449 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
450 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
451 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
452 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
453 || !flag_associative_math)))
454 && (TREE_CODE (step_expr) != REAL_CST
455 || !flag_associative_math))
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
459 "step unknown.\n");
460 return false;
463 return true;
466 /* Function vect_analyze_scalar_cycles_1.
468 Examine the cross iteration def-use cycles of scalar variables
469 in LOOP. LOOP_VINFO represents the loop that is now being
470 considered for vectorization (can be LOOP, or an outer-loop
471 enclosing LOOP). */
473 static void
474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
476 basic_block bb = loop->header;
477 tree init, step;
478 auto_vec<gimple *, 64> worklist;
479 gphi_iterator gsi;
480 bool double_reduc;
482 if (dump_enabled_p ())
483 dump_printf_loc (MSG_NOTE, vect_location,
484 "=== vect_analyze_scalar_cycles ===\n");
486 /* First - identify all inductions. Reduction detection assumes that all the
487 inductions have been identified, therefore, this order must not be
488 changed. */
489 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
491 gphi *phi = gsi.phi ();
492 tree access_fn = NULL;
493 tree def = PHI_RESULT (phi);
494 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
496 if (dump_enabled_p ())
498 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
499 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
502 /* Skip virtual phi's. The data dependences that are associated with
503 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
504 if (virtual_operand_p (def))
505 continue;
507 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
509 /* Analyze the evolution function. */
510 access_fn = analyze_scalar_evolution (loop, def);
511 if (access_fn)
513 STRIP_NOPS (access_fn);
514 if (dump_enabled_p ())
516 dump_printf_loc (MSG_NOTE, vect_location,
517 "Access function of PHI: ");
518 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
519 dump_printf (MSG_NOTE, "\n");
521 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 = initial_condition_in_loop_num (access_fn, loop->num);
523 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
524 = evolution_part_in_loop_num (access_fn, loop->num);
527 if (!access_fn
528 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
529 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
530 && TREE_CODE (step) != INTEGER_CST))
532 worklist.safe_push (phi);
533 continue;
536 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
537 != NULL_TREE);
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
542 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
546 /* Second - identify all reductions and nested cycles. */
547 while (worklist.length () > 0)
549 gimple *phi = worklist.pop ();
550 tree def = PHI_RESULT (phi);
551 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
552 gimple *reduc_stmt;
554 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
557 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
560 gcc_assert (!virtual_operand_p (def)
561 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
563 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
564 &double_reduc, false);
565 if (reduc_stmt)
567 if (double_reduc)
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "Detected double reduction.\n");
573 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
574 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
575 vect_double_reduction_def;
577 else
579 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
581 if (dump_enabled_p ())
582 dump_printf_loc (MSG_NOTE, vect_location,
583 "Detected vectorizable nested cycle.\n");
585 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
586 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
587 vect_nested_cycle;
589 else
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "Detected reduction.\n");
595 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
596 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
597 vect_reduction_def;
598 /* Store the reduction cycles for possible vectorization in
599 loop-aware SLP if it was not detected as reduction
600 chain. */
601 if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
602 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
606 else
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
609 "Unknown def-use cycle pattern.\n");
614 /* Function vect_analyze_scalar_cycles.
616 Examine the cross iteration def-use cycles of scalar variables, by
617 analyzing the loop-header PHIs of scalar variables. Classify each
618 cycle as one of the following: invariant, induction, reduction, unknown.
619 We do that for the loop represented by LOOP_VINFO, and also to its
620 inner-loop, if exists.
621 Examples for scalar cycles:
623 Example1: reduction:
625 loop1:
626 for (i=0; i<N; i++)
627 sum += a[i];
629 Example2: induction:
631 loop2:
632 for (i=0; i<N; i++)
633 a[i] = i; */
635 static void
636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
638 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
640 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
642 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
643 Reductions in such inner-loop therefore have different properties than
644 the reductions in the nest that gets vectorized:
645 1. When vectorized, they are executed in the same order as in the original
646 scalar loop, so we can't change the order of computation when
647 vectorizing them.
648 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
649 current checks are too strict. */
651 if (loop->inner)
652 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
655 /* Transfer group and reduction information from STMT to its pattern stmt. */
657 static void
658 vect_fixup_reduc_chain (gimple *stmt)
660 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
661 gimple *stmtp;
662 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
663 && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
664 REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
665 = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
668 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
669 REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
670 stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
671 if (stmt)
672 REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
673 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
675 while (stmt);
676 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
679 /* Fixup scalar cycles that now have their stmts detected as patterns. */
681 static void
682 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
684 gimple *first;
685 unsigned i;
687 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
688 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
690 gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
691 while (next)
693 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
694 break;
695 next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
697 /* If not all stmt in the chain are patterns try to handle
698 the chain without patterns. */
699 if (! next)
701 vect_fixup_reduc_chain (first);
702 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
703 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
708 /* Function vect_get_loop_niters.
710 Determine how many iterations the loop is executed and place it
711 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
712 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
713 niter information holds in ASSUMPTIONS.
715 Return the loop exit condition. */
718 static gcond *
719 vect_get_loop_niters (struct loop *loop, tree *assumptions,
720 tree *number_of_iterations, tree *number_of_iterationsm1)
722 edge exit = single_exit (loop);
723 struct tree_niter_desc niter_desc;
724 tree niter_assumptions, niter, may_be_zero;
725 gcond *cond = get_loop_exit_condition (loop);
727 *assumptions = boolean_true_node;
728 *number_of_iterationsm1 = chrec_dont_know;
729 *number_of_iterations = chrec_dont_know;
730 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location,
732 "=== get_loop_niters ===\n");
734 if (!exit)
735 return cond;
737 niter = chrec_dont_know;
738 may_be_zero = NULL_TREE;
739 niter_assumptions = boolean_true_node;
740 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
741 || chrec_contains_undetermined (niter_desc.niter))
742 return cond;
744 niter_assumptions = niter_desc.assumptions;
745 may_be_zero = niter_desc.may_be_zero;
746 niter = niter_desc.niter;
748 if (may_be_zero && integer_zerop (may_be_zero))
749 may_be_zero = NULL_TREE;
751 if (may_be_zero)
753 if (COMPARISON_CLASS_P (may_be_zero))
755 /* Try to combine may_be_zero with assumptions, this can simplify
756 computation of niter expression. */
757 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
758 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
759 niter_assumptions,
760 fold_build1 (TRUTH_NOT_EXPR,
761 boolean_type_node,
762 may_be_zero));
763 else
764 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
765 build_int_cst (TREE_TYPE (niter), 0),
766 rewrite_to_non_trapping_overflow (niter));
768 may_be_zero = NULL_TREE;
770 else if (integer_nonzerop (may_be_zero))
772 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
773 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
774 return cond;
776 else
777 return cond;
780 *assumptions = niter_assumptions;
781 *number_of_iterationsm1 = niter;
783 /* We want the number of loop header executions which is the number
784 of latch executions plus one.
785 ??? For UINT_MAX latch executions this number overflows to zero
786 for loops like do { n++; } while (n != 0); */
787 if (niter && !chrec_contains_undetermined (niter))
788 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
789 build_int_cst (TREE_TYPE (niter), 1));
790 *number_of_iterations = niter;
792 return cond;
795 /* Function bb_in_loop_p
797 Used as predicate for dfs order traversal of the loop bbs. */
799 static bool
800 bb_in_loop_p (const_basic_block bb, const void *data)
802 const struct loop *const loop = (const struct loop *)data;
803 if (flow_bb_inside_loop_p (loop, bb))
804 return true;
805 return false;
809 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
810 stmt_vec_info structs for all the stmts in LOOP_IN. */
812 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
813 : vec_info (vec_info::loop, init_cost (loop_in)),
814 loop (loop_in),
815 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
816 num_itersm1 (NULL_TREE),
817 num_iters (NULL_TREE),
818 num_iters_unchanged (NULL_TREE),
819 num_iters_assumptions (NULL_TREE),
820 th (0),
821 versioning_threshold (0),
822 vectorization_factor (0),
823 max_vectorization_factor (0),
824 mask_skip_niters (NULL_TREE),
825 mask_compare_type (NULL_TREE),
826 unaligned_dr (NULL),
827 peeling_for_alignment (0),
828 ptr_mask (0),
829 ivexpr_map (NULL),
830 slp_unrolling_factor (1),
831 single_scalar_iteration_cost (0),
832 vectorizable (false),
833 can_fully_mask_p (true),
834 fully_masked_p (false),
835 peeling_for_gaps (false),
836 peeling_for_niter (false),
837 operands_swapped (false),
838 no_data_dependencies (false),
839 has_mask_store (false),
840 scalar_loop (NULL),
841 orig_loop_info (NULL)
843 /* Create/Update stmt_info for all stmts in the loop. */
844 basic_block *body = get_loop_body (loop);
845 for (unsigned int i = 0; i < loop->num_nodes; i++)
847 basic_block bb = body[i];
848 gimple_stmt_iterator si;
850 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
852 gimple *phi = gsi_stmt (si);
853 gimple_set_uid (phi, 0);
854 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
857 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
859 gimple *stmt = gsi_stmt (si);
860 gimple_set_uid (stmt, 0);
861 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
864 free (body);
866 /* CHECKME: We want to visit all BBs before their successors (except for
867 latch blocks, for which this assertion wouldn't hold). In the simple
868 case of the loop forms we allow, a dfs order of the BBs would the same
869 as reversed postorder traversal, so we are safe. */
871 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
872 bbs, loop->num_nodes, loop);
873 gcc_assert (nbbs == loop->num_nodes);
876 /* Free all levels of MASKS. */
878 void
879 release_vec_loop_masks (vec_loop_masks *masks)
881 rgroup_masks *rgm;
882 unsigned int i;
883 FOR_EACH_VEC_ELT (*masks, i, rgm)
884 rgm->masks.release ();
885 masks->release ();
888 /* Free all memory used by the _loop_vec_info, as well as all the
889 stmt_vec_info structs of all the stmts in the loop. */
891 _loop_vec_info::~_loop_vec_info ()
893 int nbbs;
894 gimple_stmt_iterator si;
895 int j;
897 /* ??? We're releasing loop_vinfos en-block. */
898 set_stmt_vec_info_vec (&stmt_vec_infos);
899 nbbs = loop->num_nodes;
900 for (j = 0; j < nbbs; j++)
902 basic_block bb = bbs[j];
903 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
904 free_stmt_vec_info (gsi_stmt (si));
906 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
908 gimple *stmt = gsi_stmt (si);
910 /* We may have broken canonical form by moving a constant
911 into RHS1 of a commutative op. Fix such occurrences. */
912 if (operands_swapped && is_gimple_assign (stmt))
914 enum tree_code code = gimple_assign_rhs_code (stmt);
916 if ((code == PLUS_EXPR
917 || code == POINTER_PLUS_EXPR
918 || code == MULT_EXPR)
919 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
920 swap_ssa_operands (stmt,
921 gimple_assign_rhs1_ptr (stmt),
922 gimple_assign_rhs2_ptr (stmt));
923 else if (code == COND_EXPR
924 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
926 tree cond_expr = gimple_assign_rhs1 (stmt);
927 enum tree_code cond_code = TREE_CODE (cond_expr);
929 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
931 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
932 0));
933 cond_code = invert_tree_comparison (cond_code,
934 honor_nans);
935 if (cond_code != ERROR_MARK)
937 TREE_SET_CODE (cond_expr, cond_code);
938 swap_ssa_operands (stmt,
939 gimple_assign_rhs2_ptr (stmt),
940 gimple_assign_rhs3_ptr (stmt));
946 /* Free stmt_vec_info. */
947 free_stmt_vec_info (stmt);
948 gsi_next (&si);
952 free (bbs);
954 release_vec_loop_masks (&masks);
955 delete ivexpr_map;
957 loop->aux = NULL;
960 /* Return an invariant or register for EXPR and emit necessary
961 computations in the LOOP_VINFO loop preheader. */
963 tree
964 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
966 if (is_gimple_reg (expr)
967 || is_gimple_min_invariant (expr))
968 return expr;
970 if (! loop_vinfo->ivexpr_map)
971 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
972 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
973 if (! cached)
975 gimple_seq stmts = NULL;
976 cached = force_gimple_operand (unshare_expr (expr),
977 &stmts, true, NULL_TREE);
978 if (stmts)
980 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
981 gsi_insert_seq_on_edge_immediate (e, stmts);
984 return cached;
987 /* Return true if we can use CMP_TYPE as the comparison type to produce
988 all masks required to mask LOOP_VINFO. */
990 static bool
991 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
993 rgroup_masks *rgm;
994 unsigned int i;
995 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
996 if (rgm->mask_type != NULL_TREE
997 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
998 cmp_type, rgm->mask_type,
999 OPTIMIZE_FOR_SPEED))
1000 return false;
1001 return true;
1004 /* Calculate the maximum number of scalars per iteration for every
1005 rgroup in LOOP_VINFO. */
1007 static unsigned int
1008 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1010 unsigned int res = 1;
1011 unsigned int i;
1012 rgroup_masks *rgm;
1013 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1014 res = MAX (res, rgm->max_nscalars_per_iter);
1015 return res;
1018 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1019 whether we can actually generate the masks required. Return true if so,
1020 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1022 static bool
1023 vect_verify_full_masking (loop_vec_info loop_vinfo)
1025 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1026 unsigned int min_ni_width;
1028 /* Use a normal loop if there are no statements that need masking.
1029 This only happens in rare degenerate cases: it means that the loop
1030 has no loads, no stores, and no live-out values. */
1031 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1032 return false;
1034 /* Get the maximum number of iterations that is representable
1035 in the counter type. */
1036 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1037 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1039 /* Get a more refined estimate for the number of iterations. */
1040 widest_int max_back_edges;
1041 if (max_loop_iterations (loop, &max_back_edges))
1042 max_ni = wi::smin (max_ni, max_back_edges + 1);
1044 /* Account for rgroup masks, in which each bit is replicated N times. */
1045 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1047 /* Work out how many bits we need to represent the limit. */
1048 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1050 /* Find a scalar mode for which WHILE_ULT is supported. */
1051 opt_scalar_int_mode cmp_mode_iter;
1052 tree cmp_type = NULL_TREE;
1053 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1055 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1056 if (cmp_bits >= min_ni_width
1057 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1059 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1060 if (this_type
1061 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1063 /* Although we could stop as soon as we find a valid mode,
1064 it's often better to continue until we hit Pmode, since the
1065 operands to the WHILE are more likely to be reusable in
1066 address calculations. */
1067 cmp_type = this_type;
1068 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1069 break;
1074 if (!cmp_type)
1075 return false;
1077 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1078 return true;
1081 /* Calculate the cost of one scalar iteration of the loop. */
1082 static void
1083 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1085 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1086 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1087 int nbbs = loop->num_nodes, factor;
1088 int innerloop_iters, i;
1090 /* Gather costs for statements in the scalar loop. */
1092 /* FORNOW. */
1093 innerloop_iters = 1;
1094 if (loop->inner)
1095 innerloop_iters = 50; /* FIXME */
1097 for (i = 0; i < nbbs; i++)
1099 gimple_stmt_iterator si;
1100 basic_block bb = bbs[i];
1102 if (bb->loop_father == loop->inner)
1103 factor = innerloop_iters;
1104 else
1105 factor = 1;
1107 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1109 gimple *stmt = gsi_stmt (si);
1110 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1112 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1113 continue;
1115 /* Skip stmts that are not vectorized inside the loop. */
1116 if (stmt_info
1117 && !STMT_VINFO_RELEVANT_P (stmt_info)
1118 && (!STMT_VINFO_LIVE_P (stmt_info)
1119 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1120 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1121 continue;
1123 vect_cost_for_stmt kind;
1124 if (STMT_VINFO_DATA_REF (stmt_info))
1126 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1127 kind = scalar_load;
1128 else
1129 kind = scalar_store;
1131 else
1132 kind = scalar_stmt;
1134 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1135 factor, kind, stmt_info, 0, vect_prologue);
1139 /* Now accumulate cost. */
1140 void *target_cost_data = init_cost (loop);
1141 stmt_info_for_cost *si;
1142 int j;
1143 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1144 j, si)
1146 struct _stmt_vec_info *stmt_info
1147 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1148 (void) add_stmt_cost (target_cost_data, si->count,
1149 si->kind, stmt_info, si->misalign,
1150 vect_body);
1152 unsigned dummy, body_cost = 0;
1153 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1154 destroy_cost_data (target_cost_data);
1155 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1159 /* Function vect_analyze_loop_form_1.
1161 Verify that certain CFG restrictions hold, including:
1162 - the loop has a pre-header
1163 - the loop has a single entry and exit
1164 - the loop exit condition is simple enough
1165 - the number of iterations can be analyzed, i.e, a countable loop. The
1166 niter could be analyzed under some assumptions. */
1168 bool
1169 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1170 tree *assumptions, tree *number_of_iterationsm1,
1171 tree *number_of_iterations, gcond **inner_loop_cond)
1173 if (dump_enabled_p ())
1174 dump_printf_loc (MSG_NOTE, vect_location,
1175 "=== vect_analyze_loop_form ===\n");
1177 /* Different restrictions apply when we are considering an inner-most loop,
1178 vs. an outer (nested) loop.
1179 (FORNOW. May want to relax some of these restrictions in the future). */
1181 if (!loop->inner)
1183 /* Inner-most loop. We currently require that the number of BBs is
1184 exactly 2 (the header and latch). Vectorizable inner-most loops
1185 look like this:
1187 (pre-header)
1189 header <--------+
1190 | | |
1191 | +--> latch --+
1193 (exit-bb) */
1195 if (loop->num_nodes != 2)
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "not vectorized: control flow in loop.\n");
1200 return false;
1203 if (empty_block_p (loop->header))
1205 if (dump_enabled_p ())
1206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207 "not vectorized: empty loop.\n");
1208 return false;
1211 else
1213 struct loop *innerloop = loop->inner;
1214 edge entryedge;
1216 /* Nested loop. We currently require that the loop is doubly-nested,
1217 contains a single inner loop, and the number of BBs is exactly 5.
1218 Vectorizable outer-loops look like this:
1220 (pre-header)
1222 header <---+
1224 inner-loop |
1226 tail ------+
1228 (exit-bb)
1230 The inner-loop has the properties expected of inner-most loops
1231 as described above. */
1233 if ((loop->inner)->inner || (loop->inner)->next)
1235 if (dump_enabled_p ())
1236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237 "not vectorized: multiple nested loops.\n");
1238 return false;
1241 if (loop->num_nodes != 5)
1243 if (dump_enabled_p ())
1244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1245 "not vectorized: control flow in loop.\n");
1246 return false;
1249 entryedge = loop_preheader_edge (innerloop);
1250 if (entryedge->src != loop->header
1251 || !single_exit (innerloop)
1252 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 "not vectorized: unsupported outerloop form.\n");
1257 return false;
1260 /* Analyze the inner-loop. */
1261 tree inner_niterm1, inner_niter, inner_assumptions;
1262 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1263 &inner_assumptions, &inner_niterm1,
1264 &inner_niter, NULL)
1265 /* Don't support analyzing niter under assumptions for inner
1266 loop. */
1267 || !integer_onep (inner_assumptions))
1269 if (dump_enabled_p ())
1270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271 "not vectorized: Bad inner loop.\n");
1272 return false;
1275 if (!expr_invariant_in_loop_p (loop, inner_niter))
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1279 "not vectorized: inner-loop count not"
1280 " invariant.\n");
1281 return false;
1284 if (dump_enabled_p ())
1285 dump_printf_loc (MSG_NOTE, vect_location,
1286 "Considering outer-loop vectorization.\n");
1289 if (!single_exit (loop)
1290 || EDGE_COUNT (loop->header->preds) != 2)
1292 if (dump_enabled_p ())
1294 if (!single_exit (loop))
1295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1296 "not vectorized: multiple exits.\n");
1297 else if (EDGE_COUNT (loop->header->preds) != 2)
1298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1299 "not vectorized: too many incoming edges.\n");
1301 return false;
1304 /* We assume that the loop exit condition is at the end of the loop. i.e,
1305 that the loop is represented as a do-while (with a proper if-guard
1306 before the loop if needed), where the loop header contains all the
1307 executable statements, and the latch is empty. */
1308 if (!empty_block_p (loop->latch)
1309 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1311 if (dump_enabled_p ())
1312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313 "not vectorized: latch block not empty.\n");
1314 return false;
1317 /* Make sure the exit is not abnormal. */
1318 edge e = single_exit (loop);
1319 if (e->flags & EDGE_ABNORMAL)
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323 "not vectorized: abnormal loop exit edge.\n");
1324 return false;
1327 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1328 number_of_iterationsm1);
1329 if (!*loop_cond)
1331 if (dump_enabled_p ())
1332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1333 "not vectorized: complicated exit condition.\n");
1334 return false;
1337 if (integer_zerop (*assumptions)
1338 || !*number_of_iterations
1339 || chrec_contains_undetermined (*number_of_iterations))
1341 if (dump_enabled_p ())
1342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343 "not vectorized: number of iterations cannot be "
1344 "computed.\n");
1345 return false;
1348 if (integer_zerop (*number_of_iterations))
1350 if (dump_enabled_p ())
1351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352 "not vectorized: number of iterations = 0.\n");
1353 return false;
1356 return true;
1359 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1361 loop_vec_info
1362 vect_analyze_loop_form (struct loop *loop)
1364 tree assumptions, number_of_iterations, number_of_iterationsm1;
1365 gcond *loop_cond, *inner_loop_cond = NULL;
1367 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1368 &assumptions, &number_of_iterationsm1,
1369 &number_of_iterations, &inner_loop_cond))
1370 return NULL;
1372 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1373 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1374 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1375 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1376 if (!integer_onep (assumptions))
1378 /* We consider to vectorize this loop by versioning it under
1379 some assumptions. In order to do this, we need to clear
1380 existing information computed by scev and niter analyzer. */
1381 scev_reset_htab ();
1382 free_numbers_of_iterations_estimates (loop);
1383 /* Also set flag for this loop so that following scev and niter
1384 analysis are done under the assumptions. */
1385 loop_constraint_set (loop, LOOP_C_FINITE);
1386 /* Also record the assumptions for versioning. */
1387 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1390 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1392 if (dump_enabled_p ())
1394 dump_printf_loc (MSG_NOTE, vect_location,
1395 "Symbolic number of iterations is ");
1396 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1397 dump_printf (MSG_NOTE, "\n");
1401 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1402 if (inner_loop_cond)
1403 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1404 = loop_exit_ctrl_vec_info_type;
1406 gcc_assert (!loop->aux);
1407 loop->aux = loop_vinfo;
1408 return loop_vinfo;
1413 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1414 statements update the vectorization factor. */
1416 static void
1417 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1419 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1420 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1421 int nbbs = loop->num_nodes;
1422 poly_uint64 vectorization_factor;
1423 int i;
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "=== vect_update_vf_for_slp ===\n");
1429 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1430 gcc_assert (known_ne (vectorization_factor, 0U));
1432 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1433 vectorization factor of the loop is the unrolling factor required by
1434 the SLP instances. If that unrolling factor is 1, we say, that we
1435 perform pure SLP on loop - cross iteration parallelism is not
1436 exploited. */
1437 bool only_slp_in_loop = true;
1438 for (i = 0; i < nbbs; i++)
1440 basic_block bb = bbs[i];
1441 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1442 gsi_next (&si))
1444 gimple *stmt = gsi_stmt (si);
1445 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1446 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1447 && STMT_VINFO_RELATED_STMT (stmt_info))
1449 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1450 stmt_info = vinfo_for_stmt (stmt);
1452 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1453 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1454 && !PURE_SLP_STMT (stmt_info))
1455 /* STMT needs both SLP and loop-based vectorization. */
1456 only_slp_in_loop = false;
1460 if (only_slp_in_loop)
1462 dump_printf_loc (MSG_NOTE, vect_location,
1463 "Loop contains only SLP stmts\n");
1464 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1466 else
1468 dump_printf_loc (MSG_NOTE, vect_location,
1469 "Loop contains SLP and non-SLP stmts\n");
1470 /* Both the vectorization factor and unroll factor have the form
1471 current_vector_size * X for some rational X, so they must have
1472 a common multiple. */
1473 vectorization_factor
1474 = force_common_multiple (vectorization_factor,
1475 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1478 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1479 if (dump_enabled_p ())
1481 dump_printf_loc (MSG_NOTE, vect_location,
1482 "Updating vectorization factor to ");
1483 dump_dec (MSG_NOTE, vectorization_factor);
1484 dump_printf (MSG_NOTE, ".\n");
1488 /* Return true if STMT_INFO describes a double reduction phi and if
1489 the other phi in the reduction is also relevant for vectorization.
1490 This rejects cases such as:
1492 outer1:
1493 x_1 = PHI <x_3(outer2), ...>;
1496 inner:
1497 x_2 = ...;
1500 outer2:
1501 x_3 = PHI <x_2(inner)>;
1503 if nothing in x_2 or elsewhere makes x_1 relevant. */
1505 static bool
1506 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1508 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1509 return false;
1511 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1512 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1515 /* Function vect_analyze_loop_operations.
1517 Scan the loop stmts and make sure they are all vectorizable. */
1519 static bool
1520 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1522 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1523 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1524 int nbbs = loop->num_nodes;
1525 int i;
1526 stmt_vec_info stmt_info;
1527 bool need_to_vectorize = false;
1528 bool ok;
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_NOTE, vect_location,
1532 "=== vect_analyze_loop_operations ===\n");
1534 stmt_vector_for_cost cost_vec;
1535 cost_vec.create (2);
1537 for (i = 0; i < nbbs; i++)
1539 basic_block bb = bbs[i];
1541 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1542 gsi_next (&si))
1544 gphi *phi = si.phi ();
1545 ok = true;
1547 stmt_info = vinfo_for_stmt (phi);
1548 if (dump_enabled_p ())
1550 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1551 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1553 if (virtual_operand_p (gimple_phi_result (phi)))
1554 continue;
1556 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1557 (i.e., a phi in the tail of the outer-loop). */
1558 if (! is_loop_header_bb_p (bb))
1560 /* FORNOW: we currently don't support the case that these phis
1561 are not used in the outerloop (unless it is double reduction,
1562 i.e., this phi is vect_reduction_def), cause this case
1563 requires to actually do something here. */
1564 if (STMT_VINFO_LIVE_P (stmt_info)
1565 && !vect_active_double_reduction_p (stmt_info))
1567 if (dump_enabled_p ())
1568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1569 "Unsupported loop-closed phi in "
1570 "outer-loop.\n");
1571 return false;
1574 /* If PHI is used in the outer loop, we check that its operand
1575 is defined in the inner loop. */
1576 if (STMT_VINFO_RELEVANT_P (stmt_info))
1578 tree phi_op;
1579 gimple *op_def_stmt;
1581 if (gimple_phi_num_args (phi) != 1)
1582 return false;
1584 phi_op = PHI_ARG_DEF (phi, 0);
1585 if (TREE_CODE (phi_op) != SSA_NAME)
1586 return false;
1588 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1589 if (gimple_nop_p (op_def_stmt)
1590 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1591 || !vinfo_for_stmt (op_def_stmt))
1592 return false;
1594 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1595 != vect_used_in_outer
1596 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1597 != vect_used_in_outer_by_reduction)
1598 return false;
1601 continue;
1604 gcc_assert (stmt_info);
1606 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1607 || STMT_VINFO_LIVE_P (stmt_info))
1608 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1610 /* A scalar-dependence cycle that we don't support. */
1611 if (dump_enabled_p ())
1612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613 "not vectorized: scalar dependence cycle.\n");
1614 return false;
1617 if (STMT_VINFO_RELEVANT_P (stmt_info))
1619 need_to_vectorize = true;
1620 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1621 && ! PURE_SLP_STMT (stmt_info))
1622 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1623 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1624 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1625 && ! PURE_SLP_STMT (stmt_info))
1626 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1627 &cost_vec);
1630 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1631 if (ok
1632 && STMT_VINFO_LIVE_P (stmt_info)
1633 && !PURE_SLP_STMT (stmt_info))
1634 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1635 &cost_vec);
1637 if (!ok)
1639 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642 "not vectorized: relevant phi not "
1643 "supported: ");
1644 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1646 return false;
1650 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1651 gsi_next (&si))
1653 gimple *stmt = gsi_stmt (si);
1654 if (!gimple_clobber_p (stmt)
1655 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1656 &cost_vec))
1657 return false;
1659 } /* bbs */
1661 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1662 cost_vec.release ();
1664 /* All operations in the loop are either irrelevant (deal with loop
1665 control, or dead), or only used outside the loop and can be moved
1666 out of the loop (e.g. invariants, inductions). The loop can be
1667 optimized away by scalar optimizations. We're better off not
1668 touching this loop. */
1669 if (!need_to_vectorize)
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_NOTE, vect_location,
1673 "All the computation can be taken out of the loop.\n");
1674 if (dump_enabled_p ())
1675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676 "not vectorized: redundant loop. no profit to "
1677 "vectorize.\n");
1678 return false;
1681 return true;
1684 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1685 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1686 definitely no, or -1 if it's worth retrying. */
1688 static int
1689 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1691 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1692 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1694 /* Only fully-masked loops can have iteration counts less than the
1695 vectorization factor. */
1696 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1698 HOST_WIDE_INT max_niter;
1700 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1701 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1702 else
1703 max_niter = max_stmt_executions_int (loop);
1705 if (max_niter != -1
1706 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710 "not vectorized: iteration count smaller than "
1711 "vectorization factor.\n");
1712 return 0;
1716 int min_profitable_iters, min_profitable_estimate;
1717 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1718 &min_profitable_estimate);
1720 if (min_profitable_iters < 0)
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724 "not vectorized: vectorization not profitable.\n");
1725 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1727 "not vectorized: vector version will never be "
1728 "profitable.\n");
1729 return -1;
1732 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1733 * assumed_vf);
1735 /* Use the cost model only if it is more conservative than user specified
1736 threshold. */
1737 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1738 min_profitable_iters);
1740 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1742 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1743 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1747 "not vectorized: vectorization not profitable.\n");
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "not vectorized: iteration count smaller than user "
1751 "specified loop bound parameter or minimum profitable "
1752 "iterations (whichever is more conservative).\n");
1753 return 0;
1756 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1757 if (estimated_niter == -1)
1758 estimated_niter = likely_max_stmt_executions_int (loop);
1759 if (estimated_niter != -1
1760 && ((unsigned HOST_WIDE_INT) estimated_niter
1761 < MAX (th, (unsigned) min_profitable_estimate)))
1763 if (dump_enabled_p ())
1764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1765 "not vectorized: estimated iteration count too "
1766 "small.\n");
1767 if (dump_enabled_p ())
1768 dump_printf_loc (MSG_NOTE, vect_location,
1769 "not vectorized: estimated iteration count smaller "
1770 "than specified loop bound parameter or minimum "
1771 "profitable iterations (whichever is more "
1772 "conservative).\n");
1773 return -1;
1776 return 1;
1779 static bool
1780 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1781 vec<data_reference_p> *datarefs,
1782 unsigned int *n_stmts)
1784 *n_stmts = 0;
1785 for (unsigned i = 0; i < loop->num_nodes; i++)
1786 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1787 !gsi_end_p (gsi); gsi_next (&gsi))
1789 gimple *stmt = gsi_stmt (gsi);
1790 if (is_gimple_debug (stmt))
1791 continue;
1792 ++(*n_stmts);
1793 if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1795 if (is_gimple_call (stmt) && loop->safelen)
1797 tree fndecl = gimple_call_fndecl (stmt), op;
1798 if (fndecl != NULL_TREE)
1800 cgraph_node *node = cgraph_node::get (fndecl);
1801 if (node != NULL && node->simd_clones != NULL)
1803 unsigned int j, n = gimple_call_num_args (stmt);
1804 for (j = 0; j < n; j++)
1806 op = gimple_call_arg (stmt, j);
1807 if (DECL_P (op)
1808 || (REFERENCE_CLASS_P (op)
1809 && get_base_address (op)))
1810 break;
1812 op = gimple_call_lhs (stmt);
1813 /* Ignore #pragma omp declare simd functions
1814 if they don't have data references in the
1815 call stmt itself. */
1816 if (j == n
1817 && !(op
1818 && (DECL_P (op)
1819 || (REFERENCE_CLASS_P (op)
1820 && get_base_address (op)))))
1821 continue;
1825 return false;
1828 return true;
1831 /* Function vect_analyze_loop_2.
1833 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1834 for it. The different analyses will record information in the
1835 loop_vec_info struct. */
1836 static bool
1837 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1839 bool ok;
1840 int res;
1841 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1842 poly_uint64 min_vf = 2;
1844 /* The first group of checks is independent of the vector size. */
1845 fatal = true;
1847 /* Find all data references in the loop (which correspond to vdefs/vuses)
1848 and analyze their evolution in the loop. */
1850 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1851 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855 "not vectorized: loop nest containing two "
1856 "or more consecutive inner loops cannot be "
1857 "vectorized\n");
1858 return false;
1861 /* Gather the data references and count stmts in the loop. */
1862 unsigned int n_stmts;
1863 if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1864 &LOOP_VINFO_DATAREFS (loop_vinfo),
1865 &n_stmts))
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869 "not vectorized: loop contains function "
1870 "calls or data references that cannot "
1871 "be analyzed\n");
1872 return false;
1875 /* Analyze the data references and also adjust the minimal
1876 vectorization factor according to the loads and stores. */
1878 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1879 if (!ok)
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883 "bad data references.\n");
1884 return false;
1887 /* Classify all cross-iteration scalar data-flow cycles.
1888 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1889 vect_analyze_scalar_cycles (loop_vinfo);
1891 vect_pattern_recog (loop_vinfo);
1893 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1895 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1896 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1898 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1899 if (!ok)
1901 if (dump_enabled_p ())
1902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903 "bad data access.\n");
1904 return false;
1907 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1909 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1910 if (!ok)
1912 if (dump_enabled_p ())
1913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914 "unexpected pattern.\n");
1915 return false;
1918 /* While the rest of the analysis below depends on it in some way. */
1919 fatal = false;
1921 /* Analyze data dependences between the data-refs in the loop
1922 and adjust the maximum vectorization factor according to
1923 the dependences.
1924 FORNOW: fail at the first data dependence that we encounter. */
1926 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1927 if (!ok
1928 || (max_vf != MAX_VECTORIZATION_FACTOR
1929 && maybe_lt (max_vf, min_vf)))
1931 if (dump_enabled_p ())
1932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1933 "bad data dependence.\n");
1934 return false;
1936 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1938 ok = vect_determine_vectorization_factor (loop_vinfo);
1939 if (!ok)
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943 "can't determine vectorization factor.\n");
1944 return false;
1946 if (max_vf != MAX_VECTORIZATION_FACTOR
1947 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1949 if (dump_enabled_p ())
1950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1951 "bad data dependence.\n");
1952 return false;
1955 /* Compute the scalar iteration cost. */
1956 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1958 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959 unsigned th;
1961 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1962 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1963 if (!ok)
1964 return false;
1966 /* If there are any SLP instances mark them as pure_slp. */
1967 bool slp = vect_make_slp_decision (loop_vinfo);
1968 if (slp)
1970 /* Find stmts that need to be both vectorized and SLPed. */
1971 vect_detect_hybrid_slp (loop_vinfo);
1973 /* Update the vectorization factor based on the SLP decision. */
1974 vect_update_vf_for_slp (loop_vinfo);
1977 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1979 /* We don't expect to have to roll back to anything other than an empty
1980 set of rgroups. */
1981 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1983 /* This is the point where we can re-start analysis with SLP forced off. */
1984 start_over:
1986 /* Now the vectorization factor is final. */
1987 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1988 gcc_assert (known_ne (vectorization_factor, 0U));
1990 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1992 dump_printf_loc (MSG_NOTE, vect_location,
1993 "vectorization_factor = ");
1994 dump_dec (MSG_NOTE, vectorization_factor);
1995 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1996 LOOP_VINFO_INT_NITERS (loop_vinfo));
1999 HOST_WIDE_INT max_niter
2000 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2002 /* Analyze the alignment of the data-refs in the loop.
2003 Fail if a data reference is found that cannot be vectorized. */
2005 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2006 if (!ok)
2008 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2010 "bad data alignment.\n");
2011 return false;
2014 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2015 It is important to call pruning after vect_analyze_data_ref_accesses,
2016 since we use grouping information gathered by interleaving analysis. */
2017 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2018 if (!ok)
2019 return false;
2021 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2022 vectorization. */
2023 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2025 /* This pass will decide on using loop versioning and/or loop peeling in
2026 order to enhance the alignment of data references in the loop. */
2027 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2028 if (!ok)
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032 "bad data alignment.\n");
2033 return false;
2037 if (slp)
2039 /* Analyze operations in the SLP instances. Note this may
2040 remove unsupported SLP instances which makes the above
2041 SLP kind detection invalid. */
2042 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2043 vect_slp_analyze_operations (loop_vinfo);
2044 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2045 goto again;
2048 /* Scan all the remaining operations in the loop that are not subject
2049 to SLP and make sure they are vectorizable. */
2050 ok = vect_analyze_loop_operations (loop_vinfo);
2051 if (!ok)
2053 if (dump_enabled_p ())
2054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2055 "bad operation or unsupported loop bound.\n");
2056 return false;
2059 /* Decide whether to use a fully-masked loop for this vectorization
2060 factor. */
2061 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2062 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2063 && vect_verify_full_masking (loop_vinfo));
2064 if (dump_enabled_p ())
2066 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2067 dump_printf_loc (MSG_NOTE, vect_location,
2068 "using a fully-masked loop.\n");
2069 else
2070 dump_printf_loc (MSG_NOTE, vect_location,
2071 "not using a fully-masked loop.\n");
2074 /* If epilog loop is required because of data accesses with gaps,
2075 one additional iteration needs to be peeled. Check if there is
2076 enough iterations for vectorization. */
2077 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2078 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2079 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2081 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2082 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2084 if (known_lt (wi::to_widest (scalar_niters), vf))
2086 if (dump_enabled_p ())
2087 dump_printf_loc (MSG_NOTE, vect_location,
2088 "loop has no enough iterations to support"
2089 " peeling for gaps.\n");
2090 return false;
2094 /* Check the costings of the loop make vectorizing worthwhile. */
2095 res = vect_analyze_loop_costing (loop_vinfo);
2096 if (res < 0)
2097 goto again;
2098 if (!res)
2100 if (dump_enabled_p ())
2101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2102 "Loop costings not worthwhile.\n");
2103 return false;
2106 /* Decide whether we need to create an epilogue loop to handle
2107 remaining scalar iterations. */
2108 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2110 unsigned HOST_WIDE_INT const_vf;
2111 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2112 /* The main loop handles all iterations. */
2113 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2114 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2115 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2117 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2118 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2119 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2120 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2122 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2123 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2124 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2125 < (unsigned) exact_log2 (const_vf))
2126 /* In case of versioning, check if the maximum number of
2127 iterations is greater than th. If they are identical,
2128 the epilogue is unnecessary. */
2129 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2130 || ((unsigned HOST_WIDE_INT) max_niter
2131 > (th / const_vf) * const_vf))))
2132 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2134 /* If an epilogue loop is required make sure we can create one. */
2135 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2136 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2138 if (dump_enabled_p ())
2139 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2140 if (!vect_can_advance_ivs_p (loop_vinfo)
2141 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2142 single_exit (LOOP_VINFO_LOOP
2143 (loop_vinfo))))
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "not vectorized: can't create required "
2148 "epilog loop\n");
2149 goto again;
2153 /* During peeling, we need to check if number of loop iterations is
2154 enough for both peeled prolog loop and vector loop. This check
2155 can be merged along with threshold check of loop versioning, so
2156 increase threshold for this case if necessary. */
2157 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2159 poly_uint64 niters_th = 0;
2161 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2163 /* Niters for peeled prolog loop. */
2164 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2166 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2167 tree vectype
2168 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2169 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2171 else
2172 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2175 /* Niters for at least one iteration of vectorized loop. */
2176 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2177 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2178 /* One additional iteration because of peeling for gap. */
2179 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2180 niters_th += 1;
2181 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2184 gcc_assert (known_eq (vectorization_factor,
2185 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2187 /* Ok to vectorize! */
2188 return true;
2190 again:
2191 /* Try again with SLP forced off but if we didn't do any SLP there is
2192 no point in re-trying. */
2193 if (!slp)
2194 return false;
2196 /* If there are reduction chains re-trying will fail anyway. */
2197 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2198 return false;
2200 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2201 via interleaving or lane instructions. */
2202 slp_instance instance;
2203 slp_tree node;
2204 unsigned i, j;
2205 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2207 stmt_vec_info vinfo;
2208 vinfo = vinfo_for_stmt
2209 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2210 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2211 continue;
2212 vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2213 unsigned int size = DR_GROUP_SIZE (vinfo);
2214 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2215 if (! vect_store_lanes_supported (vectype, size, false)
2216 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2217 && ! vect_grouped_store_supported (vectype, size))
2218 return false;
2219 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2221 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2222 vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2223 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2224 size = DR_GROUP_SIZE (vinfo);
2225 vectype = STMT_VINFO_VECTYPE (vinfo);
2226 if (! vect_load_lanes_supported (vectype, size, false)
2227 && ! vect_grouped_load_supported (vectype, single_element_p,
2228 size))
2229 return false;
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE, vect_location,
2235 "re-trying with SLP disabled\n");
2237 /* Roll back state appropriately. No SLP this time. */
2238 slp = false;
2239 /* Restore vectorization factor as it were without SLP. */
2240 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2241 /* Free the SLP instances. */
2242 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2243 vect_free_slp_instance (instance);
2244 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2245 /* Reset SLP type to loop_vect on all stmts. */
2246 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2248 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2249 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2250 !gsi_end_p (si); gsi_next (&si))
2252 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2253 STMT_SLP_TYPE (stmt_info) = loop_vect;
2255 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2256 !gsi_end_p (si); gsi_next (&si))
2258 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2259 STMT_SLP_TYPE (stmt_info) = loop_vect;
2260 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2262 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2263 STMT_SLP_TYPE (stmt_info) = loop_vect;
2264 for (gimple_stmt_iterator pi
2265 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2266 !gsi_end_p (pi); gsi_next (&pi))
2268 gimple *pstmt = gsi_stmt (pi);
2269 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2274 /* Free optimized alias test DDRS. */
2275 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2276 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2277 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2278 /* Reset target cost data. */
2279 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2280 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2281 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2282 /* Reset accumulated rgroup information. */
2283 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2284 /* Reset assorted flags. */
2285 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2286 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2287 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2288 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2289 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2291 goto start_over;
2294 /* Function vect_analyze_loop.
2296 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2297 for it. The different analyses will record information in the
2298 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2299 be vectorized. */
2300 loop_vec_info
2301 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2303 loop_vec_info loop_vinfo;
2304 auto_vector_sizes vector_sizes;
2306 /* Autodetect first vector size we try. */
2307 current_vector_size = 0;
2308 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2309 unsigned int next_size = 0;
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "===== analyze_loop_nest =====\n");
2315 if (loop_outer (loop)
2316 && loop_vec_info_for_loop (loop_outer (loop))
2317 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location,
2321 "outer-loop already vectorized.\n");
2322 return NULL;
2325 poly_uint64 autodetected_vector_size = 0;
2326 while (1)
2328 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2329 loop_vinfo = vect_analyze_loop_form (loop);
2330 if (!loop_vinfo)
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "bad loop form.\n");
2335 return NULL;
2338 bool fatal = false;
2340 if (orig_loop_vinfo)
2341 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2343 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2345 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2347 return loop_vinfo;
2350 delete loop_vinfo;
2352 if (next_size == 0)
2353 autodetected_vector_size = current_vector_size;
2355 if (next_size < vector_sizes.length ()
2356 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2357 next_size += 1;
2359 if (fatal
2360 || next_size == vector_sizes.length ()
2361 || known_eq (current_vector_size, 0U))
2362 return NULL;
2364 /* Try the next biggest vector size. */
2365 current_vector_size = vector_sizes[next_size++];
2366 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "***** Re-trying analysis with "
2370 "vector size ");
2371 dump_dec (MSG_NOTE, current_vector_size);
2372 dump_printf (MSG_NOTE, "\n");
2377 /* Return true if there is an in-order reduction function for CODE, storing
2378 it in *REDUC_FN if so. */
2380 static bool
2381 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2383 switch (code)
2385 case PLUS_EXPR:
2386 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2387 return true;
2389 default:
2390 return false;
2394 /* Function reduction_fn_for_scalar_code
2396 Input:
2397 CODE - tree_code of a reduction operations.
2399 Output:
2400 REDUC_FN - the corresponding internal function to be used to reduce the
2401 vector of partial results into a single scalar result, or IFN_LAST
2402 if the operation is a supported reduction operation, but does not have
2403 such an internal function.
2405 Return FALSE if CODE currently cannot be vectorized as reduction. */
2407 static bool
2408 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2410 switch (code)
2412 case MAX_EXPR:
2413 *reduc_fn = IFN_REDUC_MAX;
2414 return true;
2416 case MIN_EXPR:
2417 *reduc_fn = IFN_REDUC_MIN;
2418 return true;
2420 case PLUS_EXPR:
2421 *reduc_fn = IFN_REDUC_PLUS;
2422 return true;
2424 case BIT_AND_EXPR:
2425 *reduc_fn = IFN_REDUC_AND;
2426 return true;
2428 case BIT_IOR_EXPR:
2429 *reduc_fn = IFN_REDUC_IOR;
2430 return true;
2432 case BIT_XOR_EXPR:
2433 *reduc_fn = IFN_REDUC_XOR;
2434 return true;
2436 case MULT_EXPR:
2437 case MINUS_EXPR:
2438 *reduc_fn = IFN_LAST;
2439 return true;
2441 default:
2442 return false;
2446 /* If there is a neutral value X such that SLP reduction NODE would not
2447 be affected by the introduction of additional X elements, return that X,
2448 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2449 is true if the SLP statements perform a single reduction, false if each
2450 statement performs an independent reduction. */
2452 static tree
2453 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2454 bool reduc_chain)
2456 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2457 gimple *stmt = stmts[0];
2458 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2459 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2460 tree scalar_type = TREE_TYPE (vector_type);
2461 struct loop *loop = gimple_bb (stmt)->loop_father;
2462 gcc_assert (loop);
2464 switch (code)
2466 case WIDEN_SUM_EXPR:
2467 case DOT_PROD_EXPR:
2468 case SAD_EXPR:
2469 case PLUS_EXPR:
2470 case MINUS_EXPR:
2471 case BIT_IOR_EXPR:
2472 case BIT_XOR_EXPR:
2473 return build_zero_cst (scalar_type);
2475 case MULT_EXPR:
2476 return build_one_cst (scalar_type);
2478 case BIT_AND_EXPR:
2479 return build_all_ones_cst (scalar_type);
2481 case MAX_EXPR:
2482 case MIN_EXPR:
2483 /* For MIN/MAX the initial values are neutral. A reduction chain
2484 has only a single initial value, so that value is neutral for
2485 all statements. */
2486 if (reduc_chain)
2487 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2488 return NULL_TREE;
2490 default:
2491 return NULL_TREE;
2495 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2496 STMT is printed with a message MSG. */
2498 static void
2499 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2501 dump_printf_loc (msg_type, vect_location, "%s", msg);
2502 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2506 /* Detect SLP reduction of the form:
2508 #a1 = phi <a5, a0>
2509 a2 = operation (a1)
2510 a3 = operation (a2)
2511 a4 = operation (a3)
2512 a5 = operation (a4)
2514 #a = phi <a5>
2516 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2517 FIRST_STMT is the first reduction stmt in the chain
2518 (a2 = operation (a1)).
2520 Return TRUE if a reduction chain was detected. */
2522 static bool
2523 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2524 gimple *first_stmt)
2526 struct loop *loop = (gimple_bb (phi))->loop_father;
2527 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2528 enum tree_code code;
2529 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2530 stmt_vec_info use_stmt_info, current_stmt_info;
2531 tree lhs;
2532 imm_use_iterator imm_iter;
2533 use_operand_p use_p;
2534 int nloop_uses, size = 0, n_out_of_loop_uses;
2535 bool found = false;
2537 if (loop != vect_loop)
2538 return false;
2540 lhs = PHI_RESULT (phi);
2541 code = gimple_assign_rhs_code (first_stmt);
2542 while (1)
2544 nloop_uses = 0;
2545 n_out_of_loop_uses = 0;
2546 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2548 gimple *use_stmt = USE_STMT (use_p);
2549 if (is_gimple_debug (use_stmt))
2550 continue;
2552 /* Check if we got back to the reduction phi. */
2553 if (use_stmt == phi)
2555 loop_use_stmt = use_stmt;
2556 found = true;
2557 break;
2560 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2562 loop_use_stmt = use_stmt;
2563 nloop_uses++;
2565 else
2566 n_out_of_loop_uses++;
2568 /* There are can be either a single use in the loop or two uses in
2569 phi nodes. */
2570 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2571 return false;
2574 if (found)
2575 break;
2577 /* We reached a statement with no loop uses. */
2578 if (nloop_uses == 0)
2579 return false;
2581 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2582 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2583 return false;
2585 if (!is_gimple_assign (loop_use_stmt)
2586 || code != gimple_assign_rhs_code (loop_use_stmt)
2587 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2588 return false;
2590 /* Insert USE_STMT into reduction chain. */
2591 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2592 if (current_stmt)
2594 current_stmt_info = vinfo_for_stmt (current_stmt);
2595 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2596 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2597 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2599 else
2600 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2602 lhs = gimple_assign_lhs (loop_use_stmt);
2603 current_stmt = loop_use_stmt;
2604 size++;
2607 if (!found || loop_use_stmt != phi || size < 2)
2608 return false;
2610 /* Swap the operands, if needed, to make the reduction operand be the second
2611 operand. */
2612 lhs = PHI_RESULT (phi);
2613 next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2614 while (next_stmt)
2616 if (gimple_assign_rhs2 (next_stmt) == lhs)
2618 tree op = gimple_assign_rhs1 (next_stmt);
2619 gimple *def_stmt = NULL;
2621 if (TREE_CODE (op) == SSA_NAME)
2622 def_stmt = SSA_NAME_DEF_STMT (op);
2624 /* Check that the other def is either defined in the loop
2625 ("vect_internal_def"), or it's an induction (defined by a
2626 loop-header phi-node). */
2627 if (def_stmt
2628 && gimple_bb (def_stmt)
2629 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2630 && (is_gimple_assign (def_stmt)
2631 || is_gimple_call (def_stmt)
2632 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2633 == vect_induction_def
2634 || (gimple_code (def_stmt) == GIMPLE_PHI
2635 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2636 == vect_internal_def
2637 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2639 lhs = gimple_assign_lhs (next_stmt);
2640 next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2641 continue;
2644 return false;
2646 else
2648 tree op = gimple_assign_rhs2 (next_stmt);
2649 gimple *def_stmt = NULL;
2651 if (TREE_CODE (op) == SSA_NAME)
2652 def_stmt = SSA_NAME_DEF_STMT (op);
2654 /* Check that the other def is either defined in the loop
2655 ("vect_internal_def"), or it's an induction (defined by a
2656 loop-header phi-node). */
2657 if (def_stmt
2658 && gimple_bb (def_stmt)
2659 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2660 && (is_gimple_assign (def_stmt)
2661 || is_gimple_call (def_stmt)
2662 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2663 == vect_induction_def
2664 || (gimple_code (def_stmt) == GIMPLE_PHI
2665 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2666 == vect_internal_def
2667 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2669 if (dump_enabled_p ())
2671 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2672 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2675 swap_ssa_operands (next_stmt,
2676 gimple_assign_rhs1_ptr (next_stmt),
2677 gimple_assign_rhs2_ptr (next_stmt));
2678 update_stmt (next_stmt);
2680 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2681 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2683 else
2684 return false;
2687 lhs = gimple_assign_lhs (next_stmt);
2688 next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2691 /* Save the chain for further analysis in SLP detection. */
2692 first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2693 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2694 REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2696 return true;
2699 /* Return true if we need an in-order reduction for operation CODE
2700 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2701 overflow must wrap. */
2703 static bool
2704 needs_fold_left_reduction_p (tree type, tree_code code,
2705 bool need_wrapping_integral_overflow)
2707 /* CHECKME: check for !flag_finite_math_only too? */
2708 if (SCALAR_FLOAT_TYPE_P (type))
2709 switch (code)
2711 case MIN_EXPR:
2712 case MAX_EXPR:
2713 return false;
2715 default:
2716 return !flag_associative_math;
2719 if (INTEGRAL_TYPE_P (type))
2721 if (!operation_no_trapping_overflow (type, code))
2722 return true;
2723 if (need_wrapping_integral_overflow
2724 && !TYPE_OVERFLOW_WRAPS (type)
2725 && operation_can_overflow (code))
2726 return true;
2727 return false;
2730 if (SAT_FIXED_POINT_TYPE_P (type))
2731 return true;
2733 return false;
2736 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2737 reduction operation CODE has a handled computation expression. */
2739 bool
2740 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2741 enum tree_code code)
2743 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2744 auto_bitmap visited;
2745 tree lookfor = PHI_RESULT (phi);
2746 ssa_op_iter curri;
2747 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2748 while (USE_FROM_PTR (curr) != loop_arg)
2749 curr = op_iter_next_use (&curri);
2750 curri.i = curri.numops;
2753 path.safe_push (std::make_pair (curri, curr));
2754 tree use = USE_FROM_PTR (curr);
2755 if (use == lookfor)
2756 break;
2757 gimple *def = SSA_NAME_DEF_STMT (use);
2758 if (gimple_nop_p (def)
2759 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2761 pop:
2764 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2765 curri = x.first;
2766 curr = x.second;
2768 curr = op_iter_next_use (&curri);
2769 /* Skip already visited or non-SSA operands (from iterating
2770 over PHI args). */
2771 while (curr != NULL_USE_OPERAND_P
2772 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2773 || ! bitmap_set_bit (visited,
2774 SSA_NAME_VERSION
2775 (USE_FROM_PTR (curr)))));
2777 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2778 if (curr == NULL_USE_OPERAND_P)
2779 break;
2781 else
2783 if (gimple_code (def) == GIMPLE_PHI)
2784 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2785 else
2786 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2787 while (curr != NULL_USE_OPERAND_P
2788 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2789 || ! bitmap_set_bit (visited,
2790 SSA_NAME_VERSION
2791 (USE_FROM_PTR (curr)))))
2792 curr = op_iter_next_use (&curri);
2793 if (curr == NULL_USE_OPERAND_P)
2794 goto pop;
2797 while (1);
2798 if (dump_file && (dump_flags & TDF_DETAILS))
2800 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2801 unsigned i;
2802 std::pair<ssa_op_iter, use_operand_p> *x;
2803 FOR_EACH_VEC_ELT (path, i, x)
2805 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2806 dump_printf (MSG_NOTE, " ");
2808 dump_printf (MSG_NOTE, "\n");
2811 /* Check whether the reduction path detected is valid. */
2812 bool fail = path.length () == 0;
2813 bool neg = false;
2814 for (unsigned i = 1; i < path.length (); ++i)
2816 gimple *use_stmt = USE_STMT (path[i].second);
2817 tree op = USE_FROM_PTR (path[i].second);
2818 if (! has_single_use (op)
2819 || ! is_gimple_assign (use_stmt))
2821 fail = true;
2822 break;
2824 if (gimple_assign_rhs_code (use_stmt) != code)
2826 if (code == PLUS_EXPR
2827 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2829 /* Track whether we negate the reduction value each iteration. */
2830 if (gimple_assign_rhs2 (use_stmt) == op)
2831 neg = ! neg;
2833 else
2835 fail = true;
2836 break;
2840 return ! fail && ! neg;
2844 /* Function vect_is_simple_reduction
2846 (1) Detect a cross-iteration def-use cycle that represents a simple
2847 reduction computation. We look for the following pattern:
2849 loop_header:
2850 a1 = phi < a0, a2 >
2851 a3 = ...
2852 a2 = operation (a3, a1)
2856 a3 = ...
2857 loop_header:
2858 a1 = phi < a0, a2 >
2859 a2 = operation (a3, a1)
2861 such that:
2862 1. operation is commutative and associative and it is safe to
2863 change the order of the computation
2864 2. no uses for a2 in the loop (a2 is used out of the loop)
2865 3. no uses of a1 in the loop besides the reduction operation
2866 4. no uses of a1 outside the loop.
2868 Conditions 1,4 are tested here.
2869 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2871 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2872 nested cycles.
2874 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2875 reductions:
2877 a1 = phi < a0, a2 >
2878 inner loop (def of a3)
2879 a2 = phi < a3 >
2881 (4) Detect condition expressions, ie:
2882 for (int i = 0; i < N; i++)
2883 if (a[i] < val)
2884 ret_val = a[i];
2888 static gimple *
2889 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2890 bool *double_reduc,
2891 bool need_wrapping_integral_overflow,
2892 enum vect_reduction_type *v_reduc_type)
2894 struct loop *loop = (gimple_bb (phi))->loop_father;
2895 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2896 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2897 enum tree_code orig_code, code;
2898 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2899 tree type;
2900 int nloop_uses;
2901 tree name;
2902 imm_use_iterator imm_iter;
2903 use_operand_p use_p;
2904 bool phi_def;
2906 *double_reduc = false;
2907 *v_reduc_type = TREE_CODE_REDUCTION;
2909 tree phi_name = PHI_RESULT (phi);
2910 /* ??? If there are no uses of the PHI result the inner loop reduction
2911 won't be detected as possibly double-reduction by vectorizable_reduction
2912 because that tries to walk the PHI arg from the preheader edge which
2913 can be constant. See PR60382. */
2914 if (has_zero_uses (phi_name))
2915 return NULL;
2916 nloop_uses = 0;
2917 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2919 gimple *use_stmt = USE_STMT (use_p);
2920 if (is_gimple_debug (use_stmt))
2921 continue;
2923 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2925 if (dump_enabled_p ())
2926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2927 "intermediate value used outside loop.\n");
2929 return NULL;
2932 nloop_uses++;
2933 if (nloop_uses > 1)
2935 if (dump_enabled_p ())
2936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937 "reduction value used in loop.\n");
2938 return NULL;
2941 phi_use_stmt = use_stmt;
2944 edge latch_e = loop_latch_edge (loop);
2945 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2946 if (TREE_CODE (loop_arg) != SSA_NAME)
2948 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2951 "reduction: not ssa_name: ");
2952 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2953 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2955 return NULL;
2958 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2959 if (is_gimple_assign (def_stmt))
2961 name = gimple_assign_lhs (def_stmt);
2962 phi_def = false;
2964 else if (gimple_code (def_stmt) == GIMPLE_PHI)
2966 name = PHI_RESULT (def_stmt);
2967 phi_def = true;
2969 else
2971 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974 "reduction: unhandled reduction operation: ");
2975 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2977 return NULL;
2980 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2981 return NULL;
2983 nloop_uses = 0;
2984 auto_vec<gphi *, 3> lcphis;
2985 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2987 gimple *use_stmt = USE_STMT (use_p);
2988 if (is_gimple_debug (use_stmt))
2989 continue;
2990 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2991 nloop_uses++;
2992 else
2993 /* We can have more than one loop-closed PHI. */
2994 lcphis.safe_push (as_a <gphi *> (use_stmt));
2995 if (nloop_uses > 1)
2997 if (dump_enabled_p ())
2998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2999 "reduction used in loop.\n");
3000 return NULL;
3004 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3005 defined in the inner loop. */
3006 if (phi_def)
3008 op1 = PHI_ARG_DEF (def_stmt, 0);
3010 if (gimple_phi_num_args (def_stmt) != 1
3011 || TREE_CODE (op1) != SSA_NAME)
3013 if (dump_enabled_p ())
3014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3015 "unsupported phi node definition.\n");
3017 return NULL;
3020 def1 = SSA_NAME_DEF_STMT (op1);
3021 if (gimple_bb (def1)
3022 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3023 && loop->inner
3024 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3025 && is_gimple_assign (def1)
3026 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3028 if (dump_enabled_p ())
3029 report_vect_op (MSG_NOTE, def_stmt,
3030 "detected double reduction: ");
3032 *double_reduc = true;
3033 return def_stmt;
3036 return NULL;
3039 /* If we are vectorizing an inner reduction we are executing that
3040 in the original order only in case we are not dealing with a
3041 double reduction. */
3042 bool check_reduction = true;
3043 if (flow_loop_nested_p (vect_loop, loop))
3045 gphi *lcphi;
3046 unsigned i;
3047 check_reduction = false;
3048 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3049 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3051 gimple *use_stmt = USE_STMT (use_p);
3052 if (is_gimple_debug (use_stmt))
3053 continue;
3054 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3055 check_reduction = true;
3059 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3060 code = orig_code = gimple_assign_rhs_code (def_stmt);
3062 /* We can handle "res -= x[i]", which is non-associative by
3063 simply rewriting this into "res += -x[i]". Avoid changing
3064 gimple instruction for the first simple tests and only do this
3065 if we're allowed to change code at all. */
3066 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3067 code = PLUS_EXPR;
3069 if (code == COND_EXPR)
3071 if (! nested_in_vect_loop)
3072 *v_reduc_type = COND_REDUCTION;
3074 op3 = gimple_assign_rhs1 (def_stmt);
3075 if (COMPARISON_CLASS_P (op3))
3077 op4 = TREE_OPERAND (op3, 1);
3078 op3 = TREE_OPERAND (op3, 0);
3080 if (op3 == phi_name || op4 == phi_name)
3082 if (dump_enabled_p ())
3083 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3084 "reduction: condition depends on previous"
3085 " iteration: ");
3086 return NULL;
3089 op1 = gimple_assign_rhs2 (def_stmt);
3090 op2 = gimple_assign_rhs3 (def_stmt);
3092 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3094 if (dump_enabled_p ())
3095 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3096 "reduction: not commutative/associative: ");
3097 return NULL;
3099 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3101 op1 = gimple_assign_rhs1 (def_stmt);
3102 op2 = gimple_assign_rhs2 (def_stmt);
3104 else
3106 if (dump_enabled_p ())
3107 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3108 "reduction: not handled operation: ");
3109 return NULL;
3112 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3114 if (dump_enabled_p ())
3115 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3116 "reduction: both uses not ssa_names: ");
3118 return NULL;
3121 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3122 if ((TREE_CODE (op1) == SSA_NAME
3123 && !types_compatible_p (type,TREE_TYPE (op1)))
3124 || (TREE_CODE (op2) == SSA_NAME
3125 && !types_compatible_p (type, TREE_TYPE (op2)))
3126 || (op3 && TREE_CODE (op3) == SSA_NAME
3127 && !types_compatible_p (type, TREE_TYPE (op3)))
3128 || (op4 && TREE_CODE (op4) == SSA_NAME
3129 && !types_compatible_p (type, TREE_TYPE (op4))))
3131 if (dump_enabled_p ())
3133 dump_printf_loc (MSG_NOTE, vect_location,
3134 "reduction: multiple types: operation type: ");
3135 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3136 dump_printf (MSG_NOTE, ", operands types: ");
3137 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138 TREE_TYPE (op1));
3139 dump_printf (MSG_NOTE, ",");
3140 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3141 TREE_TYPE (op2));
3142 if (op3)
3144 dump_printf (MSG_NOTE, ",");
3145 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3146 TREE_TYPE (op3));
3149 if (op4)
3151 dump_printf (MSG_NOTE, ",");
3152 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3153 TREE_TYPE (op4));
3155 dump_printf (MSG_NOTE, "\n");
3158 return NULL;
3161 /* Check whether it's ok to change the order of the computation.
3162 Generally, when vectorizing a reduction we change the order of the
3163 computation. This may change the behavior of the program in some
3164 cases, so we need to check that this is ok. One exception is when
3165 vectorizing an outer-loop: the inner-loop is executed sequentially,
3166 and therefore vectorizing reductions in the inner-loop during
3167 outer-loop vectorization is safe. */
3168 if (check_reduction
3169 && *v_reduc_type == TREE_CODE_REDUCTION
3170 && needs_fold_left_reduction_p (type, code,
3171 need_wrapping_integral_overflow))
3172 *v_reduc_type = FOLD_LEFT_REDUCTION;
3174 /* Reduction is safe. We're dealing with one of the following:
3175 1) integer arithmetic and no trapv
3176 2) floating point arithmetic, and special flags permit this optimization
3177 3) nested cycle (i.e., outer loop vectorization). */
3178 if (TREE_CODE (op1) == SSA_NAME)
3179 def1 = SSA_NAME_DEF_STMT (op1);
3181 if (TREE_CODE (op2) == SSA_NAME)
3182 def2 = SSA_NAME_DEF_STMT (op2);
3184 if (code != COND_EXPR
3185 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3187 if (dump_enabled_p ())
3188 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3189 return NULL;
3192 /* Check that one def is the reduction def, defined by PHI,
3193 the other def is either defined in the loop ("vect_internal_def"),
3194 or it's an induction (defined by a loop-header phi-node). */
3196 if (def2 && def2 == phi
3197 && (code == COND_EXPR
3198 || !def1 || gimple_nop_p (def1)
3199 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3200 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3201 && (is_gimple_assign (def1)
3202 || is_gimple_call (def1)
3203 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3204 == vect_induction_def
3205 || (gimple_code (def1) == GIMPLE_PHI
3206 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3207 == vect_internal_def
3208 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3210 if (dump_enabled_p ())
3211 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3212 return def_stmt;
3215 if (def1 && def1 == phi
3216 && (code == COND_EXPR
3217 || !def2 || gimple_nop_p (def2)
3218 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3219 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3220 && (is_gimple_assign (def2)
3221 || is_gimple_call (def2)
3222 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3223 == vect_induction_def
3224 || (gimple_code (def2) == GIMPLE_PHI
3225 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3226 == vect_internal_def
3227 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3229 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3231 /* Check if we can swap operands (just for simplicity - so that
3232 the rest of the code can assume that the reduction variable
3233 is always the last (second) argument). */
3234 if (code == COND_EXPR)
3236 /* Swap cond_expr by inverting the condition. */
3237 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3238 enum tree_code invert_code = ERROR_MARK;
3239 enum tree_code cond_code = TREE_CODE (cond_expr);
3241 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3243 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3244 invert_code = invert_tree_comparison (cond_code, honor_nans);
3246 if (invert_code != ERROR_MARK)
3248 TREE_SET_CODE (cond_expr, invert_code);
3249 swap_ssa_operands (def_stmt,
3250 gimple_assign_rhs2_ptr (def_stmt),
3251 gimple_assign_rhs3_ptr (def_stmt));
3253 else
3255 if (dump_enabled_p ())
3256 report_vect_op (MSG_NOTE, def_stmt,
3257 "detected reduction: cannot swap operands "
3258 "for cond_expr");
3259 return NULL;
3262 else
3263 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3264 gimple_assign_rhs2_ptr (def_stmt));
3266 if (dump_enabled_p ())
3267 report_vect_op (MSG_NOTE, def_stmt,
3268 "detected reduction: need to swap operands: ");
3270 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3271 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3273 else
3275 if (dump_enabled_p ())
3276 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3279 return def_stmt;
3282 /* Try to find SLP reduction chain. */
3283 if (! nested_in_vect_loop
3284 && code != COND_EXPR
3285 && orig_code != MINUS_EXPR
3286 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3288 if (dump_enabled_p ())
3289 report_vect_op (MSG_NOTE, def_stmt,
3290 "reduction: detected reduction chain: ");
3292 return def_stmt;
3295 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3296 gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3297 while (first)
3299 gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3300 REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3301 REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3302 first = next;
3305 /* Look for the expression computing loop_arg from loop PHI result. */
3306 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3307 code))
3308 return def_stmt;
3310 if (dump_enabled_p ())
3312 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3313 "reduction: unknown pattern: ");
3316 return NULL;
3319 /* Wrapper around vect_is_simple_reduction, which will modify code
3320 in-place if it enables detection of more reductions. Arguments
3321 as there. */
3323 gimple *
3324 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3325 bool *double_reduc,
3326 bool need_wrapping_integral_overflow)
3328 enum vect_reduction_type v_reduc_type;
3329 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3330 need_wrapping_integral_overflow,
3331 &v_reduc_type);
3332 if (def)
3334 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3335 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3336 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3337 reduc_def_info = vinfo_for_stmt (def);
3338 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3339 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3341 return def;
3344 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3346 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3347 int *peel_iters_epilogue,
3348 stmt_vector_for_cost *scalar_cost_vec,
3349 stmt_vector_for_cost *prologue_cost_vec,
3350 stmt_vector_for_cost *epilogue_cost_vec)
3352 int retval = 0;
3353 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3355 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3357 *peel_iters_epilogue = assumed_vf / 2;
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE, vect_location,
3360 "cost model: epilogue peel iters set to vf/2 "
3361 "because loop iterations are unknown .\n");
3363 /* If peeled iterations are known but number of scalar loop
3364 iterations are unknown, count a taken branch per peeled loop. */
3365 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3366 NULL, 0, vect_prologue);
3367 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3368 NULL, 0, vect_epilogue);
3370 else
3372 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3373 peel_iters_prologue = niters < peel_iters_prologue ?
3374 niters : peel_iters_prologue;
3375 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3376 /* If we need to peel for gaps, but no peeling is required, we have to
3377 peel VF iterations. */
3378 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3379 *peel_iters_epilogue = assumed_vf;
3382 stmt_info_for_cost *si;
3383 int j;
3384 if (peel_iters_prologue)
3385 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3387 stmt_vec_info stmt_info
3388 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3389 retval += record_stmt_cost (prologue_cost_vec,
3390 si->count * peel_iters_prologue,
3391 si->kind, stmt_info, si->misalign,
3392 vect_prologue);
3394 if (*peel_iters_epilogue)
3395 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3397 stmt_vec_info stmt_info
3398 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3399 retval += record_stmt_cost (epilogue_cost_vec,
3400 si->count * *peel_iters_epilogue,
3401 si->kind, stmt_info, si->misalign,
3402 vect_epilogue);
3405 return retval;
3408 /* Function vect_estimate_min_profitable_iters
3410 Return the number of iterations required for the vector version of the
3411 loop to be profitable relative to the cost of the scalar version of the
3412 loop.
3414 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3415 of iterations for vectorization. -1 value means loop vectorization
3416 is not profitable. This returned value may be used for dynamic
3417 profitability check.
3419 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3420 for static check against estimated number of iterations. */
3422 static void
3423 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3424 int *ret_min_profitable_niters,
3425 int *ret_min_profitable_estimate)
3427 int min_profitable_iters;
3428 int min_profitable_estimate;
3429 int peel_iters_prologue;
3430 int peel_iters_epilogue;
3431 unsigned vec_inside_cost = 0;
3432 int vec_outside_cost = 0;
3433 unsigned vec_prologue_cost = 0;
3434 unsigned vec_epilogue_cost = 0;
3435 int scalar_single_iter_cost = 0;
3436 int scalar_outside_cost = 0;
3437 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3438 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3439 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3441 /* Cost model disabled. */
3442 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3444 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3445 *ret_min_profitable_niters = 0;
3446 *ret_min_profitable_estimate = 0;
3447 return;
3450 /* Requires loop versioning tests to handle misalignment. */
3451 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3453 /* FIXME: Make cost depend on complexity of individual check. */
3454 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3455 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3456 vect_prologue);
3457 dump_printf (MSG_NOTE,
3458 "cost model: Adding cost of checks for loop "
3459 "versioning to treat misalignment.\n");
3462 /* Requires loop versioning with alias checks. */
3463 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3465 /* FIXME: Make cost depend on complexity of individual check. */
3466 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3467 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3468 vect_prologue);
3469 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3470 if (len)
3471 /* Count LEN - 1 ANDs and LEN comparisons. */
3472 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3473 NULL, 0, vect_prologue);
3474 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3475 if (len)
3477 /* Count LEN - 1 ANDs and LEN comparisons. */
3478 unsigned int nstmts = len * 2 - 1;
3479 /* +1 for each bias that needs adding. */
3480 for (unsigned int i = 0; i < len; ++i)
3481 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3482 nstmts += 1;
3483 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3484 NULL, 0, vect_prologue);
3486 dump_printf (MSG_NOTE,
3487 "cost model: Adding cost of checks for loop "
3488 "versioning aliasing.\n");
3491 /* Requires loop versioning with niter checks. */
3492 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3494 /* FIXME: Make cost depend on complexity of individual check. */
3495 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3496 vect_prologue);
3497 dump_printf (MSG_NOTE,
3498 "cost model: Adding cost of checks for loop "
3499 "versioning niters.\n");
3502 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3503 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3504 vect_prologue);
3506 /* Count statements in scalar loop. Using this as scalar cost for a single
3507 iteration for now.
3509 TODO: Add outer loop support.
3511 TODO: Consider assigning different costs to different scalar
3512 statements. */
3514 scalar_single_iter_cost
3515 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3517 /* Add additional cost for the peeled instructions in prologue and epilogue
3518 loop. (For fully-masked loops there will be no peeling.)
3520 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3521 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3523 TODO: Build an expression that represents peel_iters for prologue and
3524 epilogue to be used in a run-time test. */
3526 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3528 peel_iters_prologue = 0;
3529 peel_iters_epilogue = 0;
3531 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3533 /* We need to peel exactly one iteration. */
3534 peel_iters_epilogue += 1;
3535 stmt_info_for_cost *si;
3536 int j;
3537 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3538 j, si)
3540 struct _stmt_vec_info *stmt_info
3541 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3542 (void) add_stmt_cost (target_cost_data, si->count,
3543 si->kind, stmt_info, si->misalign,
3544 vect_epilogue);
3548 else if (npeel < 0)
3550 peel_iters_prologue = assumed_vf / 2;
3551 dump_printf (MSG_NOTE, "cost model: "
3552 "prologue peel iters set to vf/2.\n");
3554 /* If peeling for alignment is unknown, loop bound of main loop becomes
3555 unknown. */
3556 peel_iters_epilogue = assumed_vf / 2;
3557 dump_printf (MSG_NOTE, "cost model: "
3558 "epilogue peel iters set to vf/2 because "
3559 "peeling for alignment is unknown.\n");
3561 /* If peeled iterations are unknown, count a taken branch and a not taken
3562 branch per peeled loop. Even if scalar loop iterations are known,
3563 vector iterations are not known since peeled prologue iterations are
3564 not known. Hence guards remain the same. */
3565 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3566 NULL, 0, vect_prologue);
3567 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3568 NULL, 0, vect_prologue);
3569 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3570 NULL, 0, vect_epilogue);
3571 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3572 NULL, 0, vect_epilogue);
3573 stmt_info_for_cost *si;
3574 int j;
3575 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3577 struct _stmt_vec_info *stmt_info
3578 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3579 (void) add_stmt_cost (target_cost_data,
3580 si->count * peel_iters_prologue,
3581 si->kind, stmt_info, si->misalign,
3582 vect_prologue);
3583 (void) add_stmt_cost (target_cost_data,
3584 si->count * peel_iters_epilogue,
3585 si->kind, stmt_info, si->misalign,
3586 vect_epilogue);
3589 else
3591 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3592 stmt_info_for_cost *si;
3593 int j;
3594 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3596 prologue_cost_vec.create (2);
3597 epilogue_cost_vec.create (2);
3598 peel_iters_prologue = npeel;
3600 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3601 &peel_iters_epilogue,
3602 &LOOP_VINFO_SCALAR_ITERATION_COST
3603 (loop_vinfo),
3604 &prologue_cost_vec,
3605 &epilogue_cost_vec);
3607 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3609 struct _stmt_vec_info *stmt_info
3610 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3611 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3612 si->misalign, vect_prologue);
3615 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3617 struct _stmt_vec_info *stmt_info
3618 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3619 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3620 si->misalign, vect_epilogue);
3623 prologue_cost_vec.release ();
3624 epilogue_cost_vec.release ();
3627 /* FORNOW: The scalar outside cost is incremented in one of the
3628 following ways:
3630 1. The vectorizer checks for alignment and aliasing and generates
3631 a condition that allows dynamic vectorization. A cost model
3632 check is ANDED with the versioning condition. Hence scalar code
3633 path now has the added cost of the versioning check.
3635 if (cost > th & versioning_check)
3636 jmp to vector code
3638 Hence run-time scalar is incremented by not-taken branch cost.
3640 2. The vectorizer then checks if a prologue is required. If the
3641 cost model check was not done before during versioning, it has to
3642 be done before the prologue check.
3644 if (cost <= th)
3645 prologue = scalar_iters
3646 if (prologue == 0)
3647 jmp to vector code
3648 else
3649 execute prologue
3650 if (prologue == num_iters)
3651 go to exit
3653 Hence the run-time scalar cost is incremented by a taken branch,
3654 plus a not-taken branch, plus a taken branch cost.
3656 3. The vectorizer then checks if an epilogue is required. If the
3657 cost model check was not done before during prologue check, it
3658 has to be done with the epilogue check.
3660 if (prologue == 0)
3661 jmp to vector code
3662 else
3663 execute prologue
3664 if (prologue == num_iters)
3665 go to exit
3666 vector code:
3667 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3668 jmp to epilogue
3670 Hence the run-time scalar cost should be incremented by 2 taken
3671 branches.
3673 TODO: The back end may reorder the BBS's differently and reverse
3674 conditions/branch directions. Change the estimates below to
3675 something more reasonable. */
3677 /* If the number of iterations is known and we do not do versioning, we can
3678 decide whether to vectorize at compile time. Hence the scalar version
3679 do not carry cost model guard costs. */
3680 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3681 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3683 /* Cost model check occurs at versioning. */
3684 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3685 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3686 else
3688 /* Cost model check occurs at prologue generation. */
3689 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3690 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3691 + vect_get_stmt_cost (cond_branch_not_taken);
3692 /* Cost model check occurs at epilogue generation. */
3693 else
3694 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3698 /* Complete the target-specific cost calculations. */
3699 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3700 &vec_inside_cost, &vec_epilogue_cost);
3702 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3704 if (dump_enabled_p ())
3706 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3707 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3708 vec_inside_cost);
3709 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3710 vec_prologue_cost);
3711 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3712 vec_epilogue_cost);
3713 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3714 scalar_single_iter_cost);
3715 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3716 scalar_outside_cost);
3717 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3718 vec_outside_cost);
3719 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3720 peel_iters_prologue);
3721 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3722 peel_iters_epilogue);
3725 /* Calculate number of iterations required to make the vector version
3726 profitable, relative to the loop bodies only. The following condition
3727 must hold true:
3728 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3729 where
3730 SIC = scalar iteration cost, VIC = vector iteration cost,
3731 VOC = vector outside cost, VF = vectorization factor,
3732 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3733 SOC = scalar outside cost for run time cost model check. */
3735 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3737 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3738 * assumed_vf
3739 - vec_inside_cost * peel_iters_prologue
3740 - vec_inside_cost * peel_iters_epilogue);
3741 if (min_profitable_iters <= 0)
3742 min_profitable_iters = 0;
3743 else
3745 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3746 - vec_inside_cost);
3748 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3749 <= (((int) vec_inside_cost * min_profitable_iters)
3750 + (((int) vec_outside_cost - scalar_outside_cost)
3751 * assumed_vf)))
3752 min_profitable_iters++;
3755 /* vector version will never be profitable. */
3756 else
3758 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3759 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3760 "did not happen for a simd loop");
3762 if (dump_enabled_p ())
3763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3764 "cost model: the vector iteration cost = %d "
3765 "divided by the scalar iteration cost = %d "
3766 "is greater or equal to the vectorization factor = %d"
3767 ".\n",
3768 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3769 *ret_min_profitable_niters = -1;
3770 *ret_min_profitable_estimate = -1;
3771 return;
3774 dump_printf (MSG_NOTE,
3775 " Calculated minimum iters for profitability: %d\n",
3776 min_profitable_iters);
3778 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3779 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3780 /* We want the vectorized loop to execute at least once. */
3781 min_profitable_iters = assumed_vf + peel_iters_prologue;
3783 if (dump_enabled_p ())
3784 dump_printf_loc (MSG_NOTE, vect_location,
3785 " Runtime profitability threshold = %d\n",
3786 min_profitable_iters);
3788 *ret_min_profitable_niters = min_profitable_iters;
3790 /* Calculate number of iterations required to make the vector version
3791 profitable, relative to the loop bodies only.
3793 Non-vectorized variant is SIC * niters and it must win over vector
3794 variant on the expected loop trip count. The following condition must hold true:
3795 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3797 if (vec_outside_cost <= 0)
3798 min_profitable_estimate = 0;
3799 else
3801 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3802 * assumed_vf
3803 - vec_inside_cost * peel_iters_prologue
3804 - vec_inside_cost * peel_iters_epilogue)
3805 / ((scalar_single_iter_cost * assumed_vf)
3806 - vec_inside_cost);
3808 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3809 if (dump_enabled_p ())
3810 dump_printf_loc (MSG_NOTE, vect_location,
3811 " Static estimate profitability threshold = %d\n",
3812 min_profitable_estimate);
3814 *ret_min_profitable_estimate = min_profitable_estimate;
3817 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3818 vector elements (not bits) for a vector with NELT elements. */
3819 static void
3820 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3821 vec_perm_builder *sel)
3823 /* The encoding is a single stepped pattern. Any wrap-around is handled
3824 by vec_perm_indices. */
3825 sel->new_vector (nelt, 1, 3);
3826 for (unsigned int i = 0; i < 3; i++)
3827 sel->quick_push (i + offset);
3830 /* Checks whether the target supports whole-vector shifts for vectors of mode
3831 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3832 it supports vec_perm_const with masks for all necessary shift amounts. */
3833 static bool
3834 have_whole_vector_shift (machine_mode mode)
3836 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3837 return true;
3839 /* Variable-length vectors should be handled via the optab. */
3840 unsigned int nelt;
3841 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3842 return false;
3844 vec_perm_builder sel;
3845 vec_perm_indices indices;
3846 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3848 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3849 indices.new_vector (sel, 2, nelt);
3850 if (!can_vec_perm_const_p (mode, indices, false))
3851 return false;
3853 return true;
3856 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3857 functions. Design better to avoid maintenance issues. */
3859 /* Function vect_model_reduction_cost.
3861 Models cost for a reduction operation, including the vector ops
3862 generated within the strip-mine loop, the initial definition before
3863 the loop, and the epilogue code that must be generated. */
3865 static void
3866 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3867 int ncopies, stmt_vector_for_cost *cost_vec)
3869 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3870 enum tree_code code;
3871 optab optab;
3872 tree vectype;
3873 gimple *orig_stmt;
3874 machine_mode mode;
3875 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3876 struct loop *loop = NULL;
3878 if (loop_vinfo)
3879 loop = LOOP_VINFO_LOOP (loop_vinfo);
3881 /* Condition reductions generate two reductions in the loop. */
3882 vect_reduction_type reduction_type
3883 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3884 if (reduction_type == COND_REDUCTION)
3885 ncopies *= 2;
3887 vectype = STMT_VINFO_VECTYPE (stmt_info);
3888 mode = TYPE_MODE (vectype);
3889 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3891 if (!orig_stmt)
3892 orig_stmt = STMT_VINFO_STMT (stmt_info);
3894 code = gimple_assign_rhs_code (orig_stmt);
3896 if (reduction_type == EXTRACT_LAST_REDUCTION
3897 || reduction_type == FOLD_LEFT_REDUCTION)
3899 /* No extra instructions needed in the prologue. */
3900 prologue_cost = 0;
3902 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3903 /* Count one reduction-like operation per vector. */
3904 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3905 stmt_info, 0, vect_body);
3906 else
3908 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3909 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3910 inside_cost = record_stmt_cost (cost_vec, nelements,
3911 vec_to_scalar, stmt_info, 0,
3912 vect_body);
3913 inside_cost += record_stmt_cost (cost_vec, nelements,
3914 scalar_stmt, stmt_info, 0,
3915 vect_body);
3918 else
3920 /* Add in cost for initial definition.
3921 For cond reduction we have four vectors: initial index, step,
3922 initial result of the data reduction, initial value of the index
3923 reduction. */
3924 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3925 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3926 scalar_to_vec, stmt_info, 0,
3927 vect_prologue);
3929 /* Cost of reduction op inside loop. */
3930 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3931 stmt_info, 0, vect_body);
3934 /* Determine cost of epilogue code.
3936 We have a reduction operator that will reduce the vector in one statement.
3937 Also requires scalar extract. */
3939 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3941 if (reduc_fn != IFN_LAST)
3943 if (reduction_type == COND_REDUCTION)
3945 /* An EQ stmt and an COND_EXPR stmt. */
3946 epilogue_cost += record_stmt_cost (cost_vec, 2,
3947 vector_stmt, stmt_info, 0,
3948 vect_epilogue);
3949 /* Reduction of the max index and a reduction of the found
3950 values. */
3951 epilogue_cost += record_stmt_cost (cost_vec, 2,
3952 vec_to_scalar, stmt_info, 0,
3953 vect_epilogue);
3954 /* A broadcast of the max value. */
3955 epilogue_cost += record_stmt_cost (cost_vec, 1,
3956 scalar_to_vec, stmt_info, 0,
3957 vect_epilogue);
3959 else
3961 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3962 stmt_info, 0, vect_epilogue);
3963 epilogue_cost += record_stmt_cost (cost_vec, 1,
3964 vec_to_scalar, stmt_info, 0,
3965 vect_epilogue);
3968 else if (reduction_type == COND_REDUCTION)
3970 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3971 /* Extraction of scalar elements. */
3972 epilogue_cost += record_stmt_cost (cost_vec,
3973 2 * estimated_nunits,
3974 vec_to_scalar, stmt_info, 0,
3975 vect_epilogue);
3976 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3977 epilogue_cost += record_stmt_cost (cost_vec,
3978 2 * estimated_nunits - 3,
3979 scalar_stmt, stmt_info, 0,
3980 vect_epilogue);
3982 else if (reduction_type == EXTRACT_LAST_REDUCTION
3983 || reduction_type == FOLD_LEFT_REDUCTION)
3984 /* No extra instructions need in the epilogue. */
3986 else
3988 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3989 tree bitsize =
3990 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3991 int element_bitsize = tree_to_uhwi (bitsize);
3992 int nelements = vec_size_in_bits / element_bitsize;
3994 if (code == COND_EXPR)
3995 code = MAX_EXPR;
3997 optab = optab_for_tree_code (code, vectype, optab_default);
3999 /* We have a whole vector shift available. */
4000 if (optab != unknown_optab
4001 && VECTOR_MODE_P (mode)
4002 && optab_handler (optab, mode) != CODE_FOR_nothing
4003 && have_whole_vector_shift (mode))
4005 /* Final reduction via vector shifts and the reduction operator.
4006 Also requires scalar extract. */
4007 epilogue_cost += record_stmt_cost (cost_vec,
4008 exact_log2 (nelements) * 2,
4009 vector_stmt, stmt_info, 0,
4010 vect_epilogue);
4011 epilogue_cost += record_stmt_cost (cost_vec, 1,
4012 vec_to_scalar, stmt_info, 0,
4013 vect_epilogue);
4015 else
4016 /* Use extracts and reduction op for final reduction. For N
4017 elements, we have N extracts and N-1 reduction ops. */
4018 epilogue_cost += record_stmt_cost (cost_vec,
4019 nelements + nelements - 1,
4020 vector_stmt, stmt_info, 0,
4021 vect_epilogue);
4025 if (dump_enabled_p ())
4026 dump_printf (MSG_NOTE,
4027 "vect_model_reduction_cost: inside_cost = %d, "
4028 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4029 prologue_cost, epilogue_cost);
4033 /* Function vect_model_induction_cost.
4035 Models cost for induction operations. */
4037 static void
4038 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4039 stmt_vector_for_cost *cost_vec)
4041 unsigned inside_cost, prologue_cost;
4043 if (PURE_SLP_STMT (stmt_info))
4044 return;
4046 /* loop cost for vec_loop. */
4047 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4048 stmt_info, 0, vect_body);
4050 /* prologue cost for vec_init and vec_step. */
4051 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4052 stmt_info, 0, vect_prologue);
4054 if (dump_enabled_p ())
4055 dump_printf_loc (MSG_NOTE, vect_location,
4056 "vect_model_induction_cost: inside_cost = %d, "
4057 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4062 /* Function get_initial_def_for_reduction
4064 Input:
4065 STMT - a stmt that performs a reduction operation in the loop.
4066 INIT_VAL - the initial value of the reduction variable
4068 Output:
4069 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4070 of the reduction (used for adjusting the epilog - see below).
4071 Return a vector variable, initialized according to the operation that STMT
4072 performs. This vector will be used as the initial value of the
4073 vector of partial results.
4075 Option1 (adjust in epilog): Initialize the vector as follows:
4076 add/bit or/xor: [0,0,...,0,0]
4077 mult/bit and: [1,1,...,1,1]
4078 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4079 and when necessary (e.g. add/mult case) let the caller know
4080 that it needs to adjust the result by init_val.
4082 Option2: Initialize the vector as follows:
4083 add/bit or/xor: [init_val,0,0,...,0]
4084 mult/bit and: [init_val,1,1,...,1]
4085 min/max/cond_expr: [init_val,init_val,...,init_val]
4086 and no adjustments are needed.
4088 For example, for the following code:
4090 s = init_val;
4091 for (i=0;i<n;i++)
4092 s = s + a[i];
4094 STMT is 's = s + a[i]', and the reduction variable is 's'.
4095 For a vector of 4 units, we want to return either [0,0,0,init_val],
4096 or [0,0,0,0] and let the caller know that it needs to adjust
4097 the result at the end by 'init_val'.
4099 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4100 initialization vector is simpler (same element in all entries), if
4101 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4103 A cost model should help decide between these two schemes. */
4105 tree
4106 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4107 tree *adjustment_def)
4109 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4110 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4111 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4112 tree scalar_type = TREE_TYPE (init_val);
4113 tree vectype = get_vectype_for_scalar_type (scalar_type);
4114 enum tree_code code = gimple_assign_rhs_code (stmt);
4115 tree def_for_init;
4116 tree init_def;
4117 bool nested_in_vect_loop = false;
4118 REAL_VALUE_TYPE real_init_val = dconst0;
4119 int int_init_val = 0;
4120 gimple *def_stmt = NULL;
4121 gimple_seq stmts = NULL;
4123 gcc_assert (vectype);
4125 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4126 || SCALAR_FLOAT_TYPE_P (scalar_type));
4128 if (nested_in_vect_loop_p (loop, stmt))
4129 nested_in_vect_loop = true;
4130 else
4131 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4133 /* In case of double reduction we only create a vector variable to be put
4134 in the reduction phi node. The actual statement creation is done in
4135 vect_create_epilog_for_reduction. */
4136 if (adjustment_def && nested_in_vect_loop
4137 && TREE_CODE (init_val) == SSA_NAME
4138 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4139 && gimple_code (def_stmt) == GIMPLE_PHI
4140 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4141 && vinfo_for_stmt (def_stmt)
4142 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4143 == vect_double_reduction_def)
4145 *adjustment_def = NULL;
4146 return vect_create_destination_var (init_val, vectype);
4149 vect_reduction_type reduction_type
4150 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4152 /* In case of a nested reduction do not use an adjustment def as
4153 that case is not supported by the epilogue generation correctly
4154 if ncopies is not one. */
4155 if (adjustment_def && nested_in_vect_loop)
4157 *adjustment_def = NULL;
4158 return vect_get_vec_def_for_operand (init_val, stmt);
4161 switch (code)
4163 case WIDEN_SUM_EXPR:
4164 case DOT_PROD_EXPR:
4165 case SAD_EXPR:
4166 case PLUS_EXPR:
4167 case MINUS_EXPR:
4168 case BIT_IOR_EXPR:
4169 case BIT_XOR_EXPR:
4170 case MULT_EXPR:
4171 case BIT_AND_EXPR:
4173 /* ADJUSTMENT_DEF is NULL when called from
4174 vect_create_epilog_for_reduction to vectorize double reduction. */
4175 if (adjustment_def)
4176 *adjustment_def = init_val;
4178 if (code == MULT_EXPR)
4180 real_init_val = dconst1;
4181 int_init_val = 1;
4184 if (code == BIT_AND_EXPR)
4185 int_init_val = -1;
4187 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4188 def_for_init = build_real (scalar_type, real_init_val);
4189 else
4190 def_for_init = build_int_cst (scalar_type, int_init_val);
4192 if (adjustment_def)
4193 /* Option1: the first element is '0' or '1' as well. */
4194 init_def = gimple_build_vector_from_val (&stmts, vectype,
4195 def_for_init);
4196 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4198 /* Option2 (variable length): the first element is INIT_VAL. */
4199 init_def = gimple_build_vector_from_val (&stmts, vectype,
4200 def_for_init);
4201 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4202 vectype, init_def, init_val);
4204 else
4206 /* Option2: the first element is INIT_VAL. */
4207 tree_vector_builder elts (vectype, 1, 2);
4208 elts.quick_push (init_val);
4209 elts.quick_push (def_for_init);
4210 init_def = gimple_build_vector (&stmts, &elts);
4213 break;
4215 case MIN_EXPR:
4216 case MAX_EXPR:
4217 case COND_EXPR:
4219 if (adjustment_def)
4221 *adjustment_def = NULL_TREE;
4222 if (reduction_type != COND_REDUCTION
4223 && reduction_type != EXTRACT_LAST_REDUCTION)
4225 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4226 break;
4229 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4230 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4232 break;
4234 default:
4235 gcc_unreachable ();
4238 if (stmts)
4239 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4240 return init_def;
4243 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4244 NUMBER_OF_VECTORS is the number of vector defs to create.
4245 If NEUTRAL_OP is nonnull, introducing extra elements of that
4246 value will not change the result. */
4248 static void
4249 get_initial_defs_for_reduction (slp_tree slp_node,
4250 vec<tree> *vec_oprnds,
4251 unsigned int number_of_vectors,
4252 bool reduc_chain, tree neutral_op)
4254 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4255 gimple *stmt = stmts[0];
4256 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4257 unsigned HOST_WIDE_INT nunits;
4258 unsigned j, number_of_places_left_in_vector;
4259 tree vector_type;
4260 tree vop;
4261 int group_size = stmts.length ();
4262 unsigned int vec_num, i;
4263 unsigned number_of_copies = 1;
4264 vec<tree> voprnds;
4265 voprnds.create (number_of_vectors);
4266 struct loop *loop;
4267 auto_vec<tree, 16> permute_results;
4269 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4271 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4273 loop = (gimple_bb (stmt))->loop_father;
4274 gcc_assert (loop);
4275 edge pe = loop_preheader_edge (loop);
4277 gcc_assert (!reduc_chain || neutral_op);
4279 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4280 created vectors. It is greater than 1 if unrolling is performed.
4282 For example, we have two scalar operands, s1 and s2 (e.g., group of
4283 strided accesses of size two), while NUNITS is four (i.e., four scalars
4284 of this type can be packed in a vector). The output vector will contain
4285 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4286 will be 2).
4288 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4289 vectors containing the operands.
4291 For example, NUNITS is four as before, and the group size is 8
4292 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4293 {s5, s6, s7, s8}. */
4295 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4296 nunits = group_size;
4298 number_of_copies = nunits * number_of_vectors / group_size;
4300 number_of_places_left_in_vector = nunits;
4301 bool constant_p = true;
4302 tree_vector_builder elts (vector_type, nunits, 1);
4303 elts.quick_grow (nunits);
4304 for (j = 0; j < number_of_copies; j++)
4306 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4308 tree op;
4309 /* Get the def before the loop. In reduction chain we have only
4310 one initial value. */
4311 if ((j != (number_of_copies - 1)
4312 || (reduc_chain && i != 0))
4313 && neutral_op)
4314 op = neutral_op;
4315 else
4316 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4318 /* Create 'vect_ = {op0,op1,...,opn}'. */
4319 number_of_places_left_in_vector--;
4320 elts[number_of_places_left_in_vector] = op;
4321 if (!CONSTANT_CLASS_P (op))
4322 constant_p = false;
4324 if (number_of_places_left_in_vector == 0)
4326 gimple_seq ctor_seq = NULL;
4327 tree init;
4328 if (constant_p && !neutral_op
4329 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4330 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4331 /* Build the vector directly from ELTS. */
4332 init = gimple_build_vector (&ctor_seq, &elts);
4333 else if (neutral_op)
4335 /* Build a vector of the neutral value and shift the
4336 other elements into place. */
4337 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4338 neutral_op);
4339 int k = nunits;
4340 while (k > 0 && elts[k - 1] == neutral_op)
4341 k -= 1;
4342 while (k > 0)
4344 k -= 1;
4345 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4346 vector_type, init, elts[k]);
4349 else
4351 /* First time round, duplicate ELTS to fill the
4352 required number of vectors, then cherry pick the
4353 appropriate result for each iteration. */
4354 if (vec_oprnds->is_empty ())
4355 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4356 number_of_vectors,
4357 permute_results);
4358 init = permute_results[number_of_vectors - j - 1];
4360 if (ctor_seq != NULL)
4361 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4362 voprnds.quick_push (init);
4364 number_of_places_left_in_vector = nunits;
4365 elts.new_vector (vector_type, nunits, 1);
4366 elts.quick_grow (nunits);
4367 constant_p = true;
4372 /* Since the vectors are created in the reverse order, we should invert
4373 them. */
4374 vec_num = voprnds.length ();
4375 for (j = vec_num; j != 0; j--)
4377 vop = voprnds[j - 1];
4378 vec_oprnds->quick_push (vop);
4381 voprnds.release ();
4383 /* In case that VF is greater than the unrolling factor needed for the SLP
4384 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4385 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4386 to replicate the vectors. */
4387 tree neutral_vec = NULL;
4388 while (number_of_vectors > vec_oprnds->length ())
4390 if (neutral_op)
4392 if (!neutral_vec)
4394 gimple_seq ctor_seq = NULL;
4395 neutral_vec = gimple_build_vector_from_val
4396 (&ctor_seq, vector_type, neutral_op);
4397 if (ctor_seq != NULL)
4398 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4400 vec_oprnds->quick_push (neutral_vec);
4402 else
4404 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4405 vec_oprnds->quick_push (vop);
4411 /* Function vect_create_epilog_for_reduction
4413 Create code at the loop-epilog to finalize the result of a reduction
4414 computation.
4416 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4417 reduction statements.
4418 STMT is the scalar reduction stmt that is being vectorized.
4419 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4420 number of elements that we can fit in a vectype (nunits). In this case
4421 we have to generate more than one vector stmt - i.e - we need to "unroll"
4422 the vector stmt by a factor VF/nunits. For more details see documentation
4423 in vectorizable_operation.
4424 REDUC_FN is the internal function for the epilog reduction.
4425 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4426 computation.
4427 REDUC_INDEX is the index of the operand in the right hand side of the
4428 statement that is defined by REDUCTION_PHI.
4429 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4430 SLP_NODE is an SLP node containing a group of reduction statements. The
4431 first one in this group is STMT.
4432 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4433 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4434 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4435 any value of the IV in the loop.
4436 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4437 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4438 null if this is not an SLP reduction
4440 This function:
4441 1. Creates the reduction def-use cycles: sets the arguments for
4442 REDUCTION_PHIS:
4443 The loop-entry argument is the vectorized initial-value of the reduction.
4444 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4445 sums.
4446 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4447 by calling the function specified by REDUC_FN if available, or by
4448 other means (whole-vector shifts or a scalar loop).
4449 The function also creates a new phi node at the loop exit to preserve
4450 loop-closed form, as illustrated below.
4452 The flow at the entry to this function:
4454 loop:
4455 vec_def = phi <null, null> # REDUCTION_PHI
4456 VECT_DEF = vector_stmt # vectorized form of STMT
4457 s_loop = scalar_stmt # (scalar) STMT
4458 loop_exit:
4459 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4460 use <s_out0>
4461 use <s_out0>
4463 The above is transformed by this function into:
4465 loop:
4466 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4467 VECT_DEF = vector_stmt # vectorized form of STMT
4468 s_loop = scalar_stmt # (scalar) STMT
4469 loop_exit:
4470 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4471 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4472 v_out2 = reduce <v_out1>
4473 s_out3 = extract_field <v_out2, 0>
4474 s_out4 = adjust_result <s_out3>
4475 use <s_out4>
4476 use <s_out4>
4479 static void
4480 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4481 gimple *reduc_def_stmt,
4482 int ncopies, internal_fn reduc_fn,
4483 vec<gimple *> reduction_phis,
4484 bool double_reduc,
4485 slp_tree slp_node,
4486 slp_instance slp_node_instance,
4487 tree induc_val, enum tree_code induc_code,
4488 tree neutral_op)
4490 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4491 stmt_vec_info prev_phi_info;
4492 tree vectype;
4493 machine_mode mode;
4494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4495 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4496 basic_block exit_bb;
4497 tree scalar_dest;
4498 tree scalar_type;
4499 gimple *new_phi = NULL, *phi;
4500 gimple_stmt_iterator exit_gsi;
4501 tree vec_dest;
4502 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4503 gimple *epilog_stmt = NULL;
4504 enum tree_code code = gimple_assign_rhs_code (stmt);
4505 gimple *exit_phi;
4506 tree bitsize;
4507 tree adjustment_def = NULL;
4508 tree vec_initial_def = NULL;
4509 tree expr, def, initial_def = NULL;
4510 tree orig_name, scalar_result;
4511 imm_use_iterator imm_iter, phi_imm_iter;
4512 use_operand_p use_p, phi_use_p;
4513 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4514 bool nested_in_vect_loop = false;
4515 auto_vec<gimple *> new_phis;
4516 auto_vec<gimple *> inner_phis;
4517 enum vect_def_type dt = vect_unknown_def_type;
4518 int j, i;
4519 auto_vec<tree> scalar_results;
4520 unsigned int group_size = 1, k, ratio;
4521 auto_vec<tree> vec_initial_defs;
4522 auto_vec<gimple *> phis;
4523 bool slp_reduc = false;
4524 bool direct_slp_reduc;
4525 tree new_phi_result;
4526 gimple *inner_phi = NULL;
4527 tree induction_index = NULL_TREE;
4529 if (slp_node)
4530 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4532 if (nested_in_vect_loop_p (loop, stmt))
4534 outer_loop = loop;
4535 loop = loop->inner;
4536 nested_in_vect_loop = true;
4537 gcc_assert (!slp_node);
4540 vectype = STMT_VINFO_VECTYPE (stmt_info);
4541 gcc_assert (vectype);
4542 mode = TYPE_MODE (vectype);
4544 /* 1. Create the reduction def-use cycle:
4545 Set the arguments of REDUCTION_PHIS, i.e., transform
4547 loop:
4548 vec_def = phi <null, null> # REDUCTION_PHI
4549 VECT_DEF = vector_stmt # vectorized form of STMT
4552 into:
4554 loop:
4555 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4556 VECT_DEF = vector_stmt # vectorized form of STMT
4559 (in case of SLP, do it for all the phis). */
4561 /* Get the loop-entry arguments. */
4562 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4563 if (slp_node)
4565 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4566 vec_initial_defs.reserve (vec_num);
4567 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4568 &vec_initial_defs, vec_num,
4569 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4570 neutral_op);
4572 else
4574 /* Get at the scalar def before the loop, that defines the initial value
4575 of the reduction variable. */
4576 gimple *def_stmt;
4577 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4578 loop_preheader_edge (loop));
4579 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4580 and we can't use zero for induc_val, use initial_def. Similarly
4581 for REDUC_MIN and initial_def larger than the base. */
4582 if (TREE_CODE (initial_def) == INTEGER_CST
4583 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4584 == INTEGER_INDUC_COND_REDUCTION)
4585 && !integer_zerop (induc_val)
4586 && ((induc_code == MAX_EXPR
4587 && tree_int_cst_lt (initial_def, induc_val))
4588 || (induc_code == MIN_EXPR
4589 && tree_int_cst_lt (induc_val, initial_def))))
4590 induc_val = initial_def;
4591 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4592 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4593 &adjustment_def);
4594 vec_initial_defs.create (1);
4595 vec_initial_defs.quick_push (vec_initial_def);
4598 /* Set phi nodes arguments. */
4599 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4601 tree vec_init_def = vec_initial_defs[i];
4602 tree def = vect_defs[i];
4603 for (j = 0; j < ncopies; j++)
4605 if (j != 0)
4607 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4608 if (nested_in_vect_loop)
4609 vec_init_def
4610 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4611 vec_init_def);
4614 /* Set the loop-entry arg of the reduction-phi. */
4616 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4617 == INTEGER_INDUC_COND_REDUCTION)
4619 /* Initialise the reduction phi to zero. This prevents initial
4620 values of non-zero interferring with the reduction op. */
4621 gcc_assert (ncopies == 1);
4622 gcc_assert (i == 0);
4624 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4625 tree induc_val_vec
4626 = build_vector_from_val (vec_init_def_type, induc_val);
4628 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4629 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4631 else
4632 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4633 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4635 /* Set the loop-latch arg for the reduction-phi. */
4636 if (j > 0)
4637 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4639 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4640 UNKNOWN_LOCATION);
4642 if (dump_enabled_p ())
4644 dump_printf_loc (MSG_NOTE, vect_location,
4645 "transform reduction: created def-use cycle: ");
4646 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4647 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4652 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4653 which is updated with the current index of the loop for every match of
4654 the original loop's cond_expr (VEC_STMT). This results in a vector
4655 containing the last time the condition passed for that vector lane.
4656 The first match will be a 1 to allow 0 to be used for non-matching
4657 indexes. If there are no matches at all then the vector will be all
4658 zeroes. */
4659 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4661 tree indx_before_incr, indx_after_incr;
4662 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4664 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4665 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4667 int scalar_precision
4668 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4669 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4670 tree cr_index_vector_type = build_vector_type
4671 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4673 /* First we create a simple vector induction variable which starts
4674 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4675 vector size (STEP). */
4677 /* Create a {1,2,3,...} vector. */
4678 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4680 /* Create a vector of the step value. */
4681 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4682 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4684 /* Create an induction variable. */
4685 gimple_stmt_iterator incr_gsi;
4686 bool insert_after;
4687 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4688 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4689 insert_after, &indx_before_incr, &indx_after_incr);
4691 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4692 filled with zeros (VEC_ZERO). */
4694 /* Create a vector of 0s. */
4695 tree zero = build_zero_cst (cr_index_scalar_type);
4696 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4698 /* Create a vector phi node. */
4699 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4700 new_phi = create_phi_node (new_phi_tree, loop->header);
4701 set_vinfo_for_stmt (new_phi,
4702 new_stmt_vec_info (new_phi, loop_vinfo));
4703 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4704 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4706 /* Now take the condition from the loops original cond_expr
4707 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4708 every match uses values from the induction variable
4709 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4710 (NEW_PHI_TREE).
4711 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4712 the new cond_expr (INDEX_COND_EXPR). */
4714 /* Duplicate the condition from vec_stmt. */
4715 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4717 /* Create a conditional, where the condition is taken from vec_stmt
4718 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4719 else is the phi (NEW_PHI_TREE). */
4720 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4721 ccompare, indx_before_incr,
4722 new_phi_tree);
4723 induction_index = make_ssa_name (cr_index_vector_type);
4724 gimple *index_condition = gimple_build_assign (induction_index,
4725 index_cond_expr);
4726 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4727 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4728 loop_vinfo);
4729 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4730 set_vinfo_for_stmt (index_condition, index_vec_info);
4732 /* Update the phi with the vec cond. */
4733 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4734 loop_latch_edge (loop), UNKNOWN_LOCATION);
4737 /* 2. Create epilog code.
4738 The reduction epilog code operates across the elements of the vector
4739 of partial results computed by the vectorized loop.
4740 The reduction epilog code consists of:
4742 step 1: compute the scalar result in a vector (v_out2)
4743 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4744 step 3: adjust the scalar result (s_out3) if needed.
4746 Step 1 can be accomplished using one the following three schemes:
4747 (scheme 1) using reduc_fn, if available.
4748 (scheme 2) using whole-vector shifts, if available.
4749 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4750 combined.
4752 The overall epilog code looks like this:
4754 s_out0 = phi <s_loop> # original EXIT_PHI
4755 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4756 v_out2 = reduce <v_out1> # step 1
4757 s_out3 = extract_field <v_out2, 0> # step 2
4758 s_out4 = adjust_result <s_out3> # step 3
4760 (step 3 is optional, and steps 1 and 2 may be combined).
4761 Lastly, the uses of s_out0 are replaced by s_out4. */
4764 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4765 v_out1 = phi <VECT_DEF>
4766 Store them in NEW_PHIS. */
4768 exit_bb = single_exit (loop)->dest;
4769 prev_phi_info = NULL;
4770 new_phis.create (vect_defs.length ());
4771 FOR_EACH_VEC_ELT (vect_defs, i, def)
4773 for (j = 0; j < ncopies; j++)
4775 tree new_def = copy_ssa_name (def);
4776 phi = create_phi_node (new_def, exit_bb);
4777 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4778 if (j == 0)
4779 new_phis.quick_push (phi);
4780 else
4782 def = vect_get_vec_def_for_stmt_copy (dt, def);
4783 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4786 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4787 prev_phi_info = vinfo_for_stmt (phi);
4791 /* The epilogue is created for the outer-loop, i.e., for the loop being
4792 vectorized. Create exit phis for the outer loop. */
4793 if (double_reduc)
4795 loop = outer_loop;
4796 exit_bb = single_exit (loop)->dest;
4797 inner_phis.create (vect_defs.length ());
4798 FOR_EACH_VEC_ELT (new_phis, i, phi)
4800 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4801 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4802 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4803 PHI_RESULT (phi));
4804 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4805 loop_vinfo));
4806 inner_phis.quick_push (phi);
4807 new_phis[i] = outer_phi;
4808 prev_phi_info = vinfo_for_stmt (outer_phi);
4809 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4811 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4812 new_result = copy_ssa_name (PHI_RESULT (phi));
4813 outer_phi = create_phi_node (new_result, exit_bb);
4814 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4815 PHI_RESULT (phi));
4816 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4817 loop_vinfo));
4818 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4819 prev_phi_info = vinfo_for_stmt (outer_phi);
4824 exit_gsi = gsi_after_labels (exit_bb);
4826 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4827 (i.e. when reduc_fn is not available) and in the final adjustment
4828 code (if needed). Also get the original scalar reduction variable as
4829 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4830 represents a reduction pattern), the tree-code and scalar-def are
4831 taken from the original stmt that the pattern-stmt (STMT) replaces.
4832 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4833 are taken from STMT. */
4835 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4836 if (!orig_stmt)
4838 /* Regular reduction */
4839 orig_stmt = stmt;
4841 else
4843 /* Reduction pattern */
4844 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4845 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4846 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4849 code = gimple_assign_rhs_code (orig_stmt);
4850 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4851 partial results are added and not subtracted. */
4852 if (code == MINUS_EXPR)
4853 code = PLUS_EXPR;
4855 scalar_dest = gimple_assign_lhs (orig_stmt);
4856 scalar_type = TREE_TYPE (scalar_dest);
4857 scalar_results.create (group_size);
4858 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4859 bitsize = TYPE_SIZE (scalar_type);
4861 /* In case this is a reduction in an inner-loop while vectorizing an outer
4862 loop - we don't need to extract a single scalar result at the end of the
4863 inner-loop (unless it is double reduction, i.e., the use of reduction is
4864 outside the outer-loop). The final vector of partial results will be used
4865 in the vectorized outer-loop, or reduced to a scalar result at the end of
4866 the outer-loop. */
4867 if (nested_in_vect_loop && !double_reduc)
4868 goto vect_finalize_reduction;
4870 /* SLP reduction without reduction chain, e.g.,
4871 # a1 = phi <a2, a0>
4872 # b1 = phi <b2, b0>
4873 a2 = operation (a1)
4874 b2 = operation (b1) */
4875 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4877 /* True if we should implement SLP_REDUC using native reduction operations
4878 instead of scalar operations. */
4879 direct_slp_reduc = (reduc_fn != IFN_LAST
4880 && slp_reduc
4881 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4883 /* In case of reduction chain, e.g.,
4884 # a1 = phi <a3, a0>
4885 a2 = operation (a1)
4886 a3 = operation (a2),
4888 we may end up with more than one vector result. Here we reduce them to
4889 one vector. */
4890 if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4892 tree first_vect = PHI_RESULT (new_phis[0]);
4893 gassign *new_vec_stmt = NULL;
4894 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4895 for (k = 1; k < new_phis.length (); k++)
4897 gimple *next_phi = new_phis[k];
4898 tree second_vect = PHI_RESULT (next_phi);
4899 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4900 new_vec_stmt = gimple_build_assign (tem, code,
4901 first_vect, second_vect);
4902 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4903 first_vect = tem;
4906 new_phi_result = first_vect;
4907 if (new_vec_stmt)
4909 new_phis.truncate (0);
4910 new_phis.safe_push (new_vec_stmt);
4913 /* Likewise if we couldn't use a single defuse cycle. */
4914 else if (ncopies > 1)
4916 gcc_assert (new_phis.length () == 1);
4917 tree first_vect = PHI_RESULT (new_phis[0]);
4918 gassign *new_vec_stmt = NULL;
4919 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4920 gimple *next_phi = new_phis[0];
4921 for (int k = 1; k < ncopies; ++k)
4923 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4924 tree second_vect = PHI_RESULT (next_phi);
4925 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4926 new_vec_stmt = gimple_build_assign (tem, code,
4927 first_vect, second_vect);
4928 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4929 first_vect = tem;
4931 new_phi_result = first_vect;
4932 new_phis.truncate (0);
4933 new_phis.safe_push (new_vec_stmt);
4935 else
4936 new_phi_result = PHI_RESULT (new_phis[0]);
4938 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4939 && reduc_fn != IFN_LAST)
4941 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4942 various data values where the condition matched and another vector
4943 (INDUCTION_INDEX) containing all the indexes of those matches. We
4944 need to extract the last matching index (which will be the index with
4945 highest value) and use this to index into the data vector.
4946 For the case where there were no matches, the data vector will contain
4947 all default values and the index vector will be all zeros. */
4949 /* Get various versions of the type of the vector of indexes. */
4950 tree index_vec_type = TREE_TYPE (induction_index);
4951 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4952 tree index_scalar_type = TREE_TYPE (index_vec_type);
4953 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4954 (index_vec_type);
4956 /* Get an unsigned integer version of the type of the data vector. */
4957 int scalar_precision
4958 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4959 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4960 tree vectype_unsigned = build_vector_type
4961 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4963 /* First we need to create a vector (ZERO_VEC) of zeros and another
4964 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4965 can create using a MAX reduction and then expanding.
4966 In the case where the loop never made any matches, the max index will
4967 be zero. */
4969 /* Vector of {0, 0, 0,...}. */
4970 tree zero_vec = make_ssa_name (vectype);
4971 tree zero_vec_rhs = build_zero_cst (vectype);
4972 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4973 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4975 /* Find maximum value from the vector of found indexes. */
4976 tree max_index = make_ssa_name (index_scalar_type);
4977 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4978 1, induction_index);
4979 gimple_call_set_lhs (max_index_stmt, max_index);
4980 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4982 /* Vector of {max_index, max_index, max_index,...}. */
4983 tree max_index_vec = make_ssa_name (index_vec_type);
4984 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4985 max_index);
4986 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4987 max_index_vec_rhs);
4988 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4990 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4991 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4992 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4993 otherwise. Only one value should match, resulting in a vector
4994 (VEC_COND) with one data value and the rest zeros.
4995 In the case where the loop never made any matches, every index will
4996 match, resulting in a vector with all data values (which will all be
4997 the default value). */
4999 /* Compare the max index vector to the vector of found indexes to find
5000 the position of the max value. */
5001 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5002 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5003 induction_index,
5004 max_index_vec);
5005 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5007 /* Use the compare to choose either values from the data vector or
5008 zero. */
5009 tree vec_cond = make_ssa_name (vectype);
5010 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5011 vec_compare, new_phi_result,
5012 zero_vec);
5013 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5015 /* Finally we need to extract the data value from the vector (VEC_COND)
5016 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5017 reduction, but because this doesn't exist, we can use a MAX reduction
5018 instead. The data value might be signed or a float so we need to cast
5019 it first.
5020 In the case where the loop never made any matches, the data values are
5021 all identical, and so will reduce down correctly. */
5023 /* Make the matched data values unsigned. */
5024 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5025 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5026 vec_cond);
5027 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5028 VIEW_CONVERT_EXPR,
5029 vec_cond_cast_rhs);
5030 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5032 /* Reduce down to a scalar value. */
5033 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5034 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5035 1, vec_cond_cast);
5036 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5037 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5039 /* Convert the reduced value back to the result type and set as the
5040 result. */
5041 gimple_seq stmts = NULL;
5042 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5043 data_reduc);
5044 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5045 scalar_results.safe_push (new_temp);
5047 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5048 && reduc_fn == IFN_LAST)
5050 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5051 idx = 0;
5052 idx_val = induction_index[0];
5053 val = data_reduc[0];
5054 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5055 if (induction_index[i] > idx_val)
5056 val = data_reduc[i], idx_val = induction_index[i];
5057 return val; */
5059 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5060 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5061 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5062 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5063 /* Enforced by vectorizable_reduction, which ensures we have target
5064 support before allowing a conditional reduction on variable-length
5065 vectors. */
5066 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5067 tree idx_val = NULL_TREE, val = NULL_TREE;
5068 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5070 tree old_idx_val = idx_val;
5071 tree old_val = val;
5072 idx_val = make_ssa_name (idx_eltype);
5073 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5074 build3 (BIT_FIELD_REF, idx_eltype,
5075 induction_index,
5076 bitsize_int (el_size),
5077 bitsize_int (off)));
5078 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5079 val = make_ssa_name (data_eltype);
5080 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5081 build3 (BIT_FIELD_REF,
5082 data_eltype,
5083 new_phi_result,
5084 bitsize_int (el_size),
5085 bitsize_int (off)));
5086 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5087 if (off != 0)
5089 tree new_idx_val = idx_val;
5090 tree new_val = val;
5091 if (off != v_size - el_size)
5093 new_idx_val = make_ssa_name (idx_eltype);
5094 epilog_stmt = gimple_build_assign (new_idx_val,
5095 MAX_EXPR, idx_val,
5096 old_idx_val);
5097 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5099 new_val = make_ssa_name (data_eltype);
5100 epilog_stmt = gimple_build_assign (new_val,
5101 COND_EXPR,
5102 build2 (GT_EXPR,
5103 boolean_type_node,
5104 idx_val,
5105 old_idx_val),
5106 val, old_val);
5107 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5108 idx_val = new_idx_val;
5109 val = new_val;
5112 /* Convert the reduced value back to the result type and set as the
5113 result. */
5114 gimple_seq stmts = NULL;
5115 val = gimple_convert (&stmts, scalar_type, val);
5116 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5117 scalar_results.safe_push (val);
5120 /* 2.3 Create the reduction code, using one of the three schemes described
5121 above. In SLP we simply need to extract all the elements from the
5122 vector (without reducing them), so we use scalar shifts. */
5123 else if (reduc_fn != IFN_LAST && !slp_reduc)
5125 tree tmp;
5126 tree vec_elem_type;
5128 /* Case 1: Create:
5129 v_out2 = reduc_expr <v_out1> */
5131 if (dump_enabled_p ())
5132 dump_printf_loc (MSG_NOTE, vect_location,
5133 "Reduce using direct vector reduction.\n");
5135 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5136 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5138 tree tmp_dest
5139 = vect_create_destination_var (scalar_dest, vec_elem_type);
5140 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5141 new_phi_result);
5142 gimple_set_lhs (epilog_stmt, tmp_dest);
5143 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5144 gimple_set_lhs (epilog_stmt, new_temp);
5145 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5147 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5148 new_temp);
5150 else
5152 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5153 new_phi_result);
5154 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5157 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5158 gimple_set_lhs (epilog_stmt, new_temp);
5159 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5161 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5162 == INTEGER_INDUC_COND_REDUCTION)
5163 && !operand_equal_p (initial_def, induc_val, 0))
5165 /* Earlier we set the initial value to be a vector if induc_val
5166 values. Check the result and if it is induc_val then replace
5167 with the original initial value, unless induc_val is
5168 the same as initial_def already. */
5169 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5170 induc_val);
5172 tmp = make_ssa_name (new_scalar_dest);
5173 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5174 initial_def, new_temp);
5175 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5176 new_temp = tmp;
5179 scalar_results.safe_push (new_temp);
5181 else if (direct_slp_reduc)
5183 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5184 with the elements for other SLP statements replaced with the
5185 neutral value. We can then do a normal reduction on each vector. */
5187 /* Enforced by vectorizable_reduction. */
5188 gcc_assert (new_phis.length () == 1);
5189 gcc_assert (pow2p_hwi (group_size));
5191 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5192 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5193 gimple_seq seq = NULL;
5195 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5196 and the same element size as VECTYPE. */
5197 tree index = build_index_vector (vectype, 0, 1);
5198 tree index_type = TREE_TYPE (index);
5199 tree index_elt_type = TREE_TYPE (index_type);
5200 tree mask_type = build_same_sized_truth_vector_type (index_type);
5202 /* Create a vector that, for each element, identifies which of
5203 the REDUC_GROUP_SIZE results should use it. */
5204 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5205 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5206 build_vector_from_val (index_type, index_mask));
5208 /* Get a neutral vector value. This is simply a splat of the neutral
5209 scalar value if we have one, otherwise the initial scalar value
5210 is itself a neutral value. */
5211 tree vector_identity = NULL_TREE;
5212 if (neutral_op)
5213 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5214 neutral_op);
5215 for (unsigned int i = 0; i < group_size; ++i)
5217 /* If there's no univeral neutral value, we can use the
5218 initial scalar value from the original PHI. This is used
5219 for MIN and MAX reduction, for example. */
5220 if (!neutral_op)
5222 tree scalar_value
5223 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5224 loop_preheader_edge (loop));
5225 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5226 scalar_value);
5229 /* Calculate the equivalent of:
5231 sel[j] = (index[j] == i);
5233 which selects the elements of NEW_PHI_RESULT that should
5234 be included in the result. */
5235 tree compare_val = build_int_cst (index_elt_type, i);
5236 compare_val = build_vector_from_val (index_type, compare_val);
5237 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5238 index, compare_val);
5240 /* Calculate the equivalent of:
5242 vec = seq ? new_phi_result : vector_identity;
5244 VEC is now suitable for a full vector reduction. */
5245 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5246 sel, new_phi_result, vector_identity);
5248 /* Do the reduction and convert it to the appropriate type. */
5249 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5250 TREE_TYPE (vectype), vec);
5251 scalar = gimple_convert (&seq, scalar_type, scalar);
5252 scalar_results.safe_push (scalar);
5254 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5256 else
5258 bool reduce_with_shift;
5259 tree vec_temp;
5261 /* COND reductions all do the final reduction with MAX_EXPR
5262 or MIN_EXPR. */
5263 if (code == COND_EXPR)
5265 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5266 == INTEGER_INDUC_COND_REDUCTION)
5267 code = induc_code;
5268 else
5269 code = MAX_EXPR;
5272 /* See if the target wants to do the final (shift) reduction
5273 in a vector mode of smaller size and first reduce upper/lower
5274 halves against each other. */
5275 enum machine_mode mode1 = mode;
5276 tree vectype1 = vectype;
5277 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5278 unsigned sz1 = sz;
5279 if (!slp_reduc
5280 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5281 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5283 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5284 reduce_with_shift = have_whole_vector_shift (mode1);
5285 if (!VECTOR_MODE_P (mode1))
5286 reduce_with_shift = false;
5287 else
5289 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5290 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5291 reduce_with_shift = false;
5294 /* First reduce the vector to the desired vector size we should
5295 do shift reduction on by combining upper and lower halves. */
5296 new_temp = new_phi_result;
5297 while (sz > sz1)
5299 gcc_assert (!slp_reduc);
5300 sz /= 2;
5301 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5303 /* The target has to make sure we support lowpart/highpart
5304 extraction, either via direct vector extract or through
5305 an integer mode punning. */
5306 tree dst1, dst2;
5307 if (convert_optab_handler (vec_extract_optab,
5308 TYPE_MODE (TREE_TYPE (new_temp)),
5309 TYPE_MODE (vectype1))
5310 != CODE_FOR_nothing)
5312 /* Extract sub-vectors directly once vec_extract becomes
5313 a conversion optab. */
5314 dst1 = make_ssa_name (vectype1);
5315 epilog_stmt
5316 = gimple_build_assign (dst1, BIT_FIELD_REF,
5317 build3 (BIT_FIELD_REF, vectype1,
5318 new_temp, TYPE_SIZE (vectype1),
5319 bitsize_int (0)));
5320 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5321 dst2 = make_ssa_name (vectype1);
5322 epilog_stmt
5323 = gimple_build_assign (dst2, BIT_FIELD_REF,
5324 build3 (BIT_FIELD_REF, vectype1,
5325 new_temp, TYPE_SIZE (vectype1),
5326 bitsize_int (sz * BITS_PER_UNIT)));
5327 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5329 else
5331 /* Extract via punning to appropriately sized integer mode
5332 vector. */
5333 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5335 tree etype = build_vector_type (eltype, 2);
5336 gcc_assert (convert_optab_handler (vec_extract_optab,
5337 TYPE_MODE (etype),
5338 TYPE_MODE (eltype))
5339 != CODE_FOR_nothing);
5340 tree tem = make_ssa_name (etype);
5341 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5342 build1 (VIEW_CONVERT_EXPR,
5343 etype, new_temp));
5344 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345 new_temp = tem;
5346 tem = make_ssa_name (eltype);
5347 epilog_stmt
5348 = gimple_build_assign (tem, BIT_FIELD_REF,
5349 build3 (BIT_FIELD_REF, eltype,
5350 new_temp, TYPE_SIZE (eltype),
5351 bitsize_int (0)));
5352 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5353 dst1 = make_ssa_name (vectype1);
5354 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5355 build1 (VIEW_CONVERT_EXPR,
5356 vectype1, tem));
5357 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5358 tem = make_ssa_name (eltype);
5359 epilog_stmt
5360 = gimple_build_assign (tem, BIT_FIELD_REF,
5361 build3 (BIT_FIELD_REF, eltype,
5362 new_temp, TYPE_SIZE (eltype),
5363 bitsize_int (sz * BITS_PER_UNIT)));
5364 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365 dst2 = make_ssa_name (vectype1);
5366 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5367 build1 (VIEW_CONVERT_EXPR,
5368 vectype1, tem));
5369 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5372 new_temp = make_ssa_name (vectype1);
5373 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5374 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5377 if (reduce_with_shift && !slp_reduc)
5379 int element_bitsize = tree_to_uhwi (bitsize);
5380 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5381 for variable-length vectors and also requires direct target support
5382 for loop reductions. */
5383 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5384 int nelements = vec_size_in_bits / element_bitsize;
5385 vec_perm_builder sel;
5386 vec_perm_indices indices;
5388 int elt_offset;
5390 tree zero_vec = build_zero_cst (vectype1);
5391 /* Case 2: Create:
5392 for (offset = nelements/2; offset >= 1; offset/=2)
5394 Create: va' = vec_shift <va, offset>
5395 Create: va = vop <va, va'>
5396 } */
5398 tree rhs;
5400 if (dump_enabled_p ())
5401 dump_printf_loc (MSG_NOTE, vect_location,
5402 "Reduce using vector shifts\n");
5404 mode1 = TYPE_MODE (vectype1);
5405 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5406 for (elt_offset = nelements / 2;
5407 elt_offset >= 1;
5408 elt_offset /= 2)
5410 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5411 indices.new_vector (sel, 2, nelements);
5412 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5413 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5414 new_temp, zero_vec, mask);
5415 new_name = make_ssa_name (vec_dest, epilog_stmt);
5416 gimple_assign_set_lhs (epilog_stmt, new_name);
5417 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5419 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5420 new_temp);
5421 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5422 gimple_assign_set_lhs (epilog_stmt, new_temp);
5423 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5426 /* 2.4 Extract the final scalar result. Create:
5427 s_out3 = extract_field <v_out2, bitpos> */
5429 if (dump_enabled_p ())
5430 dump_printf_loc (MSG_NOTE, vect_location,
5431 "extract scalar result\n");
5433 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5434 bitsize, bitsize_zero_node);
5435 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5436 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5437 gimple_assign_set_lhs (epilog_stmt, new_temp);
5438 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5439 scalar_results.safe_push (new_temp);
5441 else
5443 /* Case 3: Create:
5444 s = extract_field <v_out2, 0>
5445 for (offset = element_size;
5446 offset < vector_size;
5447 offset += element_size;)
5449 Create: s' = extract_field <v_out2, offset>
5450 Create: s = op <s, s'> // For non SLP cases
5451 } */
5453 if (dump_enabled_p ())
5454 dump_printf_loc (MSG_NOTE, vect_location,
5455 "Reduce using scalar code.\n");
5457 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5458 int element_bitsize = tree_to_uhwi (bitsize);
5459 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5461 int bit_offset;
5462 if (gimple_code (new_phi) == GIMPLE_PHI)
5463 vec_temp = PHI_RESULT (new_phi);
5464 else
5465 vec_temp = gimple_assign_lhs (new_phi);
5466 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5467 bitsize_zero_node);
5468 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5469 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5470 gimple_assign_set_lhs (epilog_stmt, new_temp);
5471 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5473 /* In SLP we don't need to apply reduction operation, so we just
5474 collect s' values in SCALAR_RESULTS. */
5475 if (slp_reduc)
5476 scalar_results.safe_push (new_temp);
5478 for (bit_offset = element_bitsize;
5479 bit_offset < vec_size_in_bits;
5480 bit_offset += element_bitsize)
5482 tree bitpos = bitsize_int (bit_offset);
5483 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5484 bitsize, bitpos);
5486 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5487 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5488 gimple_assign_set_lhs (epilog_stmt, new_name);
5489 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5491 if (slp_reduc)
5493 /* In SLP we don't need to apply reduction operation, so
5494 we just collect s' values in SCALAR_RESULTS. */
5495 new_temp = new_name;
5496 scalar_results.safe_push (new_name);
5498 else
5500 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5501 new_name, new_temp);
5502 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5503 gimple_assign_set_lhs (epilog_stmt, new_temp);
5504 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5509 /* The only case where we need to reduce scalar results in SLP, is
5510 unrolling. If the size of SCALAR_RESULTS is greater than
5511 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5512 REDUC_GROUP_SIZE. */
5513 if (slp_reduc)
5515 tree res, first_res, new_res;
5516 gimple *new_stmt;
5518 /* Reduce multiple scalar results in case of SLP unrolling. */
5519 for (j = group_size; scalar_results.iterate (j, &res);
5520 j++)
5522 first_res = scalar_results[j % group_size];
5523 new_stmt = gimple_build_assign (new_scalar_dest, code,
5524 first_res, res);
5525 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5526 gimple_assign_set_lhs (new_stmt, new_res);
5527 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5528 scalar_results[j % group_size] = new_res;
5531 else
5532 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5533 scalar_results.safe_push (new_temp);
5536 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5537 == INTEGER_INDUC_COND_REDUCTION)
5538 && !operand_equal_p (initial_def, induc_val, 0))
5540 /* Earlier we set the initial value to be a vector if induc_val
5541 values. Check the result and if it is induc_val then replace
5542 with the original initial value, unless induc_val is
5543 the same as initial_def already. */
5544 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5545 induc_val);
5547 tree tmp = make_ssa_name (new_scalar_dest);
5548 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5549 initial_def, new_temp);
5550 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5551 scalar_results[0] = tmp;
5555 vect_finalize_reduction:
5557 if (double_reduc)
5558 loop = loop->inner;
5560 /* 2.5 Adjust the final result by the initial value of the reduction
5561 variable. (When such adjustment is not needed, then
5562 'adjustment_def' is zero). For example, if code is PLUS we create:
5563 new_temp = loop_exit_def + adjustment_def */
5565 if (adjustment_def)
5567 gcc_assert (!slp_reduc);
5568 if (nested_in_vect_loop)
5570 new_phi = new_phis[0];
5571 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5572 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5573 new_dest = vect_create_destination_var (scalar_dest, vectype);
5575 else
5577 new_temp = scalar_results[0];
5578 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5579 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5580 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5583 epilog_stmt = gimple_build_assign (new_dest, expr);
5584 new_temp = make_ssa_name (new_dest, epilog_stmt);
5585 gimple_assign_set_lhs (epilog_stmt, new_temp);
5586 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5587 if (nested_in_vect_loop)
5589 set_vinfo_for_stmt (epilog_stmt,
5590 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5591 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5592 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5594 if (!double_reduc)
5595 scalar_results.quick_push (new_temp);
5596 else
5597 scalar_results[0] = new_temp;
5599 else
5600 scalar_results[0] = new_temp;
5602 new_phis[0] = epilog_stmt;
5605 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5606 phis with new adjusted scalar results, i.e., replace use <s_out0>
5607 with use <s_out4>.
5609 Transform:
5610 loop_exit:
5611 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5612 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5613 v_out2 = reduce <v_out1>
5614 s_out3 = extract_field <v_out2, 0>
5615 s_out4 = adjust_result <s_out3>
5616 use <s_out0>
5617 use <s_out0>
5619 into:
5621 loop_exit:
5622 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5623 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5624 v_out2 = reduce <v_out1>
5625 s_out3 = extract_field <v_out2, 0>
5626 s_out4 = adjust_result <s_out3>
5627 use <s_out4>
5628 use <s_out4> */
5631 /* In SLP reduction chain we reduce vector results into one vector if
5632 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5633 LHS of the last stmt in the reduction chain, since we are looking for
5634 the loop exit phi node. */
5635 if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5637 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5638 /* Handle reduction patterns. */
5639 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5640 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5642 scalar_dest = gimple_assign_lhs (dest_stmt);
5643 group_size = 1;
5646 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5647 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5648 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5649 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5650 correspond to the first vector stmt, etc.
5651 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5652 if (group_size > new_phis.length ())
5654 ratio = group_size / new_phis.length ();
5655 gcc_assert (!(group_size % new_phis.length ()));
5657 else
5658 ratio = 1;
5660 for (k = 0; k < group_size; k++)
5662 if (k % ratio == 0)
5664 epilog_stmt = new_phis[k / ratio];
5665 reduction_phi = reduction_phis[k / ratio];
5666 if (double_reduc)
5667 inner_phi = inner_phis[k / ratio];
5670 if (slp_reduc)
5672 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5674 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5675 /* SLP statements can't participate in patterns. */
5676 gcc_assert (!orig_stmt);
5677 scalar_dest = gimple_assign_lhs (current_stmt);
5680 phis.create (3);
5681 /* Find the loop-closed-use at the loop exit of the original scalar
5682 result. (The reduction result is expected to have two immediate uses -
5683 one at the latch block, and one at the loop exit). */
5684 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5685 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5686 && !is_gimple_debug (USE_STMT (use_p)))
5687 phis.safe_push (USE_STMT (use_p));
5689 /* While we expect to have found an exit_phi because of loop-closed-ssa
5690 form we can end up without one if the scalar cycle is dead. */
5692 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5694 if (outer_loop)
5696 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5697 gphi *vect_phi;
5699 /* FORNOW. Currently not supporting the case that an inner-loop
5700 reduction is not used in the outer-loop (but only outside the
5701 outer-loop), unless it is double reduction. */
5702 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5703 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5704 || double_reduc);
5706 if (double_reduc)
5707 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5708 else
5709 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5710 if (!double_reduc
5711 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5712 != vect_double_reduction_def)
5713 continue;
5715 /* Handle double reduction:
5717 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5718 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5719 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5720 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5722 At that point the regular reduction (stmt2 and stmt3) is
5723 already vectorized, as well as the exit phi node, stmt4.
5724 Here we vectorize the phi node of double reduction, stmt1, and
5725 update all relevant statements. */
5727 /* Go through all the uses of s2 to find double reduction phi
5728 node, i.e., stmt1 above. */
5729 orig_name = PHI_RESULT (exit_phi);
5730 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5732 stmt_vec_info use_stmt_vinfo;
5733 stmt_vec_info new_phi_vinfo;
5734 tree vect_phi_init, preheader_arg, vect_phi_res;
5735 basic_block bb = gimple_bb (use_stmt);
5736 gimple *use;
5738 /* Check that USE_STMT is really double reduction phi
5739 node. */
5740 if (gimple_code (use_stmt) != GIMPLE_PHI
5741 || gimple_phi_num_args (use_stmt) != 2
5742 || bb->loop_father != outer_loop)
5743 continue;
5744 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5745 if (!use_stmt_vinfo
5746 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5747 != vect_double_reduction_def)
5748 continue;
5750 /* Create vector phi node for double reduction:
5751 vs1 = phi <vs0, vs2>
5752 vs1 was created previously in this function by a call to
5753 vect_get_vec_def_for_operand and is stored in
5754 vec_initial_def;
5755 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5756 vs0 is created here. */
5758 /* Create vector phi node. */
5759 vect_phi = create_phi_node (vec_initial_def, bb);
5760 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5761 loop_vec_info_for_loop (outer_loop));
5762 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5764 /* Create vs0 - initial def of the double reduction phi. */
5765 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5766 loop_preheader_edge (outer_loop));
5767 vect_phi_init = get_initial_def_for_reduction
5768 (stmt, preheader_arg, NULL);
5770 /* Update phi node arguments with vs0 and vs2. */
5771 add_phi_arg (vect_phi, vect_phi_init,
5772 loop_preheader_edge (outer_loop),
5773 UNKNOWN_LOCATION);
5774 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5775 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5776 if (dump_enabled_p ())
5778 dump_printf_loc (MSG_NOTE, vect_location,
5779 "created double reduction phi node: ");
5780 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5783 vect_phi_res = PHI_RESULT (vect_phi);
5785 /* Replace the use, i.e., set the correct vs1 in the regular
5786 reduction phi node. FORNOW, NCOPIES is always 1, so the
5787 loop is redundant. */
5788 use = reduction_phi;
5789 for (j = 0; j < ncopies; j++)
5791 edge pr_edge = loop_preheader_edge (loop);
5792 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5793 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5799 phis.release ();
5800 if (nested_in_vect_loop)
5802 if (double_reduc)
5803 loop = outer_loop;
5804 else
5805 continue;
5808 phis.create (3);
5809 /* Find the loop-closed-use at the loop exit of the original scalar
5810 result. (The reduction result is expected to have two immediate uses,
5811 one at the latch block, and one at the loop exit). For double
5812 reductions we are looking for exit phis of the outer loop. */
5813 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5815 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5817 if (!is_gimple_debug (USE_STMT (use_p)))
5818 phis.safe_push (USE_STMT (use_p));
5820 else
5822 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5824 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5826 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5828 if (!flow_bb_inside_loop_p (loop,
5829 gimple_bb (USE_STMT (phi_use_p)))
5830 && !is_gimple_debug (USE_STMT (phi_use_p)))
5831 phis.safe_push (USE_STMT (phi_use_p));
5837 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5839 /* Replace the uses: */
5840 orig_name = PHI_RESULT (exit_phi);
5841 scalar_result = scalar_results[k];
5842 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5843 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5844 SET_USE (use_p, scalar_result);
5847 phis.release ();
5851 /* Return a vector of type VECTYPE that is equal to the vector select
5852 operation "MASK ? VEC : IDENTITY". Insert the select statements
5853 before GSI. */
5855 static tree
5856 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5857 tree vec, tree identity)
5859 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5860 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5861 mask, vec, identity);
5862 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5863 return cond;
5866 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5867 order, starting with LHS. Insert the extraction statements before GSI and
5868 associate the new scalar SSA names with variable SCALAR_DEST.
5869 Return the SSA name for the result. */
5871 static tree
5872 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5873 tree_code code, tree lhs, tree vector_rhs)
5875 tree vectype = TREE_TYPE (vector_rhs);
5876 tree scalar_type = TREE_TYPE (vectype);
5877 tree bitsize = TYPE_SIZE (scalar_type);
5878 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5879 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5881 for (unsigned HOST_WIDE_INT bit_offset = 0;
5882 bit_offset < vec_size_in_bits;
5883 bit_offset += element_bitsize)
5885 tree bitpos = bitsize_int (bit_offset);
5886 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5887 bitsize, bitpos);
5889 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5890 rhs = make_ssa_name (scalar_dest, stmt);
5891 gimple_assign_set_lhs (stmt, rhs);
5892 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5894 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5895 tree new_name = make_ssa_name (scalar_dest, stmt);
5896 gimple_assign_set_lhs (stmt, new_name);
5897 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5898 lhs = new_name;
5900 return lhs;
5903 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
5904 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5905 statement. CODE is the operation performed by STMT and OPS are
5906 its scalar operands. REDUC_INDEX is the index of the operand in
5907 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5908 implements in-order reduction, or IFN_LAST if we should open-code it.
5909 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5910 that should be used to control the operation in a fully-masked loop. */
5912 static bool
5913 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5914 gimple **vec_stmt, slp_tree slp_node,
5915 gimple *reduc_def_stmt,
5916 tree_code code, internal_fn reduc_fn,
5917 tree ops[3], tree vectype_in,
5918 int reduc_index, vec_loop_masks *masks)
5920 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5921 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5922 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5923 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5924 gimple *new_stmt = NULL;
5926 int ncopies;
5927 if (slp_node)
5928 ncopies = 1;
5929 else
5930 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5932 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5933 gcc_assert (ncopies == 1);
5934 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5935 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5936 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5937 == FOLD_LEFT_REDUCTION);
5939 if (slp_node)
5940 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5941 TYPE_VECTOR_SUBPARTS (vectype_in)));
5943 tree op0 = ops[1 - reduc_index];
5945 int group_size = 1;
5946 gimple *scalar_dest_def;
5947 auto_vec<tree> vec_oprnds0;
5948 if (slp_node)
5950 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5951 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5952 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5954 else
5956 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5957 vec_oprnds0.create (1);
5958 vec_oprnds0.quick_push (loop_vec_def0);
5959 scalar_dest_def = stmt;
5962 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5963 tree scalar_type = TREE_TYPE (scalar_dest);
5964 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5966 int vec_num = vec_oprnds0.length ();
5967 gcc_assert (vec_num == 1 || slp_node);
5968 tree vec_elem_type = TREE_TYPE (vectype_out);
5969 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5971 tree vector_identity = NULL_TREE;
5972 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5973 vector_identity = build_zero_cst (vectype_out);
5975 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5976 int i;
5977 tree def0;
5978 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5980 tree mask = NULL_TREE;
5981 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5982 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5984 /* Handle MINUS by adding the negative. */
5985 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5987 tree negated = make_ssa_name (vectype_out);
5988 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5989 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5990 def0 = negated;
5993 if (mask)
5994 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5995 vector_identity);
5997 /* On the first iteration the input is simply the scalar phi
5998 result, and for subsequent iterations it is the output of
5999 the preceding operation. */
6000 if (reduc_fn != IFN_LAST)
6002 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6003 /* For chained SLP reductions the output of the previous reduction
6004 operation serves as the input of the next. For the final statement
6005 the output cannot be a temporary - we reuse the original
6006 scalar destination of the last statement. */
6007 if (i != vec_num - 1)
6009 gimple_set_lhs (new_stmt, scalar_dest_var);
6010 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6011 gimple_set_lhs (new_stmt, reduc_var);
6014 else
6016 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6017 reduc_var, def0);
6018 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6019 /* Remove the statement, so that we can use the same code paths
6020 as for statements that we've just created. */
6021 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6022 gsi_remove (&tmp_gsi, false);
6025 if (i == vec_num - 1)
6027 gimple_set_lhs (new_stmt, scalar_dest);
6028 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6030 else
6031 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6033 if (slp_node)
6034 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6037 if (!slp_node)
6038 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6040 return true;
6043 /* Function is_nonwrapping_integer_induction.
6045 Check if STMT (which is part of loop LOOP) both increments and
6046 does not cause overflow. */
6048 static bool
6049 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6051 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6052 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6053 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6054 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6055 widest_int ni, max_loop_value, lhs_max;
6056 bool overflow = false;
6058 /* Make sure the loop is integer based. */
6059 if (TREE_CODE (base) != INTEGER_CST
6060 || TREE_CODE (step) != INTEGER_CST)
6061 return false;
6063 /* Check that the max size of the loop will not wrap. */
6065 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6066 return true;
6068 if (! max_stmt_executions (loop, &ni))
6069 return false;
6071 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6072 &overflow);
6073 if (overflow)
6074 return false;
6076 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6077 TYPE_SIGN (lhs_type), &overflow);
6078 if (overflow)
6079 return false;
6081 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6082 <= TYPE_PRECISION (lhs_type));
6085 /* Function vectorizable_reduction.
6087 Check if STMT performs a reduction operation that can be vectorized.
6088 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6089 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6090 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6092 This function also handles reduction idioms (patterns) that have been
6093 recognized in advance during vect_pattern_recog. In this case, STMT may be
6094 of this form:
6095 X = pattern_expr (arg0, arg1, ..., X)
6096 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6097 sequence that had been detected and replaced by the pattern-stmt (STMT).
6099 This function also handles reduction of condition expressions, for example:
6100 for (int i = 0; i < N; i++)
6101 if (a[i] < value)
6102 last = a[i];
6103 This is handled by vectorising the loop and creating an additional vector
6104 containing the loop indexes for which "a[i] < value" was true. In the
6105 function epilogue this is reduced to a single max value and then used to
6106 index into the vector of results.
6108 In some cases of reduction patterns, the type of the reduction variable X is
6109 different than the type of the other arguments of STMT.
6110 In such cases, the vectype that is used when transforming STMT into a vector
6111 stmt is different than the vectype that is used to determine the
6112 vectorization factor, because it consists of a different number of elements
6113 than the actual number of elements that are being operated upon in parallel.
6115 For example, consider an accumulation of shorts into an int accumulator.
6116 On some targets it's possible to vectorize this pattern operating on 8
6117 shorts at a time (hence, the vectype for purposes of determining the
6118 vectorization factor should be V8HI); on the other hand, the vectype that
6119 is used to create the vector form is actually V4SI (the type of the result).
6121 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6122 indicates what is the actual level of parallelism (V8HI in the example), so
6123 that the right vectorization factor would be derived. This vectype
6124 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6125 be used to create the vectorized stmt. The right vectype for the vectorized
6126 stmt is obtained from the type of the result X:
6127 get_vectype_for_scalar_type (TREE_TYPE (X))
6129 This means that, contrary to "regular" reductions (or "regular" stmts in
6130 general), the following equation:
6131 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6132 does *NOT* necessarily hold for reduction patterns. */
6134 bool
6135 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6136 gimple **vec_stmt, slp_tree slp_node,
6137 slp_instance slp_node_instance,
6138 stmt_vector_for_cost *cost_vec)
6140 tree vec_dest;
6141 tree scalar_dest;
6142 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6143 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6144 tree vectype_in = NULL_TREE;
6145 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6147 enum tree_code code, orig_code;
6148 internal_fn reduc_fn;
6149 machine_mode vec_mode;
6150 int op_type;
6151 optab optab;
6152 tree new_temp = NULL_TREE;
6153 gimple *def_stmt;
6154 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6155 gimple *cond_reduc_def_stmt = NULL;
6156 enum tree_code cond_reduc_op_code = ERROR_MARK;
6157 tree scalar_type;
6158 bool is_simple_use;
6159 gimple *orig_stmt;
6160 stmt_vec_info orig_stmt_info = NULL;
6161 int i;
6162 int ncopies;
6163 int epilog_copies;
6164 stmt_vec_info prev_stmt_info, prev_phi_info;
6165 bool single_defuse_cycle = false;
6166 gimple *new_stmt = NULL;
6167 int j;
6168 tree ops[3];
6169 enum vect_def_type dts[3];
6170 bool nested_cycle = false, found_nested_cycle_def = false;
6171 bool double_reduc = false;
6172 basic_block def_bb;
6173 struct loop * def_stmt_loop, *outer_loop = NULL;
6174 tree def_arg;
6175 gimple *def_arg_stmt;
6176 auto_vec<tree> vec_oprnds0;
6177 auto_vec<tree> vec_oprnds1;
6178 auto_vec<tree> vec_oprnds2;
6179 auto_vec<tree> vect_defs;
6180 auto_vec<gimple *> phis;
6181 int vec_num;
6182 tree def0, tem;
6183 bool first_p = true;
6184 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6185 tree cond_reduc_val = NULL_TREE;
6187 /* Make sure it was already recognized as a reduction computation. */
6188 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6189 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6190 return false;
6192 if (nested_in_vect_loop_p (loop, stmt))
6194 outer_loop = loop;
6195 loop = loop->inner;
6196 nested_cycle = true;
6199 /* In case of reduction chain we switch to the first stmt in the chain, but
6200 we don't update STMT_INFO, since only the last stmt is marked as reduction
6201 and has reduction properties. */
6202 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6203 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6205 stmt = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6206 first_p = false;
6209 if (gimple_code (stmt) == GIMPLE_PHI)
6211 /* Analysis is fully done on the reduction stmt invocation. */
6212 if (! vec_stmt)
6214 if (slp_node)
6215 slp_node_instance->reduc_phis = slp_node;
6217 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6218 return true;
6221 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6222 /* Leave the scalar phi in place. Note that checking
6223 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6224 for reductions involving a single statement. */
6225 return true;
6227 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6228 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6229 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6231 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6232 == EXTRACT_LAST_REDUCTION)
6233 /* Leave the scalar phi in place. */
6234 return true;
6236 gcc_assert (is_gimple_assign (reduc_stmt));
6237 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6239 tree op = gimple_op (reduc_stmt, k);
6240 if (op == gimple_phi_result (stmt))
6241 continue;
6242 if (k == 1
6243 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6244 continue;
6245 if (!vectype_in
6246 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6247 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6248 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6249 break;
6251 gcc_assert (vectype_in);
6253 if (slp_node)
6254 ncopies = 1;
6255 else
6256 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6258 use_operand_p use_p;
6259 gimple *use_stmt;
6260 if (ncopies > 1
6261 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6262 <= vect_used_only_live)
6263 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6264 && (use_stmt == reduc_stmt
6265 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6266 == reduc_stmt)))
6267 single_defuse_cycle = true;
6269 /* Create the destination vector */
6270 scalar_dest = gimple_assign_lhs (reduc_stmt);
6271 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6273 if (slp_node)
6274 /* The size vect_schedule_slp_instance computes is off for us. */
6275 vec_num = vect_get_num_vectors
6276 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6277 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6278 vectype_in);
6279 else
6280 vec_num = 1;
6282 /* Generate the reduction PHIs upfront. */
6283 prev_phi_info = NULL;
6284 for (j = 0; j < ncopies; j++)
6286 if (j == 0 || !single_defuse_cycle)
6288 for (i = 0; i < vec_num; i++)
6290 /* Create the reduction-phi that defines the reduction
6291 operand. */
6292 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6293 set_vinfo_for_stmt (new_phi,
6294 new_stmt_vec_info (new_phi, loop_vinfo));
6296 if (slp_node)
6297 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6298 else
6300 if (j == 0)
6301 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6302 else
6303 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6304 prev_phi_info = vinfo_for_stmt (new_phi);
6310 return true;
6313 /* 1. Is vectorizable reduction? */
6314 /* Not supportable if the reduction variable is used in the loop, unless
6315 it's a reduction chain. */
6316 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6317 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6318 return false;
6320 /* Reductions that are not used even in an enclosing outer-loop,
6321 are expected to be "live" (used out of the loop). */
6322 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6323 && !STMT_VINFO_LIVE_P (stmt_info))
6324 return false;
6326 /* 2. Has this been recognized as a reduction pattern?
6328 Check if STMT represents a pattern that has been recognized
6329 in earlier analysis stages. For stmts that represent a pattern,
6330 the STMT_VINFO_RELATED_STMT field records the last stmt in
6331 the original sequence that constitutes the pattern. */
6333 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6334 if (orig_stmt)
6336 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6337 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6338 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6341 /* 3. Check the operands of the operation. The first operands are defined
6342 inside the loop body. The last operand is the reduction variable,
6343 which is defined by the loop-header-phi. */
6345 gcc_assert (is_gimple_assign (stmt));
6347 /* Flatten RHS. */
6348 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6350 case GIMPLE_BINARY_RHS:
6351 code = gimple_assign_rhs_code (stmt);
6352 op_type = TREE_CODE_LENGTH (code);
6353 gcc_assert (op_type == binary_op);
6354 ops[0] = gimple_assign_rhs1 (stmt);
6355 ops[1] = gimple_assign_rhs2 (stmt);
6356 break;
6358 case GIMPLE_TERNARY_RHS:
6359 code = gimple_assign_rhs_code (stmt);
6360 op_type = TREE_CODE_LENGTH (code);
6361 gcc_assert (op_type == ternary_op);
6362 ops[0] = gimple_assign_rhs1 (stmt);
6363 ops[1] = gimple_assign_rhs2 (stmt);
6364 ops[2] = gimple_assign_rhs3 (stmt);
6365 break;
6367 case GIMPLE_UNARY_RHS:
6368 return false;
6370 default:
6371 gcc_unreachable ();
6374 if (code == COND_EXPR && slp_node)
6375 return false;
6377 scalar_dest = gimple_assign_lhs (stmt);
6378 scalar_type = TREE_TYPE (scalar_dest);
6379 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6380 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6381 return false;
6383 /* Do not try to vectorize bit-precision reductions. */
6384 if (!type_has_mode_precision_p (scalar_type))
6385 return false;
6387 /* All uses but the last are expected to be defined in the loop.
6388 The last use is the reduction variable. In case of nested cycle this
6389 assumption is not true: we use reduc_index to record the index of the
6390 reduction variable. */
6391 gimple *reduc_def_stmt = NULL;
6392 int reduc_index = -1;
6393 for (i = 0; i < op_type; i++)
6395 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6396 if (i == 0 && code == COND_EXPR)
6397 continue;
6399 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6400 &def_stmt, &dts[i], &tem);
6401 dt = dts[i];
6402 gcc_assert (is_simple_use);
6403 if (dt == vect_reduction_def)
6405 reduc_def_stmt = def_stmt;
6406 reduc_index = i;
6407 continue;
6409 else if (tem)
6411 /* To properly compute ncopies we are interested in the widest
6412 input type in case we're looking at a widening accumulation. */
6413 if (!vectype_in
6414 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6415 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6416 vectype_in = tem;
6419 if (dt != vect_internal_def
6420 && dt != vect_external_def
6421 && dt != vect_constant_def
6422 && dt != vect_induction_def
6423 && !(dt == vect_nested_cycle && nested_cycle))
6424 return false;
6426 if (dt == vect_nested_cycle)
6428 found_nested_cycle_def = true;
6429 reduc_def_stmt = def_stmt;
6430 reduc_index = i;
6433 if (i == 1 && code == COND_EXPR)
6435 /* Record how value of COND_EXPR is defined. */
6436 if (dt == vect_constant_def)
6438 cond_reduc_dt = dt;
6439 cond_reduc_val = ops[i];
6441 if (dt == vect_induction_def
6442 && def_stmt != NULL
6443 && is_nonwrapping_integer_induction (def_stmt, loop))
6445 cond_reduc_dt = dt;
6446 cond_reduc_def_stmt = def_stmt;
6451 if (!vectype_in)
6452 vectype_in = vectype_out;
6454 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6455 directy used in stmt. */
6456 if (reduc_index == -1)
6458 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6460 if (dump_enabled_p ())
6461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6462 "in-order reduction chain without SLP.\n");
6463 return false;
6466 if (orig_stmt)
6467 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6468 else
6469 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6472 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6473 return false;
6475 if (!(reduc_index == -1
6476 || dts[reduc_index] == vect_reduction_def
6477 || dts[reduc_index] == vect_nested_cycle
6478 || ((dts[reduc_index] == vect_internal_def
6479 || dts[reduc_index] == vect_external_def
6480 || dts[reduc_index] == vect_constant_def
6481 || dts[reduc_index] == vect_induction_def)
6482 && nested_cycle && found_nested_cycle_def)))
6484 /* For pattern recognized stmts, orig_stmt might be a reduction,
6485 but some helper statements for the pattern might not, or
6486 might be COND_EXPRs with reduction uses in the condition. */
6487 gcc_assert (orig_stmt);
6488 return false;
6491 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6492 enum vect_reduction_type v_reduc_type
6493 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6494 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6496 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6497 /* If we have a condition reduction, see if we can simplify it further. */
6498 if (v_reduc_type == COND_REDUCTION)
6500 /* TODO: We can't yet handle reduction chains, since we need to treat
6501 each COND_EXPR in the chain specially, not just the last one.
6502 E.g. for:
6504 x_1 = PHI <x_3, ...>
6505 x_2 = a_2 ? ... : x_1;
6506 x_3 = a_3 ? ... : x_2;
6508 we're interested in the last element in x_3 for which a_2 || a_3
6509 is true, whereas the current reduction chain handling would
6510 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6511 as a reduction operation. */
6512 if (reduc_index == -1)
6514 if (dump_enabled_p ())
6515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6516 "conditional reduction chains not supported\n");
6517 return false;
6520 /* vect_is_simple_reduction ensured that operand 2 is the
6521 loop-carried operand. */
6522 gcc_assert (reduc_index == 2);
6524 /* Loop peeling modifies initial value of reduction PHI, which
6525 makes the reduction stmt to be transformed different to the
6526 original stmt analyzed. We need to record reduction code for
6527 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6528 it can be used directly at transform stage. */
6529 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6530 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6532 /* Also set the reduction type to CONST_COND_REDUCTION. */
6533 gcc_assert (cond_reduc_dt == vect_constant_def);
6534 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6536 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6537 vectype_in, OPTIMIZE_FOR_SPEED))
6539 if (dump_enabled_p ())
6540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6541 "optimizing condition reduction with"
6542 " FOLD_EXTRACT_LAST.\n");
6543 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6545 else if (cond_reduc_dt == vect_induction_def)
6547 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6548 tree base
6549 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6550 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6552 gcc_assert (TREE_CODE (base) == INTEGER_CST
6553 && TREE_CODE (step) == INTEGER_CST);
6554 cond_reduc_val = NULL_TREE;
6555 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6556 above base; punt if base is the minimum value of the type for
6557 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6558 if (tree_int_cst_sgn (step) == -1)
6560 cond_reduc_op_code = MIN_EXPR;
6561 if (tree_int_cst_sgn (base) == -1)
6562 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6563 else if (tree_int_cst_lt (base,
6564 TYPE_MAX_VALUE (TREE_TYPE (base))))
6565 cond_reduc_val
6566 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6568 else
6570 cond_reduc_op_code = MAX_EXPR;
6571 if (tree_int_cst_sgn (base) == 1)
6572 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6573 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6574 base))
6575 cond_reduc_val
6576 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6578 if (cond_reduc_val)
6580 if (dump_enabled_p ())
6581 dump_printf_loc (MSG_NOTE, vect_location,
6582 "condition expression based on "
6583 "integer induction.\n");
6584 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6585 = INTEGER_INDUC_COND_REDUCTION;
6588 else if (cond_reduc_dt == vect_constant_def)
6590 enum vect_def_type cond_initial_dt;
6591 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6592 tree cond_initial_val
6593 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6595 gcc_assert (cond_reduc_val != NULL_TREE);
6596 vect_is_simple_use (cond_initial_val, loop_vinfo,
6597 &def_stmt, &cond_initial_dt);
6598 if (cond_initial_dt == vect_constant_def
6599 && types_compatible_p (TREE_TYPE (cond_initial_val),
6600 TREE_TYPE (cond_reduc_val)))
6602 tree e = fold_binary (LE_EXPR, boolean_type_node,
6603 cond_initial_val, cond_reduc_val);
6604 if (e && (integer_onep (e) || integer_zerop (e)))
6606 if (dump_enabled_p ())
6607 dump_printf_loc (MSG_NOTE, vect_location,
6608 "condition expression based on "
6609 "compile time constant.\n");
6610 /* Record reduction code at analysis stage. */
6611 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6612 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6613 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6614 = CONST_COND_REDUCTION;
6620 if (orig_stmt)
6621 gcc_assert (tmp == orig_stmt
6622 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6623 == orig_stmt));
6624 else
6625 /* We changed STMT to be the first stmt in reduction chain, hence we
6626 check that in this case the first element in the chain is STMT. */
6627 gcc_assert (stmt == tmp
6628 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6630 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6631 return false;
6633 if (slp_node)
6634 ncopies = 1;
6635 else
6636 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6638 gcc_assert (ncopies >= 1);
6640 vec_mode = TYPE_MODE (vectype_in);
6641 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6643 if (code == COND_EXPR)
6645 /* Only call during the analysis stage, otherwise we'll lose
6646 STMT_VINFO_TYPE. */
6647 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6648 ops[reduc_index], 0, NULL,
6649 cost_vec))
6651 if (dump_enabled_p ())
6652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6653 "unsupported condition in reduction\n");
6654 return false;
6657 else
6659 /* 4. Supportable by target? */
6661 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6662 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6664 /* Shifts and rotates are only supported by vectorizable_shifts,
6665 not vectorizable_reduction. */
6666 if (dump_enabled_p ())
6667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6668 "unsupported shift or rotation.\n");
6669 return false;
6672 /* 4.1. check support for the operation in the loop */
6673 optab = optab_for_tree_code (code, vectype_in, optab_default);
6674 if (!optab)
6676 if (dump_enabled_p ())
6677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6678 "no optab.\n");
6680 return false;
6683 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6685 if (dump_enabled_p ())
6686 dump_printf (MSG_NOTE, "op not supported by target.\n");
6688 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6689 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6690 return false;
6692 if (dump_enabled_p ())
6693 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6696 /* Worthwhile without SIMD support? */
6697 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6698 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6700 if (dump_enabled_p ())
6701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6702 "not worthwhile without SIMD support.\n");
6704 return false;
6708 /* 4.2. Check support for the epilog operation.
6710 If STMT represents a reduction pattern, then the type of the
6711 reduction variable may be different than the type of the rest
6712 of the arguments. For example, consider the case of accumulation
6713 of shorts into an int accumulator; The original code:
6714 S1: int_a = (int) short_a;
6715 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6717 was replaced with:
6718 STMT: int_acc = widen_sum <short_a, int_acc>
6720 This means that:
6721 1. The tree-code that is used to create the vector operation in the
6722 epilog code (that reduces the partial results) is not the
6723 tree-code of STMT, but is rather the tree-code of the original
6724 stmt from the pattern that STMT is replacing. I.e, in the example
6725 above we want to use 'widen_sum' in the loop, but 'plus' in the
6726 epilog.
6727 2. The type (mode) we use to check available target support
6728 for the vector operation to be created in the *epilog*, is
6729 determined by the type of the reduction variable (in the example
6730 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6731 However the type (mode) we use to check available target support
6732 for the vector operation to be created *inside the loop*, is
6733 determined by the type of the other arguments to STMT (in the
6734 example we'd check this: optab_handler (widen_sum_optab,
6735 vect_short_mode)).
6737 This is contrary to "regular" reductions, in which the types of all
6738 the arguments are the same as the type of the reduction variable.
6739 For "regular" reductions we can therefore use the same vector type
6740 (and also the same tree-code) when generating the epilog code and
6741 when generating the code inside the loop. */
6743 vect_reduction_type reduction_type
6744 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6745 if (orig_stmt
6746 && (reduction_type == TREE_CODE_REDUCTION
6747 || reduction_type == FOLD_LEFT_REDUCTION))
6749 /* This is a reduction pattern: get the vectype from the type of the
6750 reduction variable, and get the tree-code from orig_stmt. */
6751 orig_code = gimple_assign_rhs_code (orig_stmt);
6752 gcc_assert (vectype_out);
6753 vec_mode = TYPE_MODE (vectype_out);
6755 else
6757 /* Regular reduction: use the same vectype and tree-code as used for
6758 the vector code inside the loop can be used for the epilog code. */
6759 orig_code = code;
6761 if (code == MINUS_EXPR)
6762 orig_code = PLUS_EXPR;
6764 /* For simple condition reductions, replace with the actual expression
6765 we want to base our reduction around. */
6766 if (reduction_type == CONST_COND_REDUCTION)
6768 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6769 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6771 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6772 orig_code = cond_reduc_op_code;
6775 if (nested_cycle)
6777 def_bb = gimple_bb (reduc_def_stmt);
6778 def_stmt_loop = def_bb->loop_father;
6779 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6780 loop_preheader_edge (def_stmt_loop));
6781 if (TREE_CODE (def_arg) == SSA_NAME
6782 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6783 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6784 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6785 && vinfo_for_stmt (def_arg_stmt)
6786 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6787 == vect_double_reduction_def)
6788 double_reduc = true;
6791 reduc_fn = IFN_LAST;
6793 if (reduction_type == TREE_CODE_REDUCTION
6794 || reduction_type == FOLD_LEFT_REDUCTION
6795 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6796 || reduction_type == CONST_COND_REDUCTION)
6798 if (reduction_type == FOLD_LEFT_REDUCTION
6799 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6800 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6802 if (reduc_fn != IFN_LAST
6803 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6804 OPTIMIZE_FOR_SPEED))
6806 if (dump_enabled_p ())
6807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6808 "reduc op not supported by target.\n");
6810 reduc_fn = IFN_LAST;
6813 else
6815 if (!nested_cycle || double_reduc)
6817 if (dump_enabled_p ())
6818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6819 "no reduc code for scalar code.\n");
6821 return false;
6825 else if (reduction_type == COND_REDUCTION)
6827 int scalar_precision
6828 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6829 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6830 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6831 nunits_out);
6833 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6834 OPTIMIZE_FOR_SPEED))
6835 reduc_fn = IFN_REDUC_MAX;
6838 if (reduction_type != EXTRACT_LAST_REDUCTION
6839 && reduc_fn == IFN_LAST
6840 && !nunits_out.is_constant ())
6842 if (dump_enabled_p ())
6843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6844 "missing target support for reduction on"
6845 " variable-length vectors.\n");
6846 return false;
6849 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6850 && ncopies > 1)
6852 if (dump_enabled_p ())
6853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854 "multiple types in double reduction or condition "
6855 "reduction.\n");
6856 return false;
6859 /* For SLP reductions, see if there is a neutral value we can use. */
6860 tree neutral_op = NULL_TREE;
6861 if (slp_node)
6862 neutral_op = neutral_op_for_slp_reduction
6863 (slp_node_instance->reduc_phis, code,
6864 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6866 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6868 /* We can't support in-order reductions of code such as this:
6870 for (int i = 0; i < n1; ++i)
6871 for (int j = 0; j < n2; ++j)
6872 l += a[j];
6874 since GCC effectively transforms the loop when vectorizing:
6876 for (int i = 0; i < n1 / VF; ++i)
6877 for (int j = 0; j < n2; ++j)
6878 for (int k = 0; k < VF; ++k)
6879 l += a[j];
6881 which is a reassociation of the original operation. */
6882 if (dump_enabled_p ())
6883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6884 "in-order double reduction not supported.\n");
6886 return false;
6889 if (reduction_type == FOLD_LEFT_REDUCTION
6890 && slp_node
6891 && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6893 /* We cannot use in-order reductions in this case because there is
6894 an implicit reassociation of the operations involved. */
6895 if (dump_enabled_p ())
6896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6897 "in-order unchained SLP reductions not supported.\n");
6898 return false;
6901 /* For double reductions, and for SLP reductions with a neutral value,
6902 we construct a variable-length initial vector by loading a vector
6903 full of the neutral value and then shift-and-inserting the start
6904 values into the low-numbered elements. */
6905 if ((double_reduc || neutral_op)
6906 && !nunits_out.is_constant ()
6907 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6908 vectype_out, OPTIMIZE_FOR_SPEED))
6910 if (dump_enabled_p ())
6911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6912 "reduction on variable-length vectors requires"
6913 " target support for a vector-shift-and-insert"
6914 " operation.\n");
6915 return false;
6918 /* Check extra constraints for variable-length unchained SLP reductions. */
6919 if (STMT_SLP_TYPE (stmt_info)
6920 && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6921 && !nunits_out.is_constant ())
6923 /* We checked above that we could build the initial vector when
6924 there's a neutral element value. Check here for the case in
6925 which each SLP statement has its own initial value and in which
6926 that value needs to be repeated for every instance of the
6927 statement within the initial vector. */
6928 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6929 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6930 if (!neutral_op
6931 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6933 if (dump_enabled_p ())
6934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6935 "unsupported form of SLP reduction for"
6936 " variable-length vectors: cannot build"
6937 " initial vector.\n");
6938 return false;
6940 /* The epilogue code relies on the number of elements being a multiple
6941 of the group size. The duplicate-and-interleave approach to setting
6942 up the the initial vector does too. */
6943 if (!multiple_p (nunits_out, group_size))
6945 if (dump_enabled_p ())
6946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6947 "unsupported form of SLP reduction for"
6948 " variable-length vectors: the vector size"
6949 " is not a multiple of the number of results.\n");
6950 return false;
6954 /* In case of widenning multiplication by a constant, we update the type
6955 of the constant to be the type of the other operand. We check that the
6956 constant fits the type in the pattern recognition pass. */
6957 if (code == DOT_PROD_EXPR
6958 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6960 if (TREE_CODE (ops[0]) == INTEGER_CST)
6961 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6962 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6963 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6964 else
6966 if (dump_enabled_p ())
6967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6968 "invalid types in dot-prod\n");
6970 return false;
6974 if (reduction_type == COND_REDUCTION)
6976 widest_int ni;
6978 if (! max_loop_iterations (loop, &ni))
6980 if (dump_enabled_p ())
6981 dump_printf_loc (MSG_NOTE, vect_location,
6982 "loop count not known, cannot create cond "
6983 "reduction.\n");
6984 return false;
6986 /* Convert backedges to iterations. */
6987 ni += 1;
6989 /* The additional index will be the same type as the condition. Check
6990 that the loop can fit into this less one (because we'll use up the
6991 zero slot for when there are no matches). */
6992 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6993 if (wi::geu_p (ni, wi::to_widest (max_index)))
6995 if (dump_enabled_p ())
6996 dump_printf_loc (MSG_NOTE, vect_location,
6997 "loop size is greater than data size.\n");
6998 return false;
7002 /* In case the vectorization factor (VF) is bigger than the number
7003 of elements that we can fit in a vectype (nunits), we have to generate
7004 more than one vector stmt - i.e - we need to "unroll" the
7005 vector stmt by a factor VF/nunits. For more details see documentation
7006 in vectorizable_operation. */
7008 /* If the reduction is used in an outer loop we need to generate
7009 VF intermediate results, like so (e.g. for ncopies=2):
7010 r0 = phi (init, r0)
7011 r1 = phi (init, r1)
7012 r0 = x0 + r0;
7013 r1 = x1 + r1;
7014 (i.e. we generate VF results in 2 registers).
7015 In this case we have a separate def-use cycle for each copy, and therefore
7016 for each copy we get the vector def for the reduction variable from the
7017 respective phi node created for this copy.
7019 Otherwise (the reduction is unused in the loop nest), we can combine
7020 together intermediate results, like so (e.g. for ncopies=2):
7021 r = phi (init, r)
7022 r = x0 + r;
7023 r = x1 + r;
7024 (i.e. we generate VF/2 results in a single register).
7025 In this case for each copy we get the vector def for the reduction variable
7026 from the vectorized reduction operation generated in the previous iteration.
7028 This only works when we see both the reduction PHI and its only consumer
7029 in vectorizable_reduction and there are no intermediate stmts
7030 participating. */
7031 use_operand_p use_p;
7032 gimple *use_stmt;
7033 if (ncopies > 1
7034 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7035 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7036 && (use_stmt == stmt
7037 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7039 single_defuse_cycle = true;
7040 epilog_copies = 1;
7042 else
7043 epilog_copies = ncopies;
7045 /* If the reduction stmt is one of the patterns that have lane
7046 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7047 if ((ncopies > 1
7048 && ! single_defuse_cycle)
7049 && (code == DOT_PROD_EXPR
7050 || code == WIDEN_SUM_EXPR
7051 || code == SAD_EXPR))
7053 if (dump_enabled_p ())
7054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7055 "multi def-use cycle not possible for lane-reducing "
7056 "reduction operation\n");
7057 return false;
7060 if (slp_node)
7061 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7062 else
7063 vec_num = 1;
7065 internal_fn cond_fn = get_conditional_internal_fn (code);
7066 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7068 if (!vec_stmt) /* transformation not required. */
7070 if (first_p)
7071 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7072 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7074 if (reduction_type != FOLD_LEFT_REDUCTION
7075 && (cond_fn == IFN_LAST
7076 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7077 OPTIMIZE_FOR_SPEED)))
7079 if (dump_enabled_p ())
7080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7081 "can't use a fully-masked loop because no"
7082 " conditional operation is available.\n");
7083 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7085 else if (reduc_index == -1)
7087 if (dump_enabled_p ())
7088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7089 "can't use a fully-masked loop for chained"
7090 " reductions.\n");
7091 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7093 else
7094 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7095 vectype_in);
7097 if (dump_enabled_p ()
7098 && reduction_type == FOLD_LEFT_REDUCTION)
7099 dump_printf_loc (MSG_NOTE, vect_location,
7100 "using an in-order (fold-left) reduction.\n");
7101 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7102 return true;
7105 /* Transform. */
7107 if (dump_enabled_p ())
7108 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7110 /* FORNOW: Multiple types are not supported for condition. */
7111 if (code == COND_EXPR)
7112 gcc_assert (ncopies == 1);
7114 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7116 if (reduction_type == FOLD_LEFT_REDUCTION)
7117 return vectorize_fold_left_reduction
7118 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7119 reduc_fn, ops, vectype_in, reduc_index, masks);
7121 if (reduction_type == EXTRACT_LAST_REDUCTION)
7123 gcc_assert (!slp_node);
7124 return vectorizable_condition (stmt, gsi, vec_stmt,
7125 NULL, reduc_index, NULL, NULL);
7128 /* Create the destination vector */
7129 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7131 prev_stmt_info = NULL;
7132 prev_phi_info = NULL;
7133 if (!slp_node)
7135 vec_oprnds0.create (1);
7136 vec_oprnds1.create (1);
7137 if (op_type == ternary_op)
7138 vec_oprnds2.create (1);
7141 phis.create (vec_num);
7142 vect_defs.create (vec_num);
7143 if (!slp_node)
7144 vect_defs.quick_push (NULL_TREE);
7146 if (slp_node)
7147 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7148 else
7149 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7151 for (j = 0; j < ncopies; j++)
7153 if (code == COND_EXPR)
7155 gcc_assert (!slp_node);
7156 vectorizable_condition (stmt, gsi, vec_stmt,
7157 PHI_RESULT (phis[0]),
7158 reduc_index, NULL, NULL);
7159 /* Multiple types are not supported for condition. */
7160 break;
7163 /* Handle uses. */
7164 if (j == 0)
7166 if (slp_node)
7168 /* Get vec defs for all the operands except the reduction index,
7169 ensuring the ordering of the ops in the vector is kept. */
7170 auto_vec<tree, 3> slp_ops;
7171 auto_vec<vec<tree>, 3> vec_defs;
7173 slp_ops.quick_push (ops[0]);
7174 slp_ops.quick_push (ops[1]);
7175 if (op_type == ternary_op)
7176 slp_ops.quick_push (ops[2]);
7178 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7180 vec_oprnds0.safe_splice (vec_defs[0]);
7181 vec_defs[0].release ();
7182 vec_oprnds1.safe_splice (vec_defs[1]);
7183 vec_defs[1].release ();
7184 if (op_type == ternary_op)
7186 vec_oprnds2.safe_splice (vec_defs[2]);
7187 vec_defs[2].release ();
7190 else
7192 vec_oprnds0.quick_push
7193 (vect_get_vec_def_for_operand (ops[0], stmt));
7194 vec_oprnds1.quick_push
7195 (vect_get_vec_def_for_operand (ops[1], stmt));
7196 if (op_type == ternary_op)
7197 vec_oprnds2.quick_push
7198 (vect_get_vec_def_for_operand (ops[2], stmt));
7201 else
7203 if (!slp_node)
7205 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7207 if (single_defuse_cycle && reduc_index == 0)
7208 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7209 else
7210 vec_oprnds0[0]
7211 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7212 if (single_defuse_cycle && reduc_index == 1)
7213 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7214 else
7215 vec_oprnds1[0]
7216 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7217 if (op_type == ternary_op)
7219 if (single_defuse_cycle && reduc_index == 2)
7220 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7221 else
7222 vec_oprnds2[0]
7223 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7228 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7230 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7231 if (masked_loop_p)
7233 /* Make sure that the reduction accumulator is vop[0]. */
7234 if (reduc_index == 1)
7236 gcc_assert (commutative_tree_code (code));
7237 std::swap (vop[0], vop[1]);
7239 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7240 vectype_in, i * ncopies + j);
7241 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7242 vop[0], vop[1],
7243 vop[0]);
7244 new_temp = make_ssa_name (vec_dest, call);
7245 gimple_call_set_lhs (call, new_temp);
7246 gimple_call_set_nothrow (call, true);
7247 new_stmt = call;
7249 else
7251 if (op_type == ternary_op)
7252 vop[2] = vec_oprnds2[i];
7254 new_temp = make_ssa_name (vec_dest, new_stmt);
7255 new_stmt = gimple_build_assign (new_temp, code,
7256 vop[0], vop[1], vop[2]);
7258 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7260 if (slp_node)
7262 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7263 vect_defs.quick_push (new_temp);
7265 else
7266 vect_defs[0] = new_temp;
7269 if (slp_node)
7270 continue;
7272 if (j == 0)
7273 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7274 else
7275 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7277 prev_stmt_info = vinfo_for_stmt (new_stmt);
7280 /* Finalize the reduction-phi (set its arguments) and create the
7281 epilog reduction code. */
7282 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7283 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7285 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7286 epilog_copies, reduc_fn, phis,
7287 double_reduc, slp_node, slp_node_instance,
7288 cond_reduc_val, cond_reduc_op_code,
7289 neutral_op);
7291 return true;
7294 /* Function vect_min_worthwhile_factor.
7296 For a loop where we could vectorize the operation indicated by CODE,
7297 return the minimum vectorization factor that makes it worthwhile
7298 to use generic vectors. */
7299 static unsigned int
7300 vect_min_worthwhile_factor (enum tree_code code)
7302 switch (code)
7304 case PLUS_EXPR:
7305 case MINUS_EXPR:
7306 case NEGATE_EXPR:
7307 return 4;
7309 case BIT_AND_EXPR:
7310 case BIT_IOR_EXPR:
7311 case BIT_XOR_EXPR:
7312 case BIT_NOT_EXPR:
7313 return 2;
7315 default:
7316 return INT_MAX;
7320 /* Return true if VINFO indicates we are doing loop vectorization and if
7321 it is worth decomposing CODE operations into scalar operations for
7322 that loop's vectorization factor. */
7324 bool
7325 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7327 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7328 unsigned HOST_WIDE_INT value;
7329 return (loop_vinfo
7330 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7331 && value >= vect_min_worthwhile_factor (code));
7334 /* Function vectorizable_induction
7336 Check if PHI performs an induction computation that can be vectorized.
7337 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7338 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7339 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7341 bool
7342 vectorizable_induction (gimple *phi,
7343 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7344 gimple **vec_stmt, slp_tree slp_node,
7345 stmt_vector_for_cost *cost_vec)
7347 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7348 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7349 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7350 unsigned ncopies;
7351 bool nested_in_vect_loop = false;
7352 struct loop *iv_loop;
7353 tree vec_def;
7354 edge pe = loop_preheader_edge (loop);
7355 basic_block new_bb;
7356 tree new_vec, vec_init, vec_step, t;
7357 tree new_name;
7358 gimple *new_stmt;
7359 gphi *induction_phi;
7360 tree induc_def, vec_dest;
7361 tree init_expr, step_expr;
7362 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7363 unsigned i;
7364 tree expr;
7365 gimple_seq stmts;
7366 imm_use_iterator imm_iter;
7367 use_operand_p use_p;
7368 gimple *exit_phi;
7369 edge latch_e;
7370 tree loop_arg;
7371 gimple_stmt_iterator si;
7372 basic_block bb = gimple_bb (phi);
7374 if (gimple_code (phi) != GIMPLE_PHI)
7375 return false;
7377 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7378 return false;
7380 /* Make sure it was recognized as induction computation. */
7381 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7382 return false;
7384 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7385 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7387 if (slp_node)
7388 ncopies = 1;
7389 else
7390 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7391 gcc_assert (ncopies >= 1);
7393 /* FORNOW. These restrictions should be relaxed. */
7394 if (nested_in_vect_loop_p (loop, phi))
7396 imm_use_iterator imm_iter;
7397 use_operand_p use_p;
7398 gimple *exit_phi;
7399 edge latch_e;
7400 tree loop_arg;
7402 if (ncopies > 1)
7404 if (dump_enabled_p ())
7405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7406 "multiple types in nested loop.\n");
7407 return false;
7410 /* FORNOW: outer loop induction with SLP not supported. */
7411 if (STMT_SLP_TYPE (stmt_info))
7412 return false;
7414 exit_phi = NULL;
7415 latch_e = loop_latch_edge (loop->inner);
7416 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7417 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7419 gimple *use_stmt = USE_STMT (use_p);
7420 if (is_gimple_debug (use_stmt))
7421 continue;
7423 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7425 exit_phi = use_stmt;
7426 break;
7429 if (exit_phi)
7431 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7432 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7433 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7435 if (dump_enabled_p ())
7436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7437 "inner-loop induction only used outside "
7438 "of the outer vectorized loop.\n");
7439 return false;
7443 nested_in_vect_loop = true;
7444 iv_loop = loop->inner;
7446 else
7447 iv_loop = loop;
7448 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7450 if (slp_node && !nunits.is_constant ())
7452 /* The current SLP code creates the initial value element-by-element. */
7453 if (dump_enabled_p ())
7454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7455 "SLP induction not supported for variable-length"
7456 " vectors.\n");
7457 return false;
7460 if (!vec_stmt) /* transformation not required. */
7462 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7463 if (dump_enabled_p ())
7464 dump_printf_loc (MSG_NOTE, vect_location,
7465 "=== vectorizable_induction ===\n");
7466 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7467 return true;
7470 /* Transform. */
7472 /* Compute a vector variable, initialized with the first VF values of
7473 the induction variable. E.g., for an iv with IV_PHI='X' and
7474 evolution S, for a vector of 4 units, we want to compute:
7475 [X, X + S, X + 2*S, X + 3*S]. */
7477 if (dump_enabled_p ())
7478 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7480 latch_e = loop_latch_edge (iv_loop);
7481 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7483 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7484 gcc_assert (step_expr != NULL_TREE);
7486 pe = loop_preheader_edge (iv_loop);
7487 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7488 loop_preheader_edge (iv_loop));
7490 stmts = NULL;
7491 if (!nested_in_vect_loop)
7493 /* Convert the initial value to the desired type. */
7494 tree new_type = TREE_TYPE (vectype);
7495 init_expr = gimple_convert (&stmts, new_type, init_expr);
7497 /* If we are using the loop mask to "peel" for alignment then we need
7498 to adjust the start value here. */
7499 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7500 if (skip_niters != NULL_TREE)
7502 if (FLOAT_TYPE_P (vectype))
7503 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7504 skip_niters);
7505 else
7506 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7507 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7508 skip_niters, step_expr);
7509 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7510 init_expr, skip_step);
7514 /* Convert the step to the desired type. */
7515 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7517 if (stmts)
7519 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7520 gcc_assert (!new_bb);
7523 /* Find the first insertion point in the BB. */
7524 si = gsi_after_labels (bb);
7526 /* For SLP induction we have to generate several IVs as for example
7527 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7528 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7529 [VF*S, VF*S, VF*S, VF*S] for all. */
7530 if (slp_node)
7532 /* Enforced above. */
7533 unsigned int const_nunits = nunits.to_constant ();
7535 /* Generate [VF*S, VF*S, ... ]. */
7536 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7538 expr = build_int_cst (integer_type_node, vf);
7539 expr = fold_convert (TREE_TYPE (step_expr), expr);
7541 else
7542 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7543 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7544 expr, step_expr);
7545 if (! CONSTANT_CLASS_P (new_name))
7546 new_name = vect_init_vector (phi, new_name,
7547 TREE_TYPE (step_expr), NULL);
7548 new_vec = build_vector_from_val (vectype, new_name);
7549 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7551 /* Now generate the IVs. */
7552 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7553 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7554 unsigned elts = const_nunits * nvects;
7555 unsigned nivs = least_common_multiple (group_size,
7556 const_nunits) / const_nunits;
7557 gcc_assert (elts % group_size == 0);
7558 tree elt = init_expr;
7559 unsigned ivn;
7560 for (ivn = 0; ivn < nivs; ++ivn)
7562 tree_vector_builder elts (vectype, const_nunits, 1);
7563 stmts = NULL;
7564 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7566 if (ivn*const_nunits + eltn >= group_size
7567 && (ivn * const_nunits + eltn) % group_size == 0)
7568 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7569 elt, step_expr);
7570 elts.quick_push (elt);
7572 vec_init = gimple_build_vector (&stmts, &elts);
7573 if (stmts)
7575 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7576 gcc_assert (!new_bb);
7579 /* Create the induction-phi that defines the induction-operand. */
7580 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7581 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7582 set_vinfo_for_stmt (induction_phi,
7583 new_stmt_vec_info (induction_phi, loop_vinfo));
7584 induc_def = PHI_RESULT (induction_phi);
7586 /* Create the iv update inside the loop */
7587 vec_def = make_ssa_name (vec_dest);
7588 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7589 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7590 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7592 /* Set the arguments of the phi node: */
7593 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7594 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7595 UNKNOWN_LOCATION);
7597 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7600 /* Re-use IVs when we can. */
7601 if (ivn < nvects)
7603 unsigned vfp
7604 = least_common_multiple (group_size, const_nunits) / group_size;
7605 /* Generate [VF'*S, VF'*S, ... ]. */
7606 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7608 expr = build_int_cst (integer_type_node, vfp);
7609 expr = fold_convert (TREE_TYPE (step_expr), expr);
7611 else
7612 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7613 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7614 expr, step_expr);
7615 if (! CONSTANT_CLASS_P (new_name))
7616 new_name = vect_init_vector (phi, new_name,
7617 TREE_TYPE (step_expr), NULL);
7618 new_vec = build_vector_from_val (vectype, new_name);
7619 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7620 for (; ivn < nvects; ++ivn)
7622 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7623 tree def;
7624 if (gimple_code (iv) == GIMPLE_PHI)
7625 def = gimple_phi_result (iv);
7626 else
7627 def = gimple_assign_lhs (iv);
7628 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7629 PLUS_EXPR,
7630 def, vec_step);
7631 if (gimple_code (iv) == GIMPLE_PHI)
7632 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7633 else
7635 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7636 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7638 set_vinfo_for_stmt (new_stmt,
7639 new_stmt_vec_info (new_stmt, loop_vinfo));
7640 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7644 return true;
7647 /* Create the vector that holds the initial_value of the induction. */
7648 if (nested_in_vect_loop)
7650 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7651 been created during vectorization of previous stmts. We obtain it
7652 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7653 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7654 /* If the initial value is not of proper type, convert it. */
7655 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7657 new_stmt
7658 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7659 vect_simple_var,
7660 "vec_iv_"),
7661 VIEW_CONVERT_EXPR,
7662 build1 (VIEW_CONVERT_EXPR, vectype,
7663 vec_init));
7664 vec_init = gimple_assign_lhs (new_stmt);
7665 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7666 new_stmt);
7667 gcc_assert (!new_bb);
7668 set_vinfo_for_stmt (new_stmt,
7669 new_stmt_vec_info (new_stmt, loop_vinfo));
7672 else
7674 /* iv_loop is the loop to be vectorized. Create:
7675 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7676 stmts = NULL;
7677 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7679 unsigned HOST_WIDE_INT const_nunits;
7680 if (nunits.is_constant (&const_nunits))
7682 tree_vector_builder elts (vectype, const_nunits, 1);
7683 elts.quick_push (new_name);
7684 for (i = 1; i < const_nunits; i++)
7686 /* Create: new_name_i = new_name + step_expr */
7687 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7688 new_name, step_expr);
7689 elts.quick_push (new_name);
7691 /* Create a vector from [new_name_0, new_name_1, ...,
7692 new_name_nunits-1] */
7693 vec_init = gimple_build_vector (&stmts, &elts);
7695 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7696 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7697 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7698 new_name, step_expr);
7699 else
7701 /* Build:
7702 [base, base, base, ...]
7703 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7704 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7705 gcc_assert (flag_associative_math);
7706 tree index = build_index_vector (vectype, 0, 1);
7707 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7708 new_name);
7709 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7710 step_expr);
7711 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7712 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7713 vec_init, step_vec);
7714 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7715 vec_init, base_vec);
7718 if (stmts)
7720 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7721 gcc_assert (!new_bb);
7726 /* Create the vector that holds the step of the induction. */
7727 if (nested_in_vect_loop)
7728 /* iv_loop is nested in the loop to be vectorized. Generate:
7729 vec_step = [S, S, S, S] */
7730 new_name = step_expr;
7731 else
7733 /* iv_loop is the loop to be vectorized. Generate:
7734 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7735 gimple_seq seq = NULL;
7736 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7738 expr = build_int_cst (integer_type_node, vf);
7739 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7741 else
7742 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7743 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7744 expr, step_expr);
7745 if (seq)
7747 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7748 gcc_assert (!new_bb);
7752 t = unshare_expr (new_name);
7753 gcc_assert (CONSTANT_CLASS_P (new_name)
7754 || TREE_CODE (new_name) == SSA_NAME);
7755 new_vec = build_vector_from_val (vectype, t);
7756 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7759 /* Create the following def-use cycle:
7760 loop prolog:
7761 vec_init = ...
7762 vec_step = ...
7763 loop:
7764 vec_iv = PHI <vec_init, vec_loop>
7766 STMT
7768 vec_loop = vec_iv + vec_step; */
7770 /* Create the induction-phi that defines the induction-operand. */
7771 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7772 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7773 set_vinfo_for_stmt (induction_phi,
7774 new_stmt_vec_info (induction_phi, loop_vinfo));
7775 induc_def = PHI_RESULT (induction_phi);
7777 /* Create the iv update inside the loop */
7778 vec_def = make_ssa_name (vec_dest);
7779 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7780 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7781 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7783 /* Set the arguments of the phi node: */
7784 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7785 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7786 UNKNOWN_LOCATION);
7788 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7790 /* In case that vectorization factor (VF) is bigger than the number
7791 of elements that we can fit in a vectype (nunits), we have to generate
7792 more than one vector stmt - i.e - we need to "unroll" the
7793 vector stmt by a factor VF/nunits. For more details see documentation
7794 in vectorizable_operation. */
7796 if (ncopies > 1)
7798 gimple_seq seq = NULL;
7799 stmt_vec_info prev_stmt_vinfo;
7800 /* FORNOW. This restriction should be relaxed. */
7801 gcc_assert (!nested_in_vect_loop);
7803 /* Create the vector that holds the step of the induction. */
7804 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7806 expr = build_int_cst (integer_type_node, nunits);
7807 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7809 else
7810 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7811 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7812 expr, step_expr);
7813 if (seq)
7815 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7816 gcc_assert (!new_bb);
7819 t = unshare_expr (new_name);
7820 gcc_assert (CONSTANT_CLASS_P (new_name)
7821 || TREE_CODE (new_name) == SSA_NAME);
7822 new_vec = build_vector_from_val (vectype, t);
7823 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7825 vec_def = induc_def;
7826 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7827 for (i = 1; i < ncopies; i++)
7829 /* vec_i = vec_prev + vec_step */
7830 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7831 vec_def, vec_step);
7832 vec_def = make_ssa_name (vec_dest, new_stmt);
7833 gimple_assign_set_lhs (new_stmt, vec_def);
7835 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7836 set_vinfo_for_stmt (new_stmt,
7837 new_stmt_vec_info (new_stmt, loop_vinfo));
7838 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7839 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7843 if (nested_in_vect_loop)
7845 /* Find the loop-closed exit-phi of the induction, and record
7846 the final vector of induction results: */
7847 exit_phi = NULL;
7848 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7850 gimple *use_stmt = USE_STMT (use_p);
7851 if (is_gimple_debug (use_stmt))
7852 continue;
7854 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7856 exit_phi = use_stmt;
7857 break;
7860 if (exit_phi)
7862 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7863 /* FORNOW. Currently not supporting the case that an inner-loop induction
7864 is not used in the outer-loop (i.e. only outside the outer-loop). */
7865 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7866 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7868 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7869 if (dump_enabled_p ())
7871 dump_printf_loc (MSG_NOTE, vect_location,
7872 "vector of inductions after inner-loop:");
7873 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7879 if (dump_enabled_p ())
7881 dump_printf_loc (MSG_NOTE, vect_location,
7882 "transform induction: created def-use cycle: ");
7883 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7884 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7885 SSA_NAME_DEF_STMT (vec_def), 0);
7888 return true;
7891 /* Function vectorizable_live_operation.
7893 STMT computes a value that is used outside the loop. Check if
7894 it can be supported. */
7896 bool
7897 vectorizable_live_operation (gimple *stmt,
7898 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7899 slp_tree slp_node, int slp_index,
7900 gimple **vec_stmt,
7901 stmt_vector_for_cost *)
7903 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7904 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7905 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7906 imm_use_iterator imm_iter;
7907 tree lhs, lhs_type, bitsize, vec_bitsize;
7908 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7909 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7910 int ncopies;
7911 gimple *use_stmt;
7912 auto_vec<tree> vec_oprnds;
7913 int vec_entry = 0;
7914 poly_uint64 vec_index = 0;
7916 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7918 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7919 return false;
7921 /* FORNOW. CHECKME. */
7922 if (nested_in_vect_loop_p (loop, stmt))
7923 return false;
7925 /* If STMT is not relevant and it is a simple assignment and its inputs are
7926 invariant then it can remain in place, unvectorized. The original last
7927 scalar value that it computes will be used. */
7928 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7930 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7931 if (dump_enabled_p ())
7932 dump_printf_loc (MSG_NOTE, vect_location,
7933 "statement is simple and uses invariant. Leaving in "
7934 "place.\n");
7935 return true;
7938 if (slp_node)
7939 ncopies = 1;
7940 else
7941 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7943 if (slp_node)
7945 gcc_assert (slp_index >= 0);
7947 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7948 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7950 /* Get the last occurrence of the scalar index from the concatenation of
7951 all the slp vectors. Calculate which slp vector it is and the index
7952 within. */
7953 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7955 /* Calculate which vector contains the result, and which lane of
7956 that vector we need. */
7957 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7959 if (dump_enabled_p ())
7960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7961 "Cannot determine which vector holds the"
7962 " final result.\n");
7963 return false;
7967 if (!vec_stmt)
7969 /* No transformation required. */
7970 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7972 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7973 OPTIMIZE_FOR_SPEED))
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977 "can't use a fully-masked loop because "
7978 "the target doesn't support extract last "
7979 "reduction.\n");
7980 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7982 else if (slp_node)
7984 if (dump_enabled_p ())
7985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7986 "can't use a fully-masked loop because an "
7987 "SLP statement is live after the loop.\n");
7988 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7990 else if (ncopies > 1)
7992 if (dump_enabled_p ())
7993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7994 "can't use a fully-masked loop because"
7995 " ncopies is greater than 1.\n");
7996 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7998 else
8000 gcc_assert (ncopies == 1 && !slp_node);
8001 vect_record_loop_mask (loop_vinfo,
8002 &LOOP_VINFO_MASKS (loop_vinfo),
8003 1, vectype);
8006 return true;
8009 /* If stmt has a related stmt, then use that for getting the lhs. */
8010 if (is_pattern_stmt_p (stmt_info))
8011 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8013 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8014 : gimple_get_lhs (stmt);
8015 lhs_type = TREE_TYPE (lhs);
8017 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8018 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8019 : TYPE_SIZE (TREE_TYPE (vectype)));
8020 vec_bitsize = TYPE_SIZE (vectype);
8022 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8023 tree vec_lhs, bitstart;
8024 if (slp_node)
8026 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8028 /* Get the correct slp vectorized stmt. */
8029 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8030 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8031 vec_lhs = gimple_phi_result (phi);
8032 else
8033 vec_lhs = gimple_get_lhs (vec_stmt);
8035 /* Get entry to use. */
8036 bitstart = bitsize_int (vec_index);
8037 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8039 else
8041 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8042 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8043 gcc_checking_assert (ncopies == 1
8044 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8046 /* For multiple copies, get the last copy. */
8047 for (int i = 1; i < ncopies; ++i)
8048 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8049 vec_lhs);
8051 /* Get the last lane in the vector. */
8052 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8055 gimple_seq stmts = NULL;
8056 tree new_tree;
8057 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8059 /* Emit:
8061 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8063 where VEC_LHS is the vectorized live-out result and MASK is
8064 the loop mask for the final iteration. */
8065 gcc_assert (ncopies == 1 && !slp_node);
8066 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8067 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8068 1, vectype, 0);
8069 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8070 scalar_type, mask, vec_lhs);
8072 /* Convert the extracted vector element to the required scalar type. */
8073 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8075 else
8077 tree bftype = TREE_TYPE (vectype);
8078 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8079 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8080 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8081 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8082 &stmts, true, NULL_TREE);
8085 if (stmts)
8086 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8088 /* Replace use of lhs with newly computed result. If the use stmt is a
8089 single arg PHI, just replace all uses of PHI result. It's necessary
8090 because lcssa PHI defining lhs may be before newly inserted stmt. */
8091 use_operand_p use_p;
8092 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8093 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8094 && !is_gimple_debug (use_stmt))
8096 if (gimple_code (use_stmt) == GIMPLE_PHI
8097 && gimple_phi_num_args (use_stmt) == 1)
8099 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8101 else
8103 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8104 SET_USE (use_p, new_tree);
8106 update_stmt (use_stmt);
8109 return true;
8112 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8114 static void
8115 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8117 ssa_op_iter op_iter;
8118 imm_use_iterator imm_iter;
8119 def_operand_p def_p;
8120 gimple *ustmt;
8122 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8124 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8126 basic_block bb;
8128 if (!is_gimple_debug (ustmt))
8129 continue;
8131 bb = gimple_bb (ustmt);
8133 if (!flow_bb_inside_loop_p (loop, bb))
8135 if (gimple_debug_bind_p (ustmt))
8137 if (dump_enabled_p ())
8138 dump_printf_loc (MSG_NOTE, vect_location,
8139 "killing debug use\n");
8141 gimple_debug_bind_reset_value (ustmt);
8142 update_stmt (ustmt);
8144 else
8145 gcc_unreachable ();
8151 /* Given loop represented by LOOP_VINFO, return true if computation of
8152 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8153 otherwise. */
8155 static bool
8156 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8158 /* Constant case. */
8159 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8161 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8162 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8164 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8165 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8166 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8167 return true;
8170 widest_int max;
8171 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8172 /* Check the upper bound of loop niters. */
8173 if (get_max_loop_iterations (loop, &max))
8175 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8176 signop sgn = TYPE_SIGN (type);
8177 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8178 if (max < type_max)
8179 return true;
8181 return false;
8184 /* Return a mask type with half the number of elements as TYPE. */
8186 tree
8187 vect_halve_mask_nunits (tree type)
8189 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8190 return build_truth_vector_type (nunits, current_vector_size);
8193 /* Return a mask type with twice as many elements as TYPE. */
8195 tree
8196 vect_double_mask_nunits (tree type)
8198 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8199 return build_truth_vector_type (nunits, current_vector_size);
8202 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8203 contain a sequence of NVECTORS masks that each control a vector of type
8204 VECTYPE. */
8206 void
8207 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8208 unsigned int nvectors, tree vectype)
8210 gcc_assert (nvectors != 0);
8211 if (masks->length () < nvectors)
8212 masks->safe_grow_cleared (nvectors);
8213 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8214 /* The number of scalars per iteration and the number of vectors are
8215 both compile-time constants. */
8216 unsigned int nscalars_per_iter
8217 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8218 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8219 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8221 rgm->max_nscalars_per_iter = nscalars_per_iter;
8222 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8226 /* Given a complete set of masks MASKS, extract mask number INDEX
8227 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8228 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8230 See the comment above vec_loop_masks for more details about the mask
8231 arrangement. */
8233 tree
8234 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8235 unsigned int nvectors, tree vectype, unsigned int index)
8237 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8238 tree mask_type = rgm->mask_type;
8240 /* Populate the rgroup's mask array, if this is the first time we've
8241 used it. */
8242 if (rgm->masks.is_empty ())
8244 rgm->masks.safe_grow_cleared (nvectors);
8245 for (unsigned int i = 0; i < nvectors; ++i)
8247 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8248 /* Provide a dummy definition until the real one is available. */
8249 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8250 rgm->masks[i] = mask;
8254 tree mask = rgm->masks[index];
8255 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8256 TYPE_VECTOR_SUBPARTS (vectype)))
8258 /* A loop mask for data type X can be reused for data type Y
8259 if X has N times more elements than Y and if Y's elements
8260 are N times bigger than X's. In this case each sequence
8261 of N elements in the loop mask will be all-zero or all-one.
8262 We can then view-convert the mask so that each sequence of
8263 N elements is replaced by a single element. */
8264 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8265 TYPE_VECTOR_SUBPARTS (vectype)));
8266 gimple_seq seq = NULL;
8267 mask_type = build_same_sized_truth_vector_type (vectype);
8268 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8269 if (seq)
8270 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8272 return mask;
8275 /* Scale profiling counters by estimation for LOOP which is vectorized
8276 by factor VF. */
8278 static void
8279 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8281 edge preheader = loop_preheader_edge (loop);
8282 /* Reduce loop iterations by the vectorization factor. */
8283 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8284 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8286 if (freq_h.nonzero_p ())
8288 profile_probability p;
8290 /* Avoid dropping loop body profile counter to 0 because of zero count
8291 in loop's preheader. */
8292 if (!(freq_e == profile_count::zero ()))
8293 freq_e = freq_e.force_nonzero ();
8294 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8295 scale_loop_frequencies (loop, p);
8298 edge exit_e = single_exit (loop);
8299 exit_e->probability = profile_probability::always ()
8300 .apply_scale (1, new_est_niter + 1);
8302 edge exit_l = single_pred_edge (loop->latch);
8303 profile_probability prob = exit_l->probability;
8304 exit_l->probability = exit_e->probability.invert ();
8305 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8306 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8309 /* Function vect_transform_loop.
8311 The analysis phase has determined that the loop is vectorizable.
8312 Vectorize the loop - created vectorized stmts to replace the scalar
8313 stmts in the loop, and update the loop exit condition.
8314 Returns scalar epilogue loop if any. */
8316 struct loop *
8317 vect_transform_loop (loop_vec_info loop_vinfo)
8319 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8320 struct loop *epilogue = NULL;
8321 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8322 int nbbs = loop->num_nodes;
8323 int i;
8324 tree niters_vector = NULL_TREE;
8325 tree step_vector = NULL_TREE;
8326 tree niters_vector_mult_vf = NULL_TREE;
8327 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8328 unsigned int lowest_vf = constant_lower_bound (vf);
8329 bool grouped_store;
8330 bool slp_scheduled = false;
8331 gimple *stmt, *pattern_stmt;
8332 gimple_seq pattern_def_seq = NULL;
8333 gimple_stmt_iterator pattern_def_si = gsi_none ();
8334 bool transform_pattern_stmt = false;
8335 bool check_profitability = false;
8336 unsigned int th;
8338 if (dump_enabled_p ())
8339 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8341 /* Use the more conservative vectorization threshold. If the number
8342 of iterations is constant assume the cost check has been performed
8343 by our caller. If the threshold makes all loops profitable that
8344 run at least the (estimated) vectorization factor number of times
8345 checking is pointless, too. */
8346 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8347 if (th >= vect_vf_for_cost (loop_vinfo)
8348 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8350 if (dump_enabled_p ())
8351 dump_printf_loc (MSG_NOTE, vect_location,
8352 "Profitability threshold is %d loop iterations.\n",
8353 th);
8354 check_profitability = true;
8357 /* Make sure there exists a single-predecessor exit bb. Do this before
8358 versioning. */
8359 edge e = single_exit (loop);
8360 if (! single_pred_p (e->dest))
8362 split_loop_exit_edge (e);
8363 if (dump_enabled_p ())
8364 dump_printf (MSG_NOTE, "split exit edge\n");
8367 /* Version the loop first, if required, so the profitability check
8368 comes first. */
8370 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8372 poly_uint64 versioning_threshold
8373 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8374 if (check_profitability
8375 && ordered_p (poly_uint64 (th), versioning_threshold))
8377 versioning_threshold = ordered_max (poly_uint64 (th),
8378 versioning_threshold);
8379 check_profitability = false;
8381 vect_loop_versioning (loop_vinfo, th, check_profitability,
8382 versioning_threshold);
8383 check_profitability = false;
8386 /* Make sure there exists a single-predecessor exit bb also on the
8387 scalar loop copy. Do this after versioning but before peeling
8388 so CFG structure is fine for both scalar and if-converted loop
8389 to make slpeel_duplicate_current_defs_from_edges face matched
8390 loop closed PHI nodes on the exit. */
8391 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8393 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8394 if (! single_pred_p (e->dest))
8396 split_loop_exit_edge (e);
8397 if (dump_enabled_p ())
8398 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8402 tree niters = vect_build_loop_niters (loop_vinfo);
8403 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8404 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8405 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8406 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8407 &step_vector, &niters_vector_mult_vf, th,
8408 check_profitability, niters_no_overflow);
8410 if (niters_vector == NULL_TREE)
8412 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8413 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8414 && known_eq (lowest_vf, vf))
8416 niters_vector
8417 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8418 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8419 step_vector = build_one_cst (TREE_TYPE (niters));
8421 else
8422 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8423 &step_vector, niters_no_overflow);
8426 /* 1) Make sure the loop header has exactly two entries
8427 2) Make sure we have a preheader basic block. */
8429 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8431 split_edge (loop_preheader_edge (loop));
8433 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8434 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8435 /* This will deal with any possible peeling. */
8436 vect_prepare_for_masked_peels (loop_vinfo);
8438 /* FORNOW: the vectorizer supports only loops which body consist
8439 of one basic block (header + empty latch). When the vectorizer will
8440 support more involved loop forms, the order by which the BBs are
8441 traversed need to be reconsidered. */
8443 for (i = 0; i < nbbs; i++)
8445 basic_block bb = bbs[i];
8446 stmt_vec_info stmt_info;
8448 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8449 gsi_next (&si))
8451 gphi *phi = si.phi ();
8452 if (dump_enabled_p ())
8454 dump_printf_loc (MSG_NOTE, vect_location,
8455 "------>vectorizing phi: ");
8456 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8458 stmt_info = vinfo_for_stmt (phi);
8459 if (!stmt_info)
8460 continue;
8462 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8463 vect_loop_kill_debug_uses (loop, phi);
8465 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8466 && !STMT_VINFO_LIVE_P (stmt_info))
8467 continue;
8469 if (STMT_VINFO_VECTYPE (stmt_info)
8470 && (maybe_ne
8471 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8472 && dump_enabled_p ())
8473 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8475 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8476 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8477 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8478 && ! PURE_SLP_STMT (stmt_info))
8480 if (dump_enabled_p ())
8481 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8482 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8486 pattern_stmt = NULL;
8487 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8488 !gsi_end_p (si) || transform_pattern_stmt;)
8490 bool is_store;
8492 if (transform_pattern_stmt)
8493 stmt = pattern_stmt;
8494 else
8496 stmt = gsi_stmt (si);
8497 /* During vectorization remove existing clobber stmts. */
8498 if (gimple_clobber_p (stmt))
8500 unlink_stmt_vdef (stmt);
8501 gsi_remove (&si, true);
8502 release_defs (stmt);
8503 continue;
8507 if (dump_enabled_p ())
8509 dump_printf_loc (MSG_NOTE, vect_location,
8510 "------>vectorizing statement: ");
8511 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8514 stmt_info = vinfo_for_stmt (stmt);
8516 /* vector stmts created in the outer-loop during vectorization of
8517 stmts in an inner-loop may not have a stmt_info, and do not
8518 need to be vectorized. */
8519 if (!stmt_info)
8521 gsi_next (&si);
8522 continue;
8525 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8526 vect_loop_kill_debug_uses (loop, stmt);
8528 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8529 && !STMT_VINFO_LIVE_P (stmt_info))
8531 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8532 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8533 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8534 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8536 stmt = pattern_stmt;
8537 stmt_info = vinfo_for_stmt (stmt);
8539 else
8541 gsi_next (&si);
8542 continue;
8545 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8546 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8547 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8548 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8549 transform_pattern_stmt = true;
8551 /* If pattern statement has def stmts, vectorize them too. */
8552 if (is_pattern_stmt_p (stmt_info))
8554 if (pattern_def_seq == NULL)
8556 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8557 pattern_def_si = gsi_start (pattern_def_seq);
8559 else if (!gsi_end_p (pattern_def_si))
8560 gsi_next (&pattern_def_si);
8561 if (pattern_def_seq != NULL)
8563 gimple *pattern_def_stmt = NULL;
8564 stmt_vec_info pattern_def_stmt_info = NULL;
8566 while (!gsi_end_p (pattern_def_si))
8568 pattern_def_stmt = gsi_stmt (pattern_def_si);
8569 pattern_def_stmt_info
8570 = vinfo_for_stmt (pattern_def_stmt);
8571 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8572 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8573 break;
8574 gsi_next (&pattern_def_si);
8577 if (!gsi_end_p (pattern_def_si))
8579 if (dump_enabled_p ())
8581 dump_printf_loc (MSG_NOTE, vect_location,
8582 "==> vectorizing pattern def "
8583 "stmt: ");
8584 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8585 pattern_def_stmt, 0);
8588 stmt = pattern_def_stmt;
8589 stmt_info = pattern_def_stmt_info;
8591 else
8593 pattern_def_si = gsi_none ();
8594 transform_pattern_stmt = false;
8597 else
8598 transform_pattern_stmt = false;
8601 if (STMT_VINFO_VECTYPE (stmt_info))
8603 poly_uint64 nunits
8604 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8605 if (!STMT_SLP_TYPE (stmt_info)
8606 && maybe_ne (nunits, vf)
8607 && dump_enabled_p ())
8608 /* For SLP VF is set according to unrolling factor, and not
8609 to vector size, hence for SLP this print is not valid. */
8610 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8613 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8614 reached. */
8615 if (STMT_SLP_TYPE (stmt_info))
8617 if (!slp_scheduled)
8619 slp_scheduled = true;
8621 if (dump_enabled_p ())
8622 dump_printf_loc (MSG_NOTE, vect_location,
8623 "=== scheduling SLP instances ===\n");
8625 vect_schedule_slp (loop_vinfo);
8628 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8629 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8631 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8633 pattern_def_seq = NULL;
8634 gsi_next (&si);
8636 continue;
8640 /* -------- vectorize statement ------------ */
8641 if (dump_enabled_p ())
8642 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8644 grouped_store = false;
8645 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8646 if (is_store)
8648 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8650 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8651 interleaving chain was completed - free all the stores in
8652 the chain. */
8653 gsi_next (&si);
8654 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (stmt_info));
8656 else
8658 /* Free the attached stmt_vec_info and remove the stmt. */
8659 gimple *store = gsi_stmt (si);
8660 free_stmt_vec_info (store);
8661 unlink_stmt_vdef (store);
8662 gsi_remove (&si, true);
8663 release_defs (store);
8666 /* Stores can only appear at the end of pattern statements. */
8667 gcc_assert (!transform_pattern_stmt);
8668 pattern_def_seq = NULL;
8670 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8672 pattern_def_seq = NULL;
8673 gsi_next (&si);
8675 } /* stmts in BB */
8677 /* Stub out scalar statements that must not survive vectorization.
8678 Doing this here helps with grouped statements, or statements that
8679 are involved in patterns. */
8680 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8681 !gsi_end_p (gsi); gsi_next (&gsi))
8683 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8684 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8686 tree lhs = gimple_get_lhs (call);
8687 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8689 tree zero = build_zero_cst (TREE_TYPE (lhs));
8690 gimple *new_stmt = gimple_build_assign (lhs, zero);
8691 gsi_replace (&gsi, new_stmt, true);
8695 } /* BBs in loop */
8697 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8698 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8699 if (integer_onep (step_vector))
8700 niters_no_overflow = true;
8701 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8702 niters_vector_mult_vf, !niters_no_overflow);
8704 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8705 scale_profile_for_vect_loop (loop, assumed_vf);
8707 /* True if the final iteration might not handle a full vector's
8708 worth of scalar iterations. */
8709 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8710 /* The minimum number of iterations performed by the epilogue. This
8711 is 1 when peeling for gaps because we always need a final scalar
8712 iteration. */
8713 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8714 /* +1 to convert latch counts to loop iteration counts,
8715 -min_epilogue_iters to remove iterations that cannot be performed
8716 by the vector code. */
8717 int bias_for_lowest = 1 - min_epilogue_iters;
8718 int bias_for_assumed = bias_for_lowest;
8719 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8720 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8722 /* When the amount of peeling is known at compile time, the first
8723 iteration will have exactly alignment_npeels active elements.
8724 In the worst case it will have at least one. */
8725 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8726 bias_for_lowest += lowest_vf - min_first_active;
8727 bias_for_assumed += assumed_vf - min_first_active;
8729 /* In these calculations the "- 1" converts loop iteration counts
8730 back to latch counts. */
8731 if (loop->any_upper_bound)
8732 loop->nb_iterations_upper_bound
8733 = (final_iter_may_be_partial
8734 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8735 lowest_vf) - 1
8736 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8737 lowest_vf) - 1);
8738 if (loop->any_likely_upper_bound)
8739 loop->nb_iterations_likely_upper_bound
8740 = (final_iter_may_be_partial
8741 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8742 + bias_for_lowest, lowest_vf) - 1
8743 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8744 + bias_for_lowest, lowest_vf) - 1);
8745 if (loop->any_estimate)
8746 loop->nb_iterations_estimate
8747 = (final_iter_may_be_partial
8748 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8749 assumed_vf) - 1
8750 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8751 assumed_vf) - 1);
8753 if (dump_enabled_p ())
8755 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8757 dump_printf_loc (MSG_NOTE, vect_location,
8758 "LOOP VECTORIZED\n");
8759 if (loop->inner)
8760 dump_printf_loc (MSG_NOTE, vect_location,
8761 "OUTER LOOP VECTORIZED\n");
8762 dump_printf (MSG_NOTE, "\n");
8764 else
8766 dump_printf_loc (MSG_NOTE, vect_location,
8767 "LOOP EPILOGUE VECTORIZED (VS=");
8768 dump_dec (MSG_NOTE, current_vector_size);
8769 dump_printf (MSG_NOTE, ")\n");
8773 /* Free SLP instances here because otherwise stmt reference counting
8774 won't work. */
8775 slp_instance instance;
8776 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8777 vect_free_slp_instance (instance);
8778 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8779 /* Clear-up safelen field since its value is invalid after vectorization
8780 since vectorized loop can have loop-carried dependencies. */
8781 loop->safelen = 0;
8783 /* Don't vectorize epilogue for epilogue. */
8784 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8785 epilogue = NULL;
8787 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8788 epilogue = NULL;
8790 if (epilogue)
8792 auto_vector_sizes vector_sizes;
8793 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8794 unsigned int next_size = 0;
8796 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8797 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8798 && known_eq (vf, lowest_vf))
8800 unsigned int eiters
8801 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8802 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8803 eiters = eiters % lowest_vf;
8804 epilogue->nb_iterations_upper_bound = eiters - 1;
8806 unsigned int ratio;
8807 while (next_size < vector_sizes.length ()
8808 && !(constant_multiple_p (current_vector_size,
8809 vector_sizes[next_size], &ratio)
8810 && eiters >= lowest_vf / ratio))
8811 next_size += 1;
8813 else
8814 while (next_size < vector_sizes.length ()
8815 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8816 next_size += 1;
8818 if (next_size == vector_sizes.length ())
8819 epilogue = NULL;
8822 if (epilogue)
8824 epilogue->force_vectorize = loop->force_vectorize;
8825 epilogue->safelen = loop->safelen;
8826 epilogue->dont_vectorize = false;
8828 /* We may need to if-convert epilogue to vectorize it. */
8829 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8830 tree_if_conversion (epilogue);
8833 return epilogue;
8836 /* The code below is trying to perform simple optimization - revert
8837 if-conversion for masked stores, i.e. if the mask of a store is zero
8838 do not perform it and all stored value producers also if possible.
8839 For example,
8840 for (i=0; i<n; i++)
8841 if (c[i])
8843 p1[i] += 1;
8844 p2[i] = p3[i] +2;
8846 this transformation will produce the following semi-hammock:
8848 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8850 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8851 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8852 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8853 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8854 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8855 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8859 void
8860 optimize_mask_stores (struct loop *loop)
8862 basic_block *bbs = get_loop_body (loop);
8863 unsigned nbbs = loop->num_nodes;
8864 unsigned i;
8865 basic_block bb;
8866 struct loop *bb_loop;
8867 gimple_stmt_iterator gsi;
8868 gimple *stmt;
8869 auto_vec<gimple *> worklist;
8871 vect_location = find_loop_location (loop);
8872 /* Pick up all masked stores in loop if any. */
8873 for (i = 0; i < nbbs; i++)
8875 bb = bbs[i];
8876 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8877 gsi_next (&gsi))
8879 stmt = gsi_stmt (gsi);
8880 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8881 worklist.safe_push (stmt);
8885 free (bbs);
8886 if (worklist.is_empty ())
8887 return;
8889 /* Loop has masked stores. */
8890 while (!worklist.is_empty ())
8892 gimple *last, *last_store;
8893 edge e, efalse;
8894 tree mask;
8895 basic_block store_bb, join_bb;
8896 gimple_stmt_iterator gsi_to;
8897 tree vdef, new_vdef;
8898 gphi *phi;
8899 tree vectype;
8900 tree zero;
8902 last = worklist.pop ();
8903 mask = gimple_call_arg (last, 2);
8904 bb = gimple_bb (last);
8905 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8906 the same loop as if_bb. It could be different to LOOP when two
8907 level loop-nest is vectorized and mask_store belongs to the inner
8908 one. */
8909 e = split_block (bb, last);
8910 bb_loop = bb->loop_father;
8911 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8912 join_bb = e->dest;
8913 store_bb = create_empty_bb (bb);
8914 add_bb_to_loop (store_bb, bb_loop);
8915 e->flags = EDGE_TRUE_VALUE;
8916 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8917 /* Put STORE_BB to likely part. */
8918 efalse->probability = profile_probability::unlikely ();
8919 store_bb->count = efalse->count ();
8920 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8921 if (dom_info_available_p (CDI_DOMINATORS))
8922 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8923 if (dump_enabled_p ())
8924 dump_printf_loc (MSG_NOTE, vect_location,
8925 "Create new block %d to sink mask stores.",
8926 store_bb->index);
8927 /* Create vector comparison with boolean result. */
8928 vectype = TREE_TYPE (mask);
8929 zero = build_zero_cst (vectype);
8930 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8931 gsi = gsi_last_bb (bb);
8932 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8933 /* Create new PHI node for vdef of the last masked store:
8934 .MEM_2 = VDEF <.MEM_1>
8935 will be converted to
8936 .MEM.3 = VDEF <.MEM_1>
8937 and new PHI node will be created in join bb
8938 .MEM_2 = PHI <.MEM_1, .MEM_3>
8940 vdef = gimple_vdef (last);
8941 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8942 gimple_set_vdef (last, new_vdef);
8943 phi = create_phi_node (vdef, join_bb);
8944 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8946 /* Put all masked stores with the same mask to STORE_BB if possible. */
8947 while (true)
8949 gimple_stmt_iterator gsi_from;
8950 gimple *stmt1 = NULL;
8952 /* Move masked store to STORE_BB. */
8953 last_store = last;
8954 gsi = gsi_for_stmt (last);
8955 gsi_from = gsi;
8956 /* Shift GSI to the previous stmt for further traversal. */
8957 gsi_prev (&gsi);
8958 gsi_to = gsi_start_bb (store_bb);
8959 gsi_move_before (&gsi_from, &gsi_to);
8960 /* Setup GSI_TO to the non-empty block start. */
8961 gsi_to = gsi_start_bb (store_bb);
8962 if (dump_enabled_p ())
8964 dump_printf_loc (MSG_NOTE, vect_location,
8965 "Move stmt to created bb\n");
8966 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8968 /* Move all stored value producers if possible. */
8969 while (!gsi_end_p (gsi))
8971 tree lhs;
8972 imm_use_iterator imm_iter;
8973 use_operand_p use_p;
8974 bool res;
8976 /* Skip debug statements. */
8977 if (is_gimple_debug (gsi_stmt (gsi)))
8979 gsi_prev (&gsi);
8980 continue;
8982 stmt1 = gsi_stmt (gsi);
8983 /* Do not consider statements writing to memory or having
8984 volatile operand. */
8985 if (gimple_vdef (stmt1)
8986 || gimple_has_volatile_ops (stmt1))
8987 break;
8988 gsi_from = gsi;
8989 gsi_prev (&gsi);
8990 lhs = gimple_get_lhs (stmt1);
8991 if (!lhs)
8992 break;
8994 /* LHS of vectorized stmt must be SSA_NAME. */
8995 if (TREE_CODE (lhs) != SSA_NAME)
8996 break;
8998 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9000 /* Remove dead scalar statement. */
9001 if (has_zero_uses (lhs))
9003 gsi_remove (&gsi_from, true);
9004 continue;
9008 /* Check that LHS does not have uses outside of STORE_BB. */
9009 res = true;
9010 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9012 gimple *use_stmt;
9013 use_stmt = USE_STMT (use_p);
9014 if (is_gimple_debug (use_stmt))
9015 continue;
9016 if (gimple_bb (use_stmt) != store_bb)
9018 res = false;
9019 break;
9022 if (!res)
9023 break;
9025 if (gimple_vuse (stmt1)
9026 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9027 break;
9029 /* Can move STMT1 to STORE_BB. */
9030 if (dump_enabled_p ())
9032 dump_printf_loc (MSG_NOTE, vect_location,
9033 "Move stmt to created bb\n");
9034 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9036 gsi_move_before (&gsi_from, &gsi_to);
9037 /* Shift GSI_TO for further insertion. */
9038 gsi_prev (&gsi_to);
9040 /* Put other masked stores with the same mask to STORE_BB. */
9041 if (worklist.is_empty ()
9042 || gimple_call_arg (worklist.last (), 2) != mask
9043 || worklist.last () != stmt1)
9044 break;
9045 last = worklist.pop ();
9047 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);