PR libstdc++/85818 ensure path::preferred_separator is defined
[official-gcc.git] / gcc / tree-vect-loop.c
blob0caa6e4c7ddacaf66e6189f64671a946df80cd86
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static bool
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return true;
179 tree stmt_vectype, nunits_vectype;
180 if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype))
182 return false;
184 if (stmt_vectype)
186 if (STMT_VINFO_VECTYPE (stmt_info))
187 /* The only case when a vectype had been already set is for stmts
188 that contain a data ref, or for "pattern-stmts" (stmts generated
189 by the vectorizer to represent/replace a certain idiom). */
190 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
191 || vectype_maybe_set_p)
192 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
193 else if (stmt_vectype == boolean_type_node)
194 mask_producers->safe_push (stmt_info);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return true;
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. If some of the statements
208 produce a mask result whose vector type can only be calculated later,
209 add them to MASK_PRODUCERS. Return true on success or false if
210 something prevented vectorization. */
212 static bool
213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
214 vec<stmt_vec_info > *mask_producers)
216 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
221 if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
222 return false;
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
227 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
229 /* If a pattern statement has def stmts, analyze them too. */
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
234 stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location,
238 "==> examining pattern def stmt: ");
239 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
240 def_stmt_info->stmt, 0);
242 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
243 vf, mask_producers))
244 return false;
247 if (dump_enabled_p ())
249 dump_printf_loc (MSG_NOTE, vect_location,
250 "==> examining pattern statement: ");
251 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
253 if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
254 return false;
257 return true;
260 /* Function vect_determine_vectorization_factor
262 Determine the vectorization factor (VF). VF is the number of data elements
263 that are operated upon in parallel in a single iteration of the vectorized
264 loop. For example, when vectorizing a loop that operates on 4byte elements,
265 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
266 elements can fit in a single vector register.
268 We currently support vectorization of loops in which all types operated upon
269 are of the same size. Therefore this function currently sets VF according to
270 the size of the types operated upon, and fails if there are multiple sizes
271 in the loop.
273 VF is also the factor by which the loop iterations are strip-mined, e.g.:
274 original loop:
275 for (i=0; i<N; i++){
276 a[i] = b[i] + c[i];
279 vectorized loop:
280 for (i=0; i<N; i+=VF){
281 a[i:VF] = b[i:VF] + c[i:VF];
285 static bool
286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
289 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
290 unsigned nbbs = loop->num_nodes;
291 poly_uint64 vectorization_factor = 1;
292 tree scalar_type = NULL_TREE;
293 gphi *phi;
294 tree vectype;
295 stmt_vec_info stmt_info;
296 unsigned i;
297 auto_vec<stmt_vec_info> mask_producers;
299 if (dump_enabled_p ())
300 dump_printf_loc (MSG_NOTE, vect_location,
301 "=== vect_determine_vectorization_factor ===\n");
303 for (i = 0; i < nbbs; i++)
305 basic_block bb = bbs[i];
307 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
308 gsi_next (&si))
310 phi = si.phi ();
311 stmt_info = vinfo_for_stmt (phi);
312 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
315 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
318 gcc_assert (stmt_info);
320 if (STMT_VINFO_RELEVANT_P (stmt_info)
321 || STMT_VINFO_LIVE_P (stmt_info))
323 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
324 scalar_type = TREE_TYPE (PHI_RESULT (phi));
326 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location,
329 "get vectype for scalar type: ");
330 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
331 dump_printf (MSG_NOTE, "\n");
334 vectype = get_vectype_for_scalar_type (scalar_type);
335 if (!vectype)
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 "not vectorized: unsupported "
341 "data-type ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
343 scalar_type);
344 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
346 return false;
348 STMT_VINFO_VECTYPE (stmt_info) = vectype;
350 if (dump_enabled_p ())
352 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
353 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
354 dump_printf (MSG_NOTE, "\n");
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
360 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
361 dump_printf (MSG_NOTE, "\n");
364 vect_update_max_nunits (&vectorization_factor, vectype);
368 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
369 gsi_next (&si))
371 stmt_info = vinfo_for_stmt (gsi_stmt (si));
372 if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
373 &mask_producers))
374 return false;
378 /* TODO: Analyze cost. Decide if worth while to vectorize. */
379 if (dump_enabled_p ())
381 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
382 dump_dec (MSG_NOTE, vectorization_factor);
383 dump_printf (MSG_NOTE, "\n");
386 if (known_le (vectorization_factor, 1U))
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "not vectorized: unsupported data-type\n");
391 return false;
393 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
395 for (i = 0; i < mask_producers.length (); i++)
397 stmt_info = mask_producers[i];
398 tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
399 if (!mask_type)
400 return false;
401 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
404 return true;
408 /* Function vect_is_simple_iv_evolution.
410 FORNOW: A simple evolution of an induction variables in the loop is
411 considered a polynomial evolution. */
413 static bool
414 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
415 tree * step)
417 tree init_expr;
418 tree step_expr;
419 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
420 basic_block bb;
422 /* When there is no evolution in this loop, the evolution function
423 is not "simple". */
424 if (evolution_part == NULL_TREE)
425 return false;
427 /* When the evolution is a polynomial of degree >= 2
428 the evolution function is not "simple". */
429 if (tree_is_chrec (evolution_part))
430 return false;
432 step_expr = evolution_part;
433 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
435 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
439 dump_printf (MSG_NOTE, ", init: ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
441 dump_printf (MSG_NOTE, "\n");
444 *init = init_expr;
445 *step = step_expr;
447 if (TREE_CODE (step_expr) != INTEGER_CST
448 && (TREE_CODE (step_expr) != SSA_NAME
449 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
450 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
451 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
452 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
453 || !flag_associative_math)))
454 && (TREE_CODE (step_expr) != REAL_CST
455 || !flag_associative_math))
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
459 "step unknown.\n");
460 return false;
463 return true;
466 /* Function vect_analyze_scalar_cycles_1.
468 Examine the cross iteration def-use cycles of scalar variables
469 in LOOP. LOOP_VINFO represents the loop that is now being
470 considered for vectorization (can be LOOP, or an outer-loop
471 enclosing LOOP). */
473 static void
474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
476 basic_block bb = loop->header;
477 tree init, step;
478 auto_vec<gimple *, 64> worklist;
479 gphi_iterator gsi;
480 bool double_reduc;
482 if (dump_enabled_p ())
483 dump_printf_loc (MSG_NOTE, vect_location,
484 "=== vect_analyze_scalar_cycles ===\n");
486 /* First - identify all inductions. Reduction detection assumes that all the
487 inductions have been identified, therefore, this order must not be
488 changed. */
489 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
491 gphi *phi = gsi.phi ();
492 tree access_fn = NULL;
493 tree def = PHI_RESULT (phi);
494 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
496 if (dump_enabled_p ())
498 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
499 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
502 /* Skip virtual phi's. The data dependences that are associated with
503 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
504 if (virtual_operand_p (def))
505 continue;
507 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
509 /* Analyze the evolution function. */
510 access_fn = analyze_scalar_evolution (loop, def);
511 if (access_fn)
513 STRIP_NOPS (access_fn);
514 if (dump_enabled_p ())
516 dump_printf_loc (MSG_NOTE, vect_location,
517 "Access function of PHI: ");
518 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
519 dump_printf (MSG_NOTE, "\n");
521 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 = initial_condition_in_loop_num (access_fn, loop->num);
523 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
524 = evolution_part_in_loop_num (access_fn, loop->num);
527 if (!access_fn
528 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
529 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
530 && TREE_CODE (step) != INTEGER_CST))
532 worklist.safe_push (phi);
533 continue;
536 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
537 != NULL_TREE);
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
540 if (dump_enabled_p ())
541 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
542 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
546 /* Second - identify all reductions and nested cycles. */
547 while (worklist.length () > 0)
549 gimple *phi = worklist.pop ();
550 tree def = PHI_RESULT (phi);
551 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
552 gimple *reduc_stmt;
554 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
557 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
560 gcc_assert (!virtual_operand_p (def)
561 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
563 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
564 &double_reduc, false);
565 if (reduc_stmt)
567 if (double_reduc)
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "Detected double reduction.\n");
573 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
574 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
575 vect_double_reduction_def;
577 else
579 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
581 if (dump_enabled_p ())
582 dump_printf_loc (MSG_NOTE, vect_location,
583 "Detected vectorizable nested cycle.\n");
585 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
586 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
587 vect_nested_cycle;
589 else
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "Detected reduction.\n");
595 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
596 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
597 vect_reduction_def;
598 /* Store the reduction cycles for possible vectorization in
599 loop-aware SLP if it was not detected as reduction
600 chain. */
601 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
602 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
606 else
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
609 "Unknown def-use cycle pattern.\n");
614 /* Function vect_analyze_scalar_cycles.
616 Examine the cross iteration def-use cycles of scalar variables, by
617 analyzing the loop-header PHIs of scalar variables. Classify each
618 cycle as one of the following: invariant, induction, reduction, unknown.
619 We do that for the loop represented by LOOP_VINFO, and also to its
620 inner-loop, if exists.
621 Examples for scalar cycles:
623 Example1: reduction:
625 loop1:
626 for (i=0; i<N; i++)
627 sum += a[i];
629 Example2: induction:
631 loop2:
632 for (i=0; i<N; i++)
633 a[i] = i; */
635 static void
636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
638 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
640 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
642 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
643 Reductions in such inner-loop therefore have different properties than
644 the reductions in the nest that gets vectorized:
645 1. When vectorized, they are executed in the same order as in the original
646 scalar loop, so we can't change the order of computation when
647 vectorizing them.
648 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
649 current checks are too strict. */
651 if (loop->inner)
652 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
655 /* Transfer group and reduction information from STMT to its pattern stmt. */
657 static void
658 vect_fixup_reduc_chain (gimple *stmt)
660 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
661 gimple *stmtp;
662 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
663 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
664 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
667 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
668 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
669 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
670 if (stmt)
671 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
672 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
674 while (stmt);
675 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
678 /* Fixup scalar cycles that now have their stmts detected as patterns. */
680 static void
681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
683 gimple *first;
684 unsigned i;
686 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
687 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
689 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
690 while (next)
692 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
693 break;
694 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
696 /* If not all stmt in the chain are patterns try to handle
697 the chain without patterns. */
698 if (! next)
700 vect_fixup_reduc_chain (first);
701 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
702 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
707 /* Function vect_get_loop_niters.
709 Determine how many iterations the loop is executed and place it
710 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
711 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
712 niter information holds in ASSUMPTIONS.
714 Return the loop exit condition. */
717 static gcond *
718 vect_get_loop_niters (struct loop *loop, tree *assumptions,
719 tree *number_of_iterations, tree *number_of_iterationsm1)
721 edge exit = single_exit (loop);
722 struct tree_niter_desc niter_desc;
723 tree niter_assumptions, niter, may_be_zero;
724 gcond *cond = get_loop_exit_condition (loop);
726 *assumptions = boolean_true_node;
727 *number_of_iterationsm1 = chrec_dont_know;
728 *number_of_iterations = chrec_dont_know;
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_NOTE, vect_location,
731 "=== get_loop_niters ===\n");
733 if (!exit)
734 return cond;
736 niter = chrec_dont_know;
737 may_be_zero = NULL_TREE;
738 niter_assumptions = boolean_true_node;
739 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
740 || chrec_contains_undetermined (niter_desc.niter))
741 return cond;
743 niter_assumptions = niter_desc.assumptions;
744 may_be_zero = niter_desc.may_be_zero;
745 niter = niter_desc.niter;
747 if (may_be_zero && integer_zerop (may_be_zero))
748 may_be_zero = NULL_TREE;
750 if (may_be_zero)
752 if (COMPARISON_CLASS_P (may_be_zero))
754 /* Try to combine may_be_zero with assumptions, this can simplify
755 computation of niter expression. */
756 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
757 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
758 niter_assumptions,
759 fold_build1 (TRUTH_NOT_EXPR,
760 boolean_type_node,
761 may_be_zero));
762 else
763 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
764 build_int_cst (TREE_TYPE (niter), 0),
765 rewrite_to_non_trapping_overflow (niter));
767 may_be_zero = NULL_TREE;
769 else if (integer_nonzerop (may_be_zero))
771 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
772 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
773 return cond;
775 else
776 return cond;
779 *assumptions = niter_assumptions;
780 *number_of_iterationsm1 = niter;
782 /* We want the number of loop header executions which is the number
783 of latch executions plus one.
784 ??? For UINT_MAX latch executions this number overflows to zero
785 for loops like do { n++; } while (n != 0); */
786 if (niter && !chrec_contains_undetermined (niter))
787 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
788 build_int_cst (TREE_TYPE (niter), 1));
789 *number_of_iterations = niter;
791 return cond;
794 /* Function bb_in_loop_p
796 Used as predicate for dfs order traversal of the loop bbs. */
798 static bool
799 bb_in_loop_p (const_basic_block bb, const void *data)
801 const struct loop *const loop = (const struct loop *)data;
802 if (flow_bb_inside_loop_p (loop, bb))
803 return true;
804 return false;
808 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
809 stmt_vec_info structs for all the stmts in LOOP_IN. */
811 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
812 : vec_info (vec_info::loop, init_cost (loop_in)),
813 loop (loop_in),
814 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
815 num_itersm1 (NULL_TREE),
816 num_iters (NULL_TREE),
817 num_iters_unchanged (NULL_TREE),
818 num_iters_assumptions (NULL_TREE),
819 th (0),
820 versioning_threshold (0),
821 vectorization_factor (0),
822 max_vectorization_factor (0),
823 mask_skip_niters (NULL_TREE),
824 mask_compare_type (NULL_TREE),
825 unaligned_dr (NULL),
826 peeling_for_alignment (0),
827 ptr_mask (0),
828 ivexpr_map (NULL),
829 slp_unrolling_factor (1),
830 single_scalar_iteration_cost (0),
831 vectorizable (false),
832 can_fully_mask_p (true),
833 fully_masked_p (false),
834 peeling_for_gaps (false),
835 peeling_for_niter (false),
836 operands_swapped (false),
837 no_data_dependencies (false),
838 has_mask_store (false),
839 scalar_loop (NULL),
840 orig_loop_info (NULL)
842 /* Create/Update stmt_info for all stmts in the loop. */
843 basic_block *body = get_loop_body (loop);
844 for (unsigned int i = 0; i < loop->num_nodes; i++)
846 basic_block bb = body[i];
847 gimple_stmt_iterator si;
849 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
851 gimple *phi = gsi_stmt (si);
852 gimple_set_uid (phi, 0);
853 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
856 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
858 gimple *stmt = gsi_stmt (si);
859 gimple_set_uid (stmt, 0);
860 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
863 free (body);
865 /* CHECKME: We want to visit all BBs before their successors (except for
866 latch blocks, for which this assertion wouldn't hold). In the simple
867 case of the loop forms we allow, a dfs order of the BBs would the same
868 as reversed postorder traversal, so we are safe. */
870 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
871 bbs, loop->num_nodes, loop);
872 gcc_assert (nbbs == loop->num_nodes);
875 /* Free all levels of MASKS. */
877 void
878 release_vec_loop_masks (vec_loop_masks *masks)
880 rgroup_masks *rgm;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (*masks, i, rgm)
883 rgm->masks.release ();
884 masks->release ();
887 /* Free all memory used by the _loop_vec_info, as well as all the
888 stmt_vec_info structs of all the stmts in the loop. */
890 _loop_vec_info::~_loop_vec_info ()
892 int nbbs;
893 gimple_stmt_iterator si;
894 int j;
896 nbbs = loop->num_nodes;
897 for (j = 0; j < nbbs; j++)
899 basic_block bb = bbs[j];
900 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
901 free_stmt_vec_info (gsi_stmt (si));
903 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
905 gimple *stmt = gsi_stmt (si);
907 /* We may have broken canonical form by moving a constant
908 into RHS1 of a commutative op. Fix such occurrences. */
909 if (operands_swapped && is_gimple_assign (stmt))
911 enum tree_code code = gimple_assign_rhs_code (stmt);
913 if ((code == PLUS_EXPR
914 || code == POINTER_PLUS_EXPR
915 || code == MULT_EXPR)
916 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
917 swap_ssa_operands (stmt,
918 gimple_assign_rhs1_ptr (stmt),
919 gimple_assign_rhs2_ptr (stmt));
920 else if (code == COND_EXPR
921 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
923 tree cond_expr = gimple_assign_rhs1 (stmt);
924 enum tree_code cond_code = TREE_CODE (cond_expr);
926 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
928 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
929 0));
930 cond_code = invert_tree_comparison (cond_code,
931 honor_nans);
932 if (cond_code != ERROR_MARK)
934 TREE_SET_CODE (cond_expr, cond_code);
935 swap_ssa_operands (stmt,
936 gimple_assign_rhs2_ptr (stmt),
937 gimple_assign_rhs3_ptr (stmt));
943 /* Free stmt_vec_info. */
944 free_stmt_vec_info (stmt);
945 gsi_next (&si);
949 free (bbs);
951 release_vec_loop_masks (&masks);
952 delete ivexpr_map;
954 loop->aux = NULL;
957 /* Return an invariant or register for EXPR and emit necessary
958 computations in the LOOP_VINFO loop preheader. */
960 tree
961 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
963 if (is_gimple_reg (expr)
964 || is_gimple_min_invariant (expr))
965 return expr;
967 if (! loop_vinfo->ivexpr_map)
968 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
969 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
970 if (! cached)
972 gimple_seq stmts = NULL;
973 cached = force_gimple_operand (unshare_expr (expr),
974 &stmts, true, NULL_TREE);
975 if (stmts)
977 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
978 gsi_insert_seq_on_edge_immediate (e, stmts);
981 return cached;
984 /* Return true if we can use CMP_TYPE as the comparison type to produce
985 all masks required to mask LOOP_VINFO. */
987 static bool
988 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
990 rgroup_masks *rgm;
991 unsigned int i;
992 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
993 if (rgm->mask_type != NULL_TREE
994 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
995 cmp_type, rgm->mask_type,
996 OPTIMIZE_FOR_SPEED))
997 return false;
998 return true;
1001 /* Calculate the maximum number of scalars per iteration for every
1002 rgroup in LOOP_VINFO. */
1004 static unsigned int
1005 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1007 unsigned int res = 1;
1008 unsigned int i;
1009 rgroup_masks *rgm;
1010 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1011 res = MAX (res, rgm->max_nscalars_per_iter);
1012 return res;
1015 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1016 whether we can actually generate the masks required. Return true if so,
1017 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1019 static bool
1020 vect_verify_full_masking (loop_vec_info loop_vinfo)
1022 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1023 unsigned int min_ni_width;
1025 /* Use a normal loop if there are no statements that need masking.
1026 This only happens in rare degenerate cases: it means that the loop
1027 has no loads, no stores, and no live-out values. */
1028 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1029 return false;
1031 /* Get the maximum number of iterations that is representable
1032 in the counter type. */
1033 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1034 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1036 /* Get a more refined estimate for the number of iterations. */
1037 widest_int max_back_edges;
1038 if (max_loop_iterations (loop, &max_back_edges))
1039 max_ni = wi::smin (max_ni, max_back_edges + 1);
1041 /* Account for rgroup masks, in which each bit is replicated N times. */
1042 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1044 /* Work out how many bits we need to represent the limit. */
1045 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1047 /* Find a scalar mode for which WHILE_ULT is supported. */
1048 opt_scalar_int_mode cmp_mode_iter;
1049 tree cmp_type = NULL_TREE;
1050 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1052 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1053 if (cmp_bits >= min_ni_width
1054 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1056 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1057 if (this_type
1058 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1060 /* Although we could stop as soon as we find a valid mode,
1061 it's often better to continue until we hit Pmode, since the
1062 operands to the WHILE are more likely to be reusable in
1063 address calculations. */
1064 cmp_type = this_type;
1065 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1066 break;
1071 if (!cmp_type)
1072 return false;
1074 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1075 return true;
1078 /* Calculate the cost of one scalar iteration of the loop. */
1079 static void
1080 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1082 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1083 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1084 int nbbs = loop->num_nodes, factor;
1085 int innerloop_iters, i;
1087 /* Gather costs for statements in the scalar loop. */
1089 /* FORNOW. */
1090 innerloop_iters = 1;
1091 if (loop->inner)
1092 innerloop_iters = 50; /* FIXME */
1094 for (i = 0; i < nbbs; i++)
1096 gimple_stmt_iterator si;
1097 basic_block bb = bbs[i];
1099 if (bb->loop_father == loop->inner)
1100 factor = innerloop_iters;
1101 else
1102 factor = 1;
1104 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1106 gimple *stmt = gsi_stmt (si);
1107 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1109 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1110 continue;
1112 /* Skip stmts that are not vectorized inside the loop. */
1113 if (stmt_info
1114 && !STMT_VINFO_RELEVANT_P (stmt_info)
1115 && (!STMT_VINFO_LIVE_P (stmt_info)
1116 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1117 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1118 continue;
1120 vect_cost_for_stmt kind;
1121 if (STMT_VINFO_DATA_REF (stmt_info))
1123 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1124 kind = scalar_load;
1125 else
1126 kind = scalar_store;
1128 else
1129 kind = scalar_stmt;
1131 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1132 factor, kind, stmt_info, 0, vect_prologue);
1136 /* Now accumulate cost. */
1137 void *target_cost_data = init_cost (loop);
1138 stmt_info_for_cost *si;
1139 int j;
1140 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1141 j, si)
1143 struct _stmt_vec_info *stmt_info
1144 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1145 (void) add_stmt_cost (target_cost_data, si->count,
1146 si->kind, stmt_info, si->misalign,
1147 vect_body);
1149 unsigned dummy, body_cost = 0;
1150 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1151 destroy_cost_data (target_cost_data);
1152 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1156 /* Function vect_analyze_loop_form_1.
1158 Verify that certain CFG restrictions hold, including:
1159 - the loop has a pre-header
1160 - the loop has a single entry and exit
1161 - the loop exit condition is simple enough
1162 - the number of iterations can be analyzed, i.e, a countable loop. The
1163 niter could be analyzed under some assumptions. */
1165 bool
1166 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1167 tree *assumptions, tree *number_of_iterationsm1,
1168 tree *number_of_iterations, gcond **inner_loop_cond)
1170 if (dump_enabled_p ())
1171 dump_printf_loc (MSG_NOTE, vect_location,
1172 "=== vect_analyze_loop_form ===\n");
1174 /* Different restrictions apply when we are considering an inner-most loop,
1175 vs. an outer (nested) loop.
1176 (FORNOW. May want to relax some of these restrictions in the future). */
1178 if (!loop->inner)
1180 /* Inner-most loop. We currently require that the number of BBs is
1181 exactly 2 (the header and latch). Vectorizable inner-most loops
1182 look like this:
1184 (pre-header)
1186 header <--------+
1187 | | |
1188 | +--> latch --+
1190 (exit-bb) */
1192 if (loop->num_nodes != 2)
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "not vectorized: control flow in loop.\n");
1197 return false;
1200 if (empty_block_p (loop->header))
1202 if (dump_enabled_p ())
1203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1204 "not vectorized: empty loop.\n");
1205 return false;
1208 else
1210 struct loop *innerloop = loop->inner;
1211 edge entryedge;
1213 /* Nested loop. We currently require that the loop is doubly-nested,
1214 contains a single inner loop, and the number of BBs is exactly 5.
1215 Vectorizable outer-loops look like this:
1217 (pre-header)
1219 header <---+
1221 inner-loop |
1223 tail ------+
1225 (exit-bb)
1227 The inner-loop has the properties expected of inner-most loops
1228 as described above. */
1230 if ((loop->inner)->inner || (loop->inner)->next)
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: multiple nested loops.\n");
1235 return false;
1238 if (loop->num_nodes != 5)
1240 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242 "not vectorized: control flow in loop.\n");
1243 return false;
1246 entryedge = loop_preheader_edge (innerloop);
1247 if (entryedge->src != loop->header
1248 || !single_exit (innerloop)
1249 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1251 if (dump_enabled_p ())
1252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1253 "not vectorized: unsupported outerloop form.\n");
1254 return false;
1257 /* Analyze the inner-loop. */
1258 tree inner_niterm1, inner_niter, inner_assumptions;
1259 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1260 &inner_assumptions, &inner_niterm1,
1261 &inner_niter, NULL)
1262 /* Don't support analyzing niter under assumptions for inner
1263 loop. */
1264 || !integer_onep (inner_assumptions))
1266 if (dump_enabled_p ())
1267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1268 "not vectorized: Bad inner loop.\n");
1269 return false;
1272 if (!expr_invariant_in_loop_p (loop, inner_niter))
1274 if (dump_enabled_p ())
1275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276 "not vectorized: inner-loop count not"
1277 " invariant.\n");
1278 return false;
1281 if (dump_enabled_p ())
1282 dump_printf_loc (MSG_NOTE, vect_location,
1283 "Considering outer-loop vectorization.\n");
1286 if (!single_exit (loop)
1287 || EDGE_COUNT (loop->header->preds) != 2)
1289 if (dump_enabled_p ())
1291 if (!single_exit (loop))
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: multiple exits.\n");
1294 else if (EDGE_COUNT (loop->header->preds) != 2)
1295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1296 "not vectorized: too many incoming edges.\n");
1298 return false;
1301 /* We assume that the loop exit condition is at the end of the loop. i.e,
1302 that the loop is represented as a do-while (with a proper if-guard
1303 before the loop if needed), where the loop header contains all the
1304 executable statements, and the latch is empty. */
1305 if (!empty_block_p (loop->latch)
1306 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 "not vectorized: latch block not empty.\n");
1311 return false;
1314 /* Make sure the exit is not abnormal. */
1315 edge e = single_exit (loop);
1316 if (e->flags & EDGE_ABNORMAL)
1318 if (dump_enabled_p ())
1319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1320 "not vectorized: abnormal loop exit edge.\n");
1321 return false;
1324 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1325 number_of_iterationsm1);
1326 if (!*loop_cond)
1328 if (dump_enabled_p ())
1329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330 "not vectorized: complicated exit condition.\n");
1331 return false;
1334 if (integer_zerop (*assumptions)
1335 || !*number_of_iterations
1336 || chrec_contains_undetermined (*number_of_iterations))
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340 "not vectorized: number of iterations cannot be "
1341 "computed.\n");
1342 return false;
1345 if (integer_zerop (*number_of_iterations))
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349 "not vectorized: number of iterations = 0.\n");
1350 return false;
1353 return true;
1356 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1358 loop_vec_info
1359 vect_analyze_loop_form (struct loop *loop)
1361 tree assumptions, number_of_iterations, number_of_iterationsm1;
1362 gcond *loop_cond, *inner_loop_cond = NULL;
1364 if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1365 &assumptions, &number_of_iterationsm1,
1366 &number_of_iterations, &inner_loop_cond))
1367 return NULL;
1369 loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1370 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1371 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1372 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1373 if (!integer_onep (assumptions))
1375 /* We consider to vectorize this loop by versioning it under
1376 some assumptions. In order to do this, we need to clear
1377 existing information computed by scev and niter analyzer. */
1378 scev_reset_htab ();
1379 free_numbers_of_iterations_estimates (loop);
1380 /* Also set flag for this loop so that following scev and niter
1381 analysis are done under the assumptions. */
1382 loop_constraint_set (loop, LOOP_C_FINITE);
1383 /* Also record the assumptions for versioning. */
1384 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1387 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1389 if (dump_enabled_p ())
1391 dump_printf_loc (MSG_NOTE, vect_location,
1392 "Symbolic number of iterations is ");
1393 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1394 dump_printf (MSG_NOTE, "\n");
1398 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1399 if (inner_loop_cond)
1400 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1401 = loop_exit_ctrl_vec_info_type;
1403 gcc_assert (!loop->aux);
1404 loop->aux = loop_vinfo;
1405 return loop_vinfo;
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411 statements update the vectorization factor. */
1413 static void
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1416 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1417 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1418 int nbbs = loop->num_nodes;
1419 poly_uint64 vectorization_factor;
1420 int i;
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "=== vect_update_vf_for_slp ===\n");
1426 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1427 gcc_assert (known_ne (vectorization_factor, 0U));
1429 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1430 vectorization factor of the loop is the unrolling factor required by
1431 the SLP instances. If that unrolling factor is 1, we say, that we
1432 perform pure SLP on loop - cross iteration parallelism is not
1433 exploited. */
1434 bool only_slp_in_loop = true;
1435 for (i = 0; i < nbbs; i++)
1437 basic_block bb = bbs[i];
1438 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1439 gsi_next (&si))
1441 gimple *stmt = gsi_stmt (si);
1442 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1443 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1444 && STMT_VINFO_RELATED_STMT (stmt_info))
1446 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1447 stmt_info = vinfo_for_stmt (stmt);
1449 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1450 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1451 && !PURE_SLP_STMT (stmt_info))
1452 /* STMT needs both SLP and loop-based vectorization. */
1453 only_slp_in_loop = false;
1457 if (only_slp_in_loop)
1459 dump_printf_loc (MSG_NOTE, vect_location,
1460 "Loop contains only SLP stmts\n");
1461 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1463 else
1465 dump_printf_loc (MSG_NOTE, vect_location,
1466 "Loop contains SLP and non-SLP stmts\n");
1467 /* Both the vectorization factor and unroll factor have the form
1468 current_vector_size * X for some rational X, so they must have
1469 a common multiple. */
1470 vectorization_factor
1471 = force_common_multiple (vectorization_factor,
1472 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1475 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1476 if (dump_enabled_p ())
1478 dump_printf_loc (MSG_NOTE, vect_location,
1479 "Updating vectorization factor to ");
1480 dump_dec (MSG_NOTE, vectorization_factor);
1481 dump_printf (MSG_NOTE, ".\n");
1485 /* Return true if STMT_INFO describes a double reduction phi and if
1486 the other phi in the reduction is also relevant for vectorization.
1487 This rejects cases such as:
1489 outer1:
1490 x_1 = PHI <x_3(outer2), ...>;
1493 inner:
1494 x_2 = ...;
1497 outer2:
1498 x_3 = PHI <x_2(inner)>;
1500 if nothing in x_2 or elsewhere makes x_1 relevant. */
1502 static bool
1503 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1505 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1506 return false;
1508 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1509 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1512 /* Function vect_analyze_loop_operations.
1514 Scan the loop stmts and make sure they are all vectorizable. */
1516 static bool
1517 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1519 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1521 int nbbs = loop->num_nodes;
1522 int i;
1523 stmt_vec_info stmt_info;
1524 bool need_to_vectorize = false;
1525 bool ok;
1527 if (dump_enabled_p ())
1528 dump_printf_loc (MSG_NOTE, vect_location,
1529 "=== vect_analyze_loop_operations ===\n");
1531 stmt_vector_for_cost cost_vec;
1532 cost_vec.create (2);
1534 for (i = 0; i < nbbs; i++)
1536 basic_block bb = bbs[i];
1538 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1539 gsi_next (&si))
1541 gphi *phi = si.phi ();
1542 ok = true;
1544 stmt_info = vinfo_for_stmt (phi);
1545 if (dump_enabled_p ())
1547 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1548 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1550 if (virtual_operand_p (gimple_phi_result (phi)))
1551 continue;
1553 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1554 (i.e., a phi in the tail of the outer-loop). */
1555 if (! is_loop_header_bb_p (bb))
1557 /* FORNOW: we currently don't support the case that these phis
1558 are not used in the outerloop (unless it is double reduction,
1559 i.e., this phi is vect_reduction_def), cause this case
1560 requires to actually do something here. */
1561 if (STMT_VINFO_LIVE_P (stmt_info)
1562 && !vect_active_double_reduction_p (stmt_info))
1564 if (dump_enabled_p ())
1565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566 "Unsupported loop-closed phi in "
1567 "outer-loop.\n");
1568 return false;
1571 /* If PHI is used in the outer loop, we check that its operand
1572 is defined in the inner loop. */
1573 if (STMT_VINFO_RELEVANT_P (stmt_info))
1575 tree phi_op;
1576 gimple *op_def_stmt;
1578 if (gimple_phi_num_args (phi) != 1)
1579 return false;
1581 phi_op = PHI_ARG_DEF (phi, 0);
1582 if (TREE_CODE (phi_op) != SSA_NAME)
1583 return false;
1585 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1586 if (gimple_nop_p (op_def_stmt)
1587 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1588 || !vinfo_for_stmt (op_def_stmt))
1589 return false;
1591 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1592 != vect_used_in_outer
1593 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1594 != vect_used_in_outer_by_reduction)
1595 return false;
1598 continue;
1601 gcc_assert (stmt_info);
1603 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1604 || STMT_VINFO_LIVE_P (stmt_info))
1605 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1607 /* A scalar-dependence cycle that we don't support. */
1608 if (dump_enabled_p ())
1609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1610 "not vectorized: scalar dependence cycle.\n");
1611 return false;
1614 if (STMT_VINFO_RELEVANT_P (stmt_info))
1616 need_to_vectorize = true;
1617 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1618 && ! PURE_SLP_STMT (stmt_info))
1619 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1620 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1621 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1622 && ! PURE_SLP_STMT (stmt_info))
1623 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1624 &cost_vec);
1627 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1628 if (ok
1629 && STMT_VINFO_LIVE_P (stmt_info)
1630 && !PURE_SLP_STMT (stmt_info))
1631 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1632 &cost_vec);
1634 if (!ok)
1636 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "not vectorized: relevant phi not "
1640 "supported: ");
1641 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1643 return false;
1647 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1648 gsi_next (&si))
1650 gimple *stmt = gsi_stmt (si);
1651 if (!gimple_clobber_p (stmt)
1652 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1653 &cost_vec))
1654 return false;
1656 } /* bbs */
1658 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1659 cost_vec.release ();
1661 /* All operations in the loop are either irrelevant (deal with loop
1662 control, or dead), or only used outside the loop and can be moved
1663 out of the loop (e.g. invariants, inductions). The loop can be
1664 optimized away by scalar optimizations. We're better off not
1665 touching this loop. */
1666 if (!need_to_vectorize)
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_NOTE, vect_location,
1670 "All the computation can be taken out of the loop.\n");
1671 if (dump_enabled_p ())
1672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673 "not vectorized: redundant loop. no profit to "
1674 "vectorize.\n");
1675 return false;
1678 return true;
1681 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1682 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1683 definitely no, or -1 if it's worth retrying. */
1685 static int
1686 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1688 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1689 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1691 /* Only fully-masked loops can have iteration counts less than the
1692 vectorization factor. */
1693 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1695 HOST_WIDE_INT max_niter;
1697 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1698 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1699 else
1700 max_niter = max_stmt_executions_int (loop);
1702 if (max_niter != -1
1703 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1705 if (dump_enabled_p ())
1706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707 "not vectorized: iteration count smaller than "
1708 "vectorization factor.\n");
1709 return 0;
1713 int min_profitable_iters, min_profitable_estimate;
1714 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1715 &min_profitable_estimate);
1717 if (min_profitable_iters < 0)
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721 "not vectorized: vectorization not profitable.\n");
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724 "not vectorized: vector version will never be "
1725 "profitable.\n");
1726 return -1;
1729 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1730 * assumed_vf);
1732 /* Use the cost model only if it is more conservative than user specified
1733 threshold. */
1734 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1735 min_profitable_iters);
1737 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1739 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1740 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744 "not vectorized: vectorization not profitable.\n");
1745 if (dump_enabled_p ())
1746 dump_printf_loc (MSG_NOTE, vect_location,
1747 "not vectorized: iteration count smaller than user "
1748 "specified loop bound parameter or minimum profitable "
1749 "iterations (whichever is more conservative).\n");
1750 return 0;
1753 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1754 if (estimated_niter == -1)
1755 estimated_niter = likely_max_stmt_executions_int (loop);
1756 if (estimated_niter != -1
1757 && ((unsigned HOST_WIDE_INT) estimated_niter
1758 < MAX (th, (unsigned) min_profitable_estimate)))
1760 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762 "not vectorized: estimated iteration count too "
1763 "small.\n");
1764 if (dump_enabled_p ())
1765 dump_printf_loc (MSG_NOTE, vect_location,
1766 "not vectorized: estimated iteration count smaller "
1767 "than specified loop bound parameter or minimum "
1768 "profitable iterations (whichever is more "
1769 "conservative).\n");
1770 return -1;
1773 return 1;
1777 /* Function vect_analyze_loop_2.
1779 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780 for it. The different analyses will record information in the
1781 loop_vec_info struct. */
1782 static bool
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1785 bool ok;
1786 int res;
1787 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1788 poly_uint64 min_vf = 2;
1789 unsigned int n_stmts = 0;
1791 /* The first group of checks is independent of the vector size. */
1792 fatal = true;
1794 /* Find all data references in the loop (which correspond to vdefs/vuses)
1795 and analyze their evolution in the loop. */
1797 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1799 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1800 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1802 if (dump_enabled_p ())
1803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1804 "not vectorized: loop nest containing two "
1805 "or more consecutive inner loops cannot be "
1806 "vectorized\n");
1807 return false;
1810 for (unsigned i = 0; i < loop->num_nodes; i++)
1811 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1812 !gsi_end_p (gsi); gsi_next (&gsi))
1814 gimple *stmt = gsi_stmt (gsi);
1815 if (is_gimple_debug (stmt))
1816 continue;
1817 ++n_stmts;
1818 if (!find_data_references_in_stmt (loop, stmt,
1819 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1821 if (is_gimple_call (stmt) && loop->safelen)
1823 tree fndecl = gimple_call_fndecl (stmt), op;
1824 if (fndecl != NULL_TREE)
1826 cgraph_node *node = cgraph_node::get (fndecl);
1827 if (node != NULL && node->simd_clones != NULL)
1829 unsigned int j, n = gimple_call_num_args (stmt);
1830 for (j = 0; j < n; j++)
1832 op = gimple_call_arg (stmt, j);
1833 if (DECL_P (op)
1834 || (REFERENCE_CLASS_P (op)
1835 && get_base_address (op)))
1836 break;
1838 op = gimple_call_lhs (stmt);
1839 /* Ignore #pragma omp declare simd functions
1840 if they don't have data references in the
1841 call stmt itself. */
1842 if (j == n
1843 && !(op
1844 && (DECL_P (op)
1845 || (REFERENCE_CLASS_P (op)
1846 && get_base_address (op)))))
1847 continue;
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853 "not vectorized: loop contains function "
1854 "calls or data references that cannot "
1855 "be analyzed\n");
1856 return false;
1860 /* Analyze the data references and also adjust the minimal
1861 vectorization factor according to the loads and stores. */
1863 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1864 if (!ok)
1866 if (dump_enabled_p ())
1867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868 "bad data references.\n");
1869 return false;
1872 /* Classify all cross-iteration scalar data-flow cycles.
1873 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1874 vect_analyze_scalar_cycles (loop_vinfo);
1876 vect_pattern_recog (loop_vinfo);
1878 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1880 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1881 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1883 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1884 if (!ok)
1886 if (dump_enabled_p ())
1887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 "bad data access.\n");
1889 return false;
1892 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1894 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1895 if (!ok)
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 "unexpected pattern.\n");
1900 return false;
1903 /* While the rest of the analysis below depends on it in some way. */
1904 fatal = false;
1906 /* Analyze data dependences between the data-refs in the loop
1907 and adjust the maximum vectorization factor according to
1908 the dependences.
1909 FORNOW: fail at the first data dependence that we encounter. */
1911 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1912 if (!ok
1913 || (max_vf != MAX_VECTORIZATION_FACTOR
1914 && maybe_lt (max_vf, min_vf)))
1916 if (dump_enabled_p ())
1917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918 "bad data dependence.\n");
1919 return false;
1921 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1923 ok = vect_determine_vectorization_factor (loop_vinfo);
1924 if (!ok)
1926 if (dump_enabled_p ())
1927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928 "can't determine vectorization factor.\n");
1929 return false;
1931 if (max_vf != MAX_VECTORIZATION_FACTOR
1932 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1934 if (dump_enabled_p ())
1935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936 "bad data dependence.\n");
1937 return false;
1940 /* Compute the scalar iteration cost. */
1941 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1943 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1944 unsigned th;
1946 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1947 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1948 if (!ok)
1949 return false;
1951 /* If there are any SLP instances mark them as pure_slp. */
1952 bool slp = vect_make_slp_decision (loop_vinfo);
1953 if (slp)
1955 /* Find stmts that need to be both vectorized and SLPed. */
1956 vect_detect_hybrid_slp (loop_vinfo);
1958 /* Update the vectorization factor based on the SLP decision. */
1959 vect_update_vf_for_slp (loop_vinfo);
1962 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1964 /* We don't expect to have to roll back to anything other than an empty
1965 set of rgroups. */
1966 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1968 /* This is the point where we can re-start analysis with SLP forced off. */
1969 start_over:
1971 /* Now the vectorization factor is final. */
1972 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1973 gcc_assert (known_ne (vectorization_factor, 0U));
1975 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1977 dump_printf_loc (MSG_NOTE, vect_location,
1978 "vectorization_factor = ");
1979 dump_dec (MSG_NOTE, vectorization_factor);
1980 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1981 LOOP_VINFO_INT_NITERS (loop_vinfo));
1984 HOST_WIDE_INT max_niter
1985 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1987 /* Analyze the alignment of the data-refs in the loop.
1988 Fail if a data reference is found that cannot be vectorized. */
1990 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1991 if (!ok)
1993 if (dump_enabled_p ())
1994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1995 "bad data alignment.\n");
1996 return false;
1999 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2000 It is important to call pruning after vect_analyze_data_ref_accesses,
2001 since we use grouping information gathered by interleaving analysis. */
2002 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2003 if (!ok)
2004 return false;
2006 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2007 vectorization. */
2008 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2010 /* This pass will decide on using loop versioning and/or loop peeling in
2011 order to enhance the alignment of data references in the loop. */
2012 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2013 if (!ok)
2015 if (dump_enabled_p ())
2016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2017 "bad data alignment.\n");
2018 return false;
2022 if (slp)
2024 /* Analyze operations in the SLP instances. Note this may
2025 remove unsupported SLP instances which makes the above
2026 SLP kind detection invalid. */
2027 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2028 vect_slp_analyze_operations (loop_vinfo);
2029 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2030 goto again;
2033 /* Scan all the remaining operations in the loop that are not subject
2034 to SLP and make sure they are vectorizable. */
2035 ok = vect_analyze_loop_operations (loop_vinfo);
2036 if (!ok)
2038 if (dump_enabled_p ())
2039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040 "bad operation or unsupported loop bound.\n");
2041 return false;
2044 /* Decide whether to use a fully-masked loop for this vectorization
2045 factor. */
2046 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2047 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2048 && vect_verify_full_masking (loop_vinfo));
2049 if (dump_enabled_p ())
2051 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2052 dump_printf_loc (MSG_NOTE, vect_location,
2053 "using a fully-masked loop.\n");
2054 else
2055 dump_printf_loc (MSG_NOTE, vect_location,
2056 "not using a fully-masked loop.\n");
2059 /* If epilog loop is required because of data accesses with gaps,
2060 one additional iteration needs to be peeled. Check if there is
2061 enough iterations for vectorization. */
2062 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2063 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2064 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2066 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2067 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2069 if (known_lt (wi::to_widest (scalar_niters), vf))
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_NOTE, vect_location,
2073 "loop has no enough iterations to support"
2074 " peeling for gaps.\n");
2075 return false;
2079 /* Check the costings of the loop make vectorizing worthwhile. */
2080 res = vect_analyze_loop_costing (loop_vinfo);
2081 if (res < 0)
2082 goto again;
2083 if (!res)
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2087 "Loop costings not worthwhile.\n");
2088 return false;
2091 /* Decide whether we need to create an epilogue loop to handle
2092 remaining scalar iterations. */
2093 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2095 unsigned HOST_WIDE_INT const_vf;
2096 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2097 /* The main loop handles all iterations. */
2098 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2099 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2100 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2102 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2103 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2104 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2105 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2107 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2108 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2109 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2110 < (unsigned) exact_log2 (const_vf))
2111 /* In case of versioning, check if the maximum number of
2112 iterations is greater than th. If they are identical,
2113 the epilogue is unnecessary. */
2114 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2115 || ((unsigned HOST_WIDE_INT) max_niter
2116 > (th / const_vf) * const_vf))))
2117 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2119 /* If an epilogue loop is required make sure we can create one. */
2120 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2121 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2123 if (dump_enabled_p ())
2124 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2125 if (!vect_can_advance_ivs_p (loop_vinfo)
2126 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2127 single_exit (LOOP_VINFO_LOOP
2128 (loop_vinfo))))
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2132 "not vectorized: can't create required "
2133 "epilog loop\n");
2134 goto again;
2138 /* During peeling, we need to check if number of loop iterations is
2139 enough for both peeled prolog loop and vector loop. This check
2140 can be merged along with threshold check of loop versioning, so
2141 increase threshold for this case if necessary. */
2142 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2144 poly_uint64 niters_th = 0;
2146 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2148 /* Niters for peeled prolog loop. */
2149 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2151 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2152 tree vectype
2153 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2154 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2156 else
2157 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2160 /* Niters for at least one iteration of vectorized loop. */
2161 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2162 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2163 /* One additional iteration because of peeling for gap. */
2164 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2165 niters_th += 1;
2166 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2169 gcc_assert (known_eq (vectorization_factor,
2170 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2172 /* Ok to vectorize! */
2173 return true;
2175 again:
2176 /* Try again with SLP forced off but if we didn't do any SLP there is
2177 no point in re-trying. */
2178 if (!slp)
2179 return false;
2181 /* If there are reduction chains re-trying will fail anyway. */
2182 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2183 return false;
2185 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2186 via interleaving or lane instructions. */
2187 slp_instance instance;
2188 slp_tree node;
2189 unsigned i, j;
2190 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2192 stmt_vec_info vinfo;
2193 vinfo = vinfo_for_stmt
2194 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2195 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2196 continue;
2197 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2198 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2199 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2200 if (! vect_store_lanes_supported (vectype, size, false)
2201 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2202 && ! vect_grouped_store_supported (vectype, size))
2203 return false;
2204 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2206 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2207 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2208 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2209 size = STMT_VINFO_GROUP_SIZE (vinfo);
2210 vectype = STMT_VINFO_VECTYPE (vinfo);
2211 if (! vect_load_lanes_supported (vectype, size, false)
2212 && ! vect_grouped_load_supported (vectype, single_element_p,
2213 size))
2214 return false;
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_NOTE, vect_location,
2220 "re-trying with SLP disabled\n");
2222 /* Roll back state appropriately. No SLP this time. */
2223 slp = false;
2224 /* Restore vectorization factor as it were without SLP. */
2225 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2226 /* Free the SLP instances. */
2227 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2228 vect_free_slp_instance (instance);
2229 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2230 /* Reset SLP type to loop_vect on all stmts. */
2231 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2233 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2234 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2235 !gsi_end_p (si); gsi_next (&si))
2237 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2238 STMT_SLP_TYPE (stmt_info) = loop_vect;
2240 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2241 !gsi_end_p (si); gsi_next (&si))
2243 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2244 STMT_SLP_TYPE (stmt_info) = loop_vect;
2245 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2247 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2248 STMT_SLP_TYPE (stmt_info) = loop_vect;
2249 for (gimple_stmt_iterator pi
2250 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2251 !gsi_end_p (pi); gsi_next (&pi))
2253 gimple *pstmt = gsi_stmt (pi);
2254 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2259 /* Free optimized alias test DDRS. */
2260 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2261 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2262 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2263 /* Reset target cost data. */
2264 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2265 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2266 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2267 /* Reset accumulated rgroup information. */
2268 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2269 /* Reset assorted flags. */
2270 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2271 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2272 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2273 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2274 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2276 goto start_over;
2279 /* Function vect_analyze_loop.
2281 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2282 for it. The different analyses will record information in the
2283 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2284 be vectorized. */
2285 loop_vec_info
2286 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2288 loop_vec_info loop_vinfo;
2289 auto_vector_sizes vector_sizes;
2291 /* Autodetect first vector size we try. */
2292 current_vector_size = 0;
2293 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2294 unsigned int next_size = 0;
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "===== analyze_loop_nest =====\n");
2300 if (loop_outer (loop)
2301 && loop_vec_info_for_loop (loop_outer (loop))
2302 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE, vect_location,
2306 "outer-loop already vectorized.\n");
2307 return NULL;
2310 poly_uint64 autodetected_vector_size = 0;
2311 while (1)
2313 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2314 loop_vinfo = vect_analyze_loop_form (loop);
2315 if (!loop_vinfo)
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319 "bad loop form.\n");
2320 return NULL;
2323 bool fatal = false;
2325 if (orig_loop_vinfo)
2326 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2328 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2330 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2332 return loop_vinfo;
2335 delete loop_vinfo;
2337 if (next_size == 0)
2338 autodetected_vector_size = current_vector_size;
2340 if (next_size < vector_sizes.length ()
2341 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2342 next_size += 1;
2344 if (fatal
2345 || next_size == vector_sizes.length ()
2346 || known_eq (current_vector_size, 0U))
2347 return NULL;
2349 /* Try the next biggest vector size. */
2350 current_vector_size = vector_sizes[next_size++];
2351 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE, vect_location,
2354 "***** Re-trying analysis with "
2355 "vector size ");
2356 dump_dec (MSG_NOTE, current_vector_size);
2357 dump_printf (MSG_NOTE, "\n");
2362 /* Return true if there is an in-order reduction function for CODE, storing
2363 it in *REDUC_FN if so. */
2365 static bool
2366 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2368 switch (code)
2370 case PLUS_EXPR:
2371 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2372 return true;
2374 default:
2375 return false;
2379 /* Function reduction_fn_for_scalar_code
2381 Input:
2382 CODE - tree_code of a reduction operations.
2384 Output:
2385 REDUC_FN - the corresponding internal function to be used to reduce the
2386 vector of partial results into a single scalar result, or IFN_LAST
2387 if the operation is a supported reduction operation, but does not have
2388 such an internal function.
2390 Return FALSE if CODE currently cannot be vectorized as reduction. */
2392 static bool
2393 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2395 switch (code)
2397 case MAX_EXPR:
2398 *reduc_fn = IFN_REDUC_MAX;
2399 return true;
2401 case MIN_EXPR:
2402 *reduc_fn = IFN_REDUC_MIN;
2403 return true;
2405 case PLUS_EXPR:
2406 *reduc_fn = IFN_REDUC_PLUS;
2407 return true;
2409 case BIT_AND_EXPR:
2410 *reduc_fn = IFN_REDUC_AND;
2411 return true;
2413 case BIT_IOR_EXPR:
2414 *reduc_fn = IFN_REDUC_IOR;
2415 return true;
2417 case BIT_XOR_EXPR:
2418 *reduc_fn = IFN_REDUC_XOR;
2419 return true;
2421 case MULT_EXPR:
2422 case MINUS_EXPR:
2423 *reduc_fn = IFN_LAST;
2424 return true;
2426 default:
2427 return false;
2431 /* If there is a neutral value X such that SLP reduction NODE would not
2432 be affected by the introduction of additional X elements, return that X,
2433 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2434 is true if the SLP statements perform a single reduction, false if each
2435 statement performs an independent reduction. */
2437 static tree
2438 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2439 bool reduc_chain)
2441 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2442 gimple *stmt = stmts[0];
2443 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2444 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2445 tree scalar_type = TREE_TYPE (vector_type);
2446 struct loop *loop = gimple_bb (stmt)->loop_father;
2447 gcc_assert (loop);
2449 switch (code)
2451 case WIDEN_SUM_EXPR:
2452 case DOT_PROD_EXPR:
2453 case SAD_EXPR:
2454 case PLUS_EXPR:
2455 case MINUS_EXPR:
2456 case BIT_IOR_EXPR:
2457 case BIT_XOR_EXPR:
2458 return build_zero_cst (scalar_type);
2460 case MULT_EXPR:
2461 return build_one_cst (scalar_type);
2463 case BIT_AND_EXPR:
2464 return build_all_ones_cst (scalar_type);
2466 case MAX_EXPR:
2467 case MIN_EXPR:
2468 /* For MIN/MAX the initial values are neutral. A reduction chain
2469 has only a single initial value, so that value is neutral for
2470 all statements. */
2471 if (reduc_chain)
2472 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2473 return NULL_TREE;
2475 default:
2476 return NULL_TREE;
2480 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2481 STMT is printed with a message MSG. */
2483 static void
2484 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2486 dump_printf_loc (msg_type, vect_location, "%s", msg);
2487 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2491 /* Detect SLP reduction of the form:
2493 #a1 = phi <a5, a0>
2494 a2 = operation (a1)
2495 a3 = operation (a2)
2496 a4 = operation (a3)
2497 a5 = operation (a4)
2499 #a = phi <a5>
2501 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2502 FIRST_STMT is the first reduction stmt in the chain
2503 (a2 = operation (a1)).
2505 Return TRUE if a reduction chain was detected. */
2507 static bool
2508 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2509 gimple *first_stmt)
2511 struct loop *loop = (gimple_bb (phi))->loop_father;
2512 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2513 enum tree_code code;
2514 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2515 stmt_vec_info use_stmt_info, current_stmt_info;
2516 tree lhs;
2517 imm_use_iterator imm_iter;
2518 use_operand_p use_p;
2519 int nloop_uses, size = 0, n_out_of_loop_uses;
2520 bool found = false;
2522 if (loop != vect_loop)
2523 return false;
2525 lhs = PHI_RESULT (phi);
2526 code = gimple_assign_rhs_code (first_stmt);
2527 while (1)
2529 nloop_uses = 0;
2530 n_out_of_loop_uses = 0;
2531 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2533 gimple *use_stmt = USE_STMT (use_p);
2534 if (is_gimple_debug (use_stmt))
2535 continue;
2537 /* Check if we got back to the reduction phi. */
2538 if (use_stmt == phi)
2540 loop_use_stmt = use_stmt;
2541 found = true;
2542 break;
2545 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2547 loop_use_stmt = use_stmt;
2548 nloop_uses++;
2550 else
2551 n_out_of_loop_uses++;
2553 /* There are can be either a single use in the loop or two uses in
2554 phi nodes. */
2555 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2556 return false;
2559 if (found)
2560 break;
2562 /* We reached a statement with no loop uses. */
2563 if (nloop_uses == 0)
2564 return false;
2566 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2567 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2568 return false;
2570 if (!is_gimple_assign (loop_use_stmt)
2571 || code != gimple_assign_rhs_code (loop_use_stmt)
2572 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2573 return false;
2575 /* Insert USE_STMT into reduction chain. */
2576 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2577 if (current_stmt)
2579 current_stmt_info = vinfo_for_stmt (current_stmt);
2580 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2581 GROUP_FIRST_ELEMENT (use_stmt_info)
2582 = GROUP_FIRST_ELEMENT (current_stmt_info);
2584 else
2585 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2587 lhs = gimple_assign_lhs (loop_use_stmt);
2588 current_stmt = loop_use_stmt;
2589 size++;
2592 if (!found || loop_use_stmt != phi || size < 2)
2593 return false;
2595 /* Swap the operands, if needed, to make the reduction operand be the second
2596 operand. */
2597 lhs = PHI_RESULT (phi);
2598 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2599 while (next_stmt)
2601 if (gimple_assign_rhs2 (next_stmt) == lhs)
2603 tree op = gimple_assign_rhs1 (next_stmt);
2604 gimple *def_stmt = NULL;
2606 if (TREE_CODE (op) == SSA_NAME)
2607 def_stmt = SSA_NAME_DEF_STMT (op);
2609 /* Check that the other def is either defined in the loop
2610 ("vect_internal_def"), or it's an induction (defined by a
2611 loop-header phi-node). */
2612 if (def_stmt
2613 && gimple_bb (def_stmt)
2614 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2615 && (is_gimple_assign (def_stmt)
2616 || is_gimple_call (def_stmt)
2617 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2618 == vect_induction_def
2619 || (gimple_code (def_stmt) == GIMPLE_PHI
2620 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2621 == vect_internal_def
2622 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2624 lhs = gimple_assign_lhs (next_stmt);
2625 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2626 continue;
2629 return false;
2631 else
2633 tree op = gimple_assign_rhs2 (next_stmt);
2634 gimple *def_stmt = NULL;
2636 if (TREE_CODE (op) == SSA_NAME)
2637 def_stmt = SSA_NAME_DEF_STMT (op);
2639 /* Check that the other def is either defined in the loop
2640 ("vect_internal_def"), or it's an induction (defined by a
2641 loop-header phi-node). */
2642 if (def_stmt
2643 && gimple_bb (def_stmt)
2644 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2645 && (is_gimple_assign (def_stmt)
2646 || is_gimple_call (def_stmt)
2647 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2648 == vect_induction_def
2649 || (gimple_code (def_stmt) == GIMPLE_PHI
2650 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2651 == vect_internal_def
2652 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2654 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2657 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2660 swap_ssa_operands (next_stmt,
2661 gimple_assign_rhs1_ptr (next_stmt),
2662 gimple_assign_rhs2_ptr (next_stmt));
2663 update_stmt (next_stmt);
2665 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2666 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2668 else
2669 return false;
2672 lhs = gimple_assign_lhs (next_stmt);
2673 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2676 /* Save the chain for further analysis in SLP detection. */
2677 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2678 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2679 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2681 return true;
2684 /* Return true if we need an in-order reduction for operation CODE
2685 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2686 overflow must wrap. */
2688 static bool
2689 needs_fold_left_reduction_p (tree type, tree_code code,
2690 bool need_wrapping_integral_overflow)
2692 /* CHECKME: check for !flag_finite_math_only too? */
2693 if (SCALAR_FLOAT_TYPE_P (type))
2694 switch (code)
2696 case MIN_EXPR:
2697 case MAX_EXPR:
2698 return false;
2700 default:
2701 return !flag_associative_math;
2704 if (INTEGRAL_TYPE_P (type))
2706 if (!operation_no_trapping_overflow (type, code))
2707 return true;
2708 if (need_wrapping_integral_overflow
2709 && !TYPE_OVERFLOW_WRAPS (type)
2710 && operation_can_overflow (code))
2711 return true;
2712 return false;
2715 if (SAT_FIXED_POINT_TYPE_P (type))
2716 return true;
2718 return false;
2721 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2722 reduction operation CODE has a handled computation expression. */
2724 bool
2725 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2726 enum tree_code code)
2728 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2729 auto_bitmap visited;
2730 tree lookfor = PHI_RESULT (phi);
2731 ssa_op_iter curri;
2732 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2733 while (USE_FROM_PTR (curr) != loop_arg)
2734 curr = op_iter_next_use (&curri);
2735 curri.i = curri.numops;
2738 path.safe_push (std::make_pair (curri, curr));
2739 tree use = USE_FROM_PTR (curr);
2740 if (use == lookfor)
2741 break;
2742 gimple *def = SSA_NAME_DEF_STMT (use);
2743 if (gimple_nop_p (def)
2744 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2746 pop:
2749 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2750 curri = x.first;
2751 curr = x.second;
2753 curr = op_iter_next_use (&curri);
2754 /* Skip already visited or non-SSA operands (from iterating
2755 over PHI args). */
2756 while (curr != NULL_USE_OPERAND_P
2757 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2758 || ! bitmap_set_bit (visited,
2759 SSA_NAME_VERSION
2760 (USE_FROM_PTR (curr)))));
2762 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2763 if (curr == NULL_USE_OPERAND_P)
2764 break;
2766 else
2768 if (gimple_code (def) == GIMPLE_PHI)
2769 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2770 else
2771 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2772 while (curr != NULL_USE_OPERAND_P
2773 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2774 || ! bitmap_set_bit (visited,
2775 SSA_NAME_VERSION
2776 (USE_FROM_PTR (curr)))))
2777 curr = op_iter_next_use (&curri);
2778 if (curr == NULL_USE_OPERAND_P)
2779 goto pop;
2782 while (1);
2783 if (dump_file && (dump_flags & TDF_DETAILS))
2785 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2786 unsigned i;
2787 std::pair<ssa_op_iter, use_operand_p> *x;
2788 FOR_EACH_VEC_ELT (path, i, x)
2790 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2791 dump_printf (MSG_NOTE, " ");
2793 dump_printf (MSG_NOTE, "\n");
2796 /* Check whether the reduction path detected is valid. */
2797 bool fail = path.length () == 0;
2798 bool neg = false;
2799 for (unsigned i = 1; i < path.length (); ++i)
2801 gimple *use_stmt = USE_STMT (path[i].second);
2802 tree op = USE_FROM_PTR (path[i].second);
2803 if (! has_single_use (op)
2804 || ! is_gimple_assign (use_stmt))
2806 fail = true;
2807 break;
2809 if (gimple_assign_rhs_code (use_stmt) != code)
2811 if (code == PLUS_EXPR
2812 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2814 /* Track whether we negate the reduction value each iteration. */
2815 if (gimple_assign_rhs2 (use_stmt) == op)
2816 neg = ! neg;
2818 else
2820 fail = true;
2821 break;
2825 return ! fail && ! neg;
2829 /* Function vect_is_simple_reduction
2831 (1) Detect a cross-iteration def-use cycle that represents a simple
2832 reduction computation. We look for the following pattern:
2834 loop_header:
2835 a1 = phi < a0, a2 >
2836 a3 = ...
2837 a2 = operation (a3, a1)
2841 a3 = ...
2842 loop_header:
2843 a1 = phi < a0, a2 >
2844 a2 = operation (a3, a1)
2846 such that:
2847 1. operation is commutative and associative and it is safe to
2848 change the order of the computation
2849 2. no uses for a2 in the loop (a2 is used out of the loop)
2850 3. no uses of a1 in the loop besides the reduction operation
2851 4. no uses of a1 outside the loop.
2853 Conditions 1,4 are tested here.
2854 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2856 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2857 nested cycles.
2859 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2860 reductions:
2862 a1 = phi < a0, a2 >
2863 inner loop (def of a3)
2864 a2 = phi < a3 >
2866 (4) Detect condition expressions, ie:
2867 for (int i = 0; i < N; i++)
2868 if (a[i] < val)
2869 ret_val = a[i];
2873 static gimple *
2874 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2875 bool *double_reduc,
2876 bool need_wrapping_integral_overflow,
2877 enum vect_reduction_type *v_reduc_type)
2879 struct loop *loop = (gimple_bb (phi))->loop_father;
2880 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2881 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2882 enum tree_code orig_code, code;
2883 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2884 tree type;
2885 int nloop_uses;
2886 tree name;
2887 imm_use_iterator imm_iter;
2888 use_operand_p use_p;
2889 bool phi_def;
2891 *double_reduc = false;
2892 *v_reduc_type = TREE_CODE_REDUCTION;
2894 tree phi_name = PHI_RESULT (phi);
2895 /* ??? If there are no uses of the PHI result the inner loop reduction
2896 won't be detected as possibly double-reduction by vectorizable_reduction
2897 because that tries to walk the PHI arg from the preheader edge which
2898 can be constant. See PR60382. */
2899 if (has_zero_uses (phi_name))
2900 return NULL;
2901 nloop_uses = 0;
2902 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2904 gimple *use_stmt = USE_STMT (use_p);
2905 if (is_gimple_debug (use_stmt))
2906 continue;
2908 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912 "intermediate value used outside loop.\n");
2914 return NULL;
2917 nloop_uses++;
2918 if (nloop_uses > 1)
2920 if (dump_enabled_p ())
2921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2922 "reduction value used in loop.\n");
2923 return NULL;
2926 phi_use_stmt = use_stmt;
2929 edge latch_e = loop_latch_edge (loop);
2930 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2931 if (TREE_CODE (loop_arg) != SSA_NAME)
2933 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936 "reduction: not ssa_name: ");
2937 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2938 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2940 return NULL;
2943 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2944 if (is_gimple_assign (def_stmt))
2946 name = gimple_assign_lhs (def_stmt);
2947 phi_def = false;
2949 else if (gimple_code (def_stmt) == GIMPLE_PHI)
2951 name = PHI_RESULT (def_stmt);
2952 phi_def = true;
2954 else
2956 if (dump_enabled_p ())
2958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2959 "reduction: unhandled reduction operation: ");
2960 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2962 return NULL;
2965 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2966 return NULL;
2968 nloop_uses = 0;
2969 auto_vec<gphi *, 3> lcphis;
2970 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2972 gimple *use_stmt = USE_STMT (use_p);
2973 if (is_gimple_debug (use_stmt))
2974 continue;
2975 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2976 nloop_uses++;
2977 else
2978 /* We can have more than one loop-closed PHI. */
2979 lcphis.safe_push (as_a <gphi *> (use_stmt));
2980 if (nloop_uses > 1)
2982 if (dump_enabled_p ())
2983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2984 "reduction used in loop.\n");
2985 return NULL;
2989 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2990 defined in the inner loop. */
2991 if (phi_def)
2993 op1 = PHI_ARG_DEF (def_stmt, 0);
2995 if (gimple_phi_num_args (def_stmt) != 1
2996 || TREE_CODE (op1) != SSA_NAME)
2998 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3000 "unsupported phi node definition.\n");
3002 return NULL;
3005 def1 = SSA_NAME_DEF_STMT (op1);
3006 if (gimple_bb (def1)
3007 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3008 && loop->inner
3009 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3010 && is_gimple_assign (def1)
3011 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3013 if (dump_enabled_p ())
3014 report_vect_op (MSG_NOTE, def_stmt,
3015 "detected double reduction: ");
3017 *double_reduc = true;
3018 return def_stmt;
3021 return NULL;
3024 /* If we are vectorizing an inner reduction we are executing that
3025 in the original order only in case we are not dealing with a
3026 double reduction. */
3027 bool check_reduction = true;
3028 if (flow_loop_nested_p (vect_loop, loop))
3030 gphi *lcphi;
3031 unsigned i;
3032 check_reduction = false;
3033 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3034 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3036 gimple *use_stmt = USE_STMT (use_p);
3037 if (is_gimple_debug (use_stmt))
3038 continue;
3039 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3040 check_reduction = true;
3044 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3045 code = orig_code = gimple_assign_rhs_code (def_stmt);
3047 /* We can handle "res -= x[i]", which is non-associative by
3048 simply rewriting this into "res += -x[i]". Avoid changing
3049 gimple instruction for the first simple tests and only do this
3050 if we're allowed to change code at all. */
3051 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3052 code = PLUS_EXPR;
3054 if (code == COND_EXPR)
3056 if (! nested_in_vect_loop)
3057 *v_reduc_type = COND_REDUCTION;
3059 op3 = gimple_assign_rhs1 (def_stmt);
3060 if (COMPARISON_CLASS_P (op3))
3062 op4 = TREE_OPERAND (op3, 1);
3063 op3 = TREE_OPERAND (op3, 0);
3065 if (op3 == phi_name || op4 == phi_name)
3067 if (dump_enabled_p ())
3068 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3069 "reduction: condition depends on previous"
3070 " iteration: ");
3071 return NULL;
3074 op1 = gimple_assign_rhs2 (def_stmt);
3075 op2 = gimple_assign_rhs3 (def_stmt);
3077 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3079 if (dump_enabled_p ())
3080 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3081 "reduction: not commutative/associative: ");
3082 return NULL;
3084 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3086 op1 = gimple_assign_rhs1 (def_stmt);
3087 op2 = gimple_assign_rhs2 (def_stmt);
3089 else
3091 if (dump_enabled_p ())
3092 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3093 "reduction: not handled operation: ");
3094 return NULL;
3097 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3099 if (dump_enabled_p ())
3100 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3101 "reduction: both uses not ssa_names: ");
3103 return NULL;
3106 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3107 if ((TREE_CODE (op1) == SSA_NAME
3108 && !types_compatible_p (type,TREE_TYPE (op1)))
3109 || (TREE_CODE (op2) == SSA_NAME
3110 && !types_compatible_p (type, TREE_TYPE (op2)))
3111 || (op3 && TREE_CODE (op3) == SSA_NAME
3112 && !types_compatible_p (type, TREE_TYPE (op3)))
3113 || (op4 && TREE_CODE (op4) == SSA_NAME
3114 && !types_compatible_p (type, TREE_TYPE (op4))))
3116 if (dump_enabled_p ())
3118 dump_printf_loc (MSG_NOTE, vect_location,
3119 "reduction: multiple types: operation type: ");
3120 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3121 dump_printf (MSG_NOTE, ", operands types: ");
3122 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3123 TREE_TYPE (op1));
3124 dump_printf (MSG_NOTE, ",");
3125 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126 TREE_TYPE (op2));
3127 if (op3)
3129 dump_printf (MSG_NOTE, ",");
3130 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3131 TREE_TYPE (op3));
3134 if (op4)
3136 dump_printf (MSG_NOTE, ",");
3137 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138 TREE_TYPE (op4));
3140 dump_printf (MSG_NOTE, "\n");
3143 return NULL;
3146 /* Check whether it's ok to change the order of the computation.
3147 Generally, when vectorizing a reduction we change the order of the
3148 computation. This may change the behavior of the program in some
3149 cases, so we need to check that this is ok. One exception is when
3150 vectorizing an outer-loop: the inner-loop is executed sequentially,
3151 and therefore vectorizing reductions in the inner-loop during
3152 outer-loop vectorization is safe. */
3153 if (check_reduction
3154 && *v_reduc_type == TREE_CODE_REDUCTION
3155 && needs_fold_left_reduction_p (type, code,
3156 need_wrapping_integral_overflow))
3157 *v_reduc_type = FOLD_LEFT_REDUCTION;
3159 /* Reduction is safe. We're dealing with one of the following:
3160 1) integer arithmetic and no trapv
3161 2) floating point arithmetic, and special flags permit this optimization
3162 3) nested cycle (i.e., outer loop vectorization). */
3163 if (TREE_CODE (op1) == SSA_NAME)
3164 def1 = SSA_NAME_DEF_STMT (op1);
3166 if (TREE_CODE (op2) == SSA_NAME)
3167 def2 = SSA_NAME_DEF_STMT (op2);
3169 if (code != COND_EXPR
3170 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3172 if (dump_enabled_p ())
3173 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3174 return NULL;
3177 /* Check that one def is the reduction def, defined by PHI,
3178 the other def is either defined in the loop ("vect_internal_def"),
3179 or it's an induction (defined by a loop-header phi-node). */
3181 if (def2 && def2 == phi
3182 && (code == COND_EXPR
3183 || !def1 || gimple_nop_p (def1)
3184 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3185 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3186 && (is_gimple_assign (def1)
3187 || is_gimple_call (def1)
3188 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3189 == vect_induction_def
3190 || (gimple_code (def1) == GIMPLE_PHI
3191 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3192 == vect_internal_def
3193 && !is_loop_header_bb_p (gimple_bb (def1)))))))
3195 if (dump_enabled_p ())
3196 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3197 return def_stmt;
3200 if (def1 && def1 == phi
3201 && (code == COND_EXPR
3202 || !def2 || gimple_nop_p (def2)
3203 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3204 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3205 && (is_gimple_assign (def2)
3206 || is_gimple_call (def2)
3207 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3208 == vect_induction_def
3209 || (gimple_code (def2) == GIMPLE_PHI
3210 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3211 == vect_internal_def
3212 && !is_loop_header_bb_p (gimple_bb (def2)))))))
3214 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3216 /* Check if we can swap operands (just for simplicity - so that
3217 the rest of the code can assume that the reduction variable
3218 is always the last (second) argument). */
3219 if (code == COND_EXPR)
3221 /* Swap cond_expr by inverting the condition. */
3222 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3223 enum tree_code invert_code = ERROR_MARK;
3224 enum tree_code cond_code = TREE_CODE (cond_expr);
3226 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3228 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3229 invert_code = invert_tree_comparison (cond_code, honor_nans);
3231 if (invert_code != ERROR_MARK)
3233 TREE_SET_CODE (cond_expr, invert_code);
3234 swap_ssa_operands (def_stmt,
3235 gimple_assign_rhs2_ptr (def_stmt),
3236 gimple_assign_rhs3_ptr (def_stmt));
3238 else
3240 if (dump_enabled_p ())
3241 report_vect_op (MSG_NOTE, def_stmt,
3242 "detected reduction: cannot swap operands "
3243 "for cond_expr");
3244 return NULL;
3247 else
3248 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3249 gimple_assign_rhs2_ptr (def_stmt));
3251 if (dump_enabled_p ())
3252 report_vect_op (MSG_NOTE, def_stmt,
3253 "detected reduction: need to swap operands: ");
3255 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3256 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3258 else
3260 if (dump_enabled_p ())
3261 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3264 return def_stmt;
3267 /* Try to find SLP reduction chain. */
3268 if (! nested_in_vect_loop
3269 && code != COND_EXPR
3270 && orig_code != MINUS_EXPR
3271 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3273 if (dump_enabled_p ())
3274 report_vect_op (MSG_NOTE, def_stmt,
3275 "reduction: detected reduction chain: ");
3277 return def_stmt;
3280 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3281 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3282 while (first)
3284 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3285 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3286 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3287 first = next;
3290 /* Look for the expression computing loop_arg from loop PHI result. */
3291 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3292 code))
3293 return def_stmt;
3295 if (dump_enabled_p ())
3297 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3298 "reduction: unknown pattern: ");
3301 return NULL;
3304 /* Wrapper around vect_is_simple_reduction, which will modify code
3305 in-place if it enables detection of more reductions. Arguments
3306 as there. */
3308 gimple *
3309 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3310 bool *double_reduc,
3311 bool need_wrapping_integral_overflow)
3313 enum vect_reduction_type v_reduc_type;
3314 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3315 need_wrapping_integral_overflow,
3316 &v_reduc_type);
3317 if (def)
3319 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3320 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3321 STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3322 reduc_def_info = vinfo_for_stmt (def);
3323 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3324 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3326 return def;
3329 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3331 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3332 int *peel_iters_epilogue,
3333 stmt_vector_for_cost *scalar_cost_vec,
3334 stmt_vector_for_cost *prologue_cost_vec,
3335 stmt_vector_for_cost *epilogue_cost_vec)
3337 int retval = 0;
3338 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3340 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3342 *peel_iters_epilogue = assumed_vf / 2;
3343 if (dump_enabled_p ())
3344 dump_printf_loc (MSG_NOTE, vect_location,
3345 "cost model: epilogue peel iters set to vf/2 "
3346 "because loop iterations are unknown .\n");
3348 /* If peeled iterations are known but number of scalar loop
3349 iterations are unknown, count a taken branch per peeled loop. */
3350 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3351 NULL, 0, vect_prologue);
3352 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3353 NULL, 0, vect_epilogue);
3355 else
3357 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3358 peel_iters_prologue = niters < peel_iters_prologue ?
3359 niters : peel_iters_prologue;
3360 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3361 /* If we need to peel for gaps, but no peeling is required, we have to
3362 peel VF iterations. */
3363 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3364 *peel_iters_epilogue = assumed_vf;
3367 stmt_info_for_cost *si;
3368 int j;
3369 if (peel_iters_prologue)
3370 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3372 stmt_vec_info stmt_info
3373 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3374 retval += record_stmt_cost (prologue_cost_vec,
3375 si->count * peel_iters_prologue,
3376 si->kind, stmt_info, si->misalign,
3377 vect_prologue);
3379 if (*peel_iters_epilogue)
3380 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3382 stmt_vec_info stmt_info
3383 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3384 retval += record_stmt_cost (epilogue_cost_vec,
3385 si->count * *peel_iters_epilogue,
3386 si->kind, stmt_info, si->misalign,
3387 vect_epilogue);
3390 return retval;
3393 /* Function vect_estimate_min_profitable_iters
3395 Return the number of iterations required for the vector version of the
3396 loop to be profitable relative to the cost of the scalar version of the
3397 loop.
3399 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3400 of iterations for vectorization. -1 value means loop vectorization
3401 is not profitable. This returned value may be used for dynamic
3402 profitability check.
3404 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3405 for static check against estimated number of iterations. */
3407 static void
3408 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3409 int *ret_min_profitable_niters,
3410 int *ret_min_profitable_estimate)
3412 int min_profitable_iters;
3413 int min_profitable_estimate;
3414 int peel_iters_prologue;
3415 int peel_iters_epilogue;
3416 unsigned vec_inside_cost = 0;
3417 int vec_outside_cost = 0;
3418 unsigned vec_prologue_cost = 0;
3419 unsigned vec_epilogue_cost = 0;
3420 int scalar_single_iter_cost = 0;
3421 int scalar_outside_cost = 0;
3422 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3423 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3424 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3426 /* Cost model disabled. */
3427 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3429 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3430 *ret_min_profitable_niters = 0;
3431 *ret_min_profitable_estimate = 0;
3432 return;
3435 /* Requires loop versioning tests to handle misalignment. */
3436 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3438 /* FIXME: Make cost depend on complexity of individual check. */
3439 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3440 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3441 vect_prologue);
3442 dump_printf (MSG_NOTE,
3443 "cost model: Adding cost of checks for loop "
3444 "versioning to treat misalignment.\n");
3447 /* Requires loop versioning with alias checks. */
3448 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3450 /* FIXME: Make cost depend on complexity of individual check. */
3451 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3452 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3453 vect_prologue);
3454 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3455 if (len)
3456 /* Count LEN - 1 ANDs and LEN comparisons. */
3457 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3458 NULL, 0, vect_prologue);
3459 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3460 if (len)
3462 /* Count LEN - 1 ANDs and LEN comparisons. */
3463 unsigned int nstmts = len * 2 - 1;
3464 /* +1 for each bias that needs adding. */
3465 for (unsigned int i = 0; i < len; ++i)
3466 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3467 nstmts += 1;
3468 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3469 NULL, 0, vect_prologue);
3471 dump_printf (MSG_NOTE,
3472 "cost model: Adding cost of checks for loop "
3473 "versioning aliasing.\n");
3476 /* Requires loop versioning with niter checks. */
3477 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3479 /* FIXME: Make cost depend on complexity of individual check. */
3480 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3481 vect_prologue);
3482 dump_printf (MSG_NOTE,
3483 "cost model: Adding cost of checks for loop "
3484 "versioning niters.\n");
3487 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3488 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3489 vect_prologue);
3491 /* Count statements in scalar loop. Using this as scalar cost for a single
3492 iteration for now.
3494 TODO: Add outer loop support.
3496 TODO: Consider assigning different costs to different scalar
3497 statements. */
3499 scalar_single_iter_cost
3500 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3502 /* Add additional cost for the peeled instructions in prologue and epilogue
3503 loop. (For fully-masked loops there will be no peeling.)
3505 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3506 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3508 TODO: Build an expression that represents peel_iters for prologue and
3509 epilogue to be used in a run-time test. */
3511 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3513 peel_iters_prologue = 0;
3514 peel_iters_epilogue = 0;
3516 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3518 /* We need to peel exactly one iteration. */
3519 peel_iters_epilogue += 1;
3520 stmt_info_for_cost *si;
3521 int j;
3522 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3523 j, si)
3525 struct _stmt_vec_info *stmt_info
3526 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3527 (void) add_stmt_cost (target_cost_data, si->count,
3528 si->kind, stmt_info, si->misalign,
3529 vect_epilogue);
3533 else if (npeel < 0)
3535 peel_iters_prologue = assumed_vf / 2;
3536 dump_printf (MSG_NOTE, "cost model: "
3537 "prologue peel iters set to vf/2.\n");
3539 /* If peeling for alignment is unknown, loop bound of main loop becomes
3540 unknown. */
3541 peel_iters_epilogue = assumed_vf / 2;
3542 dump_printf (MSG_NOTE, "cost model: "
3543 "epilogue peel iters set to vf/2 because "
3544 "peeling for alignment is unknown.\n");
3546 /* If peeled iterations are unknown, count a taken branch and a not taken
3547 branch per peeled loop. Even if scalar loop iterations are known,
3548 vector iterations are not known since peeled prologue iterations are
3549 not known. Hence guards remain the same. */
3550 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3551 NULL, 0, vect_prologue);
3552 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3553 NULL, 0, vect_prologue);
3554 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3555 NULL, 0, vect_epilogue);
3556 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3557 NULL, 0, vect_epilogue);
3558 stmt_info_for_cost *si;
3559 int j;
3560 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3562 struct _stmt_vec_info *stmt_info
3563 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3564 (void) add_stmt_cost (target_cost_data,
3565 si->count * peel_iters_prologue,
3566 si->kind, stmt_info, si->misalign,
3567 vect_prologue);
3568 (void) add_stmt_cost (target_cost_data,
3569 si->count * peel_iters_epilogue,
3570 si->kind, stmt_info, si->misalign,
3571 vect_epilogue);
3574 else
3576 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3577 stmt_info_for_cost *si;
3578 int j;
3579 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3581 prologue_cost_vec.create (2);
3582 epilogue_cost_vec.create (2);
3583 peel_iters_prologue = npeel;
3585 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3586 &peel_iters_epilogue,
3587 &LOOP_VINFO_SCALAR_ITERATION_COST
3588 (loop_vinfo),
3589 &prologue_cost_vec,
3590 &epilogue_cost_vec);
3592 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3594 struct _stmt_vec_info *stmt_info
3595 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3596 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3597 si->misalign, vect_prologue);
3600 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3602 struct _stmt_vec_info *stmt_info
3603 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3604 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3605 si->misalign, vect_epilogue);
3608 prologue_cost_vec.release ();
3609 epilogue_cost_vec.release ();
3612 /* FORNOW: The scalar outside cost is incremented in one of the
3613 following ways:
3615 1. The vectorizer checks for alignment and aliasing and generates
3616 a condition that allows dynamic vectorization. A cost model
3617 check is ANDED with the versioning condition. Hence scalar code
3618 path now has the added cost of the versioning check.
3620 if (cost > th & versioning_check)
3621 jmp to vector code
3623 Hence run-time scalar is incremented by not-taken branch cost.
3625 2. The vectorizer then checks if a prologue is required. If the
3626 cost model check was not done before during versioning, it has to
3627 be done before the prologue check.
3629 if (cost <= th)
3630 prologue = scalar_iters
3631 if (prologue == 0)
3632 jmp to vector code
3633 else
3634 execute prologue
3635 if (prologue == num_iters)
3636 go to exit
3638 Hence the run-time scalar cost is incremented by a taken branch,
3639 plus a not-taken branch, plus a taken branch cost.
3641 3. The vectorizer then checks if an epilogue is required. If the
3642 cost model check was not done before during prologue check, it
3643 has to be done with the epilogue check.
3645 if (prologue == 0)
3646 jmp to vector code
3647 else
3648 execute prologue
3649 if (prologue == num_iters)
3650 go to exit
3651 vector code:
3652 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3653 jmp to epilogue
3655 Hence the run-time scalar cost should be incremented by 2 taken
3656 branches.
3658 TODO: The back end may reorder the BBS's differently and reverse
3659 conditions/branch directions. Change the estimates below to
3660 something more reasonable. */
3662 /* If the number of iterations is known and we do not do versioning, we can
3663 decide whether to vectorize at compile time. Hence the scalar version
3664 do not carry cost model guard costs. */
3665 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3666 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3668 /* Cost model check occurs at versioning. */
3669 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3670 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3671 else
3673 /* Cost model check occurs at prologue generation. */
3674 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3675 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3676 + vect_get_stmt_cost (cond_branch_not_taken);
3677 /* Cost model check occurs at epilogue generation. */
3678 else
3679 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3683 /* Complete the target-specific cost calculations. */
3684 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3685 &vec_inside_cost, &vec_epilogue_cost);
3687 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3689 if (dump_enabled_p ())
3691 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3692 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3693 vec_inside_cost);
3694 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3695 vec_prologue_cost);
3696 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3697 vec_epilogue_cost);
3698 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3699 scalar_single_iter_cost);
3700 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3701 scalar_outside_cost);
3702 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3703 vec_outside_cost);
3704 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3705 peel_iters_prologue);
3706 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3707 peel_iters_epilogue);
3710 /* Calculate number of iterations required to make the vector version
3711 profitable, relative to the loop bodies only. The following condition
3712 must hold true:
3713 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3714 where
3715 SIC = scalar iteration cost, VIC = vector iteration cost,
3716 VOC = vector outside cost, VF = vectorization factor,
3717 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3718 SOC = scalar outside cost for run time cost model check. */
3720 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3722 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3723 * assumed_vf
3724 - vec_inside_cost * peel_iters_prologue
3725 - vec_inside_cost * peel_iters_epilogue);
3726 if (min_profitable_iters <= 0)
3727 min_profitable_iters = 0;
3728 else
3730 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3731 - vec_inside_cost);
3733 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3734 <= (((int) vec_inside_cost * min_profitable_iters)
3735 + (((int) vec_outside_cost - scalar_outside_cost)
3736 * assumed_vf)))
3737 min_profitable_iters++;
3740 /* vector version will never be profitable. */
3741 else
3743 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3744 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3745 "did not happen for a simd loop");
3747 if (dump_enabled_p ())
3748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3749 "cost model: the vector iteration cost = %d "
3750 "divided by the scalar iteration cost = %d "
3751 "is greater or equal to the vectorization factor = %d"
3752 ".\n",
3753 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3754 *ret_min_profitable_niters = -1;
3755 *ret_min_profitable_estimate = -1;
3756 return;
3759 dump_printf (MSG_NOTE,
3760 " Calculated minimum iters for profitability: %d\n",
3761 min_profitable_iters);
3763 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3764 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3765 /* We want the vectorized loop to execute at least once. */
3766 min_profitable_iters = assumed_vf + peel_iters_prologue;
3768 if (dump_enabled_p ())
3769 dump_printf_loc (MSG_NOTE, vect_location,
3770 " Runtime profitability threshold = %d\n",
3771 min_profitable_iters);
3773 *ret_min_profitable_niters = min_profitable_iters;
3775 /* Calculate number of iterations required to make the vector version
3776 profitable, relative to the loop bodies only.
3778 Non-vectorized variant is SIC * niters and it must win over vector
3779 variant on the expected loop trip count. The following condition must hold true:
3780 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3782 if (vec_outside_cost <= 0)
3783 min_profitable_estimate = 0;
3784 else
3786 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3787 * assumed_vf
3788 - vec_inside_cost * peel_iters_prologue
3789 - vec_inside_cost * peel_iters_epilogue)
3790 / ((scalar_single_iter_cost * assumed_vf)
3791 - vec_inside_cost);
3793 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 " Static estimate profitability threshold = %d\n",
3797 min_profitable_estimate);
3799 *ret_min_profitable_estimate = min_profitable_estimate;
3802 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3803 vector elements (not bits) for a vector with NELT elements. */
3804 static void
3805 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3806 vec_perm_builder *sel)
3808 /* The encoding is a single stepped pattern. Any wrap-around is handled
3809 by vec_perm_indices. */
3810 sel->new_vector (nelt, 1, 3);
3811 for (unsigned int i = 0; i < 3; i++)
3812 sel->quick_push (i + offset);
3815 /* Checks whether the target supports whole-vector shifts for vectors of mode
3816 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3817 it supports vec_perm_const with masks for all necessary shift amounts. */
3818 static bool
3819 have_whole_vector_shift (machine_mode mode)
3821 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3822 return true;
3824 /* Variable-length vectors should be handled via the optab. */
3825 unsigned int nelt;
3826 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3827 return false;
3829 vec_perm_builder sel;
3830 vec_perm_indices indices;
3831 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3833 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3834 indices.new_vector (sel, 2, nelt);
3835 if (!can_vec_perm_const_p (mode, indices, false))
3836 return false;
3838 return true;
3841 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3842 functions. Design better to avoid maintenance issues. */
3844 /* Function vect_model_reduction_cost.
3846 Models cost for a reduction operation, including the vector ops
3847 generated within the strip-mine loop, the initial definition before
3848 the loop, and the epilogue code that must be generated. */
3850 static void
3851 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3852 int ncopies, stmt_vector_for_cost *cost_vec)
3854 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3855 enum tree_code code;
3856 optab optab;
3857 tree vectype;
3858 gimple *orig_stmt;
3859 machine_mode mode;
3860 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3861 struct loop *loop = NULL;
3863 if (loop_vinfo)
3864 loop = LOOP_VINFO_LOOP (loop_vinfo);
3866 /* Condition reductions generate two reductions in the loop. */
3867 vect_reduction_type reduction_type
3868 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3869 if (reduction_type == COND_REDUCTION)
3870 ncopies *= 2;
3872 vectype = STMT_VINFO_VECTYPE (stmt_info);
3873 mode = TYPE_MODE (vectype);
3874 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3876 if (!orig_stmt)
3877 orig_stmt = STMT_VINFO_STMT (stmt_info);
3879 code = gimple_assign_rhs_code (orig_stmt);
3881 if (reduction_type == EXTRACT_LAST_REDUCTION
3882 || reduction_type == FOLD_LEFT_REDUCTION)
3884 /* No extra instructions needed in the prologue. */
3885 prologue_cost = 0;
3887 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3888 /* Count one reduction-like operation per vector. */
3889 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3890 stmt_info, 0, vect_body);
3891 else
3893 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3894 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3895 inside_cost = record_stmt_cost (cost_vec, nelements,
3896 vec_to_scalar, stmt_info, 0,
3897 vect_body);
3898 inside_cost += record_stmt_cost (cost_vec, nelements,
3899 scalar_stmt, stmt_info, 0,
3900 vect_body);
3903 else
3905 /* Add in cost for initial definition.
3906 For cond reduction we have four vectors: initial index, step,
3907 initial result of the data reduction, initial value of the index
3908 reduction. */
3909 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3910 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3911 scalar_to_vec, stmt_info, 0,
3912 vect_prologue);
3914 /* Cost of reduction op inside loop. */
3915 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3916 stmt_info, 0, vect_body);
3919 /* Determine cost of epilogue code.
3921 We have a reduction operator that will reduce the vector in one statement.
3922 Also requires scalar extract. */
3924 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3926 if (reduc_fn != IFN_LAST)
3928 if (reduction_type == COND_REDUCTION)
3930 /* An EQ stmt and an COND_EXPR stmt. */
3931 epilogue_cost += record_stmt_cost (cost_vec, 2,
3932 vector_stmt, stmt_info, 0,
3933 vect_epilogue);
3934 /* Reduction of the max index and a reduction of the found
3935 values. */
3936 epilogue_cost += record_stmt_cost (cost_vec, 2,
3937 vec_to_scalar, stmt_info, 0,
3938 vect_epilogue);
3939 /* A broadcast of the max value. */
3940 epilogue_cost += record_stmt_cost (cost_vec, 1,
3941 scalar_to_vec, stmt_info, 0,
3942 vect_epilogue);
3944 else
3946 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3947 stmt_info, 0, vect_epilogue);
3948 epilogue_cost += record_stmt_cost (cost_vec, 1,
3949 vec_to_scalar, stmt_info, 0,
3950 vect_epilogue);
3953 else if (reduction_type == COND_REDUCTION)
3955 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3956 /* Extraction of scalar elements. */
3957 epilogue_cost += record_stmt_cost (cost_vec,
3958 2 * estimated_nunits,
3959 vec_to_scalar, stmt_info, 0,
3960 vect_epilogue);
3961 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3962 epilogue_cost += record_stmt_cost (cost_vec,
3963 2 * estimated_nunits - 3,
3964 scalar_stmt, stmt_info, 0,
3965 vect_epilogue);
3967 else if (reduction_type == EXTRACT_LAST_REDUCTION
3968 || reduction_type == FOLD_LEFT_REDUCTION)
3969 /* No extra instructions need in the epilogue. */
3971 else
3973 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3974 tree bitsize =
3975 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3976 int element_bitsize = tree_to_uhwi (bitsize);
3977 int nelements = vec_size_in_bits / element_bitsize;
3979 if (code == COND_EXPR)
3980 code = MAX_EXPR;
3982 optab = optab_for_tree_code (code, vectype, optab_default);
3984 /* We have a whole vector shift available. */
3985 if (optab != unknown_optab
3986 && VECTOR_MODE_P (mode)
3987 && optab_handler (optab, mode) != CODE_FOR_nothing
3988 && have_whole_vector_shift (mode))
3990 /* Final reduction via vector shifts and the reduction operator.
3991 Also requires scalar extract. */
3992 epilogue_cost += record_stmt_cost (cost_vec,
3993 exact_log2 (nelements) * 2,
3994 vector_stmt, stmt_info, 0,
3995 vect_epilogue);
3996 epilogue_cost += record_stmt_cost (cost_vec, 1,
3997 vec_to_scalar, stmt_info, 0,
3998 vect_epilogue);
4000 else
4001 /* Use extracts and reduction op for final reduction. For N
4002 elements, we have N extracts and N-1 reduction ops. */
4003 epilogue_cost += record_stmt_cost (cost_vec,
4004 nelements + nelements - 1,
4005 vector_stmt, stmt_info, 0,
4006 vect_epilogue);
4010 if (dump_enabled_p ())
4011 dump_printf (MSG_NOTE,
4012 "vect_model_reduction_cost: inside_cost = %d, "
4013 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4014 prologue_cost, epilogue_cost);
4018 /* Function vect_model_induction_cost.
4020 Models cost for induction operations. */
4022 static void
4023 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4024 stmt_vector_for_cost *cost_vec)
4026 unsigned inside_cost, prologue_cost;
4028 if (PURE_SLP_STMT (stmt_info))
4029 return;
4031 /* loop cost for vec_loop. */
4032 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4033 stmt_info, 0, vect_body);
4035 /* prologue cost for vec_init and vec_step. */
4036 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4037 stmt_info, 0, vect_prologue);
4039 if (dump_enabled_p ())
4040 dump_printf_loc (MSG_NOTE, vect_location,
4041 "vect_model_induction_cost: inside_cost = %d, "
4042 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4047 /* Function get_initial_def_for_reduction
4049 Input:
4050 STMT - a stmt that performs a reduction operation in the loop.
4051 INIT_VAL - the initial value of the reduction variable
4053 Output:
4054 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4055 of the reduction (used for adjusting the epilog - see below).
4056 Return a vector variable, initialized according to the operation that STMT
4057 performs. This vector will be used as the initial value of the
4058 vector of partial results.
4060 Option1 (adjust in epilog): Initialize the vector as follows:
4061 add/bit or/xor: [0,0,...,0,0]
4062 mult/bit and: [1,1,...,1,1]
4063 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4064 and when necessary (e.g. add/mult case) let the caller know
4065 that it needs to adjust the result by init_val.
4067 Option2: Initialize the vector as follows:
4068 add/bit or/xor: [init_val,0,0,...,0]
4069 mult/bit and: [init_val,1,1,...,1]
4070 min/max/cond_expr: [init_val,init_val,...,init_val]
4071 and no adjustments are needed.
4073 For example, for the following code:
4075 s = init_val;
4076 for (i=0;i<n;i++)
4077 s = s + a[i];
4079 STMT is 's = s + a[i]', and the reduction variable is 's'.
4080 For a vector of 4 units, we want to return either [0,0,0,init_val],
4081 or [0,0,0,0] and let the caller know that it needs to adjust
4082 the result at the end by 'init_val'.
4084 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4085 initialization vector is simpler (same element in all entries), if
4086 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4088 A cost model should help decide between these two schemes. */
4090 tree
4091 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4092 tree *adjustment_def)
4094 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4095 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4096 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4097 tree scalar_type = TREE_TYPE (init_val);
4098 tree vectype = get_vectype_for_scalar_type (scalar_type);
4099 enum tree_code code = gimple_assign_rhs_code (stmt);
4100 tree def_for_init;
4101 tree init_def;
4102 bool nested_in_vect_loop = false;
4103 REAL_VALUE_TYPE real_init_val = dconst0;
4104 int int_init_val = 0;
4105 gimple *def_stmt = NULL;
4106 gimple_seq stmts = NULL;
4108 gcc_assert (vectype);
4110 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4111 || SCALAR_FLOAT_TYPE_P (scalar_type));
4113 if (nested_in_vect_loop_p (loop, stmt))
4114 nested_in_vect_loop = true;
4115 else
4116 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4118 /* In case of double reduction we only create a vector variable to be put
4119 in the reduction phi node. The actual statement creation is done in
4120 vect_create_epilog_for_reduction. */
4121 if (adjustment_def && nested_in_vect_loop
4122 && TREE_CODE (init_val) == SSA_NAME
4123 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4124 && gimple_code (def_stmt) == GIMPLE_PHI
4125 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4126 && vinfo_for_stmt (def_stmt)
4127 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4128 == vect_double_reduction_def)
4130 *adjustment_def = NULL;
4131 return vect_create_destination_var (init_val, vectype);
4134 vect_reduction_type reduction_type
4135 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4137 /* In case of a nested reduction do not use an adjustment def as
4138 that case is not supported by the epilogue generation correctly
4139 if ncopies is not one. */
4140 if (adjustment_def && nested_in_vect_loop)
4142 *adjustment_def = NULL;
4143 return vect_get_vec_def_for_operand (init_val, stmt);
4146 switch (code)
4148 case WIDEN_SUM_EXPR:
4149 case DOT_PROD_EXPR:
4150 case SAD_EXPR:
4151 case PLUS_EXPR:
4152 case MINUS_EXPR:
4153 case BIT_IOR_EXPR:
4154 case BIT_XOR_EXPR:
4155 case MULT_EXPR:
4156 case BIT_AND_EXPR:
4158 /* ADJUSTMENT_DEF is NULL when called from
4159 vect_create_epilog_for_reduction to vectorize double reduction. */
4160 if (adjustment_def)
4161 *adjustment_def = init_val;
4163 if (code == MULT_EXPR)
4165 real_init_val = dconst1;
4166 int_init_val = 1;
4169 if (code == BIT_AND_EXPR)
4170 int_init_val = -1;
4172 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4173 def_for_init = build_real (scalar_type, real_init_val);
4174 else
4175 def_for_init = build_int_cst (scalar_type, int_init_val);
4177 if (adjustment_def)
4178 /* Option1: the first element is '0' or '1' as well. */
4179 init_def = gimple_build_vector_from_val (&stmts, vectype,
4180 def_for_init);
4181 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4183 /* Option2 (variable length): the first element is INIT_VAL. */
4184 init_def = gimple_build_vector_from_val (&stmts, vectype,
4185 def_for_init);
4186 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4187 vectype, init_def, init_val);
4189 else
4191 /* Option2: the first element is INIT_VAL. */
4192 tree_vector_builder elts (vectype, 1, 2);
4193 elts.quick_push (init_val);
4194 elts.quick_push (def_for_init);
4195 init_def = gimple_build_vector (&stmts, &elts);
4198 break;
4200 case MIN_EXPR:
4201 case MAX_EXPR:
4202 case COND_EXPR:
4204 if (adjustment_def)
4206 *adjustment_def = NULL_TREE;
4207 if (reduction_type != COND_REDUCTION
4208 && reduction_type != EXTRACT_LAST_REDUCTION)
4210 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4211 break;
4214 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4215 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4217 break;
4219 default:
4220 gcc_unreachable ();
4223 if (stmts)
4224 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4225 return init_def;
4228 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4229 NUMBER_OF_VECTORS is the number of vector defs to create.
4230 If NEUTRAL_OP is nonnull, introducing extra elements of that
4231 value will not change the result. */
4233 static void
4234 get_initial_defs_for_reduction (slp_tree slp_node,
4235 vec<tree> *vec_oprnds,
4236 unsigned int number_of_vectors,
4237 bool reduc_chain, tree neutral_op)
4239 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4240 gimple *stmt = stmts[0];
4241 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4242 unsigned HOST_WIDE_INT nunits;
4243 unsigned j, number_of_places_left_in_vector;
4244 tree vector_type;
4245 tree vop;
4246 int group_size = stmts.length ();
4247 unsigned int vec_num, i;
4248 unsigned number_of_copies = 1;
4249 vec<tree> voprnds;
4250 voprnds.create (number_of_vectors);
4251 struct loop *loop;
4252 auto_vec<tree, 16> permute_results;
4254 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4256 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4258 loop = (gimple_bb (stmt))->loop_father;
4259 gcc_assert (loop);
4260 edge pe = loop_preheader_edge (loop);
4262 gcc_assert (!reduc_chain || neutral_op);
4264 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4265 created vectors. It is greater than 1 if unrolling is performed.
4267 For example, we have two scalar operands, s1 and s2 (e.g., group of
4268 strided accesses of size two), while NUNITS is four (i.e., four scalars
4269 of this type can be packed in a vector). The output vector will contain
4270 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4271 will be 2).
4273 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4274 containing the operands.
4276 For example, NUNITS is four as before, and the group size is 8
4277 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4278 {s5, s6, s7, s8}. */
4280 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4281 nunits = group_size;
4283 number_of_copies = nunits * number_of_vectors / group_size;
4285 number_of_places_left_in_vector = nunits;
4286 bool constant_p = true;
4287 tree_vector_builder elts (vector_type, nunits, 1);
4288 elts.quick_grow (nunits);
4289 for (j = 0; j < number_of_copies; j++)
4291 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4293 tree op;
4294 /* Get the def before the loop. In reduction chain we have only
4295 one initial value. */
4296 if ((j != (number_of_copies - 1)
4297 || (reduc_chain && i != 0))
4298 && neutral_op)
4299 op = neutral_op;
4300 else
4301 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4303 /* Create 'vect_ = {op0,op1,...,opn}'. */
4304 number_of_places_left_in_vector--;
4305 elts[number_of_places_left_in_vector] = op;
4306 if (!CONSTANT_CLASS_P (op))
4307 constant_p = false;
4309 if (number_of_places_left_in_vector == 0)
4311 gimple_seq ctor_seq = NULL;
4312 tree init;
4313 if (constant_p && !neutral_op
4314 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4315 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4316 /* Build the vector directly from ELTS. */
4317 init = gimple_build_vector (&ctor_seq, &elts);
4318 else if (neutral_op)
4320 /* Build a vector of the neutral value and shift the
4321 other elements into place. */
4322 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4323 neutral_op);
4324 int k = nunits;
4325 while (k > 0 && elts[k - 1] == neutral_op)
4326 k -= 1;
4327 while (k > 0)
4329 k -= 1;
4330 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4331 vector_type, init, elts[k]);
4334 else
4336 /* First time round, duplicate ELTS to fill the
4337 required number of vectors, then cherry pick the
4338 appropriate result for each iteration. */
4339 if (vec_oprnds->is_empty ())
4340 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4341 number_of_vectors,
4342 permute_results);
4343 init = permute_results[number_of_vectors - j - 1];
4345 if (ctor_seq != NULL)
4346 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4347 voprnds.quick_push (init);
4349 number_of_places_left_in_vector = nunits;
4350 elts.new_vector (vector_type, nunits, 1);
4351 elts.quick_grow (nunits);
4352 constant_p = true;
4357 /* Since the vectors are created in the reverse order, we should invert
4358 them. */
4359 vec_num = voprnds.length ();
4360 for (j = vec_num; j != 0; j--)
4362 vop = voprnds[j - 1];
4363 vec_oprnds->quick_push (vop);
4366 voprnds.release ();
4368 /* In case that VF is greater than the unrolling factor needed for the SLP
4369 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4370 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4371 to replicate the vectors. */
4372 tree neutral_vec = NULL;
4373 while (number_of_vectors > vec_oprnds->length ())
4375 if (neutral_op)
4377 if (!neutral_vec)
4379 gimple_seq ctor_seq = NULL;
4380 neutral_vec = gimple_build_vector_from_val
4381 (&ctor_seq, vector_type, neutral_op);
4382 if (ctor_seq != NULL)
4383 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4385 vec_oprnds->quick_push (neutral_vec);
4387 else
4389 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4390 vec_oprnds->quick_push (vop);
4396 /* Function vect_create_epilog_for_reduction
4398 Create code at the loop-epilog to finalize the result of a reduction
4399 computation.
4401 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4402 reduction statements.
4403 STMT is the scalar reduction stmt that is being vectorized.
4404 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4405 number of elements that we can fit in a vectype (nunits). In this case
4406 we have to generate more than one vector stmt - i.e - we need to "unroll"
4407 the vector stmt by a factor VF/nunits. For more details see documentation
4408 in vectorizable_operation.
4409 REDUC_FN is the internal function for the epilog reduction.
4410 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4411 computation.
4412 REDUC_INDEX is the index of the operand in the right hand side of the
4413 statement that is defined by REDUCTION_PHI.
4414 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4415 SLP_NODE is an SLP node containing a group of reduction statements. The
4416 first one in this group is STMT.
4417 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4418 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4419 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4420 any value of the IV in the loop.
4421 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4422 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4423 null if this is not an SLP reduction
4425 This function:
4426 1. Creates the reduction def-use cycles: sets the arguments for
4427 REDUCTION_PHIS:
4428 The loop-entry argument is the vectorized initial-value of the reduction.
4429 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4430 sums.
4431 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4432 by calling the function specified by REDUC_FN if available, or by
4433 other means (whole-vector shifts or a scalar loop).
4434 The function also creates a new phi node at the loop exit to preserve
4435 loop-closed form, as illustrated below.
4437 The flow at the entry to this function:
4439 loop:
4440 vec_def = phi <null, null> # REDUCTION_PHI
4441 VECT_DEF = vector_stmt # vectorized form of STMT
4442 s_loop = scalar_stmt # (scalar) STMT
4443 loop_exit:
4444 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4445 use <s_out0>
4446 use <s_out0>
4448 The above is transformed by this function into:
4450 loop:
4451 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4452 VECT_DEF = vector_stmt # vectorized form of STMT
4453 s_loop = scalar_stmt # (scalar) STMT
4454 loop_exit:
4455 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4456 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4457 v_out2 = reduce <v_out1>
4458 s_out3 = extract_field <v_out2, 0>
4459 s_out4 = adjust_result <s_out3>
4460 use <s_out4>
4461 use <s_out4>
4464 static void
4465 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4466 gimple *reduc_def_stmt,
4467 int ncopies, internal_fn reduc_fn,
4468 vec<gimple *> reduction_phis,
4469 bool double_reduc,
4470 slp_tree slp_node,
4471 slp_instance slp_node_instance,
4472 tree induc_val, enum tree_code induc_code,
4473 tree neutral_op)
4475 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4476 stmt_vec_info prev_phi_info;
4477 tree vectype;
4478 machine_mode mode;
4479 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4480 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4481 basic_block exit_bb;
4482 tree scalar_dest;
4483 tree scalar_type;
4484 gimple *new_phi = NULL, *phi;
4485 gimple_stmt_iterator exit_gsi;
4486 tree vec_dest;
4487 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4488 gimple *epilog_stmt = NULL;
4489 enum tree_code code = gimple_assign_rhs_code (stmt);
4490 gimple *exit_phi;
4491 tree bitsize;
4492 tree adjustment_def = NULL;
4493 tree vec_initial_def = NULL;
4494 tree expr, def, initial_def = NULL;
4495 tree orig_name, scalar_result;
4496 imm_use_iterator imm_iter, phi_imm_iter;
4497 use_operand_p use_p, phi_use_p;
4498 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4499 bool nested_in_vect_loop = false;
4500 auto_vec<gimple *> new_phis;
4501 auto_vec<gimple *> inner_phis;
4502 enum vect_def_type dt = vect_unknown_def_type;
4503 int j, i;
4504 auto_vec<tree> scalar_results;
4505 unsigned int group_size = 1, k, ratio;
4506 auto_vec<tree> vec_initial_defs;
4507 auto_vec<gimple *> phis;
4508 bool slp_reduc = false;
4509 bool direct_slp_reduc;
4510 tree new_phi_result;
4511 gimple *inner_phi = NULL;
4512 tree induction_index = NULL_TREE;
4514 if (slp_node)
4515 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4517 if (nested_in_vect_loop_p (loop, stmt))
4519 outer_loop = loop;
4520 loop = loop->inner;
4521 nested_in_vect_loop = true;
4522 gcc_assert (!slp_node);
4525 vectype = STMT_VINFO_VECTYPE (stmt_info);
4526 gcc_assert (vectype);
4527 mode = TYPE_MODE (vectype);
4529 /* 1. Create the reduction def-use cycle:
4530 Set the arguments of REDUCTION_PHIS, i.e., transform
4532 loop:
4533 vec_def = phi <null, null> # REDUCTION_PHI
4534 VECT_DEF = vector_stmt # vectorized form of STMT
4537 into:
4539 loop:
4540 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4541 VECT_DEF = vector_stmt # vectorized form of STMT
4544 (in case of SLP, do it for all the phis). */
4546 /* Get the loop-entry arguments. */
4547 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4548 if (slp_node)
4550 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4551 vec_initial_defs.reserve (vec_num);
4552 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4553 &vec_initial_defs, vec_num,
4554 GROUP_FIRST_ELEMENT (stmt_info),
4555 neutral_op);
4557 else
4559 /* Get at the scalar def before the loop, that defines the initial value
4560 of the reduction variable. */
4561 gimple *def_stmt;
4562 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4563 loop_preheader_edge (loop));
4564 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4565 and we can't use zero for induc_val, use initial_def. Similarly
4566 for REDUC_MIN and initial_def larger than the base. */
4567 if (TREE_CODE (initial_def) == INTEGER_CST
4568 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4569 == INTEGER_INDUC_COND_REDUCTION)
4570 && !integer_zerop (induc_val)
4571 && ((induc_code == MAX_EXPR
4572 && tree_int_cst_lt (initial_def, induc_val))
4573 || (induc_code == MIN_EXPR
4574 && tree_int_cst_lt (induc_val, initial_def))))
4575 induc_val = initial_def;
4576 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4577 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4578 &adjustment_def);
4579 vec_initial_defs.create (1);
4580 vec_initial_defs.quick_push (vec_initial_def);
4583 /* Set phi nodes arguments. */
4584 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4586 tree vec_init_def = vec_initial_defs[i];
4587 tree def = vect_defs[i];
4588 for (j = 0; j < ncopies; j++)
4590 if (j != 0)
4592 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4593 if (nested_in_vect_loop)
4594 vec_init_def
4595 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4596 vec_init_def);
4599 /* Set the loop-entry arg of the reduction-phi. */
4601 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4602 == INTEGER_INDUC_COND_REDUCTION)
4604 /* Initialise the reduction phi to zero. This prevents initial
4605 values of non-zero interferring with the reduction op. */
4606 gcc_assert (ncopies == 1);
4607 gcc_assert (i == 0);
4609 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4610 tree induc_val_vec
4611 = build_vector_from_val (vec_init_def_type, induc_val);
4613 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4614 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4616 else
4617 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4618 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4620 /* Set the loop-latch arg for the reduction-phi. */
4621 if (j > 0)
4622 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4624 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4625 UNKNOWN_LOCATION);
4627 if (dump_enabled_p ())
4629 dump_printf_loc (MSG_NOTE, vect_location,
4630 "transform reduction: created def-use cycle: ");
4631 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4632 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4637 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4638 which is updated with the current index of the loop for every match of
4639 the original loop's cond_expr (VEC_STMT). This results in a vector
4640 containing the last time the condition passed for that vector lane.
4641 The first match will be a 1 to allow 0 to be used for non-matching
4642 indexes. If there are no matches at all then the vector will be all
4643 zeroes. */
4644 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4646 tree indx_before_incr, indx_after_incr;
4647 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4649 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4650 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4652 int scalar_precision
4653 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4654 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4655 tree cr_index_vector_type = build_vector_type
4656 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4658 /* First we create a simple vector induction variable which starts
4659 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4660 vector size (STEP). */
4662 /* Create a {1,2,3,...} vector. */
4663 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4665 /* Create a vector of the step value. */
4666 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4667 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4669 /* Create an induction variable. */
4670 gimple_stmt_iterator incr_gsi;
4671 bool insert_after;
4672 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4673 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4674 insert_after, &indx_before_incr, &indx_after_incr);
4676 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4677 filled with zeros (VEC_ZERO). */
4679 /* Create a vector of 0s. */
4680 tree zero = build_zero_cst (cr_index_scalar_type);
4681 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4683 /* Create a vector phi node. */
4684 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4685 new_phi = create_phi_node (new_phi_tree, loop->header);
4686 set_vinfo_for_stmt (new_phi,
4687 new_stmt_vec_info (new_phi, loop_vinfo));
4688 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4689 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4691 /* Now take the condition from the loops original cond_expr
4692 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4693 every match uses values from the induction variable
4694 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4695 (NEW_PHI_TREE).
4696 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4697 the new cond_expr (INDEX_COND_EXPR). */
4699 /* Duplicate the condition from vec_stmt. */
4700 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4702 /* Create a conditional, where the condition is taken from vec_stmt
4703 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4704 else is the phi (NEW_PHI_TREE). */
4705 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4706 ccompare, indx_before_incr,
4707 new_phi_tree);
4708 induction_index = make_ssa_name (cr_index_vector_type);
4709 gimple *index_condition = gimple_build_assign (induction_index,
4710 index_cond_expr);
4711 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4712 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4713 loop_vinfo);
4714 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4715 set_vinfo_for_stmt (index_condition, index_vec_info);
4717 /* Update the phi with the vec cond. */
4718 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4719 loop_latch_edge (loop), UNKNOWN_LOCATION);
4722 /* 2. Create epilog code.
4723 The reduction epilog code operates across the elements of the vector
4724 of partial results computed by the vectorized loop.
4725 The reduction epilog code consists of:
4727 step 1: compute the scalar result in a vector (v_out2)
4728 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4729 step 3: adjust the scalar result (s_out3) if needed.
4731 Step 1 can be accomplished using one the following three schemes:
4732 (scheme 1) using reduc_fn, if available.
4733 (scheme 2) using whole-vector shifts, if available.
4734 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4735 combined.
4737 The overall epilog code looks like this:
4739 s_out0 = phi <s_loop> # original EXIT_PHI
4740 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4741 v_out2 = reduce <v_out1> # step 1
4742 s_out3 = extract_field <v_out2, 0> # step 2
4743 s_out4 = adjust_result <s_out3> # step 3
4745 (step 3 is optional, and steps 1 and 2 may be combined).
4746 Lastly, the uses of s_out0 are replaced by s_out4. */
4749 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4750 v_out1 = phi <VECT_DEF>
4751 Store them in NEW_PHIS. */
4753 exit_bb = single_exit (loop)->dest;
4754 prev_phi_info = NULL;
4755 new_phis.create (vect_defs.length ());
4756 FOR_EACH_VEC_ELT (vect_defs, i, def)
4758 for (j = 0; j < ncopies; j++)
4760 tree new_def = copy_ssa_name (def);
4761 phi = create_phi_node (new_def, exit_bb);
4762 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4763 if (j == 0)
4764 new_phis.quick_push (phi);
4765 else
4767 def = vect_get_vec_def_for_stmt_copy (dt, def);
4768 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4771 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4772 prev_phi_info = vinfo_for_stmt (phi);
4776 /* The epilogue is created for the outer-loop, i.e., for the loop being
4777 vectorized. Create exit phis for the outer loop. */
4778 if (double_reduc)
4780 loop = outer_loop;
4781 exit_bb = single_exit (loop)->dest;
4782 inner_phis.create (vect_defs.length ());
4783 FOR_EACH_VEC_ELT (new_phis, i, phi)
4785 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4786 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4787 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4788 PHI_RESULT (phi));
4789 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4790 loop_vinfo));
4791 inner_phis.quick_push (phi);
4792 new_phis[i] = outer_phi;
4793 prev_phi_info = vinfo_for_stmt (outer_phi);
4794 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4796 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4797 new_result = copy_ssa_name (PHI_RESULT (phi));
4798 outer_phi = create_phi_node (new_result, exit_bb);
4799 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4800 PHI_RESULT (phi));
4801 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4802 loop_vinfo));
4803 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4804 prev_phi_info = vinfo_for_stmt (outer_phi);
4809 exit_gsi = gsi_after_labels (exit_bb);
4811 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4812 (i.e. when reduc_fn is not available) and in the final adjustment
4813 code (if needed). Also get the original scalar reduction variable as
4814 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4815 represents a reduction pattern), the tree-code and scalar-def are
4816 taken from the original stmt that the pattern-stmt (STMT) replaces.
4817 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4818 are taken from STMT. */
4820 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4821 if (!orig_stmt)
4823 /* Regular reduction */
4824 orig_stmt = stmt;
4826 else
4828 /* Reduction pattern */
4829 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4830 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4831 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4834 code = gimple_assign_rhs_code (orig_stmt);
4835 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4836 partial results are added and not subtracted. */
4837 if (code == MINUS_EXPR)
4838 code = PLUS_EXPR;
4840 scalar_dest = gimple_assign_lhs (orig_stmt);
4841 scalar_type = TREE_TYPE (scalar_dest);
4842 scalar_results.create (group_size);
4843 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4844 bitsize = TYPE_SIZE (scalar_type);
4846 /* In case this is a reduction in an inner-loop while vectorizing an outer
4847 loop - we don't need to extract a single scalar result at the end of the
4848 inner-loop (unless it is double reduction, i.e., the use of reduction is
4849 outside the outer-loop). The final vector of partial results will be used
4850 in the vectorized outer-loop, or reduced to a scalar result at the end of
4851 the outer-loop. */
4852 if (nested_in_vect_loop && !double_reduc)
4853 goto vect_finalize_reduction;
4855 /* SLP reduction without reduction chain, e.g.,
4856 # a1 = phi <a2, a0>
4857 # b1 = phi <b2, b0>
4858 a2 = operation (a1)
4859 b2 = operation (b1) */
4860 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4862 /* True if we should implement SLP_REDUC using native reduction operations
4863 instead of scalar operations. */
4864 direct_slp_reduc = (reduc_fn != IFN_LAST
4865 && slp_reduc
4866 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4868 /* In case of reduction chain, e.g.,
4869 # a1 = phi <a3, a0>
4870 a2 = operation (a1)
4871 a3 = operation (a2),
4873 we may end up with more than one vector result. Here we reduce them to
4874 one vector. */
4875 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4877 tree first_vect = PHI_RESULT (new_phis[0]);
4878 gassign *new_vec_stmt = NULL;
4879 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4880 for (k = 1; k < new_phis.length (); k++)
4882 gimple *next_phi = new_phis[k];
4883 tree second_vect = PHI_RESULT (next_phi);
4884 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4885 new_vec_stmt = gimple_build_assign (tem, code,
4886 first_vect, second_vect);
4887 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4888 first_vect = tem;
4891 new_phi_result = first_vect;
4892 if (new_vec_stmt)
4894 new_phis.truncate (0);
4895 new_phis.safe_push (new_vec_stmt);
4898 /* Likewise if we couldn't use a single defuse cycle. */
4899 else if (ncopies > 1)
4901 gcc_assert (new_phis.length () == 1);
4902 tree first_vect = PHI_RESULT (new_phis[0]);
4903 gassign *new_vec_stmt = NULL;
4904 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4905 gimple *next_phi = new_phis[0];
4906 for (int k = 1; k < ncopies; ++k)
4908 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4909 tree second_vect = PHI_RESULT (next_phi);
4910 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4911 new_vec_stmt = gimple_build_assign (tem, code,
4912 first_vect, second_vect);
4913 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4914 first_vect = tem;
4916 new_phi_result = first_vect;
4917 new_phis.truncate (0);
4918 new_phis.safe_push (new_vec_stmt);
4920 else
4921 new_phi_result = PHI_RESULT (new_phis[0]);
4923 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4924 && reduc_fn != IFN_LAST)
4926 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4927 various data values where the condition matched and another vector
4928 (INDUCTION_INDEX) containing all the indexes of those matches. We
4929 need to extract the last matching index (which will be the index with
4930 highest value) and use this to index into the data vector.
4931 For the case where there were no matches, the data vector will contain
4932 all default values and the index vector will be all zeros. */
4934 /* Get various versions of the type of the vector of indexes. */
4935 tree index_vec_type = TREE_TYPE (induction_index);
4936 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4937 tree index_scalar_type = TREE_TYPE (index_vec_type);
4938 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4939 (index_vec_type);
4941 /* Get an unsigned integer version of the type of the data vector. */
4942 int scalar_precision
4943 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4944 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4945 tree vectype_unsigned = build_vector_type
4946 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4948 /* First we need to create a vector (ZERO_VEC) of zeros and another
4949 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4950 can create using a MAX reduction and then expanding.
4951 In the case where the loop never made any matches, the max index will
4952 be zero. */
4954 /* Vector of {0, 0, 0,...}. */
4955 tree zero_vec = make_ssa_name (vectype);
4956 tree zero_vec_rhs = build_zero_cst (vectype);
4957 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4958 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4960 /* Find maximum value from the vector of found indexes. */
4961 tree max_index = make_ssa_name (index_scalar_type);
4962 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4963 1, induction_index);
4964 gimple_call_set_lhs (max_index_stmt, max_index);
4965 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4967 /* Vector of {max_index, max_index, max_index,...}. */
4968 tree max_index_vec = make_ssa_name (index_vec_type);
4969 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4970 max_index);
4971 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4972 max_index_vec_rhs);
4973 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4975 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4976 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4977 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4978 otherwise. Only one value should match, resulting in a vector
4979 (VEC_COND) with one data value and the rest zeros.
4980 In the case where the loop never made any matches, every index will
4981 match, resulting in a vector with all data values (which will all be
4982 the default value). */
4984 /* Compare the max index vector to the vector of found indexes to find
4985 the position of the max value. */
4986 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4987 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4988 induction_index,
4989 max_index_vec);
4990 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4992 /* Use the compare to choose either values from the data vector or
4993 zero. */
4994 tree vec_cond = make_ssa_name (vectype);
4995 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4996 vec_compare, new_phi_result,
4997 zero_vec);
4998 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5000 /* Finally we need to extract the data value from the vector (VEC_COND)
5001 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5002 reduction, but because this doesn't exist, we can use a MAX reduction
5003 instead. The data value might be signed or a float so we need to cast
5004 it first.
5005 In the case where the loop never made any matches, the data values are
5006 all identical, and so will reduce down correctly. */
5008 /* Make the matched data values unsigned. */
5009 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5010 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5011 vec_cond);
5012 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5013 VIEW_CONVERT_EXPR,
5014 vec_cond_cast_rhs);
5015 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5017 /* Reduce down to a scalar value. */
5018 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5019 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5020 1, vec_cond_cast);
5021 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5022 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5024 /* Convert the reduced value back to the result type and set as the
5025 result. */
5026 gimple_seq stmts = NULL;
5027 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5028 data_reduc);
5029 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5030 scalar_results.safe_push (new_temp);
5032 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5033 && reduc_fn == IFN_LAST)
5035 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5036 idx = 0;
5037 idx_val = induction_index[0];
5038 val = data_reduc[0];
5039 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5040 if (induction_index[i] > idx_val)
5041 val = data_reduc[i], idx_val = induction_index[i];
5042 return val; */
5044 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5045 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5046 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5047 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5048 /* Enforced by vectorizable_reduction, which ensures we have target
5049 support before allowing a conditional reduction on variable-length
5050 vectors. */
5051 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5052 tree idx_val = NULL_TREE, val = NULL_TREE;
5053 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5055 tree old_idx_val = idx_val;
5056 tree old_val = val;
5057 idx_val = make_ssa_name (idx_eltype);
5058 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5059 build3 (BIT_FIELD_REF, idx_eltype,
5060 induction_index,
5061 bitsize_int (el_size),
5062 bitsize_int (off)));
5063 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5064 val = make_ssa_name (data_eltype);
5065 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5066 build3 (BIT_FIELD_REF,
5067 data_eltype,
5068 new_phi_result,
5069 bitsize_int (el_size),
5070 bitsize_int (off)));
5071 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5072 if (off != 0)
5074 tree new_idx_val = idx_val;
5075 tree new_val = val;
5076 if (off != v_size - el_size)
5078 new_idx_val = make_ssa_name (idx_eltype);
5079 epilog_stmt = gimple_build_assign (new_idx_val,
5080 MAX_EXPR, idx_val,
5081 old_idx_val);
5082 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5084 new_val = make_ssa_name (data_eltype);
5085 epilog_stmt = gimple_build_assign (new_val,
5086 COND_EXPR,
5087 build2 (GT_EXPR,
5088 boolean_type_node,
5089 idx_val,
5090 old_idx_val),
5091 val, old_val);
5092 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5093 idx_val = new_idx_val;
5094 val = new_val;
5097 /* Convert the reduced value back to the result type and set as the
5098 result. */
5099 gimple_seq stmts = NULL;
5100 val = gimple_convert (&stmts, scalar_type, val);
5101 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5102 scalar_results.safe_push (val);
5105 /* 2.3 Create the reduction code, using one of the three schemes described
5106 above. In SLP we simply need to extract all the elements from the
5107 vector (without reducing them), so we use scalar shifts. */
5108 else if (reduc_fn != IFN_LAST && !slp_reduc)
5110 tree tmp;
5111 tree vec_elem_type;
5113 /* Case 1: Create:
5114 v_out2 = reduc_expr <v_out1> */
5116 if (dump_enabled_p ())
5117 dump_printf_loc (MSG_NOTE, vect_location,
5118 "Reduce using direct vector reduction.\n");
5120 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5121 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5123 tree tmp_dest
5124 = vect_create_destination_var (scalar_dest, vec_elem_type);
5125 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5126 new_phi_result);
5127 gimple_set_lhs (epilog_stmt, tmp_dest);
5128 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5129 gimple_set_lhs (epilog_stmt, new_temp);
5130 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5132 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5133 new_temp);
5135 else
5137 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5138 new_phi_result);
5139 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5142 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5143 gimple_set_lhs (epilog_stmt, new_temp);
5144 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5146 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5147 == INTEGER_INDUC_COND_REDUCTION)
5148 && !operand_equal_p (initial_def, induc_val, 0))
5150 /* Earlier we set the initial value to be a vector if induc_val
5151 values. Check the result and if it is induc_val then replace
5152 with the original initial value, unless induc_val is
5153 the same as initial_def already. */
5154 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5155 induc_val);
5157 tmp = make_ssa_name (new_scalar_dest);
5158 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5159 initial_def, new_temp);
5160 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5161 new_temp = tmp;
5164 scalar_results.safe_push (new_temp);
5166 else if (direct_slp_reduc)
5168 /* Here we create one vector for each of the GROUP_SIZE results,
5169 with the elements for other SLP statements replaced with the
5170 neutral value. We can then do a normal reduction on each vector. */
5172 /* Enforced by vectorizable_reduction. */
5173 gcc_assert (new_phis.length () == 1);
5174 gcc_assert (pow2p_hwi (group_size));
5176 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5177 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5178 gimple_seq seq = NULL;
5180 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5181 and the same element size as VECTYPE. */
5182 tree index = build_index_vector (vectype, 0, 1);
5183 tree index_type = TREE_TYPE (index);
5184 tree index_elt_type = TREE_TYPE (index_type);
5185 tree mask_type = build_same_sized_truth_vector_type (index_type);
5187 /* Create a vector that, for each element, identifies which of
5188 the GROUP_SIZE results should use it. */
5189 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5190 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5191 build_vector_from_val (index_type, index_mask));
5193 /* Get a neutral vector value. This is simply a splat of the neutral
5194 scalar value if we have one, otherwise the initial scalar value
5195 is itself a neutral value. */
5196 tree vector_identity = NULL_TREE;
5197 if (neutral_op)
5198 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5199 neutral_op);
5200 for (unsigned int i = 0; i < group_size; ++i)
5202 /* If there's no univeral neutral value, we can use the
5203 initial scalar value from the original PHI. This is used
5204 for MIN and MAX reduction, for example. */
5205 if (!neutral_op)
5207 tree scalar_value
5208 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5209 loop_preheader_edge (loop));
5210 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5211 scalar_value);
5214 /* Calculate the equivalent of:
5216 sel[j] = (index[j] == i);
5218 which selects the elements of NEW_PHI_RESULT that should
5219 be included in the result. */
5220 tree compare_val = build_int_cst (index_elt_type, i);
5221 compare_val = build_vector_from_val (index_type, compare_val);
5222 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5223 index, compare_val);
5225 /* Calculate the equivalent of:
5227 vec = seq ? new_phi_result : vector_identity;
5229 VEC is now suitable for a full vector reduction. */
5230 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5231 sel, new_phi_result, vector_identity);
5233 /* Do the reduction and convert it to the appropriate type. */
5234 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5235 TREE_TYPE (vectype), vec);
5236 scalar = gimple_convert (&seq, scalar_type, scalar);
5237 scalar_results.safe_push (scalar);
5239 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5241 else
5243 bool reduce_with_shift;
5244 tree vec_temp;
5246 /* COND reductions all do the final reduction with MAX_EXPR
5247 or MIN_EXPR. */
5248 if (code == COND_EXPR)
5250 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5251 == INTEGER_INDUC_COND_REDUCTION)
5252 code = induc_code;
5253 else
5254 code = MAX_EXPR;
5257 /* See if the target wants to do the final (shift) reduction
5258 in a vector mode of smaller size and first reduce upper/lower
5259 halves against each other. */
5260 enum machine_mode mode1 = mode;
5261 tree vectype1 = vectype;
5262 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5263 unsigned sz1 = sz;
5264 if (!slp_reduc
5265 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5266 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5268 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5269 reduce_with_shift = have_whole_vector_shift (mode1);
5270 if (!VECTOR_MODE_P (mode1))
5271 reduce_with_shift = false;
5272 else
5274 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5275 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5276 reduce_with_shift = false;
5279 /* First reduce the vector to the desired vector size we should
5280 do shift reduction on by combining upper and lower halves. */
5281 new_temp = new_phi_result;
5282 while (sz > sz1)
5284 gcc_assert (!slp_reduc);
5285 sz /= 2;
5286 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5288 /* The target has to make sure we support lowpart/highpart
5289 extraction, either via direct vector extract or through
5290 an integer mode punning. */
5291 tree dst1, dst2;
5292 if (convert_optab_handler (vec_extract_optab,
5293 TYPE_MODE (TREE_TYPE (new_temp)),
5294 TYPE_MODE (vectype1))
5295 != CODE_FOR_nothing)
5297 /* Extract sub-vectors directly once vec_extract becomes
5298 a conversion optab. */
5299 dst1 = make_ssa_name (vectype1);
5300 epilog_stmt
5301 = gimple_build_assign (dst1, BIT_FIELD_REF,
5302 build3 (BIT_FIELD_REF, vectype1,
5303 new_temp, TYPE_SIZE (vectype1),
5304 bitsize_int (0)));
5305 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5306 dst2 = make_ssa_name (vectype1);
5307 epilog_stmt
5308 = gimple_build_assign (dst2, BIT_FIELD_REF,
5309 build3 (BIT_FIELD_REF, vectype1,
5310 new_temp, TYPE_SIZE (vectype1),
5311 bitsize_int (sz * BITS_PER_UNIT)));
5312 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5314 else
5316 /* Extract via punning to appropriately sized integer mode
5317 vector. */
5318 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5320 tree etype = build_vector_type (eltype, 2);
5321 gcc_assert (convert_optab_handler (vec_extract_optab,
5322 TYPE_MODE (etype),
5323 TYPE_MODE (eltype))
5324 != CODE_FOR_nothing);
5325 tree tem = make_ssa_name (etype);
5326 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5327 build1 (VIEW_CONVERT_EXPR,
5328 etype, new_temp));
5329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5330 new_temp = tem;
5331 tem = make_ssa_name (eltype);
5332 epilog_stmt
5333 = gimple_build_assign (tem, BIT_FIELD_REF,
5334 build3 (BIT_FIELD_REF, eltype,
5335 new_temp, TYPE_SIZE (eltype),
5336 bitsize_int (0)));
5337 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5338 dst1 = make_ssa_name (vectype1);
5339 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5340 build1 (VIEW_CONVERT_EXPR,
5341 vectype1, tem));
5342 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5343 tem = make_ssa_name (eltype);
5344 epilog_stmt
5345 = gimple_build_assign (tem, BIT_FIELD_REF,
5346 build3 (BIT_FIELD_REF, eltype,
5347 new_temp, TYPE_SIZE (eltype),
5348 bitsize_int (sz * BITS_PER_UNIT)));
5349 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5350 dst2 = make_ssa_name (vectype1);
5351 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5352 build1 (VIEW_CONVERT_EXPR,
5353 vectype1, tem));
5354 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357 new_temp = make_ssa_name (vectype1);
5358 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5359 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5362 if (reduce_with_shift && !slp_reduc)
5364 int element_bitsize = tree_to_uhwi (bitsize);
5365 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5366 for variable-length vectors and also requires direct target support
5367 for loop reductions. */
5368 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5369 int nelements = vec_size_in_bits / element_bitsize;
5370 vec_perm_builder sel;
5371 vec_perm_indices indices;
5373 int elt_offset;
5375 tree zero_vec = build_zero_cst (vectype1);
5376 /* Case 2: Create:
5377 for (offset = nelements/2; offset >= 1; offset/=2)
5379 Create: va' = vec_shift <va, offset>
5380 Create: va = vop <va, va'>
5381 } */
5383 tree rhs;
5385 if (dump_enabled_p ())
5386 dump_printf_loc (MSG_NOTE, vect_location,
5387 "Reduce using vector shifts\n");
5389 mode1 = TYPE_MODE (vectype1);
5390 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5391 for (elt_offset = nelements / 2;
5392 elt_offset >= 1;
5393 elt_offset /= 2)
5395 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5396 indices.new_vector (sel, 2, nelements);
5397 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5398 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5399 new_temp, zero_vec, mask);
5400 new_name = make_ssa_name (vec_dest, epilog_stmt);
5401 gimple_assign_set_lhs (epilog_stmt, new_name);
5402 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5404 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5405 new_temp);
5406 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5407 gimple_assign_set_lhs (epilog_stmt, new_temp);
5408 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5411 /* 2.4 Extract the final scalar result. Create:
5412 s_out3 = extract_field <v_out2, bitpos> */
5414 if (dump_enabled_p ())
5415 dump_printf_loc (MSG_NOTE, vect_location,
5416 "extract scalar result\n");
5418 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5419 bitsize, bitsize_zero_node);
5420 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5421 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5422 gimple_assign_set_lhs (epilog_stmt, new_temp);
5423 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5424 scalar_results.safe_push (new_temp);
5426 else
5428 /* Case 3: Create:
5429 s = extract_field <v_out2, 0>
5430 for (offset = element_size;
5431 offset < vector_size;
5432 offset += element_size;)
5434 Create: s' = extract_field <v_out2, offset>
5435 Create: s = op <s, s'> // For non SLP cases
5436 } */
5438 if (dump_enabled_p ())
5439 dump_printf_loc (MSG_NOTE, vect_location,
5440 "Reduce using scalar code.\n");
5442 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5443 int element_bitsize = tree_to_uhwi (bitsize);
5444 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5446 int bit_offset;
5447 if (gimple_code (new_phi) == GIMPLE_PHI)
5448 vec_temp = PHI_RESULT (new_phi);
5449 else
5450 vec_temp = gimple_assign_lhs (new_phi);
5451 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5452 bitsize_zero_node);
5453 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5454 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5455 gimple_assign_set_lhs (epilog_stmt, new_temp);
5456 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5458 /* In SLP we don't need to apply reduction operation, so we just
5459 collect s' values in SCALAR_RESULTS. */
5460 if (slp_reduc)
5461 scalar_results.safe_push (new_temp);
5463 for (bit_offset = element_bitsize;
5464 bit_offset < vec_size_in_bits;
5465 bit_offset += element_bitsize)
5467 tree bitpos = bitsize_int (bit_offset);
5468 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5469 bitsize, bitpos);
5471 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5472 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5473 gimple_assign_set_lhs (epilog_stmt, new_name);
5474 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5476 if (slp_reduc)
5478 /* In SLP we don't need to apply reduction operation, so
5479 we just collect s' values in SCALAR_RESULTS. */
5480 new_temp = new_name;
5481 scalar_results.safe_push (new_name);
5483 else
5485 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5486 new_name, new_temp);
5487 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5488 gimple_assign_set_lhs (epilog_stmt, new_temp);
5489 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5494 /* The only case where we need to reduce scalar results in SLP, is
5495 unrolling. If the size of SCALAR_RESULTS is greater than
5496 GROUP_SIZE, we reduce them combining elements modulo
5497 GROUP_SIZE. */
5498 if (slp_reduc)
5500 tree res, first_res, new_res;
5501 gimple *new_stmt;
5503 /* Reduce multiple scalar results in case of SLP unrolling. */
5504 for (j = group_size; scalar_results.iterate (j, &res);
5505 j++)
5507 first_res = scalar_results[j % group_size];
5508 new_stmt = gimple_build_assign (new_scalar_dest, code,
5509 first_res, res);
5510 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5511 gimple_assign_set_lhs (new_stmt, new_res);
5512 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5513 scalar_results[j % group_size] = new_res;
5516 else
5517 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5518 scalar_results.safe_push (new_temp);
5521 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5522 == INTEGER_INDUC_COND_REDUCTION)
5523 && !operand_equal_p (initial_def, induc_val, 0))
5525 /* Earlier we set the initial value to be a vector if induc_val
5526 values. Check the result and if it is induc_val then replace
5527 with the original initial value, unless induc_val is
5528 the same as initial_def already. */
5529 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5530 induc_val);
5532 tree tmp = make_ssa_name (new_scalar_dest);
5533 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5534 initial_def, new_temp);
5535 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5536 scalar_results[0] = tmp;
5540 vect_finalize_reduction:
5542 if (double_reduc)
5543 loop = loop->inner;
5545 /* 2.5 Adjust the final result by the initial value of the reduction
5546 variable. (When such adjustment is not needed, then
5547 'adjustment_def' is zero). For example, if code is PLUS we create:
5548 new_temp = loop_exit_def + adjustment_def */
5550 if (adjustment_def)
5552 gcc_assert (!slp_reduc);
5553 if (nested_in_vect_loop)
5555 new_phi = new_phis[0];
5556 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5557 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5558 new_dest = vect_create_destination_var (scalar_dest, vectype);
5560 else
5562 new_temp = scalar_results[0];
5563 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5564 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5565 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5568 epilog_stmt = gimple_build_assign (new_dest, expr);
5569 new_temp = make_ssa_name (new_dest, epilog_stmt);
5570 gimple_assign_set_lhs (epilog_stmt, new_temp);
5571 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5572 if (nested_in_vect_loop)
5574 set_vinfo_for_stmt (epilog_stmt,
5575 new_stmt_vec_info (epilog_stmt, loop_vinfo));
5576 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5577 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5579 if (!double_reduc)
5580 scalar_results.quick_push (new_temp);
5581 else
5582 scalar_results[0] = new_temp;
5584 else
5585 scalar_results[0] = new_temp;
5587 new_phis[0] = epilog_stmt;
5590 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5591 phis with new adjusted scalar results, i.e., replace use <s_out0>
5592 with use <s_out4>.
5594 Transform:
5595 loop_exit:
5596 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5597 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5598 v_out2 = reduce <v_out1>
5599 s_out3 = extract_field <v_out2, 0>
5600 s_out4 = adjust_result <s_out3>
5601 use <s_out0>
5602 use <s_out0>
5604 into:
5606 loop_exit:
5607 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5608 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5609 v_out2 = reduce <v_out1>
5610 s_out3 = extract_field <v_out2, 0>
5611 s_out4 = adjust_result <s_out3>
5612 use <s_out4>
5613 use <s_out4> */
5616 /* In SLP reduction chain we reduce vector results into one vector if
5617 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
5618 the last stmt in the reduction chain, since we are looking for the loop
5619 exit phi node. */
5620 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5622 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5623 /* Handle reduction patterns. */
5624 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5625 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5627 scalar_dest = gimple_assign_lhs (dest_stmt);
5628 group_size = 1;
5631 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5632 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
5633 need to match SCALAR_RESULTS with corresponding statements. The first
5634 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5635 the first vector stmt, etc.
5636 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
5637 if (group_size > new_phis.length ())
5639 ratio = group_size / new_phis.length ();
5640 gcc_assert (!(group_size % new_phis.length ()));
5642 else
5643 ratio = 1;
5645 for (k = 0; k < group_size; k++)
5647 if (k % ratio == 0)
5649 epilog_stmt = new_phis[k / ratio];
5650 reduction_phi = reduction_phis[k / ratio];
5651 if (double_reduc)
5652 inner_phi = inner_phis[k / ratio];
5655 if (slp_reduc)
5657 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5659 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5660 /* SLP statements can't participate in patterns. */
5661 gcc_assert (!orig_stmt);
5662 scalar_dest = gimple_assign_lhs (current_stmt);
5665 phis.create (3);
5666 /* Find the loop-closed-use at the loop exit of the original scalar
5667 result. (The reduction result is expected to have two immediate uses -
5668 one at the latch block, and one at the loop exit). */
5669 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5670 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5671 && !is_gimple_debug (USE_STMT (use_p)))
5672 phis.safe_push (USE_STMT (use_p));
5674 /* While we expect to have found an exit_phi because of loop-closed-ssa
5675 form we can end up without one if the scalar cycle is dead. */
5677 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5679 if (outer_loop)
5681 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5682 gphi *vect_phi;
5684 /* FORNOW. Currently not supporting the case that an inner-loop
5685 reduction is not used in the outer-loop (but only outside the
5686 outer-loop), unless it is double reduction. */
5687 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5688 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5689 || double_reduc);
5691 if (double_reduc)
5692 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5693 else
5694 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5695 if (!double_reduc
5696 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5697 != vect_double_reduction_def)
5698 continue;
5700 /* Handle double reduction:
5702 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5703 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5704 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5705 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5707 At that point the regular reduction (stmt2 and stmt3) is
5708 already vectorized, as well as the exit phi node, stmt4.
5709 Here we vectorize the phi node of double reduction, stmt1, and
5710 update all relevant statements. */
5712 /* Go through all the uses of s2 to find double reduction phi
5713 node, i.e., stmt1 above. */
5714 orig_name = PHI_RESULT (exit_phi);
5715 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5717 stmt_vec_info use_stmt_vinfo;
5718 stmt_vec_info new_phi_vinfo;
5719 tree vect_phi_init, preheader_arg, vect_phi_res;
5720 basic_block bb = gimple_bb (use_stmt);
5721 gimple *use;
5723 /* Check that USE_STMT is really double reduction phi
5724 node. */
5725 if (gimple_code (use_stmt) != GIMPLE_PHI
5726 || gimple_phi_num_args (use_stmt) != 2
5727 || bb->loop_father != outer_loop)
5728 continue;
5729 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5730 if (!use_stmt_vinfo
5731 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5732 != vect_double_reduction_def)
5733 continue;
5735 /* Create vector phi node for double reduction:
5736 vs1 = phi <vs0, vs2>
5737 vs1 was created previously in this function by a call to
5738 vect_get_vec_def_for_operand and is stored in
5739 vec_initial_def;
5740 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5741 vs0 is created here. */
5743 /* Create vector phi node. */
5744 vect_phi = create_phi_node (vec_initial_def, bb);
5745 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5746 loop_vec_info_for_loop (outer_loop));
5747 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5749 /* Create vs0 - initial def of the double reduction phi. */
5750 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5751 loop_preheader_edge (outer_loop));
5752 vect_phi_init = get_initial_def_for_reduction
5753 (stmt, preheader_arg, NULL);
5755 /* Update phi node arguments with vs0 and vs2. */
5756 add_phi_arg (vect_phi, vect_phi_init,
5757 loop_preheader_edge (outer_loop),
5758 UNKNOWN_LOCATION);
5759 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5760 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5761 if (dump_enabled_p ())
5763 dump_printf_loc (MSG_NOTE, vect_location,
5764 "created double reduction phi node: ");
5765 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5768 vect_phi_res = PHI_RESULT (vect_phi);
5770 /* Replace the use, i.e., set the correct vs1 in the regular
5771 reduction phi node. FORNOW, NCOPIES is always 1, so the
5772 loop is redundant. */
5773 use = reduction_phi;
5774 for (j = 0; j < ncopies; j++)
5776 edge pr_edge = loop_preheader_edge (loop);
5777 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5778 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5784 phis.release ();
5785 if (nested_in_vect_loop)
5787 if (double_reduc)
5788 loop = outer_loop;
5789 else
5790 continue;
5793 phis.create (3);
5794 /* Find the loop-closed-use at the loop exit of the original scalar
5795 result. (The reduction result is expected to have two immediate uses,
5796 one at the latch block, and one at the loop exit). For double
5797 reductions we are looking for exit phis of the outer loop. */
5798 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5800 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5802 if (!is_gimple_debug (USE_STMT (use_p)))
5803 phis.safe_push (USE_STMT (use_p));
5805 else
5807 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5809 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5811 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5813 if (!flow_bb_inside_loop_p (loop,
5814 gimple_bb (USE_STMT (phi_use_p)))
5815 && !is_gimple_debug (USE_STMT (phi_use_p)))
5816 phis.safe_push (USE_STMT (phi_use_p));
5822 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5824 /* Replace the uses: */
5825 orig_name = PHI_RESULT (exit_phi);
5826 scalar_result = scalar_results[k];
5827 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5828 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5829 SET_USE (use_p, scalar_result);
5832 phis.release ();
5836 /* Return a vector of type VECTYPE that is equal to the vector select
5837 operation "MASK ? VEC : IDENTITY". Insert the select statements
5838 before GSI. */
5840 static tree
5841 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5842 tree vec, tree identity)
5844 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5845 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5846 mask, vec, identity);
5847 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5848 return cond;
5851 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5852 order, starting with LHS. Insert the extraction statements before GSI and
5853 associate the new scalar SSA names with variable SCALAR_DEST.
5854 Return the SSA name for the result. */
5856 static tree
5857 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5858 tree_code code, tree lhs, tree vector_rhs)
5860 tree vectype = TREE_TYPE (vector_rhs);
5861 tree scalar_type = TREE_TYPE (vectype);
5862 tree bitsize = TYPE_SIZE (scalar_type);
5863 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5864 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5866 for (unsigned HOST_WIDE_INT bit_offset = 0;
5867 bit_offset < vec_size_in_bits;
5868 bit_offset += element_bitsize)
5870 tree bitpos = bitsize_int (bit_offset);
5871 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5872 bitsize, bitpos);
5874 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5875 rhs = make_ssa_name (scalar_dest, stmt);
5876 gimple_assign_set_lhs (stmt, rhs);
5877 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5879 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5880 tree new_name = make_ssa_name (scalar_dest, stmt);
5881 gimple_assign_set_lhs (stmt, new_name);
5882 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5883 lhs = new_name;
5885 return lhs;
5888 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the
5889 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5890 statement. CODE is the operation performed by STMT and OPS are
5891 its scalar operands. REDUC_INDEX is the index of the operand in
5892 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5893 implements in-order reduction, or IFN_LAST if we should open-code it.
5894 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5895 that should be used to control the operation in a fully-masked loop. */
5897 static bool
5898 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5899 gimple **vec_stmt, slp_tree slp_node,
5900 gimple *reduc_def_stmt,
5901 tree_code code, internal_fn reduc_fn,
5902 tree ops[3], tree vectype_in,
5903 int reduc_index, vec_loop_masks *masks)
5905 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5906 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5907 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5908 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5909 gimple *new_stmt = NULL;
5911 int ncopies;
5912 if (slp_node)
5913 ncopies = 1;
5914 else
5915 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5917 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5918 gcc_assert (ncopies == 1);
5919 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5920 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5921 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5922 == FOLD_LEFT_REDUCTION);
5924 if (slp_node)
5925 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5926 TYPE_VECTOR_SUBPARTS (vectype_in)));
5928 tree op0 = ops[1 - reduc_index];
5930 int group_size = 1;
5931 gimple *scalar_dest_def;
5932 auto_vec<tree> vec_oprnds0;
5933 if (slp_node)
5935 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5936 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5937 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5939 else
5941 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5942 vec_oprnds0.create (1);
5943 vec_oprnds0.quick_push (loop_vec_def0);
5944 scalar_dest_def = stmt;
5947 tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5948 tree scalar_type = TREE_TYPE (scalar_dest);
5949 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5951 int vec_num = vec_oprnds0.length ();
5952 gcc_assert (vec_num == 1 || slp_node);
5953 tree vec_elem_type = TREE_TYPE (vectype_out);
5954 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5956 tree vector_identity = NULL_TREE;
5957 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5958 vector_identity = build_zero_cst (vectype_out);
5960 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5961 int i;
5962 tree def0;
5963 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5965 tree mask = NULL_TREE;
5966 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5967 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5969 /* Handle MINUS by adding the negative. */
5970 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5972 tree negated = make_ssa_name (vectype_out);
5973 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5974 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5975 def0 = negated;
5978 if (mask)
5979 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5980 vector_identity);
5982 /* On the first iteration the input is simply the scalar phi
5983 result, and for subsequent iterations it is the output of
5984 the preceding operation. */
5985 if (reduc_fn != IFN_LAST)
5987 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5988 /* For chained SLP reductions the output of the previous reduction
5989 operation serves as the input of the next. For the final statement
5990 the output cannot be a temporary - we reuse the original
5991 scalar destination of the last statement. */
5992 if (i != vec_num - 1)
5994 gimple_set_lhs (new_stmt, scalar_dest_var);
5995 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5996 gimple_set_lhs (new_stmt, reduc_var);
5999 else
6001 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6002 reduc_var, def0);
6003 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6004 /* Remove the statement, so that we can use the same code paths
6005 as for statements that we've just created. */
6006 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6007 gsi_remove (&tmp_gsi, false);
6010 if (i == vec_num - 1)
6012 gimple_set_lhs (new_stmt, scalar_dest);
6013 vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6015 else
6016 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6018 if (slp_node)
6019 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6022 if (!slp_node)
6023 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6025 return true;
6028 /* Function is_nonwrapping_integer_induction.
6030 Check if STMT (which is part of loop LOOP) both increments and
6031 does not cause overflow. */
6033 static bool
6034 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6036 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6037 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6038 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6039 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6040 widest_int ni, max_loop_value, lhs_max;
6041 bool overflow = false;
6043 /* Make sure the loop is integer based. */
6044 if (TREE_CODE (base) != INTEGER_CST
6045 || TREE_CODE (step) != INTEGER_CST)
6046 return false;
6048 /* Check that the max size of the loop will not wrap. */
6050 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6051 return true;
6053 if (! max_stmt_executions (loop, &ni))
6054 return false;
6056 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6057 &overflow);
6058 if (overflow)
6059 return false;
6061 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6062 TYPE_SIGN (lhs_type), &overflow);
6063 if (overflow)
6064 return false;
6066 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6067 <= TYPE_PRECISION (lhs_type));
6070 /* Function vectorizable_reduction.
6072 Check if STMT performs a reduction operation that can be vectorized.
6073 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6074 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6075 Return FALSE if not a vectorizable STMT, TRUE otherwise.
6077 This function also handles reduction idioms (patterns) that have been
6078 recognized in advance during vect_pattern_recog. In this case, STMT may be
6079 of this form:
6080 X = pattern_expr (arg0, arg1, ..., X)
6081 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6082 sequence that had been detected and replaced by the pattern-stmt (STMT).
6084 This function also handles reduction of condition expressions, for example:
6085 for (int i = 0; i < N; i++)
6086 if (a[i] < value)
6087 last = a[i];
6088 This is handled by vectorising the loop and creating an additional vector
6089 containing the loop indexes for which "a[i] < value" was true. In the
6090 function epilogue this is reduced to a single max value and then used to
6091 index into the vector of results.
6093 In some cases of reduction patterns, the type of the reduction variable X is
6094 different than the type of the other arguments of STMT.
6095 In such cases, the vectype that is used when transforming STMT into a vector
6096 stmt is different than the vectype that is used to determine the
6097 vectorization factor, because it consists of a different number of elements
6098 than the actual number of elements that are being operated upon in parallel.
6100 For example, consider an accumulation of shorts into an int accumulator.
6101 On some targets it's possible to vectorize this pattern operating on 8
6102 shorts at a time (hence, the vectype for purposes of determining the
6103 vectorization factor should be V8HI); on the other hand, the vectype that
6104 is used to create the vector form is actually V4SI (the type of the result).
6106 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6107 indicates what is the actual level of parallelism (V8HI in the example), so
6108 that the right vectorization factor would be derived. This vectype
6109 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6110 be used to create the vectorized stmt. The right vectype for the vectorized
6111 stmt is obtained from the type of the result X:
6112 get_vectype_for_scalar_type (TREE_TYPE (X))
6114 This means that, contrary to "regular" reductions (or "regular" stmts in
6115 general), the following equation:
6116 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6117 does *NOT* necessarily hold for reduction patterns. */
6119 bool
6120 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6121 gimple **vec_stmt, slp_tree slp_node,
6122 slp_instance slp_node_instance,
6123 stmt_vector_for_cost *cost_vec)
6125 tree vec_dest;
6126 tree scalar_dest;
6127 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6128 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6129 tree vectype_in = NULL_TREE;
6130 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6131 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6132 enum tree_code code, orig_code;
6133 internal_fn reduc_fn;
6134 machine_mode vec_mode;
6135 int op_type;
6136 optab optab;
6137 tree new_temp = NULL_TREE;
6138 gimple *def_stmt;
6139 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6140 gimple *cond_reduc_def_stmt = NULL;
6141 enum tree_code cond_reduc_op_code = ERROR_MARK;
6142 tree scalar_type;
6143 bool is_simple_use;
6144 gimple *orig_stmt;
6145 stmt_vec_info orig_stmt_info = NULL;
6146 int i;
6147 int ncopies;
6148 int epilog_copies;
6149 stmt_vec_info prev_stmt_info, prev_phi_info;
6150 bool single_defuse_cycle = false;
6151 gimple *new_stmt = NULL;
6152 int j;
6153 tree ops[3];
6154 enum vect_def_type dts[3];
6155 bool nested_cycle = false, found_nested_cycle_def = false;
6156 bool double_reduc = false;
6157 basic_block def_bb;
6158 struct loop * def_stmt_loop, *outer_loop = NULL;
6159 tree def_arg;
6160 gimple *def_arg_stmt;
6161 auto_vec<tree> vec_oprnds0;
6162 auto_vec<tree> vec_oprnds1;
6163 auto_vec<tree> vec_oprnds2;
6164 auto_vec<tree> vect_defs;
6165 auto_vec<gimple *> phis;
6166 int vec_num;
6167 tree def0, tem;
6168 bool first_p = true;
6169 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6170 tree cond_reduc_val = NULL_TREE;
6172 /* Make sure it was already recognized as a reduction computation. */
6173 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6174 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6175 return false;
6177 if (nested_in_vect_loop_p (loop, stmt))
6179 outer_loop = loop;
6180 loop = loop->inner;
6181 nested_cycle = true;
6184 /* In case of reduction chain we switch to the first stmt in the chain, but
6185 we don't update STMT_INFO, since only the last stmt is marked as reduction
6186 and has reduction properties. */
6187 if (GROUP_FIRST_ELEMENT (stmt_info)
6188 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6190 stmt = GROUP_FIRST_ELEMENT (stmt_info);
6191 first_p = false;
6194 if (gimple_code (stmt) == GIMPLE_PHI)
6196 /* Analysis is fully done on the reduction stmt invocation. */
6197 if (! vec_stmt)
6199 if (slp_node)
6200 slp_node_instance->reduc_phis = slp_node;
6202 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6203 return true;
6206 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6207 /* Leave the scalar phi in place. Note that checking
6208 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6209 for reductions involving a single statement. */
6210 return true;
6212 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6213 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6214 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6216 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6217 == EXTRACT_LAST_REDUCTION)
6218 /* Leave the scalar phi in place. */
6219 return true;
6221 gcc_assert (is_gimple_assign (reduc_stmt));
6222 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6224 tree op = gimple_op (reduc_stmt, k);
6225 if (op == gimple_phi_result (stmt))
6226 continue;
6227 if (k == 1
6228 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6229 continue;
6230 if (!vectype_in
6231 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6232 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6233 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6234 break;
6236 gcc_assert (vectype_in);
6238 if (slp_node)
6239 ncopies = 1;
6240 else
6241 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6243 use_operand_p use_p;
6244 gimple *use_stmt;
6245 if (ncopies > 1
6246 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6247 <= vect_used_only_live)
6248 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6249 && (use_stmt == reduc_stmt
6250 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6251 == reduc_stmt)))
6252 single_defuse_cycle = true;
6254 /* Create the destination vector */
6255 scalar_dest = gimple_assign_lhs (reduc_stmt);
6256 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6258 if (slp_node)
6259 /* The size vect_schedule_slp_instance computes is off for us. */
6260 vec_num = vect_get_num_vectors
6261 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6262 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6263 vectype_in);
6264 else
6265 vec_num = 1;
6267 /* Generate the reduction PHIs upfront. */
6268 prev_phi_info = NULL;
6269 for (j = 0; j < ncopies; j++)
6271 if (j == 0 || !single_defuse_cycle)
6273 for (i = 0; i < vec_num; i++)
6275 /* Create the reduction-phi that defines the reduction
6276 operand. */
6277 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6278 set_vinfo_for_stmt (new_phi,
6279 new_stmt_vec_info (new_phi, loop_vinfo));
6281 if (slp_node)
6282 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6283 else
6285 if (j == 0)
6286 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6287 else
6288 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6289 prev_phi_info = vinfo_for_stmt (new_phi);
6295 return true;
6298 /* 1. Is vectorizable reduction? */
6299 /* Not supportable if the reduction variable is used in the loop, unless
6300 it's a reduction chain. */
6301 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6302 && !GROUP_FIRST_ELEMENT (stmt_info))
6303 return false;
6305 /* Reductions that are not used even in an enclosing outer-loop,
6306 are expected to be "live" (used out of the loop). */
6307 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6308 && !STMT_VINFO_LIVE_P (stmt_info))
6309 return false;
6311 /* 2. Has this been recognized as a reduction pattern?
6313 Check if STMT represents a pattern that has been recognized
6314 in earlier analysis stages. For stmts that represent a pattern,
6315 the STMT_VINFO_RELATED_STMT field records the last stmt in
6316 the original sequence that constitutes the pattern. */
6318 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6319 if (orig_stmt)
6321 orig_stmt_info = vinfo_for_stmt (orig_stmt);
6322 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6323 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6326 /* 3. Check the operands of the operation. The first operands are defined
6327 inside the loop body. The last operand is the reduction variable,
6328 which is defined by the loop-header-phi. */
6330 gcc_assert (is_gimple_assign (stmt));
6332 /* Flatten RHS. */
6333 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6335 case GIMPLE_BINARY_RHS:
6336 code = gimple_assign_rhs_code (stmt);
6337 op_type = TREE_CODE_LENGTH (code);
6338 gcc_assert (op_type == binary_op);
6339 ops[0] = gimple_assign_rhs1 (stmt);
6340 ops[1] = gimple_assign_rhs2 (stmt);
6341 break;
6343 case GIMPLE_TERNARY_RHS:
6344 code = gimple_assign_rhs_code (stmt);
6345 op_type = TREE_CODE_LENGTH (code);
6346 gcc_assert (op_type == ternary_op);
6347 ops[0] = gimple_assign_rhs1 (stmt);
6348 ops[1] = gimple_assign_rhs2 (stmt);
6349 ops[2] = gimple_assign_rhs3 (stmt);
6350 break;
6352 case GIMPLE_UNARY_RHS:
6353 return false;
6355 default:
6356 gcc_unreachable ();
6359 if (code == COND_EXPR && slp_node)
6360 return false;
6362 scalar_dest = gimple_assign_lhs (stmt);
6363 scalar_type = TREE_TYPE (scalar_dest);
6364 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6365 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6366 return false;
6368 /* Do not try to vectorize bit-precision reductions. */
6369 if (!type_has_mode_precision_p (scalar_type))
6370 return false;
6372 /* All uses but the last are expected to be defined in the loop.
6373 The last use is the reduction variable. In case of nested cycle this
6374 assumption is not true: we use reduc_index to record the index of the
6375 reduction variable. */
6376 gimple *reduc_def_stmt = NULL;
6377 int reduc_index = -1;
6378 for (i = 0; i < op_type; i++)
6380 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6381 if (i == 0 && code == COND_EXPR)
6382 continue;
6384 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6385 &def_stmt, &dts[i], &tem);
6386 dt = dts[i];
6387 gcc_assert (is_simple_use);
6388 if (dt == vect_reduction_def)
6390 reduc_def_stmt = def_stmt;
6391 reduc_index = i;
6392 continue;
6394 else if (tem)
6396 /* To properly compute ncopies we are interested in the widest
6397 input type in case we're looking at a widening accumulation. */
6398 if (!vectype_in
6399 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6400 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6401 vectype_in = tem;
6404 if (dt != vect_internal_def
6405 && dt != vect_external_def
6406 && dt != vect_constant_def
6407 && dt != vect_induction_def
6408 && !(dt == vect_nested_cycle && nested_cycle))
6409 return false;
6411 if (dt == vect_nested_cycle)
6413 found_nested_cycle_def = true;
6414 reduc_def_stmt = def_stmt;
6415 reduc_index = i;
6418 if (i == 1 && code == COND_EXPR)
6420 /* Record how value of COND_EXPR is defined. */
6421 if (dt == vect_constant_def)
6423 cond_reduc_dt = dt;
6424 cond_reduc_val = ops[i];
6426 if (dt == vect_induction_def
6427 && def_stmt != NULL
6428 && is_nonwrapping_integer_induction (def_stmt, loop))
6430 cond_reduc_dt = dt;
6431 cond_reduc_def_stmt = def_stmt;
6436 if (!vectype_in)
6437 vectype_in = vectype_out;
6439 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6440 directy used in stmt. */
6441 if (reduc_index == -1)
6443 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6445 if (dump_enabled_p ())
6446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6447 "in-order reduction chain without SLP.\n");
6448 return false;
6451 if (orig_stmt)
6452 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6453 else
6454 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6457 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6458 return false;
6460 if (!(reduc_index == -1
6461 || dts[reduc_index] == vect_reduction_def
6462 || dts[reduc_index] == vect_nested_cycle
6463 || ((dts[reduc_index] == vect_internal_def
6464 || dts[reduc_index] == vect_external_def
6465 || dts[reduc_index] == vect_constant_def
6466 || dts[reduc_index] == vect_induction_def)
6467 && nested_cycle && found_nested_cycle_def)))
6469 /* For pattern recognized stmts, orig_stmt might be a reduction,
6470 but some helper statements for the pattern might not, or
6471 might be COND_EXPRs with reduction uses in the condition. */
6472 gcc_assert (orig_stmt);
6473 return false;
6476 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6477 enum vect_reduction_type v_reduc_type
6478 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6479 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6481 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6482 /* If we have a condition reduction, see if we can simplify it further. */
6483 if (v_reduc_type == COND_REDUCTION)
6485 /* TODO: We can't yet handle reduction chains, since we need to treat
6486 each COND_EXPR in the chain specially, not just the last one.
6487 E.g. for:
6489 x_1 = PHI <x_3, ...>
6490 x_2 = a_2 ? ... : x_1;
6491 x_3 = a_3 ? ... : x_2;
6493 we're interested in the last element in x_3 for which a_2 || a_3
6494 is true, whereas the current reduction chain handling would
6495 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6496 as a reduction operation. */
6497 if (reduc_index == -1)
6499 if (dump_enabled_p ())
6500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6501 "conditional reduction chains not supported\n");
6502 return false;
6505 /* vect_is_simple_reduction ensured that operand 2 is the
6506 loop-carried operand. */
6507 gcc_assert (reduc_index == 2);
6509 /* Loop peeling modifies initial value of reduction PHI, which
6510 makes the reduction stmt to be transformed different to the
6511 original stmt analyzed. We need to record reduction code for
6512 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6513 it can be used directly at transform stage. */
6514 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6515 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6517 /* Also set the reduction type to CONST_COND_REDUCTION. */
6518 gcc_assert (cond_reduc_dt == vect_constant_def);
6519 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6521 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6522 vectype_in, OPTIMIZE_FOR_SPEED))
6524 if (dump_enabled_p ())
6525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6526 "optimizing condition reduction with"
6527 " FOLD_EXTRACT_LAST.\n");
6528 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6530 else if (cond_reduc_dt == vect_induction_def)
6532 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6533 tree base
6534 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6535 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6537 gcc_assert (TREE_CODE (base) == INTEGER_CST
6538 && TREE_CODE (step) == INTEGER_CST);
6539 cond_reduc_val = NULL_TREE;
6540 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6541 above base; punt if base is the minimum value of the type for
6542 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6543 if (tree_int_cst_sgn (step) == -1)
6545 cond_reduc_op_code = MIN_EXPR;
6546 if (tree_int_cst_sgn (base) == -1)
6547 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6548 else if (tree_int_cst_lt (base,
6549 TYPE_MAX_VALUE (TREE_TYPE (base))))
6550 cond_reduc_val
6551 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6553 else
6555 cond_reduc_op_code = MAX_EXPR;
6556 if (tree_int_cst_sgn (base) == 1)
6557 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6558 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6559 base))
6560 cond_reduc_val
6561 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6563 if (cond_reduc_val)
6565 if (dump_enabled_p ())
6566 dump_printf_loc (MSG_NOTE, vect_location,
6567 "condition expression based on "
6568 "integer induction.\n");
6569 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6570 = INTEGER_INDUC_COND_REDUCTION;
6573 else if (cond_reduc_dt == vect_constant_def)
6575 enum vect_def_type cond_initial_dt;
6576 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6577 tree cond_initial_val
6578 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6580 gcc_assert (cond_reduc_val != NULL_TREE);
6581 vect_is_simple_use (cond_initial_val, loop_vinfo,
6582 &def_stmt, &cond_initial_dt);
6583 if (cond_initial_dt == vect_constant_def
6584 && types_compatible_p (TREE_TYPE (cond_initial_val),
6585 TREE_TYPE (cond_reduc_val)))
6587 tree e = fold_binary (LE_EXPR, boolean_type_node,
6588 cond_initial_val, cond_reduc_val);
6589 if (e && (integer_onep (e) || integer_zerop (e)))
6591 if (dump_enabled_p ())
6592 dump_printf_loc (MSG_NOTE, vect_location,
6593 "condition expression based on "
6594 "compile time constant.\n");
6595 /* Record reduction code at analysis stage. */
6596 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6597 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6598 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6599 = CONST_COND_REDUCTION;
6605 if (orig_stmt)
6606 gcc_assert (tmp == orig_stmt
6607 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6608 else
6609 /* We changed STMT to be the first stmt in reduction chain, hence we
6610 check that in this case the first element in the chain is STMT. */
6611 gcc_assert (stmt == tmp
6612 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6614 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6615 return false;
6617 if (slp_node)
6618 ncopies = 1;
6619 else
6620 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6622 gcc_assert (ncopies >= 1);
6624 vec_mode = TYPE_MODE (vectype_in);
6625 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6627 if (code == COND_EXPR)
6629 /* Only call during the analysis stage, otherwise we'll lose
6630 STMT_VINFO_TYPE. */
6631 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6632 ops[reduc_index], 0, NULL,
6633 cost_vec))
6635 if (dump_enabled_p ())
6636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6637 "unsupported condition in reduction\n");
6638 return false;
6641 else
6643 /* 4. Supportable by target? */
6645 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6646 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6648 /* Shifts and rotates are only supported by vectorizable_shifts,
6649 not vectorizable_reduction. */
6650 if (dump_enabled_p ())
6651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6652 "unsupported shift or rotation.\n");
6653 return false;
6656 /* 4.1. check support for the operation in the loop */
6657 optab = optab_for_tree_code (code, vectype_in, optab_default);
6658 if (!optab)
6660 if (dump_enabled_p ())
6661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6662 "no optab.\n");
6664 return false;
6667 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6669 if (dump_enabled_p ())
6670 dump_printf (MSG_NOTE, "op not supported by target.\n");
6672 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6673 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6674 return false;
6676 if (dump_enabled_p ())
6677 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6680 /* Worthwhile without SIMD support? */
6681 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6682 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6684 if (dump_enabled_p ())
6685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6686 "not worthwhile without SIMD support.\n");
6688 return false;
6692 /* 4.2. Check support for the epilog operation.
6694 If STMT represents a reduction pattern, then the type of the
6695 reduction variable may be different than the type of the rest
6696 of the arguments. For example, consider the case of accumulation
6697 of shorts into an int accumulator; The original code:
6698 S1: int_a = (int) short_a;
6699 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6701 was replaced with:
6702 STMT: int_acc = widen_sum <short_a, int_acc>
6704 This means that:
6705 1. The tree-code that is used to create the vector operation in the
6706 epilog code (that reduces the partial results) is not the
6707 tree-code of STMT, but is rather the tree-code of the original
6708 stmt from the pattern that STMT is replacing. I.e, in the example
6709 above we want to use 'widen_sum' in the loop, but 'plus' in the
6710 epilog.
6711 2. The type (mode) we use to check available target support
6712 for the vector operation to be created in the *epilog*, is
6713 determined by the type of the reduction variable (in the example
6714 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6715 However the type (mode) we use to check available target support
6716 for the vector operation to be created *inside the loop*, is
6717 determined by the type of the other arguments to STMT (in the
6718 example we'd check this: optab_handler (widen_sum_optab,
6719 vect_short_mode)).
6721 This is contrary to "regular" reductions, in which the types of all
6722 the arguments are the same as the type of the reduction variable.
6723 For "regular" reductions we can therefore use the same vector type
6724 (and also the same tree-code) when generating the epilog code and
6725 when generating the code inside the loop. */
6727 vect_reduction_type reduction_type
6728 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6729 if (orig_stmt
6730 && (reduction_type == TREE_CODE_REDUCTION
6731 || reduction_type == FOLD_LEFT_REDUCTION))
6733 /* This is a reduction pattern: get the vectype from the type of the
6734 reduction variable, and get the tree-code from orig_stmt. */
6735 orig_code = gimple_assign_rhs_code (orig_stmt);
6736 gcc_assert (vectype_out);
6737 vec_mode = TYPE_MODE (vectype_out);
6739 else
6741 /* Regular reduction: use the same vectype and tree-code as used for
6742 the vector code inside the loop can be used for the epilog code. */
6743 orig_code = code;
6745 if (code == MINUS_EXPR)
6746 orig_code = PLUS_EXPR;
6748 /* For simple condition reductions, replace with the actual expression
6749 we want to base our reduction around. */
6750 if (reduction_type == CONST_COND_REDUCTION)
6752 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6753 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6755 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6756 orig_code = cond_reduc_op_code;
6759 if (nested_cycle)
6761 def_bb = gimple_bb (reduc_def_stmt);
6762 def_stmt_loop = def_bb->loop_father;
6763 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6764 loop_preheader_edge (def_stmt_loop));
6765 if (TREE_CODE (def_arg) == SSA_NAME
6766 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6767 && gimple_code (def_arg_stmt) == GIMPLE_PHI
6768 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6769 && vinfo_for_stmt (def_arg_stmt)
6770 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6771 == vect_double_reduction_def)
6772 double_reduc = true;
6775 reduc_fn = IFN_LAST;
6777 if (reduction_type == TREE_CODE_REDUCTION
6778 || reduction_type == FOLD_LEFT_REDUCTION
6779 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6780 || reduction_type == CONST_COND_REDUCTION)
6782 if (reduction_type == FOLD_LEFT_REDUCTION
6783 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6784 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6786 if (reduc_fn != IFN_LAST
6787 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6788 OPTIMIZE_FOR_SPEED))
6790 if (dump_enabled_p ())
6791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6792 "reduc op not supported by target.\n");
6794 reduc_fn = IFN_LAST;
6797 else
6799 if (!nested_cycle || double_reduc)
6801 if (dump_enabled_p ())
6802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6803 "no reduc code for scalar code.\n");
6805 return false;
6809 else if (reduction_type == COND_REDUCTION)
6811 int scalar_precision
6812 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6813 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6814 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6815 nunits_out);
6817 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6818 OPTIMIZE_FOR_SPEED))
6819 reduc_fn = IFN_REDUC_MAX;
6822 if (reduction_type != EXTRACT_LAST_REDUCTION
6823 && reduc_fn == IFN_LAST
6824 && !nunits_out.is_constant ())
6826 if (dump_enabled_p ())
6827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6828 "missing target support for reduction on"
6829 " variable-length vectors.\n");
6830 return false;
6833 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6834 && ncopies > 1)
6836 if (dump_enabled_p ())
6837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6838 "multiple types in double reduction or condition "
6839 "reduction.\n");
6840 return false;
6843 /* For SLP reductions, see if there is a neutral value we can use. */
6844 tree neutral_op = NULL_TREE;
6845 if (slp_node)
6846 neutral_op
6847 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
6848 GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6850 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6852 /* We can't support in-order reductions of code such as this:
6854 for (int i = 0; i < n1; ++i)
6855 for (int j = 0; j < n2; ++j)
6856 l += a[j];
6858 since GCC effectively transforms the loop when vectorizing:
6860 for (int i = 0; i < n1 / VF; ++i)
6861 for (int j = 0; j < n2; ++j)
6862 for (int k = 0; k < VF; ++k)
6863 l += a[j];
6865 which is a reassociation of the original operation. */
6866 if (dump_enabled_p ())
6867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868 "in-order double reduction not supported.\n");
6870 return false;
6873 if (reduction_type == FOLD_LEFT_REDUCTION
6874 && slp_node
6875 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6877 /* We cannot use in-order reductions in this case because there is
6878 an implicit reassociation of the operations involved. */
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881 "in-order unchained SLP reductions not supported.\n");
6882 return false;
6885 /* For double reductions, and for SLP reductions with a neutral value,
6886 we construct a variable-length initial vector by loading a vector
6887 full of the neutral value and then shift-and-inserting the start
6888 values into the low-numbered elements. */
6889 if ((double_reduc || neutral_op)
6890 && !nunits_out.is_constant ()
6891 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6892 vectype_out, OPTIMIZE_FOR_SPEED))
6894 if (dump_enabled_p ())
6895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896 "reduction on variable-length vectors requires"
6897 " target support for a vector-shift-and-insert"
6898 " operation.\n");
6899 return false;
6902 /* Check extra constraints for variable-length unchained SLP reductions. */
6903 if (STMT_SLP_TYPE (stmt_info)
6904 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6905 && !nunits_out.is_constant ())
6907 /* We checked above that we could build the initial vector when
6908 there's a neutral element value. Check here for the case in
6909 which each SLP statement has its own initial value and in which
6910 that value needs to be repeated for every instance of the
6911 statement within the initial vector. */
6912 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6913 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6914 if (!neutral_op
6915 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6917 if (dump_enabled_p ())
6918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6919 "unsupported form of SLP reduction for"
6920 " variable-length vectors: cannot build"
6921 " initial vector.\n");
6922 return false;
6924 /* The epilogue code relies on the number of elements being a multiple
6925 of the group size. The duplicate-and-interleave approach to setting
6926 up the the initial vector does too. */
6927 if (!multiple_p (nunits_out, group_size))
6929 if (dump_enabled_p ())
6930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6931 "unsupported form of SLP reduction for"
6932 " variable-length vectors: the vector size"
6933 " is not a multiple of the number of results.\n");
6934 return false;
6938 /* In case of widenning multiplication by a constant, we update the type
6939 of the constant to be the type of the other operand. We check that the
6940 constant fits the type in the pattern recognition pass. */
6941 if (code == DOT_PROD_EXPR
6942 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6944 if (TREE_CODE (ops[0]) == INTEGER_CST)
6945 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6946 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6947 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6948 else
6950 if (dump_enabled_p ())
6951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6952 "invalid types in dot-prod\n");
6954 return false;
6958 if (reduction_type == COND_REDUCTION)
6960 widest_int ni;
6962 if (! max_loop_iterations (loop, &ni))
6964 if (dump_enabled_p ())
6965 dump_printf_loc (MSG_NOTE, vect_location,
6966 "loop count not known, cannot create cond "
6967 "reduction.\n");
6968 return false;
6970 /* Convert backedges to iterations. */
6971 ni += 1;
6973 /* The additional index will be the same type as the condition. Check
6974 that the loop can fit into this less one (because we'll use up the
6975 zero slot for when there are no matches). */
6976 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6977 if (wi::geu_p (ni, wi::to_widest (max_index)))
6979 if (dump_enabled_p ())
6980 dump_printf_loc (MSG_NOTE, vect_location,
6981 "loop size is greater than data size.\n");
6982 return false;
6986 /* In case the vectorization factor (VF) is bigger than the number
6987 of elements that we can fit in a vectype (nunits), we have to generate
6988 more than one vector stmt - i.e - we need to "unroll" the
6989 vector stmt by a factor VF/nunits. For more details see documentation
6990 in vectorizable_operation. */
6992 /* If the reduction is used in an outer loop we need to generate
6993 VF intermediate results, like so (e.g. for ncopies=2):
6994 r0 = phi (init, r0)
6995 r1 = phi (init, r1)
6996 r0 = x0 + r0;
6997 r1 = x1 + r1;
6998 (i.e. we generate VF results in 2 registers).
6999 In this case we have a separate def-use cycle for each copy, and therefore
7000 for each copy we get the vector def for the reduction variable from the
7001 respective phi node created for this copy.
7003 Otherwise (the reduction is unused in the loop nest), we can combine
7004 together intermediate results, like so (e.g. for ncopies=2):
7005 r = phi (init, r)
7006 r = x0 + r;
7007 r = x1 + r;
7008 (i.e. we generate VF/2 results in a single register).
7009 In this case for each copy we get the vector def for the reduction variable
7010 from the vectorized reduction operation generated in the previous iteration.
7012 This only works when we see both the reduction PHI and its only consumer
7013 in vectorizable_reduction and there are no intermediate stmts
7014 participating. */
7015 use_operand_p use_p;
7016 gimple *use_stmt;
7017 if (ncopies > 1
7018 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7019 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7020 && (use_stmt == stmt
7021 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7023 single_defuse_cycle = true;
7024 epilog_copies = 1;
7026 else
7027 epilog_copies = ncopies;
7029 /* If the reduction stmt is one of the patterns that have lane
7030 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7031 if ((ncopies > 1
7032 && ! single_defuse_cycle)
7033 && (code == DOT_PROD_EXPR
7034 || code == WIDEN_SUM_EXPR
7035 || code == SAD_EXPR))
7037 if (dump_enabled_p ())
7038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7039 "multi def-use cycle not possible for lane-reducing "
7040 "reduction operation\n");
7041 return false;
7044 if (slp_node)
7045 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7046 else
7047 vec_num = 1;
7049 internal_fn cond_fn = get_conditional_internal_fn (code);
7050 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7052 if (!vec_stmt) /* transformation not required. */
7054 if (first_p)
7055 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7056 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7058 if (reduction_type != FOLD_LEFT_REDUCTION
7059 && (cond_fn == IFN_LAST
7060 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7061 OPTIMIZE_FOR_SPEED)))
7063 if (dump_enabled_p ())
7064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7065 "can't use a fully-masked loop because no"
7066 " conditional operation is available.\n");
7067 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7069 else if (reduc_index == -1)
7071 if (dump_enabled_p ())
7072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7073 "can't use a fully-masked loop for chained"
7074 " reductions.\n");
7075 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7077 else
7078 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7079 vectype_in);
7081 if (dump_enabled_p ()
7082 && reduction_type == FOLD_LEFT_REDUCTION)
7083 dump_printf_loc (MSG_NOTE, vect_location,
7084 "using an in-order (fold-left) reduction.\n");
7085 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7086 return true;
7089 /* Transform. */
7091 if (dump_enabled_p ())
7092 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7094 /* FORNOW: Multiple types are not supported for condition. */
7095 if (code == COND_EXPR)
7096 gcc_assert (ncopies == 1);
7098 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7100 if (reduction_type == FOLD_LEFT_REDUCTION)
7101 return vectorize_fold_left_reduction
7102 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7103 reduc_fn, ops, vectype_in, reduc_index, masks);
7105 if (reduction_type == EXTRACT_LAST_REDUCTION)
7107 gcc_assert (!slp_node);
7108 return vectorizable_condition (stmt, gsi, vec_stmt,
7109 NULL, reduc_index, NULL, NULL);
7112 /* Create the destination vector */
7113 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7115 prev_stmt_info = NULL;
7116 prev_phi_info = NULL;
7117 if (!slp_node)
7119 vec_oprnds0.create (1);
7120 vec_oprnds1.create (1);
7121 if (op_type == ternary_op)
7122 vec_oprnds2.create (1);
7125 phis.create (vec_num);
7126 vect_defs.create (vec_num);
7127 if (!slp_node)
7128 vect_defs.quick_push (NULL_TREE);
7130 if (slp_node)
7131 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7132 else
7133 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7135 for (j = 0; j < ncopies; j++)
7137 if (code == COND_EXPR)
7139 gcc_assert (!slp_node);
7140 vectorizable_condition (stmt, gsi, vec_stmt,
7141 PHI_RESULT (phis[0]),
7142 reduc_index, NULL, NULL);
7143 /* Multiple types are not supported for condition. */
7144 break;
7147 /* Handle uses. */
7148 if (j == 0)
7150 if (slp_node)
7152 /* Get vec defs for all the operands except the reduction index,
7153 ensuring the ordering of the ops in the vector is kept. */
7154 auto_vec<tree, 3> slp_ops;
7155 auto_vec<vec<tree>, 3> vec_defs;
7157 slp_ops.quick_push (ops[0]);
7158 slp_ops.quick_push (ops[1]);
7159 if (op_type == ternary_op)
7160 slp_ops.quick_push (ops[2]);
7162 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7164 vec_oprnds0.safe_splice (vec_defs[0]);
7165 vec_defs[0].release ();
7166 vec_oprnds1.safe_splice (vec_defs[1]);
7167 vec_defs[1].release ();
7168 if (op_type == ternary_op)
7170 vec_oprnds2.safe_splice (vec_defs[2]);
7171 vec_defs[2].release ();
7174 else
7176 vec_oprnds0.quick_push
7177 (vect_get_vec_def_for_operand (ops[0], stmt));
7178 vec_oprnds1.quick_push
7179 (vect_get_vec_def_for_operand (ops[1], stmt));
7180 if (op_type == ternary_op)
7181 vec_oprnds2.quick_push
7182 (vect_get_vec_def_for_operand (ops[2], stmt));
7185 else
7187 if (!slp_node)
7189 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7191 if (single_defuse_cycle && reduc_index == 0)
7192 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7193 else
7194 vec_oprnds0[0]
7195 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7196 if (single_defuse_cycle && reduc_index == 1)
7197 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7198 else
7199 vec_oprnds1[0]
7200 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7201 if (op_type == ternary_op)
7203 if (single_defuse_cycle && reduc_index == 2)
7204 vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7205 else
7206 vec_oprnds2[0]
7207 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7212 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7214 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7215 if (masked_loop_p)
7217 /* Make sure that the reduction accumulator is vop[0]. */
7218 if (reduc_index == 1)
7220 gcc_assert (commutative_tree_code (code));
7221 std::swap (vop[0], vop[1]);
7223 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7224 vectype_in, i * ncopies + j);
7225 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7226 vop[0], vop[1]);
7227 new_temp = make_ssa_name (vec_dest, call);
7228 gimple_call_set_lhs (call, new_temp);
7229 gimple_call_set_nothrow (call, true);
7230 new_stmt = call;
7232 else
7234 if (op_type == ternary_op)
7235 vop[2] = vec_oprnds2[i];
7237 new_temp = make_ssa_name (vec_dest, new_stmt);
7238 new_stmt = gimple_build_assign (new_temp, code,
7239 vop[0], vop[1], vop[2]);
7241 vect_finish_stmt_generation (stmt, new_stmt, gsi);
7243 if (slp_node)
7245 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7246 vect_defs.quick_push (new_temp);
7248 else
7249 vect_defs[0] = new_temp;
7252 if (slp_node)
7253 continue;
7255 if (j == 0)
7256 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7257 else
7258 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7260 prev_stmt_info = vinfo_for_stmt (new_stmt);
7263 /* Finalize the reduction-phi (set its arguments) and create the
7264 epilog reduction code. */
7265 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7266 vect_defs[0] = gimple_get_lhs (*vec_stmt);
7268 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7269 epilog_copies, reduc_fn, phis,
7270 double_reduc, slp_node, slp_node_instance,
7271 cond_reduc_val, cond_reduc_op_code,
7272 neutral_op);
7274 return true;
7277 /* Function vect_min_worthwhile_factor.
7279 For a loop where we could vectorize the operation indicated by CODE,
7280 return the minimum vectorization factor that makes it worthwhile
7281 to use generic vectors. */
7282 static unsigned int
7283 vect_min_worthwhile_factor (enum tree_code code)
7285 switch (code)
7287 case PLUS_EXPR:
7288 case MINUS_EXPR:
7289 case NEGATE_EXPR:
7290 return 4;
7292 case BIT_AND_EXPR:
7293 case BIT_IOR_EXPR:
7294 case BIT_XOR_EXPR:
7295 case BIT_NOT_EXPR:
7296 return 2;
7298 default:
7299 return INT_MAX;
7303 /* Return true if VINFO indicates we are doing loop vectorization and if
7304 it is worth decomposing CODE operations into scalar operations for
7305 that loop's vectorization factor. */
7307 bool
7308 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7310 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7311 unsigned HOST_WIDE_INT value;
7312 return (loop_vinfo
7313 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7314 && value >= vect_min_worthwhile_factor (code));
7317 /* Function vectorizable_induction
7319 Check if PHI performs an induction computation that can be vectorized.
7320 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7321 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7322 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
7324 bool
7325 vectorizable_induction (gimple *phi,
7326 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7327 gimple **vec_stmt, slp_tree slp_node,
7328 stmt_vector_for_cost *cost_vec)
7330 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7331 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7332 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7333 unsigned ncopies;
7334 bool nested_in_vect_loop = false;
7335 struct loop *iv_loop;
7336 tree vec_def;
7337 edge pe = loop_preheader_edge (loop);
7338 basic_block new_bb;
7339 tree new_vec, vec_init, vec_step, t;
7340 tree new_name;
7341 gimple *new_stmt;
7342 gphi *induction_phi;
7343 tree induc_def, vec_dest;
7344 tree init_expr, step_expr;
7345 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7346 unsigned i;
7347 tree expr;
7348 gimple_seq stmts;
7349 imm_use_iterator imm_iter;
7350 use_operand_p use_p;
7351 gimple *exit_phi;
7352 edge latch_e;
7353 tree loop_arg;
7354 gimple_stmt_iterator si;
7355 basic_block bb = gimple_bb (phi);
7357 if (gimple_code (phi) != GIMPLE_PHI)
7358 return false;
7360 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7361 return false;
7363 /* Make sure it was recognized as induction computation. */
7364 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7365 return false;
7367 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7368 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7370 if (slp_node)
7371 ncopies = 1;
7372 else
7373 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7374 gcc_assert (ncopies >= 1);
7376 /* FORNOW. These restrictions should be relaxed. */
7377 if (nested_in_vect_loop_p (loop, phi))
7379 imm_use_iterator imm_iter;
7380 use_operand_p use_p;
7381 gimple *exit_phi;
7382 edge latch_e;
7383 tree loop_arg;
7385 if (ncopies > 1)
7387 if (dump_enabled_p ())
7388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7389 "multiple types in nested loop.\n");
7390 return false;
7393 /* FORNOW: outer loop induction with SLP not supported. */
7394 if (STMT_SLP_TYPE (stmt_info))
7395 return false;
7397 exit_phi = NULL;
7398 latch_e = loop_latch_edge (loop->inner);
7399 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7400 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7402 gimple *use_stmt = USE_STMT (use_p);
7403 if (is_gimple_debug (use_stmt))
7404 continue;
7406 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7408 exit_phi = use_stmt;
7409 break;
7412 if (exit_phi)
7414 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
7415 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7416 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7418 if (dump_enabled_p ())
7419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7420 "inner-loop induction only used outside "
7421 "of the outer vectorized loop.\n");
7422 return false;
7426 nested_in_vect_loop = true;
7427 iv_loop = loop->inner;
7429 else
7430 iv_loop = loop;
7431 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7433 if (slp_node && !nunits.is_constant ())
7435 /* The current SLP code creates the initial value element-by-element. */
7436 if (dump_enabled_p ())
7437 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7438 "SLP induction not supported for variable-length"
7439 " vectors.\n");
7440 return false;
7443 if (!vec_stmt) /* transformation not required. */
7445 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7446 if (dump_enabled_p ())
7447 dump_printf_loc (MSG_NOTE, vect_location,
7448 "=== vectorizable_induction ===\n");
7449 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7450 return true;
7453 /* Transform. */
7455 /* Compute a vector variable, initialized with the first VF values of
7456 the induction variable. E.g., for an iv with IV_PHI='X' and
7457 evolution S, for a vector of 4 units, we want to compute:
7458 [X, X + S, X + 2*S, X + 3*S]. */
7460 if (dump_enabled_p ())
7461 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7463 latch_e = loop_latch_edge (iv_loop);
7464 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7466 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7467 gcc_assert (step_expr != NULL_TREE);
7469 pe = loop_preheader_edge (iv_loop);
7470 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7471 loop_preheader_edge (iv_loop));
7473 stmts = NULL;
7474 if (!nested_in_vect_loop)
7476 /* Convert the initial value to the desired type. */
7477 tree new_type = TREE_TYPE (vectype);
7478 init_expr = gimple_convert (&stmts, new_type, init_expr);
7480 /* If we are using the loop mask to "peel" for alignment then we need
7481 to adjust the start value here. */
7482 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7483 if (skip_niters != NULL_TREE)
7485 if (FLOAT_TYPE_P (vectype))
7486 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7487 skip_niters);
7488 else
7489 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7490 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7491 skip_niters, step_expr);
7492 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7493 init_expr, skip_step);
7497 /* Convert the step to the desired type. */
7498 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7500 if (stmts)
7502 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7503 gcc_assert (!new_bb);
7506 /* Find the first insertion point in the BB. */
7507 si = gsi_after_labels (bb);
7509 /* For SLP induction we have to generate several IVs as for example
7510 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7511 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7512 [VF*S, VF*S, VF*S, VF*S] for all. */
7513 if (slp_node)
7515 /* Enforced above. */
7516 unsigned int const_nunits = nunits.to_constant ();
7518 /* Generate [VF*S, VF*S, ... ]. */
7519 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7521 expr = build_int_cst (integer_type_node, vf);
7522 expr = fold_convert (TREE_TYPE (step_expr), expr);
7524 else
7525 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7526 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7527 expr, step_expr);
7528 if (! CONSTANT_CLASS_P (new_name))
7529 new_name = vect_init_vector (phi, new_name,
7530 TREE_TYPE (step_expr), NULL);
7531 new_vec = build_vector_from_val (vectype, new_name);
7532 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7534 /* Now generate the IVs. */
7535 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7536 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7537 unsigned elts = const_nunits * nvects;
7538 unsigned nivs = least_common_multiple (group_size,
7539 const_nunits) / const_nunits;
7540 gcc_assert (elts % group_size == 0);
7541 tree elt = init_expr;
7542 unsigned ivn;
7543 for (ivn = 0; ivn < nivs; ++ivn)
7545 tree_vector_builder elts (vectype, const_nunits, 1);
7546 stmts = NULL;
7547 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7549 if (ivn*const_nunits + eltn >= group_size
7550 && (ivn * const_nunits + eltn) % group_size == 0)
7551 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7552 elt, step_expr);
7553 elts.quick_push (elt);
7555 vec_init = gimple_build_vector (&stmts, &elts);
7556 if (stmts)
7558 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7559 gcc_assert (!new_bb);
7562 /* Create the induction-phi that defines the induction-operand. */
7563 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7564 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7565 set_vinfo_for_stmt (induction_phi,
7566 new_stmt_vec_info (induction_phi, loop_vinfo));
7567 induc_def = PHI_RESULT (induction_phi);
7569 /* Create the iv update inside the loop */
7570 vec_def = make_ssa_name (vec_dest);
7571 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7572 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7573 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7575 /* Set the arguments of the phi node: */
7576 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7577 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7578 UNKNOWN_LOCATION);
7580 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7583 /* Re-use IVs when we can. */
7584 if (ivn < nvects)
7586 unsigned vfp
7587 = least_common_multiple (group_size, const_nunits) / group_size;
7588 /* Generate [VF'*S, VF'*S, ... ]. */
7589 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7591 expr = build_int_cst (integer_type_node, vfp);
7592 expr = fold_convert (TREE_TYPE (step_expr), expr);
7594 else
7595 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7596 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7597 expr, step_expr);
7598 if (! CONSTANT_CLASS_P (new_name))
7599 new_name = vect_init_vector (phi, new_name,
7600 TREE_TYPE (step_expr), NULL);
7601 new_vec = build_vector_from_val (vectype, new_name);
7602 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7603 for (; ivn < nvects; ++ivn)
7605 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7606 tree def;
7607 if (gimple_code (iv) == GIMPLE_PHI)
7608 def = gimple_phi_result (iv);
7609 else
7610 def = gimple_assign_lhs (iv);
7611 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7612 PLUS_EXPR,
7613 def, vec_step);
7614 if (gimple_code (iv) == GIMPLE_PHI)
7615 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7616 else
7618 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7619 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7621 set_vinfo_for_stmt (new_stmt,
7622 new_stmt_vec_info (new_stmt, loop_vinfo));
7623 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7627 return true;
7630 /* Create the vector that holds the initial_value of the induction. */
7631 if (nested_in_vect_loop)
7633 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7634 been created during vectorization of previous stmts. We obtain it
7635 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7636 vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7637 /* If the initial value is not of proper type, convert it. */
7638 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7640 new_stmt
7641 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7642 vect_simple_var,
7643 "vec_iv_"),
7644 VIEW_CONVERT_EXPR,
7645 build1 (VIEW_CONVERT_EXPR, vectype,
7646 vec_init));
7647 vec_init = gimple_assign_lhs (new_stmt);
7648 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7649 new_stmt);
7650 gcc_assert (!new_bb);
7651 set_vinfo_for_stmt (new_stmt,
7652 new_stmt_vec_info (new_stmt, loop_vinfo));
7655 else
7657 /* iv_loop is the loop to be vectorized. Create:
7658 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7659 stmts = NULL;
7660 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7662 unsigned HOST_WIDE_INT const_nunits;
7663 if (nunits.is_constant (&const_nunits))
7665 tree_vector_builder elts (vectype, const_nunits, 1);
7666 elts.quick_push (new_name);
7667 for (i = 1; i < const_nunits; i++)
7669 /* Create: new_name_i = new_name + step_expr */
7670 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7671 new_name, step_expr);
7672 elts.quick_push (new_name);
7674 /* Create a vector from [new_name_0, new_name_1, ...,
7675 new_name_nunits-1] */
7676 vec_init = gimple_build_vector (&stmts, &elts);
7678 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7679 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7680 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7681 new_name, step_expr);
7682 else
7684 /* Build:
7685 [base, base, base, ...]
7686 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7687 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7688 gcc_assert (flag_associative_math);
7689 tree index = build_index_vector (vectype, 0, 1);
7690 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7691 new_name);
7692 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7693 step_expr);
7694 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7695 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7696 vec_init, step_vec);
7697 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7698 vec_init, base_vec);
7701 if (stmts)
7703 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7704 gcc_assert (!new_bb);
7709 /* Create the vector that holds the step of the induction. */
7710 if (nested_in_vect_loop)
7711 /* iv_loop is nested in the loop to be vectorized. Generate:
7712 vec_step = [S, S, S, S] */
7713 new_name = step_expr;
7714 else
7716 /* iv_loop is the loop to be vectorized. Generate:
7717 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7718 gimple_seq seq = NULL;
7719 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7721 expr = build_int_cst (integer_type_node, vf);
7722 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7724 else
7725 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7726 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7727 expr, step_expr);
7728 if (seq)
7730 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7731 gcc_assert (!new_bb);
7735 t = unshare_expr (new_name);
7736 gcc_assert (CONSTANT_CLASS_P (new_name)
7737 || TREE_CODE (new_name) == SSA_NAME);
7738 new_vec = build_vector_from_val (vectype, t);
7739 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7742 /* Create the following def-use cycle:
7743 loop prolog:
7744 vec_init = ...
7745 vec_step = ...
7746 loop:
7747 vec_iv = PHI <vec_init, vec_loop>
7749 STMT
7751 vec_loop = vec_iv + vec_step; */
7753 /* Create the induction-phi that defines the induction-operand. */
7754 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7755 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7756 set_vinfo_for_stmt (induction_phi,
7757 new_stmt_vec_info (induction_phi, loop_vinfo));
7758 induc_def = PHI_RESULT (induction_phi);
7760 /* Create the iv update inside the loop */
7761 vec_def = make_ssa_name (vec_dest);
7762 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7763 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7764 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7766 /* Set the arguments of the phi node: */
7767 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7768 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7769 UNKNOWN_LOCATION);
7771 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7773 /* In case that vectorization factor (VF) is bigger than the number
7774 of elements that we can fit in a vectype (nunits), we have to generate
7775 more than one vector stmt - i.e - we need to "unroll" the
7776 vector stmt by a factor VF/nunits. For more details see documentation
7777 in vectorizable_operation. */
7779 if (ncopies > 1)
7781 gimple_seq seq = NULL;
7782 stmt_vec_info prev_stmt_vinfo;
7783 /* FORNOW. This restriction should be relaxed. */
7784 gcc_assert (!nested_in_vect_loop);
7786 /* Create the vector that holds the step of the induction. */
7787 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7789 expr = build_int_cst (integer_type_node, nunits);
7790 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7792 else
7793 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7794 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7795 expr, step_expr);
7796 if (seq)
7798 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7799 gcc_assert (!new_bb);
7802 t = unshare_expr (new_name);
7803 gcc_assert (CONSTANT_CLASS_P (new_name)
7804 || TREE_CODE (new_name) == SSA_NAME);
7805 new_vec = build_vector_from_val (vectype, t);
7806 vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7808 vec_def = induc_def;
7809 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7810 for (i = 1; i < ncopies; i++)
7812 /* vec_i = vec_prev + vec_step */
7813 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7814 vec_def, vec_step);
7815 vec_def = make_ssa_name (vec_dest, new_stmt);
7816 gimple_assign_set_lhs (new_stmt, vec_def);
7818 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7819 set_vinfo_for_stmt (new_stmt,
7820 new_stmt_vec_info (new_stmt, loop_vinfo));
7821 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7822 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7826 if (nested_in_vect_loop)
7828 /* Find the loop-closed exit-phi of the induction, and record
7829 the final vector of induction results: */
7830 exit_phi = NULL;
7831 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7833 gimple *use_stmt = USE_STMT (use_p);
7834 if (is_gimple_debug (use_stmt))
7835 continue;
7837 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7839 exit_phi = use_stmt;
7840 break;
7843 if (exit_phi)
7845 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7846 /* FORNOW. Currently not supporting the case that an inner-loop induction
7847 is not used in the outer-loop (i.e. only outside the outer-loop). */
7848 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7849 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7851 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7852 if (dump_enabled_p ())
7854 dump_printf_loc (MSG_NOTE, vect_location,
7855 "vector of inductions after inner-loop:");
7856 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7862 if (dump_enabled_p ())
7864 dump_printf_loc (MSG_NOTE, vect_location,
7865 "transform induction: created def-use cycle: ");
7866 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7867 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7868 SSA_NAME_DEF_STMT (vec_def), 0);
7871 return true;
7874 /* Function vectorizable_live_operation.
7876 STMT computes a value that is used outside the loop. Check if
7877 it can be supported. */
7879 bool
7880 vectorizable_live_operation (gimple *stmt,
7881 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7882 slp_tree slp_node, int slp_index,
7883 gimple **vec_stmt,
7884 stmt_vector_for_cost *)
7886 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7887 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7888 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7889 imm_use_iterator imm_iter;
7890 tree lhs, lhs_type, bitsize, vec_bitsize;
7891 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7892 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7893 int ncopies;
7894 gimple *use_stmt;
7895 auto_vec<tree> vec_oprnds;
7896 int vec_entry = 0;
7897 poly_uint64 vec_index = 0;
7899 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7901 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7902 return false;
7904 /* FORNOW. CHECKME. */
7905 if (nested_in_vect_loop_p (loop, stmt))
7906 return false;
7908 /* If STMT is not relevant and it is a simple assignment and its inputs are
7909 invariant then it can remain in place, unvectorized. The original last
7910 scalar value that it computes will be used. */
7911 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7913 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7914 if (dump_enabled_p ())
7915 dump_printf_loc (MSG_NOTE, vect_location,
7916 "statement is simple and uses invariant. Leaving in "
7917 "place.\n");
7918 return true;
7921 if (slp_node)
7922 ncopies = 1;
7923 else
7924 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7926 if (slp_node)
7928 gcc_assert (slp_index >= 0);
7930 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7931 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7933 /* Get the last occurrence of the scalar index from the concatenation of
7934 all the slp vectors. Calculate which slp vector it is and the index
7935 within. */
7936 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7938 /* Calculate which vector contains the result, and which lane of
7939 that vector we need. */
7940 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7942 if (dump_enabled_p ())
7943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7944 "Cannot determine which vector holds the"
7945 " final result.\n");
7946 return false;
7950 if (!vec_stmt)
7952 /* No transformation required. */
7953 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7955 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7956 OPTIMIZE_FOR_SPEED))
7958 if (dump_enabled_p ())
7959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7960 "can't use a fully-masked loop because "
7961 "the target doesn't support extract last "
7962 "reduction.\n");
7963 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7965 else if (slp_node)
7967 if (dump_enabled_p ())
7968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7969 "can't use a fully-masked loop because an "
7970 "SLP statement is live after the loop.\n");
7971 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7973 else if (ncopies > 1)
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977 "can't use a fully-masked loop because"
7978 " ncopies is greater than 1.\n");
7979 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7981 else
7983 gcc_assert (ncopies == 1 && !slp_node);
7984 vect_record_loop_mask (loop_vinfo,
7985 &LOOP_VINFO_MASKS (loop_vinfo),
7986 1, vectype);
7989 return true;
7992 /* If stmt has a related stmt, then use that for getting the lhs. */
7993 if (is_pattern_stmt_p (stmt_info))
7994 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7996 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7997 : gimple_get_lhs (stmt);
7998 lhs_type = TREE_TYPE (lhs);
8000 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8001 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8002 : TYPE_SIZE (TREE_TYPE (vectype)));
8003 vec_bitsize = TYPE_SIZE (vectype);
8005 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8006 tree vec_lhs, bitstart;
8007 if (slp_node)
8009 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8011 /* Get the correct slp vectorized stmt. */
8012 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8013 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8014 vec_lhs = gimple_phi_result (phi);
8015 else
8016 vec_lhs = gimple_get_lhs (vec_stmt);
8018 /* Get entry to use. */
8019 bitstart = bitsize_int (vec_index);
8020 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8022 else
8024 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8025 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8026 gcc_checking_assert (ncopies == 1
8027 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8029 /* For multiple copies, get the last copy. */
8030 for (int i = 1; i < ncopies; ++i)
8031 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8032 vec_lhs);
8034 /* Get the last lane in the vector. */
8035 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8038 gimple_seq stmts = NULL;
8039 tree new_tree;
8040 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8042 /* Emit:
8044 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8046 where VEC_LHS is the vectorized live-out result and MASK is
8047 the loop mask for the final iteration. */
8048 gcc_assert (ncopies == 1 && !slp_node);
8049 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8050 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8051 1, vectype, 0);
8052 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8053 scalar_type, mask, vec_lhs);
8055 /* Convert the extracted vector element to the required scalar type. */
8056 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8058 else
8060 tree bftype = TREE_TYPE (vectype);
8061 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8062 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8063 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8064 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8065 &stmts, true, NULL_TREE);
8068 if (stmts)
8069 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8071 /* Replace use of lhs with newly computed result. If the use stmt is a
8072 single arg PHI, just replace all uses of PHI result. It's necessary
8073 because lcssa PHI defining lhs may be before newly inserted stmt. */
8074 use_operand_p use_p;
8075 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8076 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8077 && !is_gimple_debug (use_stmt))
8079 if (gimple_code (use_stmt) == GIMPLE_PHI
8080 && gimple_phi_num_args (use_stmt) == 1)
8082 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8084 else
8086 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8087 SET_USE (use_p, new_tree);
8089 update_stmt (use_stmt);
8092 return true;
8095 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
8097 static void
8098 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8100 ssa_op_iter op_iter;
8101 imm_use_iterator imm_iter;
8102 def_operand_p def_p;
8103 gimple *ustmt;
8105 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8107 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8109 basic_block bb;
8111 if (!is_gimple_debug (ustmt))
8112 continue;
8114 bb = gimple_bb (ustmt);
8116 if (!flow_bb_inside_loop_p (loop, bb))
8118 if (gimple_debug_bind_p (ustmt))
8120 if (dump_enabled_p ())
8121 dump_printf_loc (MSG_NOTE, vect_location,
8122 "killing debug use\n");
8124 gimple_debug_bind_reset_value (ustmt);
8125 update_stmt (ustmt);
8127 else
8128 gcc_unreachable ();
8134 /* Given loop represented by LOOP_VINFO, return true if computation of
8135 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8136 otherwise. */
8138 static bool
8139 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8141 /* Constant case. */
8142 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8144 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8145 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8147 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8148 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8149 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8150 return true;
8153 widest_int max;
8154 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8155 /* Check the upper bound of loop niters. */
8156 if (get_max_loop_iterations (loop, &max))
8158 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8159 signop sgn = TYPE_SIGN (type);
8160 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8161 if (max < type_max)
8162 return true;
8164 return false;
8167 /* Return a mask type with half the number of elements as TYPE. */
8169 tree
8170 vect_halve_mask_nunits (tree type)
8172 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8173 return build_truth_vector_type (nunits, current_vector_size);
8176 /* Return a mask type with twice as many elements as TYPE. */
8178 tree
8179 vect_double_mask_nunits (tree type)
8181 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8182 return build_truth_vector_type (nunits, current_vector_size);
8185 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8186 contain a sequence of NVECTORS masks that each control a vector of type
8187 VECTYPE. */
8189 void
8190 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8191 unsigned int nvectors, tree vectype)
8193 gcc_assert (nvectors != 0);
8194 if (masks->length () < nvectors)
8195 masks->safe_grow_cleared (nvectors);
8196 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8197 /* The number of scalars per iteration and the number of vectors are
8198 both compile-time constants. */
8199 unsigned int nscalars_per_iter
8200 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8201 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8202 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8204 rgm->max_nscalars_per_iter = nscalars_per_iter;
8205 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8209 /* Given a complete set of masks MASKS, extract mask number INDEX
8210 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8211 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8213 See the comment above vec_loop_masks for more details about the mask
8214 arrangement. */
8216 tree
8217 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8218 unsigned int nvectors, tree vectype, unsigned int index)
8220 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8221 tree mask_type = rgm->mask_type;
8223 /* Populate the rgroup's mask array, if this is the first time we've
8224 used it. */
8225 if (rgm->masks.is_empty ())
8227 rgm->masks.safe_grow_cleared (nvectors);
8228 for (unsigned int i = 0; i < nvectors; ++i)
8230 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8231 /* Provide a dummy definition until the real one is available. */
8232 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8233 rgm->masks[i] = mask;
8237 tree mask = rgm->masks[index];
8238 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8239 TYPE_VECTOR_SUBPARTS (vectype)))
8241 /* A loop mask for data type X can be reused for data type Y
8242 if X has N times more elements than Y and if Y's elements
8243 are N times bigger than X's. In this case each sequence
8244 of N elements in the loop mask will be all-zero or all-one.
8245 We can then view-convert the mask so that each sequence of
8246 N elements is replaced by a single element. */
8247 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8248 TYPE_VECTOR_SUBPARTS (vectype)));
8249 gimple_seq seq = NULL;
8250 mask_type = build_same_sized_truth_vector_type (vectype);
8251 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8252 if (seq)
8253 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8255 return mask;
8258 /* Scale profiling counters by estimation for LOOP which is vectorized
8259 by factor VF. */
8261 static void
8262 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8264 edge preheader = loop_preheader_edge (loop);
8265 /* Reduce loop iterations by the vectorization factor. */
8266 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8267 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8269 if (freq_h.nonzero_p ())
8271 profile_probability p;
8273 /* Avoid dropping loop body profile counter to 0 because of zero count
8274 in loop's preheader. */
8275 if (!(freq_e == profile_count::zero ()))
8276 freq_e = freq_e.force_nonzero ();
8277 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8278 scale_loop_frequencies (loop, p);
8281 edge exit_e = single_exit (loop);
8282 exit_e->probability = profile_probability::always ()
8283 .apply_scale (1, new_est_niter + 1);
8285 edge exit_l = single_pred_edge (loop->latch);
8286 profile_probability prob = exit_l->probability;
8287 exit_l->probability = exit_e->probability.invert ();
8288 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8289 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8292 /* Function vect_transform_loop.
8294 The analysis phase has determined that the loop is vectorizable.
8295 Vectorize the loop - created vectorized stmts to replace the scalar
8296 stmts in the loop, and update the loop exit condition.
8297 Returns scalar epilogue loop if any. */
8299 struct loop *
8300 vect_transform_loop (loop_vec_info loop_vinfo)
8302 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8303 struct loop *epilogue = NULL;
8304 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8305 int nbbs = loop->num_nodes;
8306 int i;
8307 tree niters_vector = NULL_TREE;
8308 tree step_vector = NULL_TREE;
8309 tree niters_vector_mult_vf = NULL_TREE;
8310 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8311 unsigned int lowest_vf = constant_lower_bound (vf);
8312 bool grouped_store;
8313 bool slp_scheduled = false;
8314 gimple *stmt, *pattern_stmt;
8315 gimple_seq pattern_def_seq = NULL;
8316 gimple_stmt_iterator pattern_def_si = gsi_none ();
8317 bool transform_pattern_stmt = false;
8318 bool check_profitability = false;
8319 unsigned int th;
8321 if (dump_enabled_p ())
8322 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8324 /* Use the more conservative vectorization threshold. If the number
8325 of iterations is constant assume the cost check has been performed
8326 by our caller. If the threshold makes all loops profitable that
8327 run at least the (estimated) vectorization factor number of times
8328 checking is pointless, too. */
8329 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8330 if (th >= vect_vf_for_cost (loop_vinfo)
8331 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8333 if (dump_enabled_p ())
8334 dump_printf_loc (MSG_NOTE, vect_location,
8335 "Profitability threshold is %d loop iterations.\n",
8336 th);
8337 check_profitability = true;
8340 /* Make sure there exists a single-predecessor exit bb. Do this before
8341 versioning. */
8342 edge e = single_exit (loop);
8343 if (! single_pred_p (e->dest))
8345 split_loop_exit_edge (e);
8346 if (dump_enabled_p ())
8347 dump_printf (MSG_NOTE, "split exit edge\n");
8350 /* Version the loop first, if required, so the profitability check
8351 comes first. */
8353 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8355 poly_uint64 versioning_threshold
8356 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8357 if (check_profitability
8358 && ordered_p (poly_uint64 (th), versioning_threshold))
8360 versioning_threshold = ordered_max (poly_uint64 (th),
8361 versioning_threshold);
8362 check_profitability = false;
8364 vect_loop_versioning (loop_vinfo, th, check_profitability,
8365 versioning_threshold);
8366 check_profitability = false;
8369 /* Make sure there exists a single-predecessor exit bb also on the
8370 scalar loop copy. Do this after versioning but before peeling
8371 so CFG structure is fine for both scalar and if-converted loop
8372 to make slpeel_duplicate_current_defs_from_edges face matched
8373 loop closed PHI nodes on the exit. */
8374 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8376 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8377 if (! single_pred_p (e->dest))
8379 split_loop_exit_edge (e);
8380 if (dump_enabled_p ())
8381 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8385 tree niters = vect_build_loop_niters (loop_vinfo);
8386 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8387 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8388 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8389 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8390 &step_vector, &niters_vector_mult_vf, th,
8391 check_profitability, niters_no_overflow);
8393 if (niters_vector == NULL_TREE)
8395 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8396 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8397 && known_eq (lowest_vf, vf))
8399 niters_vector
8400 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8401 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8402 step_vector = build_one_cst (TREE_TYPE (niters));
8404 else
8405 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8406 &step_vector, niters_no_overflow);
8409 /* 1) Make sure the loop header has exactly two entries
8410 2) Make sure we have a preheader basic block. */
8412 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8414 split_edge (loop_preheader_edge (loop));
8416 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8417 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8418 /* This will deal with any possible peeling. */
8419 vect_prepare_for_masked_peels (loop_vinfo);
8421 /* FORNOW: the vectorizer supports only loops which body consist
8422 of one basic block (header + empty latch). When the vectorizer will
8423 support more involved loop forms, the order by which the BBs are
8424 traversed need to be reconsidered. */
8426 for (i = 0; i < nbbs; i++)
8428 basic_block bb = bbs[i];
8429 stmt_vec_info stmt_info;
8431 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8432 gsi_next (&si))
8434 gphi *phi = si.phi ();
8435 if (dump_enabled_p ())
8437 dump_printf_loc (MSG_NOTE, vect_location,
8438 "------>vectorizing phi: ");
8439 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8441 stmt_info = vinfo_for_stmt (phi);
8442 if (!stmt_info)
8443 continue;
8445 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8446 vect_loop_kill_debug_uses (loop, phi);
8448 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8449 && !STMT_VINFO_LIVE_P (stmt_info))
8450 continue;
8452 if (STMT_VINFO_VECTYPE (stmt_info)
8453 && (maybe_ne
8454 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8455 && dump_enabled_p ())
8456 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8458 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8459 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8460 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8461 && ! PURE_SLP_STMT (stmt_info))
8463 if (dump_enabled_p ())
8464 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8465 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8469 pattern_stmt = NULL;
8470 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8471 !gsi_end_p (si) || transform_pattern_stmt;)
8473 bool is_store;
8475 if (transform_pattern_stmt)
8476 stmt = pattern_stmt;
8477 else
8479 stmt = gsi_stmt (si);
8480 /* During vectorization remove existing clobber stmts. */
8481 if (gimple_clobber_p (stmt))
8483 unlink_stmt_vdef (stmt);
8484 gsi_remove (&si, true);
8485 release_defs (stmt);
8486 continue;
8490 if (dump_enabled_p ())
8492 dump_printf_loc (MSG_NOTE, vect_location,
8493 "------>vectorizing statement: ");
8494 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8497 stmt_info = vinfo_for_stmt (stmt);
8499 /* vector stmts created in the outer-loop during vectorization of
8500 stmts in an inner-loop may not have a stmt_info, and do not
8501 need to be vectorized. */
8502 if (!stmt_info)
8504 gsi_next (&si);
8505 continue;
8508 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8509 vect_loop_kill_debug_uses (loop, stmt);
8511 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8512 && !STMT_VINFO_LIVE_P (stmt_info))
8514 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8515 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8516 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8517 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8519 stmt = pattern_stmt;
8520 stmt_info = vinfo_for_stmt (stmt);
8522 else
8524 gsi_next (&si);
8525 continue;
8528 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8529 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8530 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8531 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8532 transform_pattern_stmt = true;
8534 /* If pattern statement has def stmts, vectorize them too. */
8535 if (is_pattern_stmt_p (stmt_info))
8537 if (pattern_def_seq == NULL)
8539 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8540 pattern_def_si = gsi_start (pattern_def_seq);
8542 else if (!gsi_end_p (pattern_def_si))
8543 gsi_next (&pattern_def_si);
8544 if (pattern_def_seq != NULL)
8546 gimple *pattern_def_stmt = NULL;
8547 stmt_vec_info pattern_def_stmt_info = NULL;
8549 while (!gsi_end_p (pattern_def_si))
8551 pattern_def_stmt = gsi_stmt (pattern_def_si);
8552 pattern_def_stmt_info
8553 = vinfo_for_stmt (pattern_def_stmt);
8554 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8555 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8556 break;
8557 gsi_next (&pattern_def_si);
8560 if (!gsi_end_p (pattern_def_si))
8562 if (dump_enabled_p ())
8564 dump_printf_loc (MSG_NOTE, vect_location,
8565 "==> vectorizing pattern def "
8566 "stmt: ");
8567 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8568 pattern_def_stmt, 0);
8571 stmt = pattern_def_stmt;
8572 stmt_info = pattern_def_stmt_info;
8574 else
8576 pattern_def_si = gsi_none ();
8577 transform_pattern_stmt = false;
8580 else
8581 transform_pattern_stmt = false;
8584 if (STMT_VINFO_VECTYPE (stmt_info))
8586 poly_uint64 nunits
8587 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8588 if (!STMT_SLP_TYPE (stmt_info)
8589 && maybe_ne (nunits, vf)
8590 && dump_enabled_p ())
8591 /* For SLP VF is set according to unrolling factor, and not
8592 to vector size, hence for SLP this print is not valid. */
8593 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8596 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8597 reached. */
8598 if (STMT_SLP_TYPE (stmt_info))
8600 if (!slp_scheduled)
8602 slp_scheduled = true;
8604 if (dump_enabled_p ())
8605 dump_printf_loc (MSG_NOTE, vect_location,
8606 "=== scheduling SLP instances ===\n");
8608 vect_schedule_slp (loop_vinfo);
8611 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8612 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8614 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8616 pattern_def_seq = NULL;
8617 gsi_next (&si);
8619 continue;
8623 /* -------- vectorize statement ------------ */
8624 if (dump_enabled_p ())
8625 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8627 grouped_store = false;
8628 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8629 if (is_store)
8631 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8633 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8634 interleaving chain was completed - free all the stores in
8635 the chain. */
8636 gsi_next (&si);
8637 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8639 else
8641 /* Free the attached stmt_vec_info and remove the stmt. */
8642 gimple *store = gsi_stmt (si);
8643 free_stmt_vec_info (store);
8644 unlink_stmt_vdef (store);
8645 gsi_remove (&si, true);
8646 release_defs (store);
8649 /* Stores can only appear at the end of pattern statements. */
8650 gcc_assert (!transform_pattern_stmt);
8651 pattern_def_seq = NULL;
8653 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8655 pattern_def_seq = NULL;
8656 gsi_next (&si);
8658 } /* stmts in BB */
8660 /* Stub out scalar statements that must not survive vectorization.
8661 Doing this here helps with grouped statements, or statements that
8662 are involved in patterns. */
8663 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8664 !gsi_end_p (gsi); gsi_next (&gsi))
8666 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8667 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8669 tree lhs = gimple_get_lhs (call);
8670 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8672 tree zero = build_zero_cst (TREE_TYPE (lhs));
8673 gimple *new_stmt = gimple_build_assign (lhs, zero);
8674 gsi_replace (&gsi, new_stmt, true);
8678 } /* BBs in loop */
8680 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8681 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8682 if (integer_onep (step_vector))
8683 niters_no_overflow = true;
8684 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8685 niters_vector_mult_vf, !niters_no_overflow);
8687 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8688 scale_profile_for_vect_loop (loop, assumed_vf);
8690 /* True if the final iteration might not handle a full vector's
8691 worth of scalar iterations. */
8692 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8693 /* The minimum number of iterations performed by the epilogue. This
8694 is 1 when peeling for gaps because we always need a final scalar
8695 iteration. */
8696 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8697 /* +1 to convert latch counts to loop iteration counts,
8698 -min_epilogue_iters to remove iterations that cannot be performed
8699 by the vector code. */
8700 int bias_for_lowest = 1 - min_epilogue_iters;
8701 int bias_for_assumed = bias_for_lowest;
8702 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8703 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8705 /* When the amount of peeling is known at compile time, the first
8706 iteration will have exactly alignment_npeels active elements.
8707 In the worst case it will have at least one. */
8708 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8709 bias_for_lowest += lowest_vf - min_first_active;
8710 bias_for_assumed += assumed_vf - min_first_active;
8712 /* In these calculations the "- 1" converts loop iteration counts
8713 back to latch counts. */
8714 if (loop->any_upper_bound)
8715 loop->nb_iterations_upper_bound
8716 = (final_iter_may_be_partial
8717 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8718 lowest_vf) - 1
8719 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8720 lowest_vf) - 1);
8721 if (loop->any_likely_upper_bound)
8722 loop->nb_iterations_likely_upper_bound
8723 = (final_iter_may_be_partial
8724 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8725 + bias_for_lowest, lowest_vf) - 1
8726 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8727 + bias_for_lowest, lowest_vf) - 1);
8728 if (loop->any_estimate)
8729 loop->nb_iterations_estimate
8730 = (final_iter_may_be_partial
8731 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8732 assumed_vf) - 1
8733 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8734 assumed_vf) - 1);
8736 if (dump_enabled_p ())
8738 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8740 dump_printf_loc (MSG_NOTE, vect_location,
8741 "LOOP VECTORIZED\n");
8742 if (loop->inner)
8743 dump_printf_loc (MSG_NOTE, vect_location,
8744 "OUTER LOOP VECTORIZED\n");
8745 dump_printf (MSG_NOTE, "\n");
8747 else
8749 dump_printf_loc (MSG_NOTE, vect_location,
8750 "LOOP EPILOGUE VECTORIZED (VS=");
8751 dump_dec (MSG_NOTE, current_vector_size);
8752 dump_printf (MSG_NOTE, ")\n");
8756 /* Free SLP instances here because otherwise stmt reference counting
8757 won't work. */
8758 slp_instance instance;
8759 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8760 vect_free_slp_instance (instance);
8761 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8762 /* Clear-up safelen field since its value is invalid after vectorization
8763 since vectorized loop can have loop-carried dependencies. */
8764 loop->safelen = 0;
8766 /* Don't vectorize epilogue for epilogue. */
8767 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8768 epilogue = NULL;
8770 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8771 epilogue = NULL;
8773 if (epilogue)
8775 auto_vector_sizes vector_sizes;
8776 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8777 unsigned int next_size = 0;
8779 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8780 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8781 && known_eq (vf, lowest_vf))
8783 unsigned int eiters
8784 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8785 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8786 eiters = eiters % lowest_vf;
8787 epilogue->nb_iterations_upper_bound = eiters - 1;
8789 unsigned int ratio;
8790 while (next_size < vector_sizes.length ()
8791 && !(constant_multiple_p (current_vector_size,
8792 vector_sizes[next_size], &ratio)
8793 && eiters >= lowest_vf / ratio))
8794 next_size += 1;
8796 else
8797 while (next_size < vector_sizes.length ()
8798 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8799 next_size += 1;
8801 if (next_size == vector_sizes.length ())
8802 epilogue = NULL;
8805 if (epilogue)
8807 epilogue->force_vectorize = loop->force_vectorize;
8808 epilogue->safelen = loop->safelen;
8809 epilogue->dont_vectorize = false;
8811 /* We may need to if-convert epilogue to vectorize it. */
8812 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8813 tree_if_conversion (epilogue);
8816 return epilogue;
8819 /* The code below is trying to perform simple optimization - revert
8820 if-conversion for masked stores, i.e. if the mask of a store is zero
8821 do not perform it and all stored value producers also if possible.
8822 For example,
8823 for (i=0; i<n; i++)
8824 if (c[i])
8826 p1[i] += 1;
8827 p2[i] = p3[i] +2;
8829 this transformation will produce the following semi-hammock:
8831 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8833 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8834 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8835 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8836 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8837 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8838 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8842 void
8843 optimize_mask_stores (struct loop *loop)
8845 basic_block *bbs = get_loop_body (loop);
8846 unsigned nbbs = loop->num_nodes;
8847 unsigned i;
8848 basic_block bb;
8849 struct loop *bb_loop;
8850 gimple_stmt_iterator gsi;
8851 gimple *stmt;
8852 auto_vec<gimple *> worklist;
8854 vect_location = find_loop_location (loop);
8855 /* Pick up all masked stores in loop if any. */
8856 for (i = 0; i < nbbs; i++)
8858 bb = bbs[i];
8859 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8860 gsi_next (&gsi))
8862 stmt = gsi_stmt (gsi);
8863 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8864 worklist.safe_push (stmt);
8868 free (bbs);
8869 if (worklist.is_empty ())
8870 return;
8872 /* Loop has masked stores. */
8873 while (!worklist.is_empty ())
8875 gimple *last, *last_store;
8876 edge e, efalse;
8877 tree mask;
8878 basic_block store_bb, join_bb;
8879 gimple_stmt_iterator gsi_to;
8880 tree vdef, new_vdef;
8881 gphi *phi;
8882 tree vectype;
8883 tree zero;
8885 last = worklist.pop ();
8886 mask = gimple_call_arg (last, 2);
8887 bb = gimple_bb (last);
8888 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8889 the same loop as if_bb. It could be different to LOOP when two
8890 level loop-nest is vectorized and mask_store belongs to the inner
8891 one. */
8892 e = split_block (bb, last);
8893 bb_loop = bb->loop_father;
8894 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8895 join_bb = e->dest;
8896 store_bb = create_empty_bb (bb);
8897 add_bb_to_loop (store_bb, bb_loop);
8898 e->flags = EDGE_TRUE_VALUE;
8899 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8900 /* Put STORE_BB to likely part. */
8901 efalse->probability = profile_probability::unlikely ();
8902 store_bb->count = efalse->count ();
8903 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8904 if (dom_info_available_p (CDI_DOMINATORS))
8905 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8906 if (dump_enabled_p ())
8907 dump_printf_loc (MSG_NOTE, vect_location,
8908 "Create new block %d to sink mask stores.",
8909 store_bb->index);
8910 /* Create vector comparison with boolean result. */
8911 vectype = TREE_TYPE (mask);
8912 zero = build_zero_cst (vectype);
8913 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8914 gsi = gsi_last_bb (bb);
8915 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8916 /* Create new PHI node for vdef of the last masked store:
8917 .MEM_2 = VDEF <.MEM_1>
8918 will be converted to
8919 .MEM.3 = VDEF <.MEM_1>
8920 and new PHI node will be created in join bb
8921 .MEM_2 = PHI <.MEM_1, .MEM_3>
8923 vdef = gimple_vdef (last);
8924 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8925 gimple_set_vdef (last, new_vdef);
8926 phi = create_phi_node (vdef, join_bb);
8927 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8929 /* Put all masked stores with the same mask to STORE_BB if possible. */
8930 while (true)
8932 gimple_stmt_iterator gsi_from;
8933 gimple *stmt1 = NULL;
8935 /* Move masked store to STORE_BB. */
8936 last_store = last;
8937 gsi = gsi_for_stmt (last);
8938 gsi_from = gsi;
8939 /* Shift GSI to the previous stmt for further traversal. */
8940 gsi_prev (&gsi);
8941 gsi_to = gsi_start_bb (store_bb);
8942 gsi_move_before (&gsi_from, &gsi_to);
8943 /* Setup GSI_TO to the non-empty block start. */
8944 gsi_to = gsi_start_bb (store_bb);
8945 if (dump_enabled_p ())
8947 dump_printf_loc (MSG_NOTE, vect_location,
8948 "Move stmt to created bb\n");
8949 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8951 /* Move all stored value producers if possible. */
8952 while (!gsi_end_p (gsi))
8954 tree lhs;
8955 imm_use_iterator imm_iter;
8956 use_operand_p use_p;
8957 bool res;
8959 /* Skip debug statements. */
8960 if (is_gimple_debug (gsi_stmt (gsi)))
8962 gsi_prev (&gsi);
8963 continue;
8965 stmt1 = gsi_stmt (gsi);
8966 /* Do not consider statements writing to memory or having
8967 volatile operand. */
8968 if (gimple_vdef (stmt1)
8969 || gimple_has_volatile_ops (stmt1))
8970 break;
8971 gsi_from = gsi;
8972 gsi_prev (&gsi);
8973 lhs = gimple_get_lhs (stmt1);
8974 if (!lhs)
8975 break;
8977 /* LHS of vectorized stmt must be SSA_NAME. */
8978 if (TREE_CODE (lhs) != SSA_NAME)
8979 break;
8981 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8983 /* Remove dead scalar statement. */
8984 if (has_zero_uses (lhs))
8986 gsi_remove (&gsi_from, true);
8987 continue;
8991 /* Check that LHS does not have uses outside of STORE_BB. */
8992 res = true;
8993 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8995 gimple *use_stmt;
8996 use_stmt = USE_STMT (use_p);
8997 if (is_gimple_debug (use_stmt))
8998 continue;
8999 if (gimple_bb (use_stmt) != store_bb)
9001 res = false;
9002 break;
9005 if (!res)
9006 break;
9008 if (gimple_vuse (stmt1)
9009 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9010 break;
9012 /* Can move STMT1 to STORE_BB. */
9013 if (dump_enabled_p ())
9015 dump_printf_loc (MSG_NOTE, vect_location,
9016 "Move stmt to created bb\n");
9017 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9019 gsi_move_before (&gsi_from, &gsi_to);
9020 /* Shift GSI_TO for further insertion. */
9021 gsi_prev (&gsi_to);
9023 /* Put other masked stores with the same mask to STORE_BB. */
9024 if (worklist.is_empty ()
9025 || gimple_call_arg (worklist.last (), 2) != mask
9026 || worklist.last () != stmt1)
9027 break;
9028 last = worklist.pop ();
9030 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);