PR rtl-optimization/87817
[official-gcc.git] / gcc / tree-vect-loop.c
blobdac6bb87847487e63f873f09c49aa15bb9de4514
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
185 if (stmt_vectype)
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
258 return opt_result::success ();
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i = 0; i < nbbs; i++)
304 basic_block bb = bbs[i];
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
315 gcc_assert (stmt_info);
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
347 vect_update_max_nunits (&vectorization_factor, vectype);
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
376 for (i = 0; i < mask_producers.length (); i++)
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
385 return opt_result::success ();
389 /* Function vect_is_simple_iv_evolution.
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
420 *init = init_expr;
421 *step = step_expr;
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
439 return true;
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
453 x_3 = ...;
456 outer2:
457 x_4 = PHI <x_3(inner)>;
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
476 /* Function vect_analyze_scalar_cycles_1.
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
534 worklist.safe_push (stmt_vinfo);
535 continue;
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
566 if (double_reduc)
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
576 else
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
587 else
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
612 /* Function vect_analyze_scalar_cycles.
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
621 Example1: reduction:
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
627 Example2: induction:
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
682 stmt_vec_info first;
683 unsigned i;
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
706 /* Function vect_get_loop_niters.
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
713 Return the loop exit condition. */
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
730 if (!exit)
731 return cond;
733 niter = chrec_dont_know;
734 may_be_zero = NULL_TREE;
735 niter_assumptions = boolean_true_node;
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737 || chrec_contains_undetermined (niter_desc.niter))
738 return cond;
740 niter_assumptions = niter_desc.assumptions;
741 may_be_zero = niter_desc.may_be_zero;
742 niter = niter_desc.niter;
744 if (may_be_zero && integer_zerop (may_be_zero))
745 may_be_zero = NULL_TREE;
747 if (may_be_zero)
749 if (COMPARISON_CLASS_P (may_be_zero))
751 /* Try to combine may_be_zero with assumptions, this can simplify
752 computation of niter expression. */
753 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
754 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
755 niter_assumptions,
756 fold_build1 (TRUTH_NOT_EXPR,
757 boolean_type_node,
758 may_be_zero));
759 else
760 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
761 build_int_cst (TREE_TYPE (niter), 0),
762 rewrite_to_non_trapping_overflow (niter));
764 may_be_zero = NULL_TREE;
766 else if (integer_nonzerop (may_be_zero))
768 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
769 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
770 return cond;
772 else
773 return cond;
776 *assumptions = niter_assumptions;
777 *number_of_iterationsm1 = niter;
779 /* We want the number of loop header executions which is the number
780 of latch executions plus one.
781 ??? For UINT_MAX latch executions this number overflows to zero
782 for loops like do { n++; } while (n != 0); */
783 if (niter && !chrec_contains_undetermined (niter))
784 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
785 build_int_cst (TREE_TYPE (niter), 1));
786 *number_of_iterations = niter;
788 return cond;
791 /* Function bb_in_loop_p
793 Used as predicate for dfs order traversal of the loop bbs. */
795 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data)
798 const struct loop *const loop = (const struct loop *)data;
799 if (flow_bb_inside_loop_p (loop, bb))
800 return true;
801 return false;
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
809 : vec_info (vec_info::loop, init_cost (loop_in), shared),
810 loop (loop_in),
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812 num_itersm1 (NULL_TREE),
813 num_iters (NULL_TREE),
814 num_iters_unchanged (NULL_TREE),
815 num_iters_assumptions (NULL_TREE),
816 th (0),
817 versioning_threshold (0),
818 vectorization_factor (0),
819 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE),
821 mask_compare_type (NULL_TREE),
822 unaligned_dr (NULL),
823 peeling_for_alignment (0),
824 ptr_mask (0),
825 ivexpr_map (NULL),
826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
829 can_fully_mask_p (true),
830 fully_masked_p (false),
831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop (NULL),
837 orig_loop_info (NULL)
839 /* CHECKME: We want to visit all BBs before their successors (except for
840 latch blocks, for which this assertion wouldn't hold). In the simple
841 case of the loop forms we allow, a dfs order of the BBs would the same
842 as reversed postorder traversal, so we are safe. */
844 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
845 bbs, loop->num_nodes, loop);
846 gcc_assert (nbbs == loop->num_nodes);
848 for (unsigned int i = 0; i < nbbs; i++)
850 basic_block bb = bbs[i];
851 gimple_stmt_iterator si;
853 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
855 gimple *phi = gsi_stmt (si);
856 gimple_set_uid (phi, 0);
857 add_stmt (phi);
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
862 gimple *stmt = gsi_stmt (si);
863 gimple_set_uid (stmt, 0);
864 add_stmt (stmt);
869 /* Free all levels of MASKS. */
871 void
872 release_vec_loop_masks (vec_loop_masks *masks)
874 rgroup_masks *rgm;
875 unsigned int i;
876 FOR_EACH_VEC_ELT (*masks, i, rgm)
877 rgm->masks.release ();
878 masks->release ();
881 /* Free all memory used by the _loop_vec_info, as well as all the
882 stmt_vec_info structs of all the stmts in the loop. */
884 _loop_vec_info::~_loop_vec_info ()
886 int nbbs;
887 gimple_stmt_iterator si;
888 int j;
890 nbbs = loop->num_nodes;
891 for (j = 0; j < nbbs; j++)
893 basic_block bb = bbs[j];
894 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
896 gimple *stmt = gsi_stmt (si);
898 /* We may have broken canonical form by moving a constant
899 into RHS1 of a commutative op. Fix such occurrences. */
900 if (operands_swapped && is_gimple_assign (stmt))
902 enum tree_code code = gimple_assign_rhs_code (stmt);
904 if ((code == PLUS_EXPR
905 || code == POINTER_PLUS_EXPR
906 || code == MULT_EXPR)
907 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
908 swap_ssa_operands (stmt,
909 gimple_assign_rhs1_ptr (stmt),
910 gimple_assign_rhs2_ptr (stmt));
911 else if (code == COND_EXPR
912 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
914 tree cond_expr = gimple_assign_rhs1 (stmt);
915 enum tree_code cond_code = TREE_CODE (cond_expr);
917 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
919 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
920 0));
921 cond_code = invert_tree_comparison (cond_code,
922 honor_nans);
923 if (cond_code != ERROR_MARK)
925 TREE_SET_CODE (cond_expr, cond_code);
926 swap_ssa_operands (stmt,
927 gimple_assign_rhs2_ptr (stmt),
928 gimple_assign_rhs3_ptr (stmt));
933 gsi_next (&si);
937 free (bbs);
939 release_vec_loop_masks (&masks);
940 delete ivexpr_map;
942 loop->aux = NULL;
945 /* Return an invariant or register for EXPR and emit necessary
946 computations in the LOOP_VINFO loop preheader. */
948 tree
949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
951 if (is_gimple_reg (expr)
952 || is_gimple_min_invariant (expr))
953 return expr;
955 if (! loop_vinfo->ivexpr_map)
956 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
957 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
958 if (! cached)
960 gimple_seq stmts = NULL;
961 cached = force_gimple_operand (unshare_expr (expr),
962 &stmts, true, NULL_TREE);
963 if (stmts)
965 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
966 gsi_insert_seq_on_edge_immediate (e, stmts);
969 return cached;
972 /* Return true if we can use CMP_TYPE as the comparison type to produce
973 all masks required to mask LOOP_VINFO. */
975 static bool
976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
978 rgroup_masks *rgm;
979 unsigned int i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
981 if (rgm->mask_type != NULL_TREE
982 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
983 cmp_type, rgm->mask_type,
984 OPTIMIZE_FOR_SPEED))
985 return false;
986 return true;
989 /* Calculate the maximum number of scalars per iteration for every
990 rgroup in LOOP_VINFO. */
992 static unsigned int
993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
995 unsigned int res = 1;
996 unsigned int i;
997 rgroup_masks *rgm;
998 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
999 res = MAX (res, rgm->max_nscalars_per_iter);
1000 return res;
1003 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1004 whether we can actually generate the masks required. Return true if so,
1005 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011 unsigned int min_ni_width;
1013 /* Use a normal loop if there are no statements that need masking.
1014 This only happens in rare degenerate cases: it means that the loop
1015 has no loads, no stores, and no live-out values. */
1016 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017 return false;
1019 /* Get the maximum number of iterations that is representable
1020 in the counter type. */
1021 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1024 /* Get a more refined estimate for the number of iterations. */
1025 widest_int max_back_edges;
1026 if (max_loop_iterations (loop, &max_back_edges))
1027 max_ni = wi::smin (max_ni, max_back_edges + 1);
1029 /* Account for rgroup masks, in which each bit is replicated N times. */
1030 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1032 /* Work out how many bits we need to represent the limit. */
1033 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1035 /* Find a scalar mode for which WHILE_ULT is supported. */
1036 opt_scalar_int_mode cmp_mode_iter;
1037 tree cmp_type = NULL_TREE;
1038 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1040 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041 if (cmp_bits >= min_ni_width
1042 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1044 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045 if (this_type
1046 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1048 /* Although we could stop as soon as we find a valid mode,
1049 it's often better to continue until we hit Pmode, since the
1050 operands to the WHILE are more likely to be reusable in
1051 address calculations. */
1052 cmp_type = this_type;
1053 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054 break;
1059 if (!cmp_type)
1060 return false;
1062 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063 return true;
1066 /* Calculate the cost of one scalar iteration of the loop. */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072 int nbbs = loop->num_nodes, factor;
1073 int innerloop_iters, i;
1075 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1077 /* Gather costs for statements in the scalar loop. */
1079 /* FORNOW. */
1080 innerloop_iters = 1;
1081 if (loop->inner)
1082 innerloop_iters = 50; /* FIXME */
1084 for (i = 0; i < nbbs; i++)
1086 gimple_stmt_iterator si;
1087 basic_block bb = bbs[i];
1089 if (bb->loop_father == loop->inner)
1090 factor = innerloop_iters;
1091 else
1092 factor = 1;
1094 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1096 gimple *stmt = gsi_stmt (si);
1097 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100 continue;
1102 /* Skip stmts that are not vectorized inside the loop. */
1103 if (stmt_info
1104 && !STMT_VINFO_RELEVANT_P (stmt_info)
1105 && (!STMT_VINFO_LIVE_P (stmt_info)
1106 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1107 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1108 continue;
1110 vect_cost_for_stmt kind;
1111 if (STMT_VINFO_DATA_REF (stmt_info))
1113 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114 kind = scalar_load;
1115 else
1116 kind = scalar_store;
1118 else
1119 kind = scalar_stmt;
1121 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122 factor, kind, stmt_info, 0, vect_prologue);
1126 /* Now accumulate cost. */
1127 void *target_cost_data = init_cost (loop);
1128 stmt_info_for_cost *si;
1129 int j;
1130 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131 j, si)
1132 (void) add_stmt_cost (target_cost_data, si->count,
1133 si->kind, si->stmt_info, si->misalign,
1134 vect_body);
1135 unsigned dummy, body_cost = 0;
1136 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137 destroy_cost_data (target_cost_data);
1138 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1142 /* Function vect_analyze_loop_form_1.
1144 Verify that certain CFG restrictions hold, including:
1145 - the loop has a pre-header
1146 - the loop has a single entry and exit
1147 - the loop exit condition is simple enough
1148 - the number of iterations can be analyzed, i.e, a countable loop. The
1149 niter could be analyzed under some assumptions. */
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153 tree *assumptions, tree *number_of_iterationsm1,
1154 tree *number_of_iterations, gcond **inner_loop_cond)
1156 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1158 /* Different restrictions apply when we are considering an inner-most loop,
1159 vs. an outer (nested) loop.
1160 (FORNOW. May want to relax some of these restrictions in the future). */
1162 if (!loop->inner)
1164 /* Inner-most loop. We currently require that the number of BBs is
1165 exactly 2 (the header and latch). Vectorizable inner-most loops
1166 look like this:
1168 (pre-header)
1170 header <--------+
1171 | | |
1172 | +--> latch --+
1174 (exit-bb) */
1176 if (loop->num_nodes != 2)
1177 return opt_result::failure_at (vect_location,
1178 "not vectorized:"
1179 " control flow in loop.\n");
1181 if (empty_block_p (loop->header))
1182 return opt_result::failure_at (vect_location,
1183 "not vectorized: empty loop.\n");
1185 else
1187 struct loop *innerloop = loop->inner;
1188 edge entryedge;
1190 /* Nested loop. We currently require that the loop is doubly-nested,
1191 contains a single inner loop, and the number of BBs is exactly 5.
1192 Vectorizable outer-loops look like this:
1194 (pre-header)
1196 header <---+
1198 inner-loop |
1200 tail ------+
1202 (exit-bb)
1204 The inner-loop has the properties expected of inner-most loops
1205 as described above. */
1207 if ((loop->inner)->inner || (loop->inner)->next)
1208 return opt_result::failure_at (vect_location,
1209 "not vectorized:"
1210 " multiple nested loops.\n");
1212 if (loop->num_nodes != 5)
1213 return opt_result::failure_at (vect_location,
1214 "not vectorized:"
1215 " control flow in loop.\n");
1217 entryedge = loop_preheader_edge (innerloop);
1218 if (entryedge->src != loop->header
1219 || !single_exit (innerloop)
1220 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221 return opt_result::failure_at (vect_location,
1222 "not vectorized:"
1223 " unsupported outerloop form.\n");
1225 /* Analyze the inner-loop. */
1226 tree inner_niterm1, inner_niter, inner_assumptions;
1227 opt_result res
1228 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229 &inner_assumptions, &inner_niterm1,
1230 &inner_niter, NULL);
1231 if (!res)
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "not vectorized: Bad inner loop.\n");
1236 return res;
1239 /* Don't support analyzing niter under assumptions for inner
1240 loop. */
1241 if (!integer_onep (inner_assumptions))
1242 return opt_result::failure_at (vect_location,
1243 "not vectorized: Bad inner loop.\n");
1245 if (!expr_invariant_in_loop_p (loop, inner_niter))
1246 return opt_result::failure_at (vect_location,
1247 "not vectorized: inner-loop count not"
1248 " invariant.\n");
1250 if (dump_enabled_p ())
1251 dump_printf_loc (MSG_NOTE, vect_location,
1252 "Considering outer-loop vectorization.\n");
1255 if (!single_exit (loop))
1256 return opt_result::failure_at (vect_location,
1257 "not vectorized: multiple exits.\n");
1258 if (EDGE_COUNT (loop->header->preds) != 2)
1259 return opt_result::failure_at (vect_location,
1260 "not vectorized:"
1261 " too many incoming edges.\n");
1263 /* We assume that the loop exit condition is at the end of the loop. i.e,
1264 that the loop is represented as a do-while (with a proper if-guard
1265 before the loop if needed), where the loop header contains all the
1266 executable statements, and the latch is empty. */
1267 if (!empty_block_p (loop->latch)
1268 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized: latch block not empty.\n");
1272 /* Make sure the exit is not abnormal. */
1273 edge e = single_exit (loop);
1274 if (e->flags & EDGE_ABNORMAL)
1275 return opt_result::failure_at (vect_location,
1276 "not vectorized:"
1277 " abnormal loop exit edge.\n");
1279 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280 number_of_iterationsm1);
1281 if (!*loop_cond)
1282 return opt_result::failure_at
1283 (vect_location,
1284 "not vectorized: complicated exit condition.\n");
1286 if (integer_zerop (*assumptions)
1287 || !*number_of_iterations
1288 || chrec_contains_undetermined (*number_of_iterations))
1289 return opt_result::failure_at
1290 (*loop_cond,
1291 "not vectorized: number of iterations cannot be computed.\n");
1293 if (integer_zerop (*number_of_iterations))
1294 return opt_result::failure_at
1295 (*loop_cond,
1296 "not vectorized: number of iterations = 0.\n");
1298 return opt_result::success ();
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1306 tree assumptions, number_of_iterations, number_of_iterationsm1;
1307 gcond *loop_cond, *inner_loop_cond = NULL;
1309 opt_result res
1310 = vect_analyze_loop_form_1 (loop, &loop_cond,
1311 &assumptions, &number_of_iterationsm1,
1312 &number_of_iterations, &inner_loop_cond);
1313 if (!res)
1314 return opt_loop_vec_info::propagate_failure (res);
1316 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320 if (!integer_onep (assumptions))
1322 /* We consider to vectorize this loop by versioning it under
1323 some assumptions. In order to do this, we need to clear
1324 existing information computed by scev and niter analyzer. */
1325 scev_reset_htab ();
1326 free_numbers_of_iterations_estimates (loop);
1327 /* Also set flag for this loop so that following scev and niter
1328 analysis are done under the assumptions. */
1329 loop_constraint_set (loop, LOOP_C_FINITE);
1330 /* Also record the assumptions for versioning. */
1331 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1336 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Symbolic number of iterations is ");
1340 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341 dump_printf (MSG_NOTE, "\n");
1345 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347 if (inner_loop_cond)
1349 stmt_vec_info inner_loop_cond_info
1350 = loop_vinfo->lookup_stmt (inner_loop_cond);
1351 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1354 gcc_assert (!loop->aux);
1355 loop->aux = loop_vinfo;
1356 return opt_loop_vec_info::success (loop_vinfo);
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362 statements update the vectorization factor. */
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369 int nbbs = loop->num_nodes;
1370 poly_uint64 vectorization_factor;
1371 int i;
1373 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1375 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376 gcc_assert (known_ne (vectorization_factor, 0U));
1378 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379 vectorization factor of the loop is the unrolling factor required by
1380 the SLP instances. If that unrolling factor is 1, we say, that we
1381 perform pure SLP on loop - cross iteration parallelism is not
1382 exploited. */
1383 bool only_slp_in_loop = true;
1384 for (i = 0; i < nbbs; i++)
1386 basic_block bb = bbs[i];
1387 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388 gsi_next (&si))
1390 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391 stmt_info = vect_stmt_to_vectorize (stmt_info);
1392 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394 && !PURE_SLP_STMT (stmt_info))
1395 /* STMT needs both SLP and loop-based vectorization. */
1396 only_slp_in_loop = false;
1400 if (only_slp_in_loop)
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_NOTE, vect_location,
1404 "Loop contains only SLP stmts\n");
1405 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1407 else
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "Loop contains SLP and non-SLP stmts\n");
1412 /* Both the vectorization factor and unroll factor have the form
1413 current_vector_size * X for some rational X, so they must have
1414 a common multiple. */
1415 vectorization_factor
1416 = force_common_multiple (vectorization_factor,
1417 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1420 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1421 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "Updating vectorization factor to ");
1425 dump_dec (MSG_NOTE, vectorization_factor);
1426 dump_printf (MSG_NOTE, ".\n");
1430 /* Return true if STMT_INFO describes a double reduction phi and if
1431 the other phi in the reduction is also relevant for vectorization.
1432 This rejects cases such as:
1434 outer1:
1435 x_1 = PHI <x_3(outer2), ...>;
1438 inner:
1439 x_2 = ...;
1442 outer2:
1443 x_3 = PHI <x_2(inner)>;
1445 if nothing in x_2 or elsewhere makes x_1 relevant. */
1447 static bool
1448 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1450 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1451 return false;
1453 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1456 /* Function vect_analyze_loop_operations.
1458 Scan the loop stmts and make sure they are all vectorizable. */
1460 static opt_result
1461 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1463 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1464 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1465 int nbbs = loop->num_nodes;
1466 int i;
1467 stmt_vec_info stmt_info;
1468 bool need_to_vectorize = false;
1469 bool ok;
1471 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1473 stmt_vector_for_cost cost_vec;
1474 cost_vec.create (2);
1476 for (i = 0; i < nbbs; i++)
1478 basic_block bb = bbs[i];
1480 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1481 gsi_next (&si))
1483 gphi *phi = si.phi ();
1484 ok = true;
1486 stmt_info = loop_vinfo->lookup_stmt (phi);
1487 if (dump_enabled_p ())
1488 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1489 if (virtual_operand_p (gimple_phi_result (phi)))
1490 continue;
1492 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1493 (i.e., a phi in the tail of the outer-loop). */
1494 if (! is_loop_header_bb_p (bb))
1496 /* FORNOW: we currently don't support the case that these phis
1497 are not used in the outerloop (unless it is double reduction,
1498 i.e., this phi is vect_reduction_def), cause this case
1499 requires to actually do something here. */
1500 if (STMT_VINFO_LIVE_P (stmt_info)
1501 && !vect_active_double_reduction_p (stmt_info))
1502 return opt_result::failure_at (phi,
1503 "Unsupported loop-closed phi"
1504 " in outer-loop.\n");
1506 /* If PHI is used in the outer loop, we check that its operand
1507 is defined in the inner loop. */
1508 if (STMT_VINFO_RELEVANT_P (stmt_info))
1510 tree phi_op;
1512 if (gimple_phi_num_args (phi) != 1)
1513 return opt_result::failure_at (phi, "unsupported phi");
1515 phi_op = PHI_ARG_DEF (phi, 0);
1516 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1517 if (!op_def_info)
1518 return opt_result::failure_at (phi, "unsupported phi");
1520 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1521 && (STMT_VINFO_RELEVANT (op_def_info)
1522 != vect_used_in_outer_by_reduction))
1523 return opt_result::failure_at (phi, "unsupported phi");
1526 continue;
1529 gcc_assert (stmt_info);
1531 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1532 || STMT_VINFO_LIVE_P (stmt_info))
1533 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1534 /* A scalar-dependence cycle that we don't support. */
1535 return opt_result::failure_at (phi,
1536 "not vectorized:"
1537 " scalar dependence cycle.\n");
1539 if (STMT_VINFO_RELEVANT_P (stmt_info))
1541 need_to_vectorize = true;
1542 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1543 && ! PURE_SLP_STMT (stmt_info))
1544 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1545 &cost_vec);
1546 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1547 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1548 && ! PURE_SLP_STMT (stmt_info))
1549 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1550 &cost_vec);
1553 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1554 if (ok
1555 && STMT_VINFO_LIVE_P (stmt_info)
1556 && !PURE_SLP_STMT (stmt_info))
1557 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1558 &cost_vec);
1560 if (!ok)
1561 return opt_result::failure_at (phi,
1562 "not vectorized: relevant phi not "
1563 "supported: %G",
1564 static_cast <gimple *> (phi));
1567 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1568 gsi_next (&si))
1570 gimple *stmt = gsi_stmt (si);
1571 if (!gimple_clobber_p (stmt))
1573 opt_result res
1574 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1575 &need_to_vectorize,
1576 NULL, NULL, &cost_vec);
1577 if (!res)
1578 return res;
1581 } /* bbs */
1583 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1584 cost_vec.release ();
1586 /* All operations in the loop are either irrelevant (deal with loop
1587 control, or dead), or only used outside the loop and can be moved
1588 out of the loop (e.g. invariants, inductions). The loop can be
1589 optimized away by scalar optimizations. We're better off not
1590 touching this loop. */
1591 if (!need_to_vectorize)
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_NOTE, vect_location,
1595 "All the computation can be taken out of the loop.\n");
1596 return opt_result::failure_at
1597 (vect_location,
1598 "not vectorized: redundant loop. no profit to vectorize.\n");
1601 return opt_result::success ();
1604 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1605 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1606 definitely no, or -1 if it's worth retrying. */
1608 static int
1609 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1611 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1612 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1614 /* Only fully-masked loops can have iteration counts less than the
1615 vectorization factor. */
1616 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1618 HOST_WIDE_INT max_niter;
1620 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1621 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1622 else
1623 max_niter = max_stmt_executions_int (loop);
1625 if (max_niter != -1
1626 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "not vectorized: iteration count smaller than "
1631 "vectorization factor.\n");
1632 return 0;
1636 int min_profitable_iters, min_profitable_estimate;
1637 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1638 &min_profitable_estimate);
1640 if (min_profitable_iters < 0)
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644 "not vectorized: vectorization not profitable.\n");
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "not vectorized: vector version will never be "
1648 "profitable.\n");
1649 return -1;
1652 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1653 * assumed_vf);
1655 /* Use the cost model only if it is more conservative than user specified
1656 threshold. */
1657 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1658 min_profitable_iters);
1660 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1662 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1663 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "not vectorized: vectorization not profitable.\n");
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_NOTE, vect_location,
1670 "not vectorized: iteration count smaller than user "
1671 "specified loop bound parameter or minimum profitable "
1672 "iterations (whichever is more conservative).\n");
1673 return 0;
1676 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1677 if (estimated_niter == -1)
1678 estimated_niter = likely_max_stmt_executions_int (loop);
1679 if (estimated_niter != -1
1680 && ((unsigned HOST_WIDE_INT) estimated_niter
1681 < MAX (th, (unsigned) min_profitable_estimate)))
1683 if (dump_enabled_p ())
1684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1685 "not vectorized: estimated iteration count too "
1686 "small.\n");
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE, vect_location,
1689 "not vectorized: estimated iteration count smaller "
1690 "than specified loop bound parameter or minimum "
1691 "profitable iterations (whichever is more "
1692 "conservative).\n");
1693 return -1;
1696 return 1;
1699 static opt_result
1700 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1701 vec<data_reference_p> *datarefs,
1702 unsigned int *n_stmts)
1704 *n_stmts = 0;
1705 for (unsigned i = 0; i < loop->num_nodes; i++)
1706 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1707 !gsi_end_p (gsi); gsi_next (&gsi))
1709 gimple *stmt = gsi_stmt (gsi);
1710 if (is_gimple_debug (stmt))
1711 continue;
1712 ++(*n_stmts);
1713 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1714 if (!res)
1716 if (is_gimple_call (stmt) && loop->safelen)
1718 tree fndecl = gimple_call_fndecl (stmt), op;
1719 if (fndecl != NULL_TREE)
1721 cgraph_node *node = cgraph_node::get (fndecl);
1722 if (node != NULL && node->simd_clones != NULL)
1724 unsigned int j, n = gimple_call_num_args (stmt);
1725 for (j = 0; j < n; j++)
1727 op = gimple_call_arg (stmt, j);
1728 if (DECL_P (op)
1729 || (REFERENCE_CLASS_P (op)
1730 && get_base_address (op)))
1731 break;
1733 op = gimple_call_lhs (stmt);
1734 /* Ignore #pragma omp declare simd functions
1735 if they don't have data references in the
1736 call stmt itself. */
1737 if (j == n
1738 && !(op
1739 && (DECL_P (op)
1740 || (REFERENCE_CLASS_P (op)
1741 && get_base_address (op)))))
1742 continue;
1746 return res;
1748 /* If dependence analysis will give up due to the limit on the
1749 number of datarefs stop here and fail fatally. */
1750 if (datarefs->length ()
1751 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1752 return opt_result::failure_at (stmt, "exceeded param "
1753 "loop-max-datarefs-for-datadeps\n");
1755 return opt_result::success ();
1758 /* Function vect_analyze_loop_2.
1760 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1761 for it. The different analyses will record information in the
1762 loop_vec_info struct. */
1763 static opt_result
1764 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1766 opt_result ok = opt_result::success ();
1767 int res;
1768 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1769 poly_uint64 min_vf = 2;
1771 /* The first group of checks is independent of the vector size. */
1772 fatal = true;
1774 /* Find all data references in the loop (which correspond to vdefs/vuses)
1775 and analyze their evolution in the loop. */
1777 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1779 /* Gather the data references and count stmts in the loop. */
1780 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1782 opt_result res
1783 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1784 &LOOP_VINFO_DATAREFS (loop_vinfo),
1785 n_stmts);
1786 if (!res)
1788 if (dump_enabled_p ())
1789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790 "not vectorized: loop contains function "
1791 "calls or data references that cannot "
1792 "be analyzed\n");
1793 return res;
1795 loop_vinfo->shared->save_datarefs ();
1797 else
1798 loop_vinfo->shared->check_datarefs ();
1800 /* Analyze the data references and also adjust the minimal
1801 vectorization factor according to the loads and stores. */
1803 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1804 if (!ok)
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 "bad data references.\n");
1809 return ok;
1812 /* Classify all cross-iteration scalar data-flow cycles.
1813 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1814 vect_analyze_scalar_cycles (loop_vinfo);
1816 vect_pattern_recog (loop_vinfo);
1818 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1820 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1821 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1823 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1824 if (!ok)
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "bad data access.\n");
1829 return ok;
1832 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1834 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1835 if (!ok)
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "unexpected pattern.\n");
1840 return ok;
1843 /* While the rest of the analysis below depends on it in some way. */
1844 fatal = false;
1846 /* Analyze data dependences between the data-refs in the loop
1847 and adjust the maximum vectorization factor according to
1848 the dependences.
1849 FORNOW: fail at the first data dependence that we encounter. */
1851 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1852 if (!ok)
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "bad data dependence.\n");
1857 return ok;
1859 if (max_vf != MAX_VECTORIZATION_FACTOR
1860 && maybe_lt (max_vf, min_vf))
1861 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1862 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1864 ok = vect_determine_vectorization_factor (loop_vinfo);
1865 if (!ok)
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869 "can't determine vectorization factor.\n");
1870 return ok;
1872 if (max_vf != MAX_VECTORIZATION_FACTOR
1873 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1874 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1876 /* Compute the scalar iteration cost. */
1877 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1879 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1880 unsigned th;
1882 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1883 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1884 if (!ok)
1885 return ok;
1887 /* If there are any SLP instances mark them as pure_slp. */
1888 bool slp = vect_make_slp_decision (loop_vinfo);
1889 if (slp)
1891 /* Find stmts that need to be both vectorized and SLPed. */
1892 vect_detect_hybrid_slp (loop_vinfo);
1894 /* Update the vectorization factor based on the SLP decision. */
1895 vect_update_vf_for_slp (loop_vinfo);
1898 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1900 /* We don't expect to have to roll back to anything other than an empty
1901 set of rgroups. */
1902 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1904 /* This is the point where we can re-start analysis with SLP forced off. */
1905 start_over:
1907 /* Now the vectorization factor is final. */
1908 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1909 gcc_assert (known_ne (vectorization_factor, 0U));
1911 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1913 dump_printf_loc (MSG_NOTE, vect_location,
1914 "vectorization_factor = ");
1915 dump_dec (MSG_NOTE, vectorization_factor);
1916 dump_printf (MSG_NOTE, ", niters = %wd\n",
1917 LOOP_VINFO_INT_NITERS (loop_vinfo));
1920 HOST_WIDE_INT max_niter
1921 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1923 /* Analyze the alignment of the data-refs in the loop.
1924 Fail if a data reference is found that cannot be vectorized. */
1926 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1927 if (!ok)
1929 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931 "bad data alignment.\n");
1932 return ok;
1935 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1936 It is important to call pruning after vect_analyze_data_ref_accesses,
1937 since we use grouping information gathered by interleaving analysis. */
1938 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1939 if (!ok)
1940 return ok;
1942 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1943 vectorization, since we do not want to add extra peeling or
1944 add versioning for alignment. */
1945 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946 /* This pass will decide on using loop versioning and/or loop peeling in
1947 order to enhance the alignment of data references in the loop. */
1948 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1949 else
1950 ok = vect_verify_datarefs_alignment (loop_vinfo);
1951 if (!ok)
1952 return ok;
1954 if (slp)
1956 /* Analyze operations in the SLP instances. Note this may
1957 remove unsupported SLP instances which makes the above
1958 SLP kind detection invalid. */
1959 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1960 vect_slp_analyze_operations (loop_vinfo);
1961 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1963 ok = opt_result::failure_at (vect_location,
1964 "unsupported SLP instances\n");
1965 goto again;
1969 /* Scan all the remaining operations in the loop that are not subject
1970 to SLP and make sure they are vectorizable. */
1971 ok = vect_analyze_loop_operations (loop_vinfo);
1972 if (!ok)
1974 if (dump_enabled_p ())
1975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1976 "bad operation or unsupported loop bound.\n");
1977 return ok;
1980 /* Decide whether to use a fully-masked loop for this vectorization
1981 factor. */
1982 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1983 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1984 && vect_verify_full_masking (loop_vinfo));
1985 if (dump_enabled_p ())
1987 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988 dump_printf_loc (MSG_NOTE, vect_location,
1989 "using a fully-masked loop.\n");
1990 else
1991 dump_printf_loc (MSG_NOTE, vect_location,
1992 "not using a fully-masked loop.\n");
1995 /* If epilog loop is required because of data accesses with gaps,
1996 one additional iteration needs to be peeled. Check if there is
1997 enough iterations for vectorization. */
1998 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1999 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2000 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2002 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2003 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2005 if (known_lt (wi::to_widest (scalar_niters), vf))
2006 return opt_result::failure_at (vect_location,
2007 "loop has no enough iterations to"
2008 " support peeling for gaps.\n");
2011 /* Check the costings of the loop make vectorizing worthwhile. */
2012 res = vect_analyze_loop_costing (loop_vinfo);
2013 if (res < 0)
2015 ok = opt_result::failure_at (vect_location,
2016 "Loop costings may not be worthwhile.\n");
2017 goto again;
2019 if (!res)
2020 return opt_result::failure_at (vect_location,
2021 "Loop costings not worthwhile.\n");
2023 /* Decide whether we need to create an epilogue loop to handle
2024 remaining scalar iterations. */
2025 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2027 unsigned HOST_WIDE_INT const_vf;
2028 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2029 /* The main loop handles all iterations. */
2030 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2031 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2034 /* Work out the (constant) number of iterations that need to be
2035 peeled for reasons other than niters. */
2036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2038 peel_niter += 1;
2039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2041 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2044 /* ??? When peeling for gaps but not alignment, we could
2045 try to check whether the (variable) niters is known to be
2046 VF * N + 1. That's something of a niche case though. */
2047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2050 < (unsigned) exact_log2 (const_vf))
2051 /* In case of versioning, check if the maximum number of
2052 iterations is greater than th. If they are identical,
2053 the epilogue is unnecessary. */
2054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2055 || ((unsigned HOST_WIDE_INT) max_niter
2056 > (th / const_vf) * const_vf))))
2057 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2059 /* If an epilogue loop is required make sure we can create one. */
2060 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2061 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2063 if (dump_enabled_p ())
2064 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2065 if (!vect_can_advance_ivs_p (loop_vinfo)
2066 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2067 single_exit (LOOP_VINFO_LOOP
2068 (loop_vinfo))))
2070 ok = opt_result::failure_at (vect_location,
2071 "not vectorized: can't create required "
2072 "epilog loop\n");
2073 goto again;
2077 /* During peeling, we need to check if number of loop iterations is
2078 enough for both peeled prolog loop and vector loop. This check
2079 can be merged along with threshold check of loop versioning, so
2080 increase threshold for this case if necessary. */
2081 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2083 poly_uint64 niters_th = 0;
2085 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2087 /* Niters for peeled prolog loop. */
2088 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2090 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2091 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2092 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2094 else
2095 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2098 /* Niters for at least one iteration of vectorized loop. */
2099 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2100 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2101 /* One additional iteration because of peeling for gap. */
2102 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2103 niters_th += 1;
2104 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2107 gcc_assert (known_eq (vectorization_factor,
2108 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2110 /* Ok to vectorize! */
2111 return opt_result::success ();
2113 again:
2114 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2115 gcc_assert (!ok);
2117 /* Try again with SLP forced off but if we didn't do any SLP there is
2118 no point in re-trying. */
2119 if (!slp)
2120 return ok;
2122 /* If there are reduction chains re-trying will fail anyway. */
2123 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2124 return ok;
2126 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2127 via interleaving or lane instructions. */
2128 slp_instance instance;
2129 slp_tree node;
2130 unsigned i, j;
2131 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2133 stmt_vec_info vinfo;
2134 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2135 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2136 continue;
2137 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2138 unsigned int size = DR_GROUP_SIZE (vinfo);
2139 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2140 if (! vect_store_lanes_supported (vectype, size, false)
2141 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2142 && ! vect_grouped_store_supported (vectype, size))
2143 return opt_result::failure_at (vinfo->stmt,
2144 "unsupported grouped store\n");
2145 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2147 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2148 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2149 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2150 size = DR_GROUP_SIZE (vinfo);
2151 vectype = STMT_VINFO_VECTYPE (vinfo);
2152 if (! vect_load_lanes_supported (vectype, size, false)
2153 && ! vect_grouped_load_supported (vectype, single_element_p,
2154 size))
2155 return opt_result::failure_at (vinfo->stmt,
2156 "unsupported grouped load\n");
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_NOTE, vect_location,
2162 "re-trying with SLP disabled\n");
2164 /* Roll back state appropriately. No SLP this time. */
2165 slp = false;
2166 /* Restore vectorization factor as it were without SLP. */
2167 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2168 /* Free the SLP instances. */
2169 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2170 vect_free_slp_instance (instance, false);
2171 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2172 /* Reset SLP type to loop_vect on all stmts. */
2173 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2175 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2176 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2177 !gsi_end_p (si); gsi_next (&si))
2179 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2180 STMT_SLP_TYPE (stmt_info) = loop_vect;
2182 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2183 !gsi_end_p (si); gsi_next (&si))
2185 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2186 STMT_SLP_TYPE (stmt_info) = loop_vect;
2187 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2189 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2190 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2191 STMT_SLP_TYPE (stmt_info) = loop_vect;
2192 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2193 !gsi_end_p (pi); gsi_next (&pi))
2194 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2195 = loop_vect;
2199 /* Free optimized alias test DDRS. */
2200 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2201 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2202 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2203 /* Reset target cost data. */
2204 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2205 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2206 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2207 /* Reset accumulated rgroup information. */
2208 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2209 /* Reset assorted flags. */
2210 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2211 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2212 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2213 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2214 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2216 goto start_over;
2219 /* Function vect_analyze_loop.
2221 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2222 for it. The different analyses will record information in the
2223 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2224 be vectorized. */
2225 opt_loop_vec_info
2226 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2227 vec_info_shared *shared)
2229 auto_vector_sizes vector_sizes;
2231 /* Autodetect first vector size we try. */
2232 current_vector_size = 0;
2233 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2234 unsigned int next_size = 0;
2236 DUMP_VECT_SCOPE ("analyze_loop_nest");
2238 if (loop_outer (loop)
2239 && loop_vec_info_for_loop (loop_outer (loop))
2240 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2241 return opt_loop_vec_info::failure_at (vect_location,
2242 "outer-loop already vectorized.\n");
2244 if (!find_loop_nest (loop, &shared->loop_nest))
2245 return opt_loop_vec_info::failure_at
2246 (vect_location,
2247 "not vectorized: loop nest containing two or more consecutive inner"
2248 " loops cannot be vectorized\n");
2250 unsigned n_stmts = 0;
2251 poly_uint64 autodetected_vector_size = 0;
2252 while (1)
2254 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2255 opt_loop_vec_info loop_vinfo
2256 = vect_analyze_loop_form (loop, shared);
2257 if (!loop_vinfo)
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261 "bad loop form.\n");
2262 return loop_vinfo;
2265 bool fatal = false;
2267 if (orig_loop_vinfo)
2268 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2270 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2271 if (res)
2273 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2275 return loop_vinfo;
2278 delete loop_vinfo;
2280 if (next_size == 0)
2281 autodetected_vector_size = current_vector_size;
2283 if (next_size < vector_sizes.length ()
2284 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2285 next_size += 1;
2287 if (fatal
2288 || next_size == vector_sizes.length ()
2289 || known_eq (current_vector_size, 0U))
2290 return opt_loop_vec_info::propagate_failure (res);
2292 /* Try the next biggest vector size. */
2293 current_vector_size = vector_sizes[next_size++];
2294 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "***** Re-trying analysis with "
2298 "vector size ");
2299 dump_dec (MSG_NOTE, current_vector_size);
2300 dump_printf (MSG_NOTE, "\n");
2305 /* Return true if there is an in-order reduction function for CODE, storing
2306 it in *REDUC_FN if so. */
2308 static bool
2309 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2311 switch (code)
2313 case PLUS_EXPR:
2314 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2315 return true;
2317 default:
2318 return false;
2322 /* Function reduction_fn_for_scalar_code
2324 Input:
2325 CODE - tree_code of a reduction operations.
2327 Output:
2328 REDUC_FN - the corresponding internal function to be used to reduce the
2329 vector of partial results into a single scalar result, or IFN_LAST
2330 if the operation is a supported reduction operation, but does not have
2331 such an internal function.
2333 Return FALSE if CODE currently cannot be vectorized as reduction. */
2335 static bool
2336 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2338 switch (code)
2340 case MAX_EXPR:
2341 *reduc_fn = IFN_REDUC_MAX;
2342 return true;
2344 case MIN_EXPR:
2345 *reduc_fn = IFN_REDUC_MIN;
2346 return true;
2348 case PLUS_EXPR:
2349 *reduc_fn = IFN_REDUC_PLUS;
2350 return true;
2352 case BIT_AND_EXPR:
2353 *reduc_fn = IFN_REDUC_AND;
2354 return true;
2356 case BIT_IOR_EXPR:
2357 *reduc_fn = IFN_REDUC_IOR;
2358 return true;
2360 case BIT_XOR_EXPR:
2361 *reduc_fn = IFN_REDUC_XOR;
2362 return true;
2364 case MULT_EXPR:
2365 case MINUS_EXPR:
2366 *reduc_fn = IFN_LAST;
2367 return true;
2369 default:
2370 return false;
2374 /* If there is a neutral value X such that SLP reduction NODE would not
2375 be affected by the introduction of additional X elements, return that X,
2376 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2377 is true if the SLP statements perform a single reduction, false if each
2378 statement performs an independent reduction. */
2380 static tree
2381 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2382 bool reduc_chain)
2384 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2385 stmt_vec_info stmt_vinfo = stmts[0];
2386 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2387 tree scalar_type = TREE_TYPE (vector_type);
2388 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2389 gcc_assert (loop);
2391 switch (code)
2393 case WIDEN_SUM_EXPR:
2394 case DOT_PROD_EXPR:
2395 case SAD_EXPR:
2396 case PLUS_EXPR:
2397 case MINUS_EXPR:
2398 case BIT_IOR_EXPR:
2399 case BIT_XOR_EXPR:
2400 return build_zero_cst (scalar_type);
2402 case MULT_EXPR:
2403 return build_one_cst (scalar_type);
2405 case BIT_AND_EXPR:
2406 return build_all_ones_cst (scalar_type);
2408 case MAX_EXPR:
2409 case MIN_EXPR:
2410 /* For MIN/MAX the initial values are neutral. A reduction chain
2411 has only a single initial value, so that value is neutral for
2412 all statements. */
2413 if (reduc_chain)
2414 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2415 loop_preheader_edge (loop));
2416 return NULL_TREE;
2418 default:
2419 return NULL_TREE;
2423 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2424 STMT is printed with a message MSG. */
2426 static void
2427 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2429 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2432 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2433 operation. Return true if the results of DEF_STMT_INFO are something
2434 that can be accumulated by such a reduction. */
2436 static bool
2437 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2439 return (is_gimple_assign (def_stmt_info->stmt)
2440 || is_gimple_call (def_stmt_info->stmt)
2441 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2442 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2443 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2444 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2447 /* Detect SLP reduction of the form:
2449 #a1 = phi <a5, a0>
2450 a2 = operation (a1)
2451 a3 = operation (a2)
2452 a4 = operation (a3)
2453 a5 = operation (a4)
2455 #a = phi <a5>
2457 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2458 FIRST_STMT is the first reduction stmt in the chain
2459 (a2 = operation (a1)).
2461 Return TRUE if a reduction chain was detected. */
2463 static bool
2464 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2465 gimple *first_stmt)
2467 struct loop *loop = (gimple_bb (phi))->loop_father;
2468 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2469 enum tree_code code;
2470 gimple *loop_use_stmt = NULL;
2471 stmt_vec_info use_stmt_info;
2472 tree lhs;
2473 imm_use_iterator imm_iter;
2474 use_operand_p use_p;
2475 int nloop_uses, size = 0, n_out_of_loop_uses;
2476 bool found = false;
2478 if (loop != vect_loop)
2479 return false;
2481 auto_vec<stmt_vec_info, 8> reduc_chain;
2482 lhs = PHI_RESULT (phi);
2483 code = gimple_assign_rhs_code (first_stmt);
2484 while (1)
2486 nloop_uses = 0;
2487 n_out_of_loop_uses = 0;
2488 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2490 gimple *use_stmt = USE_STMT (use_p);
2491 if (is_gimple_debug (use_stmt))
2492 continue;
2494 /* Check if we got back to the reduction phi. */
2495 if (use_stmt == phi)
2497 loop_use_stmt = use_stmt;
2498 found = true;
2499 break;
2502 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2504 loop_use_stmt = use_stmt;
2505 nloop_uses++;
2507 else
2508 n_out_of_loop_uses++;
2510 /* There are can be either a single use in the loop or two uses in
2511 phi nodes. */
2512 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2513 return false;
2516 if (found)
2517 break;
2519 /* We reached a statement with no loop uses. */
2520 if (nloop_uses == 0)
2521 return false;
2523 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2524 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2525 return false;
2527 if (!is_gimple_assign (loop_use_stmt)
2528 || code != gimple_assign_rhs_code (loop_use_stmt)
2529 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2530 return false;
2532 /* Insert USE_STMT into reduction chain. */
2533 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2534 reduc_chain.safe_push (use_stmt_info);
2536 lhs = gimple_assign_lhs (loop_use_stmt);
2537 size++;
2540 if (!found || loop_use_stmt != phi || size < 2)
2541 return false;
2543 /* Swap the operands, if needed, to make the reduction operand be the second
2544 operand. */
2545 lhs = PHI_RESULT (phi);
2546 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2548 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2549 if (gimple_assign_rhs2 (next_stmt) == lhs)
2551 tree op = gimple_assign_rhs1 (next_stmt);
2552 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2554 /* Check that the other def is either defined in the loop
2555 ("vect_internal_def"), or it's an induction (defined by a
2556 loop-header phi-node). */
2557 if (def_stmt_info
2558 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2559 && vect_valid_reduction_input_p (def_stmt_info))
2561 lhs = gimple_assign_lhs (next_stmt);
2562 continue;
2565 return false;
2567 else
2569 tree op = gimple_assign_rhs2 (next_stmt);
2570 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2572 /* Check that the other def is either defined in the loop
2573 ("vect_internal_def"), or it's an induction (defined by a
2574 loop-header phi-node). */
2575 if (def_stmt_info
2576 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2577 && vect_valid_reduction_input_p (def_stmt_info))
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2581 next_stmt);
2583 swap_ssa_operands (next_stmt,
2584 gimple_assign_rhs1_ptr (next_stmt),
2585 gimple_assign_rhs2_ptr (next_stmt));
2586 update_stmt (next_stmt);
2588 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2589 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2591 else
2592 return false;
2595 lhs = gimple_assign_lhs (next_stmt);
2598 /* Build up the actual chain. */
2599 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2601 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2602 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2604 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2605 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2607 /* Save the chain for further analysis in SLP detection. */
2608 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2609 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2611 return true;
2614 /* Return true if we need an in-order reduction for operation CODE
2615 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2616 overflow must wrap. */
2618 static bool
2619 needs_fold_left_reduction_p (tree type, tree_code code,
2620 bool need_wrapping_integral_overflow)
2622 /* CHECKME: check for !flag_finite_math_only too? */
2623 if (SCALAR_FLOAT_TYPE_P (type))
2624 switch (code)
2626 case MIN_EXPR:
2627 case MAX_EXPR:
2628 return false;
2630 default:
2631 return !flag_associative_math;
2634 if (INTEGRAL_TYPE_P (type))
2636 if (!operation_no_trapping_overflow (type, code))
2637 return true;
2638 if (need_wrapping_integral_overflow
2639 && !TYPE_OVERFLOW_WRAPS (type)
2640 && operation_can_overflow (code))
2641 return true;
2642 return false;
2645 if (SAT_FIXED_POINT_TYPE_P (type))
2646 return true;
2648 return false;
2651 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2652 reduction operation CODE has a handled computation expression. */
2654 bool
2655 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2656 tree loop_arg, enum tree_code code)
2658 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2659 auto_bitmap visited;
2660 tree lookfor = PHI_RESULT (phi);
2661 ssa_op_iter curri;
2662 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2663 while (USE_FROM_PTR (curr) != loop_arg)
2664 curr = op_iter_next_use (&curri);
2665 curri.i = curri.numops;
2668 path.safe_push (std::make_pair (curri, curr));
2669 tree use = USE_FROM_PTR (curr);
2670 if (use == lookfor)
2671 break;
2672 gimple *def = SSA_NAME_DEF_STMT (use);
2673 if (gimple_nop_p (def)
2674 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2676 pop:
2679 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2680 curri = x.first;
2681 curr = x.second;
2683 curr = op_iter_next_use (&curri);
2684 /* Skip already visited or non-SSA operands (from iterating
2685 over PHI args). */
2686 while (curr != NULL_USE_OPERAND_P
2687 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2688 || ! bitmap_set_bit (visited,
2689 SSA_NAME_VERSION
2690 (USE_FROM_PTR (curr)))));
2692 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2693 if (curr == NULL_USE_OPERAND_P)
2694 break;
2696 else
2698 if (gimple_code (def) == GIMPLE_PHI)
2699 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2700 else
2701 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2702 while (curr != NULL_USE_OPERAND_P
2703 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2704 || ! bitmap_set_bit (visited,
2705 SSA_NAME_VERSION
2706 (USE_FROM_PTR (curr)))))
2707 curr = op_iter_next_use (&curri);
2708 if (curr == NULL_USE_OPERAND_P)
2709 goto pop;
2712 while (1);
2713 if (dump_file && (dump_flags & TDF_DETAILS))
2715 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2716 unsigned i;
2717 std::pair<ssa_op_iter, use_operand_p> *x;
2718 FOR_EACH_VEC_ELT (path, i, x)
2719 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2720 dump_printf (MSG_NOTE, "\n");
2723 /* Check whether the reduction path detected is valid. */
2724 bool fail = path.length () == 0;
2725 bool neg = false;
2726 for (unsigned i = 1; i < path.length (); ++i)
2728 gimple *use_stmt = USE_STMT (path[i].second);
2729 tree op = USE_FROM_PTR (path[i].second);
2730 if (! has_single_use (op)
2731 || ! is_gimple_assign (use_stmt))
2733 fail = true;
2734 break;
2736 if (gimple_assign_rhs_code (use_stmt) != code)
2738 if (code == PLUS_EXPR
2739 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2741 /* Track whether we negate the reduction value each iteration. */
2742 if (gimple_assign_rhs2 (use_stmt) == op)
2743 neg = ! neg;
2745 else
2747 fail = true;
2748 break;
2752 return ! fail && ! neg;
2756 /* Function vect_is_simple_reduction
2758 (1) Detect a cross-iteration def-use cycle that represents a simple
2759 reduction computation. We look for the following pattern:
2761 loop_header:
2762 a1 = phi < a0, a2 >
2763 a3 = ...
2764 a2 = operation (a3, a1)
2768 a3 = ...
2769 loop_header:
2770 a1 = phi < a0, a2 >
2771 a2 = operation (a3, a1)
2773 such that:
2774 1. operation is commutative and associative and it is safe to
2775 change the order of the computation
2776 2. no uses for a2 in the loop (a2 is used out of the loop)
2777 3. no uses of a1 in the loop besides the reduction operation
2778 4. no uses of a1 outside the loop.
2780 Conditions 1,4 are tested here.
2781 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2783 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2784 nested cycles.
2786 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2787 reductions:
2789 a1 = phi < a0, a2 >
2790 inner loop (def of a3)
2791 a2 = phi < a3 >
2793 (4) Detect condition expressions, ie:
2794 for (int i = 0; i < N; i++)
2795 if (a[i] < val)
2796 ret_val = a[i];
2800 static stmt_vec_info
2801 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2802 bool *double_reduc,
2803 bool need_wrapping_integral_overflow,
2804 enum vect_reduction_type *v_reduc_type)
2806 gphi *phi = as_a <gphi *> (phi_info->stmt);
2807 struct loop *loop = (gimple_bb (phi))->loop_father;
2808 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2809 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2810 gimple *phi_use_stmt = NULL;
2811 enum tree_code orig_code, code;
2812 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2813 tree type;
2814 tree name;
2815 imm_use_iterator imm_iter;
2816 use_operand_p use_p;
2817 bool phi_def;
2819 *double_reduc = false;
2820 *v_reduc_type = TREE_CODE_REDUCTION;
2822 tree phi_name = PHI_RESULT (phi);
2823 /* ??? If there are no uses of the PHI result the inner loop reduction
2824 won't be detected as possibly double-reduction by vectorizable_reduction
2825 because that tries to walk the PHI arg from the preheader edge which
2826 can be constant. See PR60382. */
2827 if (has_zero_uses (phi_name))
2828 return NULL;
2829 unsigned nphi_def_loop_uses = 0;
2830 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2832 gimple *use_stmt = USE_STMT (use_p);
2833 if (is_gimple_debug (use_stmt))
2834 continue;
2836 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2838 if (dump_enabled_p ())
2839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2840 "intermediate value used outside loop.\n");
2842 return NULL;
2845 nphi_def_loop_uses++;
2846 phi_use_stmt = use_stmt;
2849 edge latch_e = loop_latch_edge (loop);
2850 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2851 if (TREE_CODE (loop_arg) != SSA_NAME)
2853 if (dump_enabled_p ())
2854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2855 "reduction: not ssa_name: %T\n", loop_arg);
2856 return NULL;
2859 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2860 if (!def_stmt_info
2861 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2862 return NULL;
2864 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2866 name = gimple_assign_lhs (def_stmt);
2867 phi_def = false;
2869 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2871 name = PHI_RESULT (def_stmt);
2872 phi_def = true;
2874 else
2876 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878 "reduction: unhandled reduction operation: %G",
2879 def_stmt_info->stmt);
2880 return NULL;
2883 unsigned nlatch_def_loop_uses = 0;
2884 auto_vec<gphi *, 3> lcphis;
2885 bool inner_loop_of_double_reduc = false;
2886 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2888 gimple *use_stmt = USE_STMT (use_p);
2889 if (is_gimple_debug (use_stmt))
2890 continue;
2891 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2892 nlatch_def_loop_uses++;
2893 else
2895 /* We can have more than one loop-closed PHI. */
2896 lcphis.safe_push (as_a <gphi *> (use_stmt));
2897 if (nested_in_vect_loop
2898 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2899 == vect_double_reduction_def))
2900 inner_loop_of_double_reduc = true;
2904 /* If this isn't a nested cycle or if the nested cycle reduction value
2905 is used ouside of the inner loop we cannot handle uses of the reduction
2906 value. */
2907 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2908 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912 "reduction used in loop.\n");
2913 return NULL;
2916 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2917 defined in the inner loop. */
2918 if (phi_def)
2920 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2921 op1 = PHI_ARG_DEF (def_stmt, 0);
2923 if (gimple_phi_num_args (def_stmt) != 1
2924 || TREE_CODE (op1) != SSA_NAME)
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2928 "unsupported phi node definition.\n");
2930 return NULL;
2933 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2934 if (gimple_bb (def1)
2935 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2936 && loop->inner
2937 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2938 && is_gimple_assign (def1)
2939 && is_a <gphi *> (phi_use_stmt)
2940 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2942 if (dump_enabled_p ())
2943 report_vect_op (MSG_NOTE, def_stmt,
2944 "detected double reduction: ");
2946 *double_reduc = true;
2947 return def_stmt_info;
2950 return NULL;
2953 /* If we are vectorizing an inner reduction we are executing that
2954 in the original order only in case we are not dealing with a
2955 double reduction. */
2956 bool check_reduction = true;
2957 if (flow_loop_nested_p (vect_loop, loop))
2959 gphi *lcphi;
2960 unsigned i;
2961 check_reduction = false;
2962 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2963 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2965 gimple *use_stmt = USE_STMT (use_p);
2966 if (is_gimple_debug (use_stmt))
2967 continue;
2968 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2969 check_reduction = true;
2973 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2974 code = orig_code = gimple_assign_rhs_code (def_stmt);
2976 if (nested_in_vect_loop && !check_reduction)
2978 /* FIXME: Even for non-reductions code generation is funneled
2979 through vectorizable_reduction for the stmt defining the
2980 PHI latch value. So we have to artificially restrict ourselves
2981 for the supported operations. */
2982 switch (get_gimple_rhs_class (code))
2984 case GIMPLE_BINARY_RHS:
2985 case GIMPLE_TERNARY_RHS:
2986 break;
2987 default:
2988 /* Not supported by vectorizable_reduction. */
2989 if (dump_enabled_p ())
2990 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2991 "nested cycle: not handled operation: ");
2992 return NULL;
2994 if (dump_enabled_p ())
2995 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2996 return def_stmt_info;
2999 /* We can handle "res -= x[i]", which is non-associative by
3000 simply rewriting this into "res += -x[i]". Avoid changing
3001 gimple instruction for the first simple tests and only do this
3002 if we're allowed to change code at all. */
3003 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3004 code = PLUS_EXPR;
3006 if (code == COND_EXPR)
3008 if (! nested_in_vect_loop)
3009 *v_reduc_type = COND_REDUCTION;
3011 op3 = gimple_assign_rhs1 (def_stmt);
3012 if (COMPARISON_CLASS_P (op3))
3014 op4 = TREE_OPERAND (op3, 1);
3015 op3 = TREE_OPERAND (op3, 0);
3017 if (op3 == phi_name || op4 == phi_name)
3019 if (dump_enabled_p ())
3020 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3021 "reduction: condition depends on previous"
3022 " iteration: ");
3023 return NULL;
3026 op1 = gimple_assign_rhs2 (def_stmt);
3027 op2 = gimple_assign_rhs3 (def_stmt);
3029 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3031 if (dump_enabled_p ())
3032 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3033 "reduction: not commutative/associative: ");
3034 return NULL;
3036 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3038 op1 = gimple_assign_rhs1 (def_stmt);
3039 op2 = gimple_assign_rhs2 (def_stmt);
3041 else
3043 if (dump_enabled_p ())
3044 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3045 "reduction: not handled operation: ");
3046 return NULL;
3049 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3051 if (dump_enabled_p ())
3052 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3053 "reduction: both uses not ssa_names: ");
3055 return NULL;
3058 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3059 if ((TREE_CODE (op1) == SSA_NAME
3060 && !types_compatible_p (type,TREE_TYPE (op1)))
3061 || (TREE_CODE (op2) == SSA_NAME
3062 && !types_compatible_p (type, TREE_TYPE (op2)))
3063 || (op3 && TREE_CODE (op3) == SSA_NAME
3064 && !types_compatible_p (type, TREE_TYPE (op3)))
3065 || (op4 && TREE_CODE (op4) == SSA_NAME
3066 && !types_compatible_p (type, TREE_TYPE (op4))))
3068 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_NOTE, vect_location,
3071 "reduction: multiple types: operation type: "
3072 "%T, operands types: %T,%T",
3073 type, TREE_TYPE (op1), TREE_TYPE (op2));
3074 if (op3)
3075 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3077 if (op4)
3078 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3079 dump_printf (MSG_NOTE, "\n");
3082 return NULL;
3085 /* Check whether it's ok to change the order of the computation.
3086 Generally, when vectorizing a reduction we change the order of the
3087 computation. This may change the behavior of the program in some
3088 cases, so we need to check that this is ok. One exception is when
3089 vectorizing an outer-loop: the inner-loop is executed sequentially,
3090 and therefore vectorizing reductions in the inner-loop during
3091 outer-loop vectorization is safe. */
3092 if (check_reduction
3093 && *v_reduc_type == TREE_CODE_REDUCTION
3094 && needs_fold_left_reduction_p (type, code,
3095 need_wrapping_integral_overflow))
3096 *v_reduc_type = FOLD_LEFT_REDUCTION;
3098 /* Reduction is safe. We're dealing with one of the following:
3099 1) integer arithmetic and no trapv
3100 2) floating point arithmetic, and special flags permit this optimization
3101 3) nested cycle (i.e., outer loop vectorization). */
3102 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3103 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3104 if (code != COND_EXPR && !def1_info && !def2_info)
3106 if (dump_enabled_p ())
3107 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3108 return NULL;
3111 /* Check that one def is the reduction def, defined by PHI,
3112 the other def is either defined in the loop ("vect_internal_def"),
3113 or it's an induction (defined by a loop-header phi-node). */
3115 if (def2_info
3116 && def2_info->stmt == phi
3117 && (code == COND_EXPR
3118 || !def1_info
3119 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3120 || vect_valid_reduction_input_p (def1_info)))
3122 if (dump_enabled_p ())
3123 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3124 return def_stmt_info;
3127 if (def1_info
3128 && def1_info->stmt == phi
3129 && (code == COND_EXPR
3130 || !def2_info
3131 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3132 || vect_valid_reduction_input_p (def2_info)))
3134 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3136 /* Check if we can swap operands (just for simplicity - so that
3137 the rest of the code can assume that the reduction variable
3138 is always the last (second) argument). */
3139 if (code == COND_EXPR)
3141 /* Swap cond_expr by inverting the condition. */
3142 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3143 enum tree_code invert_code = ERROR_MARK;
3144 enum tree_code cond_code = TREE_CODE (cond_expr);
3146 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3148 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3149 invert_code = invert_tree_comparison (cond_code, honor_nans);
3151 if (invert_code != ERROR_MARK)
3153 TREE_SET_CODE (cond_expr, invert_code);
3154 swap_ssa_operands (def_stmt,
3155 gimple_assign_rhs2_ptr (def_stmt),
3156 gimple_assign_rhs3_ptr (def_stmt));
3158 else
3160 if (dump_enabled_p ())
3161 report_vect_op (MSG_NOTE, def_stmt,
3162 "detected reduction: cannot swap operands "
3163 "for cond_expr");
3164 return NULL;
3167 else
3168 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3169 gimple_assign_rhs2_ptr (def_stmt));
3171 if (dump_enabled_p ())
3172 report_vect_op (MSG_NOTE, def_stmt,
3173 "detected reduction: need to swap operands: ");
3175 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3176 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3178 else
3180 if (dump_enabled_p ())
3181 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3184 return def_stmt_info;
3187 /* Try to find SLP reduction chain. */
3188 if (! nested_in_vect_loop
3189 && code != COND_EXPR
3190 && orig_code != MINUS_EXPR
3191 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3193 if (dump_enabled_p ())
3194 report_vect_op (MSG_NOTE, def_stmt,
3195 "reduction: detected reduction chain: ");
3197 return def_stmt_info;
3200 /* Look for the expression computing loop_arg from loop PHI result. */
3201 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3202 return def_stmt_info;
3204 if (dump_enabled_p ())
3206 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3207 "reduction: unknown pattern: ");
3210 return NULL;
3213 /* Wrapper around vect_is_simple_reduction, which will modify code
3214 in-place if it enables detection of more reductions. Arguments
3215 as there. */
3217 stmt_vec_info
3218 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3219 bool *double_reduc,
3220 bool need_wrapping_integral_overflow)
3222 enum vect_reduction_type v_reduc_type;
3223 stmt_vec_info def_info
3224 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3225 need_wrapping_integral_overflow,
3226 &v_reduc_type);
3227 if (def_info)
3229 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3230 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3231 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3232 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3234 return def_info;
3237 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3239 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3240 int *peel_iters_epilogue,
3241 stmt_vector_for_cost *scalar_cost_vec,
3242 stmt_vector_for_cost *prologue_cost_vec,
3243 stmt_vector_for_cost *epilogue_cost_vec)
3245 int retval = 0;
3246 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3248 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3250 *peel_iters_epilogue = assumed_vf / 2;
3251 if (dump_enabled_p ())
3252 dump_printf_loc (MSG_NOTE, vect_location,
3253 "cost model: epilogue peel iters set to vf/2 "
3254 "because loop iterations are unknown .\n");
3256 /* If peeled iterations are known but number of scalar loop
3257 iterations are unknown, count a taken branch per peeled loop. */
3258 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3259 NULL, 0, vect_prologue);
3260 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3261 NULL, 0, vect_epilogue);
3263 else
3265 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3266 peel_iters_prologue = niters < peel_iters_prologue ?
3267 niters : peel_iters_prologue;
3268 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3269 /* If we need to peel for gaps, but no peeling is required, we have to
3270 peel VF iterations. */
3271 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3272 *peel_iters_epilogue = assumed_vf;
3275 stmt_info_for_cost *si;
3276 int j;
3277 if (peel_iters_prologue)
3278 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3279 retval += record_stmt_cost (prologue_cost_vec,
3280 si->count * peel_iters_prologue,
3281 si->kind, si->stmt_info, si->misalign,
3282 vect_prologue);
3283 if (*peel_iters_epilogue)
3284 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3285 retval += record_stmt_cost (epilogue_cost_vec,
3286 si->count * *peel_iters_epilogue,
3287 si->kind, si->stmt_info, si->misalign,
3288 vect_epilogue);
3290 return retval;
3293 /* Function vect_estimate_min_profitable_iters
3295 Return the number of iterations required for the vector version of the
3296 loop to be profitable relative to the cost of the scalar version of the
3297 loop.
3299 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3300 of iterations for vectorization. -1 value means loop vectorization
3301 is not profitable. This returned value may be used for dynamic
3302 profitability check.
3304 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3305 for static check against estimated number of iterations. */
3307 static void
3308 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3309 int *ret_min_profitable_niters,
3310 int *ret_min_profitable_estimate)
3312 int min_profitable_iters;
3313 int min_profitable_estimate;
3314 int peel_iters_prologue;
3315 int peel_iters_epilogue;
3316 unsigned vec_inside_cost = 0;
3317 int vec_outside_cost = 0;
3318 unsigned vec_prologue_cost = 0;
3319 unsigned vec_epilogue_cost = 0;
3320 int scalar_single_iter_cost = 0;
3321 int scalar_outside_cost = 0;
3322 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3323 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3324 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3326 /* Cost model disabled. */
3327 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3331 *ret_min_profitable_niters = 0;
3332 *ret_min_profitable_estimate = 0;
3333 return;
3336 /* Requires loop versioning tests to handle misalignment. */
3337 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3339 /* FIXME: Make cost depend on complexity of individual check. */
3340 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3341 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3342 vect_prologue);
3343 if (dump_enabled_p ())
3344 dump_printf (MSG_NOTE,
3345 "cost model: Adding cost of checks for loop "
3346 "versioning to treat misalignment.\n");
3349 /* Requires loop versioning with alias checks. */
3350 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3352 /* FIXME: Make cost depend on complexity of individual check. */
3353 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3354 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3355 vect_prologue);
3356 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3357 if (len)
3358 /* Count LEN - 1 ANDs and LEN comparisons. */
3359 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3360 NULL, 0, vect_prologue);
3361 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3362 if (len)
3364 /* Count LEN - 1 ANDs and LEN comparisons. */
3365 unsigned int nstmts = len * 2 - 1;
3366 /* +1 for each bias that needs adding. */
3367 for (unsigned int i = 0; i < len; ++i)
3368 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3369 nstmts += 1;
3370 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3371 NULL, 0, vect_prologue);
3373 if (dump_enabled_p ())
3374 dump_printf (MSG_NOTE,
3375 "cost model: Adding cost of checks for loop "
3376 "versioning aliasing.\n");
3379 /* Requires loop versioning with niter checks. */
3380 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3382 /* FIXME: Make cost depend on complexity of individual check. */
3383 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3384 vect_prologue);
3385 if (dump_enabled_p ())
3386 dump_printf (MSG_NOTE,
3387 "cost model: Adding cost of checks for loop "
3388 "versioning niters.\n");
3391 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3392 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3393 vect_prologue);
3395 /* Count statements in scalar loop. Using this as scalar cost for a single
3396 iteration for now.
3398 TODO: Add outer loop support.
3400 TODO: Consider assigning different costs to different scalar
3401 statements. */
3403 scalar_single_iter_cost
3404 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3406 /* Add additional cost for the peeled instructions in prologue and epilogue
3407 loop. (For fully-masked loops there will be no peeling.)
3409 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3410 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3412 TODO: Build an expression that represents peel_iters for prologue and
3413 epilogue to be used in a run-time test. */
3415 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3417 peel_iters_prologue = 0;
3418 peel_iters_epilogue = 0;
3420 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3422 /* We need to peel exactly one iteration. */
3423 peel_iters_epilogue += 1;
3424 stmt_info_for_cost *si;
3425 int j;
3426 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3427 j, si)
3428 (void) add_stmt_cost (target_cost_data, si->count,
3429 si->kind, si->stmt_info, si->misalign,
3430 vect_epilogue);
3433 else if (npeel < 0)
3435 peel_iters_prologue = assumed_vf / 2;
3436 if (dump_enabled_p ())
3437 dump_printf (MSG_NOTE, "cost model: "
3438 "prologue peel iters set to vf/2.\n");
3440 /* If peeling for alignment is unknown, loop bound of main loop becomes
3441 unknown. */
3442 peel_iters_epilogue = assumed_vf / 2;
3443 if (dump_enabled_p ())
3444 dump_printf (MSG_NOTE, "cost model: "
3445 "epilogue peel iters set to vf/2 because "
3446 "peeling for alignment is unknown.\n");
3448 /* If peeled iterations are unknown, count a taken branch and a not taken
3449 branch per peeled loop. Even if scalar loop iterations are known,
3450 vector iterations are not known since peeled prologue iterations are
3451 not known. Hence guards remain the same. */
3452 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3453 NULL, 0, vect_prologue);
3454 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3455 NULL, 0, vect_prologue);
3456 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3457 NULL, 0, vect_epilogue);
3458 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3459 NULL, 0, vect_epilogue);
3460 stmt_info_for_cost *si;
3461 int j;
3462 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3464 (void) add_stmt_cost (target_cost_data,
3465 si->count * peel_iters_prologue,
3466 si->kind, si->stmt_info, si->misalign,
3467 vect_prologue);
3468 (void) add_stmt_cost (target_cost_data,
3469 si->count * peel_iters_epilogue,
3470 si->kind, si->stmt_info, si->misalign,
3471 vect_epilogue);
3474 else
3476 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3477 stmt_info_for_cost *si;
3478 int j;
3479 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3481 prologue_cost_vec.create (2);
3482 epilogue_cost_vec.create (2);
3483 peel_iters_prologue = npeel;
3485 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3486 &peel_iters_epilogue,
3487 &LOOP_VINFO_SCALAR_ITERATION_COST
3488 (loop_vinfo),
3489 &prologue_cost_vec,
3490 &epilogue_cost_vec);
3492 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3493 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3494 si->misalign, vect_prologue);
3496 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3497 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3498 si->misalign, vect_epilogue);
3500 prologue_cost_vec.release ();
3501 epilogue_cost_vec.release ();
3504 /* FORNOW: The scalar outside cost is incremented in one of the
3505 following ways:
3507 1. The vectorizer checks for alignment and aliasing and generates
3508 a condition that allows dynamic vectorization. A cost model
3509 check is ANDED with the versioning condition. Hence scalar code
3510 path now has the added cost of the versioning check.
3512 if (cost > th & versioning_check)
3513 jmp to vector code
3515 Hence run-time scalar is incremented by not-taken branch cost.
3517 2. The vectorizer then checks if a prologue is required. If the
3518 cost model check was not done before during versioning, it has to
3519 be done before the prologue check.
3521 if (cost <= th)
3522 prologue = scalar_iters
3523 if (prologue == 0)
3524 jmp to vector code
3525 else
3526 execute prologue
3527 if (prologue == num_iters)
3528 go to exit
3530 Hence the run-time scalar cost is incremented by a taken branch,
3531 plus a not-taken branch, plus a taken branch cost.
3533 3. The vectorizer then checks if an epilogue is required. If the
3534 cost model check was not done before during prologue check, it
3535 has to be done with the epilogue check.
3537 if (prologue == 0)
3538 jmp to vector code
3539 else
3540 execute prologue
3541 if (prologue == num_iters)
3542 go to exit
3543 vector code:
3544 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3545 jmp to epilogue
3547 Hence the run-time scalar cost should be incremented by 2 taken
3548 branches.
3550 TODO: The back end may reorder the BBS's differently and reverse
3551 conditions/branch directions. Change the estimates below to
3552 something more reasonable. */
3554 /* If the number of iterations is known and we do not do versioning, we can
3555 decide whether to vectorize at compile time. Hence the scalar version
3556 do not carry cost model guard costs. */
3557 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3558 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3560 /* Cost model check occurs at versioning. */
3561 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3562 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3563 else
3565 /* Cost model check occurs at prologue generation. */
3566 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3567 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3568 + vect_get_stmt_cost (cond_branch_not_taken);
3569 /* Cost model check occurs at epilogue generation. */
3570 else
3571 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3575 /* Complete the target-specific cost calculations. */
3576 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3577 &vec_inside_cost, &vec_epilogue_cost);
3579 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3581 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3584 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3585 vec_inside_cost);
3586 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3587 vec_prologue_cost);
3588 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3589 vec_epilogue_cost);
3590 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3591 scalar_single_iter_cost);
3592 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3593 scalar_outside_cost);
3594 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3595 vec_outside_cost);
3596 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3597 peel_iters_prologue);
3598 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3599 peel_iters_epilogue);
3602 /* Calculate number of iterations required to make the vector version
3603 profitable, relative to the loop bodies only. The following condition
3604 must hold true:
3605 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3606 where
3607 SIC = scalar iteration cost, VIC = vector iteration cost,
3608 VOC = vector outside cost, VF = vectorization factor,
3609 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3610 SOC = scalar outside cost for run time cost model check. */
3612 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3614 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3615 * assumed_vf
3616 - vec_inside_cost * peel_iters_prologue
3617 - vec_inside_cost * peel_iters_epilogue);
3618 if (min_profitable_iters <= 0)
3619 min_profitable_iters = 0;
3620 else
3622 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3623 - vec_inside_cost);
3625 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3626 <= (((int) vec_inside_cost * min_profitable_iters)
3627 + (((int) vec_outside_cost - scalar_outside_cost)
3628 * assumed_vf)))
3629 min_profitable_iters++;
3632 /* vector version will never be profitable. */
3633 else
3635 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3636 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3637 "vectorization did not happen for a simd loop");
3639 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641 "cost model: the vector iteration cost = %d "
3642 "divided by the scalar iteration cost = %d "
3643 "is greater or equal to the vectorization factor = %d"
3644 ".\n",
3645 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3646 *ret_min_profitable_niters = -1;
3647 *ret_min_profitable_estimate = -1;
3648 return;
3651 if (dump_enabled_p ())
3652 dump_printf (MSG_NOTE,
3653 " Calculated minimum iters for profitability: %d\n",
3654 min_profitable_iters);
3656 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3657 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3658 /* We want the vectorized loop to execute at least once. */
3659 min_profitable_iters = assumed_vf + peel_iters_prologue;
3661 if (dump_enabled_p ())
3662 dump_printf_loc (MSG_NOTE, vect_location,
3663 " Runtime profitability threshold = %d\n",
3664 min_profitable_iters);
3666 *ret_min_profitable_niters = min_profitable_iters;
3668 /* Calculate number of iterations required to make the vector version
3669 profitable, relative to the loop bodies only.
3671 Non-vectorized variant is SIC * niters and it must win over vector
3672 variant on the expected loop trip count. The following condition must hold true:
3673 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3675 if (vec_outside_cost <= 0)
3676 min_profitable_estimate = 0;
3677 else
3679 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3680 * assumed_vf
3681 - vec_inside_cost * peel_iters_prologue
3682 - vec_inside_cost * peel_iters_epilogue)
3683 / ((scalar_single_iter_cost * assumed_vf)
3684 - vec_inside_cost);
3686 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3687 if (dump_enabled_p ())
3688 dump_printf_loc (MSG_NOTE, vect_location,
3689 " Static estimate profitability threshold = %d\n",
3690 min_profitable_estimate);
3692 *ret_min_profitable_estimate = min_profitable_estimate;
3695 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3696 vector elements (not bits) for a vector with NELT elements. */
3697 static void
3698 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3699 vec_perm_builder *sel)
3701 /* The encoding is a single stepped pattern. Any wrap-around is handled
3702 by vec_perm_indices. */
3703 sel->new_vector (nelt, 1, 3);
3704 for (unsigned int i = 0; i < 3; i++)
3705 sel->quick_push (i + offset);
3708 /* Checks whether the target supports whole-vector shifts for vectors of mode
3709 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3710 it supports vec_perm_const with masks for all necessary shift amounts. */
3711 static bool
3712 have_whole_vector_shift (machine_mode mode)
3714 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3715 return true;
3717 /* Variable-length vectors should be handled via the optab. */
3718 unsigned int nelt;
3719 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3720 return false;
3722 vec_perm_builder sel;
3723 vec_perm_indices indices;
3724 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3726 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3727 indices.new_vector (sel, 2, nelt);
3728 if (!can_vec_perm_const_p (mode, indices, false))
3729 return false;
3731 return true;
3734 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3735 functions. Design better to avoid maintenance issues. */
3737 /* Function vect_model_reduction_cost.
3739 Models cost for a reduction operation, including the vector ops
3740 generated within the strip-mine loop, the initial definition before
3741 the loop, and the epilogue code that must be generated. */
3743 static void
3744 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3745 int ncopies, stmt_vector_for_cost *cost_vec)
3747 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3748 enum tree_code code;
3749 optab optab;
3750 tree vectype;
3751 machine_mode mode;
3752 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3753 struct loop *loop = NULL;
3755 if (loop_vinfo)
3756 loop = LOOP_VINFO_LOOP (loop_vinfo);
3758 /* Condition reductions generate two reductions in the loop. */
3759 vect_reduction_type reduction_type
3760 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3761 if (reduction_type == COND_REDUCTION)
3762 ncopies *= 2;
3764 vectype = STMT_VINFO_VECTYPE (stmt_info);
3765 mode = TYPE_MODE (vectype);
3766 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3768 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3770 if (reduction_type == EXTRACT_LAST_REDUCTION
3771 || reduction_type == FOLD_LEFT_REDUCTION)
3773 /* No extra instructions needed in the prologue. */
3774 prologue_cost = 0;
3776 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3777 /* Count one reduction-like operation per vector. */
3778 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3779 stmt_info, 0, vect_body);
3780 else
3782 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3783 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3784 inside_cost = record_stmt_cost (cost_vec, nelements,
3785 vec_to_scalar, stmt_info, 0,
3786 vect_body);
3787 inside_cost += record_stmt_cost (cost_vec, nelements,
3788 scalar_stmt, stmt_info, 0,
3789 vect_body);
3792 else
3794 /* Add in cost for initial definition.
3795 For cond reduction we have four vectors: initial index, step,
3796 initial result of the data reduction, initial value of the index
3797 reduction. */
3798 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3799 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3800 scalar_to_vec, stmt_info, 0,
3801 vect_prologue);
3803 /* Cost of reduction op inside loop. */
3804 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3805 stmt_info, 0, vect_body);
3808 /* Determine cost of epilogue code.
3810 We have a reduction operator that will reduce the vector in one statement.
3811 Also requires scalar extract. */
3813 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3815 if (reduc_fn != IFN_LAST)
3817 if (reduction_type == COND_REDUCTION)
3819 /* An EQ stmt and an COND_EXPR stmt. */
3820 epilogue_cost += record_stmt_cost (cost_vec, 2,
3821 vector_stmt, stmt_info, 0,
3822 vect_epilogue);
3823 /* Reduction of the max index and a reduction of the found
3824 values. */
3825 epilogue_cost += record_stmt_cost (cost_vec, 2,
3826 vec_to_scalar, stmt_info, 0,
3827 vect_epilogue);
3828 /* A broadcast of the max value. */
3829 epilogue_cost += record_stmt_cost (cost_vec, 1,
3830 scalar_to_vec, stmt_info, 0,
3831 vect_epilogue);
3833 else
3835 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3836 stmt_info, 0, vect_epilogue);
3837 epilogue_cost += record_stmt_cost (cost_vec, 1,
3838 vec_to_scalar, stmt_info, 0,
3839 vect_epilogue);
3842 else if (reduction_type == COND_REDUCTION)
3844 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3845 /* Extraction of scalar elements. */
3846 epilogue_cost += record_stmt_cost (cost_vec,
3847 2 * estimated_nunits,
3848 vec_to_scalar, stmt_info, 0,
3849 vect_epilogue);
3850 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3851 epilogue_cost += record_stmt_cost (cost_vec,
3852 2 * estimated_nunits - 3,
3853 scalar_stmt, stmt_info, 0,
3854 vect_epilogue);
3856 else if (reduction_type == EXTRACT_LAST_REDUCTION
3857 || reduction_type == FOLD_LEFT_REDUCTION)
3858 /* No extra instructions need in the epilogue. */
3860 else
3862 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3863 tree bitsize =
3864 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3865 int element_bitsize = tree_to_uhwi (bitsize);
3866 int nelements = vec_size_in_bits / element_bitsize;
3868 if (code == COND_EXPR)
3869 code = MAX_EXPR;
3871 optab = optab_for_tree_code (code, vectype, optab_default);
3873 /* We have a whole vector shift available. */
3874 if (optab != unknown_optab
3875 && VECTOR_MODE_P (mode)
3876 && optab_handler (optab, mode) != CODE_FOR_nothing
3877 && have_whole_vector_shift (mode))
3879 /* Final reduction via vector shifts and the reduction operator.
3880 Also requires scalar extract. */
3881 epilogue_cost += record_stmt_cost (cost_vec,
3882 exact_log2 (nelements) * 2,
3883 vector_stmt, stmt_info, 0,
3884 vect_epilogue);
3885 epilogue_cost += record_stmt_cost (cost_vec, 1,
3886 vec_to_scalar, stmt_info, 0,
3887 vect_epilogue);
3889 else
3890 /* Use extracts and reduction op for final reduction. For N
3891 elements, we have N extracts and N-1 reduction ops. */
3892 epilogue_cost += record_stmt_cost (cost_vec,
3893 nelements + nelements - 1,
3894 vector_stmt, stmt_info, 0,
3895 vect_epilogue);
3899 if (dump_enabled_p ())
3900 dump_printf (MSG_NOTE,
3901 "vect_model_reduction_cost: inside_cost = %d, "
3902 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3903 prologue_cost, epilogue_cost);
3907 /* Function vect_model_induction_cost.
3909 Models cost for induction operations. */
3911 static void
3912 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3913 stmt_vector_for_cost *cost_vec)
3915 unsigned inside_cost, prologue_cost;
3917 if (PURE_SLP_STMT (stmt_info))
3918 return;
3920 /* loop cost for vec_loop. */
3921 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3922 stmt_info, 0, vect_body);
3924 /* prologue cost for vec_init and vec_step. */
3925 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3926 stmt_info, 0, vect_prologue);
3928 if (dump_enabled_p ())
3929 dump_printf_loc (MSG_NOTE, vect_location,
3930 "vect_model_induction_cost: inside_cost = %d, "
3931 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3936 /* Function get_initial_def_for_reduction
3938 Input:
3939 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3940 INIT_VAL - the initial value of the reduction variable
3942 Output:
3943 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3944 of the reduction (used for adjusting the epilog - see below).
3945 Return a vector variable, initialized according to the operation that
3946 STMT_VINFO performs. This vector will be used as the initial value
3947 of the vector of partial results.
3949 Option1 (adjust in epilog): Initialize the vector as follows:
3950 add/bit or/xor: [0,0,...,0,0]
3951 mult/bit and: [1,1,...,1,1]
3952 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3953 and when necessary (e.g. add/mult case) let the caller know
3954 that it needs to adjust the result by init_val.
3956 Option2: Initialize the vector as follows:
3957 add/bit or/xor: [init_val,0,0,...,0]
3958 mult/bit and: [init_val,1,1,...,1]
3959 min/max/cond_expr: [init_val,init_val,...,init_val]
3960 and no adjustments are needed.
3962 For example, for the following code:
3964 s = init_val;
3965 for (i=0;i<n;i++)
3966 s = s + a[i];
3968 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3969 For a vector of 4 units, we want to return either [0,0,0,init_val],
3970 or [0,0,0,0] and let the caller know that it needs to adjust
3971 the result at the end by 'init_val'.
3973 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3974 initialization vector is simpler (same element in all entries), if
3975 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3977 A cost model should help decide between these two schemes. */
3979 tree
3980 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
3981 tree *adjustment_def)
3983 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3985 tree scalar_type = TREE_TYPE (init_val);
3986 tree vectype = get_vectype_for_scalar_type (scalar_type);
3987 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
3988 tree def_for_init;
3989 tree init_def;
3990 REAL_VALUE_TYPE real_init_val = dconst0;
3991 int int_init_val = 0;
3992 gimple_seq stmts = NULL;
3994 gcc_assert (vectype);
3996 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3997 || SCALAR_FLOAT_TYPE_P (scalar_type));
3999 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4000 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4002 vect_reduction_type reduction_type
4003 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4005 switch (code)
4007 case WIDEN_SUM_EXPR:
4008 case DOT_PROD_EXPR:
4009 case SAD_EXPR:
4010 case PLUS_EXPR:
4011 case MINUS_EXPR:
4012 case BIT_IOR_EXPR:
4013 case BIT_XOR_EXPR:
4014 case MULT_EXPR:
4015 case BIT_AND_EXPR:
4017 /* ADJUSTMENT_DEF is NULL when called from
4018 vect_create_epilog_for_reduction to vectorize double reduction. */
4019 if (adjustment_def)
4020 *adjustment_def = init_val;
4022 if (code == MULT_EXPR)
4024 real_init_val = dconst1;
4025 int_init_val = 1;
4028 if (code == BIT_AND_EXPR)
4029 int_init_val = -1;
4031 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4032 def_for_init = build_real (scalar_type, real_init_val);
4033 else
4034 def_for_init = build_int_cst (scalar_type, int_init_val);
4036 if (adjustment_def)
4037 /* Option1: the first element is '0' or '1' as well. */
4038 init_def = gimple_build_vector_from_val (&stmts, vectype,
4039 def_for_init);
4040 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4042 /* Option2 (variable length): the first element is INIT_VAL. */
4043 init_def = gimple_build_vector_from_val (&stmts, vectype,
4044 def_for_init);
4045 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4046 vectype, init_def, init_val);
4048 else
4050 /* Option2: the first element is INIT_VAL. */
4051 tree_vector_builder elts (vectype, 1, 2);
4052 elts.quick_push (init_val);
4053 elts.quick_push (def_for_init);
4054 init_def = gimple_build_vector (&stmts, &elts);
4057 break;
4059 case MIN_EXPR:
4060 case MAX_EXPR:
4061 case COND_EXPR:
4063 if (adjustment_def)
4065 *adjustment_def = NULL_TREE;
4066 if (reduction_type != COND_REDUCTION
4067 && reduction_type != EXTRACT_LAST_REDUCTION)
4069 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4070 break;
4073 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4074 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4076 break;
4078 default:
4079 gcc_unreachable ();
4082 if (stmts)
4083 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4084 return init_def;
4087 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4088 NUMBER_OF_VECTORS is the number of vector defs to create.
4089 If NEUTRAL_OP is nonnull, introducing extra elements of that
4090 value will not change the result. */
4092 static void
4093 get_initial_defs_for_reduction (slp_tree slp_node,
4094 vec<tree> *vec_oprnds,
4095 unsigned int number_of_vectors,
4096 bool reduc_chain, tree neutral_op)
4098 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4099 stmt_vec_info stmt_vinfo = stmts[0];
4100 unsigned HOST_WIDE_INT nunits;
4101 unsigned j, number_of_places_left_in_vector;
4102 tree vector_type;
4103 tree vop;
4104 int group_size = stmts.length ();
4105 unsigned int vec_num, i;
4106 unsigned number_of_copies = 1;
4107 vec<tree> voprnds;
4108 voprnds.create (number_of_vectors);
4109 struct loop *loop;
4110 auto_vec<tree, 16> permute_results;
4112 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4114 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4116 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4117 gcc_assert (loop);
4118 edge pe = loop_preheader_edge (loop);
4120 gcc_assert (!reduc_chain || neutral_op);
4122 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4123 created vectors. It is greater than 1 if unrolling is performed.
4125 For example, we have two scalar operands, s1 and s2 (e.g., group of
4126 strided accesses of size two), while NUNITS is four (i.e., four scalars
4127 of this type can be packed in a vector). The output vector will contain
4128 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4129 will be 2).
4131 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4132 vectors containing the operands.
4134 For example, NUNITS is four as before, and the group size is 8
4135 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4136 {s5, s6, s7, s8}. */
4138 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4139 nunits = group_size;
4141 number_of_copies = nunits * number_of_vectors / group_size;
4143 number_of_places_left_in_vector = nunits;
4144 bool constant_p = true;
4145 tree_vector_builder elts (vector_type, nunits, 1);
4146 elts.quick_grow (nunits);
4147 for (j = 0; j < number_of_copies; j++)
4149 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4151 tree op;
4152 /* Get the def before the loop. In reduction chain we have only
4153 one initial value. */
4154 if ((j != (number_of_copies - 1)
4155 || (reduc_chain && i != 0))
4156 && neutral_op)
4157 op = neutral_op;
4158 else
4159 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4161 /* Create 'vect_ = {op0,op1,...,opn}'. */
4162 number_of_places_left_in_vector--;
4163 elts[number_of_places_left_in_vector] = op;
4164 if (!CONSTANT_CLASS_P (op))
4165 constant_p = false;
4167 if (number_of_places_left_in_vector == 0)
4169 gimple_seq ctor_seq = NULL;
4170 tree init;
4171 if (constant_p && !neutral_op
4172 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4173 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4174 /* Build the vector directly from ELTS. */
4175 init = gimple_build_vector (&ctor_seq, &elts);
4176 else if (neutral_op)
4178 /* Build a vector of the neutral value and shift the
4179 other elements into place. */
4180 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4181 neutral_op);
4182 int k = nunits;
4183 while (k > 0 && elts[k - 1] == neutral_op)
4184 k -= 1;
4185 while (k > 0)
4187 k -= 1;
4188 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4189 vector_type, init, elts[k]);
4192 else
4194 /* First time round, duplicate ELTS to fill the
4195 required number of vectors, then cherry pick the
4196 appropriate result for each iteration. */
4197 if (vec_oprnds->is_empty ())
4198 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4199 number_of_vectors,
4200 permute_results);
4201 init = permute_results[number_of_vectors - j - 1];
4203 if (ctor_seq != NULL)
4204 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4205 voprnds.quick_push (init);
4207 number_of_places_left_in_vector = nunits;
4208 elts.new_vector (vector_type, nunits, 1);
4209 elts.quick_grow (nunits);
4210 constant_p = true;
4215 /* Since the vectors are created in the reverse order, we should invert
4216 them. */
4217 vec_num = voprnds.length ();
4218 for (j = vec_num; j != 0; j--)
4220 vop = voprnds[j - 1];
4221 vec_oprnds->quick_push (vop);
4224 voprnds.release ();
4226 /* In case that VF is greater than the unrolling factor needed for the SLP
4227 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4228 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4229 to replicate the vectors. */
4230 tree neutral_vec = NULL;
4231 while (number_of_vectors > vec_oprnds->length ())
4233 if (neutral_op)
4235 if (!neutral_vec)
4237 gimple_seq ctor_seq = NULL;
4238 neutral_vec = gimple_build_vector_from_val
4239 (&ctor_seq, vector_type, neutral_op);
4240 if (ctor_seq != NULL)
4241 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4243 vec_oprnds->quick_push (neutral_vec);
4245 else
4247 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4248 vec_oprnds->quick_push (vop);
4254 /* Function vect_create_epilog_for_reduction
4256 Create code at the loop-epilog to finalize the result of a reduction
4257 computation.
4259 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4260 reduction statements.
4261 STMT_INFO is the scalar reduction stmt that is being vectorized.
4262 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4263 number of elements that we can fit in a vectype (nunits). In this case
4264 we have to generate more than one vector stmt - i.e - we need to "unroll"
4265 the vector stmt by a factor VF/nunits. For more details see documentation
4266 in vectorizable_operation.
4267 REDUC_FN is the internal function for the epilog reduction.
4268 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4269 computation.
4270 REDUC_INDEX is the index of the operand in the right hand side of the
4271 statement that is defined by REDUCTION_PHI.
4272 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4273 SLP_NODE is an SLP node containing a group of reduction statements. The
4274 first one in this group is STMT_INFO.
4275 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4276 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4277 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4278 any value of the IV in the loop.
4279 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4280 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4281 null if this is not an SLP reduction
4283 This function:
4284 1. Creates the reduction def-use cycles: sets the arguments for
4285 REDUCTION_PHIS:
4286 The loop-entry argument is the vectorized initial-value of the reduction.
4287 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4288 sums.
4289 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4290 by calling the function specified by REDUC_FN if available, or by
4291 other means (whole-vector shifts or a scalar loop).
4292 The function also creates a new phi node at the loop exit to preserve
4293 loop-closed form, as illustrated below.
4295 The flow at the entry to this function:
4297 loop:
4298 vec_def = phi <null, null> # REDUCTION_PHI
4299 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4300 s_loop = scalar_stmt # (scalar) STMT_INFO
4301 loop_exit:
4302 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4303 use <s_out0>
4304 use <s_out0>
4306 The above is transformed by this function into:
4308 loop:
4309 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4310 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4311 s_loop = scalar_stmt # (scalar) STMT_INFO
4312 loop_exit:
4313 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4314 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4315 v_out2 = reduce <v_out1>
4316 s_out3 = extract_field <v_out2, 0>
4317 s_out4 = adjust_result <s_out3>
4318 use <s_out4>
4319 use <s_out4>
4322 static void
4323 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4324 stmt_vec_info stmt_info,
4325 gimple *reduc_def_stmt,
4326 int ncopies, internal_fn reduc_fn,
4327 vec<stmt_vec_info> reduction_phis,
4328 bool double_reduc,
4329 slp_tree slp_node,
4330 slp_instance slp_node_instance,
4331 tree induc_val, enum tree_code induc_code,
4332 tree neutral_op)
4334 stmt_vec_info prev_phi_info;
4335 tree vectype;
4336 machine_mode mode;
4337 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4338 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4339 basic_block exit_bb;
4340 tree scalar_dest;
4341 tree scalar_type;
4342 gimple *new_phi = NULL, *phi;
4343 stmt_vec_info phi_info;
4344 gimple_stmt_iterator exit_gsi;
4345 tree vec_dest;
4346 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4347 gimple *epilog_stmt = NULL;
4348 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4349 gimple *exit_phi;
4350 tree bitsize;
4351 tree adjustment_def = NULL;
4352 tree vec_initial_def = NULL;
4353 tree expr, def, initial_def = NULL;
4354 tree orig_name, scalar_result;
4355 imm_use_iterator imm_iter, phi_imm_iter;
4356 use_operand_p use_p, phi_use_p;
4357 gimple *use_stmt;
4358 stmt_vec_info reduction_phi_info = NULL;
4359 bool nested_in_vect_loop = false;
4360 auto_vec<gimple *> new_phis;
4361 auto_vec<stmt_vec_info> inner_phis;
4362 int j, i;
4363 auto_vec<tree> scalar_results;
4364 unsigned int group_size = 1, k, ratio;
4365 auto_vec<tree> vec_initial_defs;
4366 auto_vec<gimple *> phis;
4367 bool slp_reduc = false;
4368 bool direct_slp_reduc;
4369 tree new_phi_result;
4370 stmt_vec_info inner_phi = NULL;
4371 tree induction_index = NULL_TREE;
4373 if (slp_node)
4374 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4376 if (nested_in_vect_loop_p (loop, stmt_info))
4378 outer_loop = loop;
4379 loop = loop->inner;
4380 nested_in_vect_loop = true;
4381 gcc_assert (!slp_node);
4384 vectype = STMT_VINFO_VECTYPE (stmt_info);
4385 gcc_assert (vectype);
4386 mode = TYPE_MODE (vectype);
4388 /* 1. Create the reduction def-use cycle:
4389 Set the arguments of REDUCTION_PHIS, i.e., transform
4391 loop:
4392 vec_def = phi <null, null> # REDUCTION_PHI
4393 VECT_DEF = vector_stmt # vectorized form of STMT
4396 into:
4398 loop:
4399 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4400 VECT_DEF = vector_stmt # vectorized form of STMT
4403 (in case of SLP, do it for all the phis). */
4405 /* Get the loop-entry arguments. */
4406 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4407 if (slp_node)
4409 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4410 vec_initial_defs.reserve (vec_num);
4411 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4412 &vec_initial_defs, vec_num,
4413 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4414 neutral_op);
4416 else
4418 /* Get at the scalar def before the loop, that defines the initial value
4419 of the reduction variable. */
4420 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4421 loop_preheader_edge (loop));
4422 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4423 and we can't use zero for induc_val, use initial_def. Similarly
4424 for REDUC_MIN and initial_def larger than the base. */
4425 if (TREE_CODE (initial_def) == INTEGER_CST
4426 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4427 == INTEGER_INDUC_COND_REDUCTION)
4428 && !integer_zerop (induc_val)
4429 && ((induc_code == MAX_EXPR
4430 && tree_int_cst_lt (initial_def, induc_val))
4431 || (induc_code == MIN_EXPR
4432 && tree_int_cst_lt (induc_val, initial_def))))
4433 induc_val = initial_def;
4435 if (double_reduc)
4436 /* In case of double reduction we only create a vector variable
4437 to be put in the reduction phi node. The actual statement
4438 creation is done later in this function. */
4439 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4440 else if (nested_in_vect_loop)
4442 /* Do not use an adjustment def as that case is not supported
4443 correctly if ncopies is not one. */
4444 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4445 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4446 stmt_info);
4448 else
4449 vec_initial_def
4450 = get_initial_def_for_reduction (stmt_info, initial_def,
4451 &adjustment_def);
4452 vec_initial_defs.create (1);
4453 vec_initial_defs.quick_push (vec_initial_def);
4456 /* Set phi nodes arguments. */
4457 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4459 tree vec_init_def = vec_initial_defs[i];
4460 tree def = vect_defs[i];
4461 for (j = 0; j < ncopies; j++)
4463 if (j != 0)
4465 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4466 if (nested_in_vect_loop)
4467 vec_init_def
4468 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4471 /* Set the loop-entry arg of the reduction-phi. */
4473 gphi *phi = as_a <gphi *> (phi_info->stmt);
4474 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4475 == INTEGER_INDUC_COND_REDUCTION)
4477 /* Initialise the reduction phi to zero. This prevents initial
4478 values of non-zero interferring with the reduction op. */
4479 gcc_assert (ncopies == 1);
4480 gcc_assert (i == 0);
4482 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4483 tree induc_val_vec
4484 = build_vector_from_val (vec_init_def_type, induc_val);
4486 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4487 UNKNOWN_LOCATION);
4489 else
4490 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4491 UNKNOWN_LOCATION);
4493 /* Set the loop-latch arg for the reduction-phi. */
4494 if (j > 0)
4495 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4497 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4499 if (dump_enabled_p ())
4500 dump_printf_loc (MSG_NOTE, vect_location,
4501 "transform reduction: created def-use cycle: %G%G",
4502 phi, SSA_NAME_DEF_STMT (def));
4506 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4507 which is updated with the current index of the loop for every match of
4508 the original loop's cond_expr (VEC_STMT). This results in a vector
4509 containing the last time the condition passed for that vector lane.
4510 The first match will be a 1 to allow 0 to be used for non-matching
4511 indexes. If there are no matches at all then the vector will be all
4512 zeroes. */
4513 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4515 tree indx_before_incr, indx_after_incr;
4516 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4518 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4519 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4521 int scalar_precision
4522 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4523 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4524 tree cr_index_vector_type = build_vector_type
4525 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4527 /* First we create a simple vector induction variable which starts
4528 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4529 vector size (STEP). */
4531 /* Create a {1,2,3,...} vector. */
4532 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4534 /* Create a vector of the step value. */
4535 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4536 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4538 /* Create an induction variable. */
4539 gimple_stmt_iterator incr_gsi;
4540 bool insert_after;
4541 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4542 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4543 insert_after, &indx_before_incr, &indx_after_incr);
4545 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4546 filled with zeros (VEC_ZERO). */
4548 /* Create a vector of 0s. */
4549 tree zero = build_zero_cst (cr_index_scalar_type);
4550 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4552 /* Create a vector phi node. */
4553 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4554 new_phi = create_phi_node (new_phi_tree, loop->header);
4555 loop_vinfo->add_stmt (new_phi);
4556 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4557 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4559 /* Now take the condition from the loops original cond_expr
4560 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4561 every match uses values from the induction variable
4562 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4563 (NEW_PHI_TREE).
4564 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4565 the new cond_expr (INDEX_COND_EXPR). */
4567 /* Duplicate the condition from vec_stmt. */
4568 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4570 /* Create a conditional, where the condition is taken from vec_stmt
4571 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4572 else is the phi (NEW_PHI_TREE). */
4573 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4574 ccompare, indx_before_incr,
4575 new_phi_tree);
4576 induction_index = make_ssa_name (cr_index_vector_type);
4577 gimple *index_condition = gimple_build_assign (induction_index,
4578 index_cond_expr);
4579 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4580 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4581 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4583 /* Update the phi with the vec cond. */
4584 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4585 loop_latch_edge (loop), UNKNOWN_LOCATION);
4588 /* 2. Create epilog code.
4589 The reduction epilog code operates across the elements of the vector
4590 of partial results computed by the vectorized loop.
4591 The reduction epilog code consists of:
4593 step 1: compute the scalar result in a vector (v_out2)
4594 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4595 step 3: adjust the scalar result (s_out3) if needed.
4597 Step 1 can be accomplished using one the following three schemes:
4598 (scheme 1) using reduc_fn, if available.
4599 (scheme 2) using whole-vector shifts, if available.
4600 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4601 combined.
4603 The overall epilog code looks like this:
4605 s_out0 = phi <s_loop> # original EXIT_PHI
4606 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4607 v_out2 = reduce <v_out1> # step 1
4608 s_out3 = extract_field <v_out2, 0> # step 2
4609 s_out4 = adjust_result <s_out3> # step 3
4611 (step 3 is optional, and steps 1 and 2 may be combined).
4612 Lastly, the uses of s_out0 are replaced by s_out4. */
4615 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4616 v_out1 = phi <VECT_DEF>
4617 Store them in NEW_PHIS. */
4619 exit_bb = single_exit (loop)->dest;
4620 prev_phi_info = NULL;
4621 new_phis.create (vect_defs.length ());
4622 FOR_EACH_VEC_ELT (vect_defs, i, def)
4624 for (j = 0; j < ncopies; j++)
4626 tree new_def = copy_ssa_name (def);
4627 phi = create_phi_node (new_def, exit_bb);
4628 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4629 if (j == 0)
4630 new_phis.quick_push (phi);
4631 else
4633 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4634 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4637 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4638 prev_phi_info = phi_info;
4642 /* The epilogue is created for the outer-loop, i.e., for the loop being
4643 vectorized. Create exit phis for the outer loop. */
4644 if (double_reduc)
4646 loop = outer_loop;
4647 exit_bb = single_exit (loop)->dest;
4648 inner_phis.create (vect_defs.length ());
4649 FOR_EACH_VEC_ELT (new_phis, i, phi)
4651 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4652 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4653 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4654 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4655 PHI_RESULT (phi));
4656 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4657 inner_phis.quick_push (phi_info);
4658 new_phis[i] = outer_phi;
4659 while (STMT_VINFO_RELATED_STMT (phi_info))
4661 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4662 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4663 outer_phi = create_phi_node (new_result, exit_bb);
4664 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4665 PHI_RESULT (phi_info->stmt));
4666 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4667 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4668 prev_phi_info = outer_phi_info;
4673 exit_gsi = gsi_after_labels (exit_bb);
4675 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4676 (i.e. when reduc_fn is not available) and in the final adjustment
4677 code (if needed). Also get the original scalar reduction variable as
4678 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4679 represents a reduction pattern), the tree-code and scalar-def are
4680 taken from the original stmt that the pattern-stmt (STMT) replaces.
4681 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4682 are taken from STMT. */
4684 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4685 if (orig_stmt_info != stmt_info)
4687 /* Reduction pattern */
4688 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4689 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4692 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4693 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4694 partial results are added and not subtracted. */
4695 if (code == MINUS_EXPR)
4696 code = PLUS_EXPR;
4698 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4699 scalar_type = TREE_TYPE (scalar_dest);
4700 scalar_results.create (group_size);
4701 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4702 bitsize = TYPE_SIZE (scalar_type);
4704 /* In case this is a reduction in an inner-loop while vectorizing an outer
4705 loop - we don't need to extract a single scalar result at the end of the
4706 inner-loop (unless it is double reduction, i.e., the use of reduction is
4707 outside the outer-loop). The final vector of partial results will be used
4708 in the vectorized outer-loop, or reduced to a scalar result at the end of
4709 the outer-loop. */
4710 if (nested_in_vect_loop && !double_reduc)
4711 goto vect_finalize_reduction;
4713 /* SLP reduction without reduction chain, e.g.,
4714 # a1 = phi <a2, a0>
4715 # b1 = phi <b2, b0>
4716 a2 = operation (a1)
4717 b2 = operation (b1) */
4718 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4720 /* True if we should implement SLP_REDUC using native reduction operations
4721 instead of scalar operations. */
4722 direct_slp_reduc = (reduc_fn != IFN_LAST
4723 && slp_reduc
4724 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4726 /* In case of reduction chain, e.g.,
4727 # a1 = phi <a3, a0>
4728 a2 = operation (a1)
4729 a3 = operation (a2),
4731 we may end up with more than one vector result. Here we reduce them to
4732 one vector. */
4733 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4735 tree first_vect = PHI_RESULT (new_phis[0]);
4736 gassign *new_vec_stmt = NULL;
4737 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4738 for (k = 1; k < new_phis.length (); k++)
4740 gimple *next_phi = new_phis[k];
4741 tree second_vect = PHI_RESULT (next_phi);
4742 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4743 new_vec_stmt = gimple_build_assign (tem, code,
4744 first_vect, second_vect);
4745 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4746 first_vect = tem;
4749 new_phi_result = first_vect;
4750 if (new_vec_stmt)
4752 new_phis.truncate (0);
4753 new_phis.safe_push (new_vec_stmt);
4756 /* Likewise if we couldn't use a single defuse cycle. */
4757 else if (ncopies > 1)
4759 gcc_assert (new_phis.length () == 1);
4760 tree first_vect = PHI_RESULT (new_phis[0]);
4761 gassign *new_vec_stmt = NULL;
4762 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4763 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4764 for (int k = 1; k < ncopies; ++k)
4766 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4767 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4768 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4769 new_vec_stmt = gimple_build_assign (tem, code,
4770 first_vect, second_vect);
4771 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4772 first_vect = tem;
4774 new_phi_result = first_vect;
4775 new_phis.truncate (0);
4776 new_phis.safe_push (new_vec_stmt);
4778 else
4779 new_phi_result = PHI_RESULT (new_phis[0]);
4781 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4782 && reduc_fn != IFN_LAST)
4784 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4785 various data values where the condition matched and another vector
4786 (INDUCTION_INDEX) containing all the indexes of those matches. We
4787 need to extract the last matching index (which will be the index with
4788 highest value) and use this to index into the data vector.
4789 For the case where there were no matches, the data vector will contain
4790 all default values and the index vector will be all zeros. */
4792 /* Get various versions of the type of the vector of indexes. */
4793 tree index_vec_type = TREE_TYPE (induction_index);
4794 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4795 tree index_scalar_type = TREE_TYPE (index_vec_type);
4796 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4797 (index_vec_type);
4799 /* Get an unsigned integer version of the type of the data vector. */
4800 int scalar_precision
4801 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4802 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4803 tree vectype_unsigned = build_vector_type
4804 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4806 /* First we need to create a vector (ZERO_VEC) of zeros and another
4807 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4808 can create using a MAX reduction and then expanding.
4809 In the case where the loop never made any matches, the max index will
4810 be zero. */
4812 /* Vector of {0, 0, 0,...}. */
4813 tree zero_vec = make_ssa_name (vectype);
4814 tree zero_vec_rhs = build_zero_cst (vectype);
4815 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4816 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4818 /* Find maximum value from the vector of found indexes. */
4819 tree max_index = make_ssa_name (index_scalar_type);
4820 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4821 1, induction_index);
4822 gimple_call_set_lhs (max_index_stmt, max_index);
4823 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4825 /* Vector of {max_index, max_index, max_index,...}. */
4826 tree max_index_vec = make_ssa_name (index_vec_type);
4827 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4828 max_index);
4829 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4830 max_index_vec_rhs);
4831 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4833 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4834 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4835 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4836 otherwise. Only one value should match, resulting in a vector
4837 (VEC_COND) with one data value and the rest zeros.
4838 In the case where the loop never made any matches, every index will
4839 match, resulting in a vector with all data values (which will all be
4840 the default value). */
4842 /* Compare the max index vector to the vector of found indexes to find
4843 the position of the max value. */
4844 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4845 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4846 induction_index,
4847 max_index_vec);
4848 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4850 /* Use the compare to choose either values from the data vector or
4851 zero. */
4852 tree vec_cond = make_ssa_name (vectype);
4853 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4854 vec_compare, new_phi_result,
4855 zero_vec);
4856 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4858 /* Finally we need to extract the data value from the vector (VEC_COND)
4859 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4860 reduction, but because this doesn't exist, we can use a MAX reduction
4861 instead. The data value might be signed or a float so we need to cast
4862 it first.
4863 In the case where the loop never made any matches, the data values are
4864 all identical, and so will reduce down correctly. */
4866 /* Make the matched data values unsigned. */
4867 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4868 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4869 vec_cond);
4870 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4871 VIEW_CONVERT_EXPR,
4872 vec_cond_cast_rhs);
4873 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4875 /* Reduce down to a scalar value. */
4876 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4877 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4878 1, vec_cond_cast);
4879 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4880 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4882 /* Convert the reduced value back to the result type and set as the
4883 result. */
4884 gimple_seq stmts = NULL;
4885 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4886 data_reduc);
4887 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4888 scalar_results.safe_push (new_temp);
4890 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4891 && reduc_fn == IFN_LAST)
4893 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4894 idx = 0;
4895 idx_val = induction_index[0];
4896 val = data_reduc[0];
4897 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4898 if (induction_index[i] > idx_val)
4899 val = data_reduc[i], idx_val = induction_index[i];
4900 return val; */
4902 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4903 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4904 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4905 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4906 /* Enforced by vectorizable_reduction, which ensures we have target
4907 support before allowing a conditional reduction on variable-length
4908 vectors. */
4909 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4910 tree idx_val = NULL_TREE, val = NULL_TREE;
4911 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4913 tree old_idx_val = idx_val;
4914 tree old_val = val;
4915 idx_val = make_ssa_name (idx_eltype);
4916 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4917 build3 (BIT_FIELD_REF, idx_eltype,
4918 induction_index,
4919 bitsize_int (el_size),
4920 bitsize_int (off)));
4921 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4922 val = make_ssa_name (data_eltype);
4923 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4924 build3 (BIT_FIELD_REF,
4925 data_eltype,
4926 new_phi_result,
4927 bitsize_int (el_size),
4928 bitsize_int (off)));
4929 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4930 if (off != 0)
4932 tree new_idx_val = idx_val;
4933 tree new_val = val;
4934 if (off != v_size - el_size)
4936 new_idx_val = make_ssa_name (idx_eltype);
4937 epilog_stmt = gimple_build_assign (new_idx_val,
4938 MAX_EXPR, idx_val,
4939 old_idx_val);
4940 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4942 new_val = make_ssa_name (data_eltype);
4943 epilog_stmt = gimple_build_assign (new_val,
4944 COND_EXPR,
4945 build2 (GT_EXPR,
4946 boolean_type_node,
4947 idx_val,
4948 old_idx_val),
4949 val, old_val);
4950 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4951 idx_val = new_idx_val;
4952 val = new_val;
4955 /* Convert the reduced value back to the result type and set as the
4956 result. */
4957 gimple_seq stmts = NULL;
4958 val = gimple_convert (&stmts, scalar_type, val);
4959 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4960 scalar_results.safe_push (val);
4963 /* 2.3 Create the reduction code, using one of the three schemes described
4964 above. In SLP we simply need to extract all the elements from the
4965 vector (without reducing them), so we use scalar shifts. */
4966 else if (reduc_fn != IFN_LAST && !slp_reduc)
4968 tree tmp;
4969 tree vec_elem_type;
4971 /* Case 1: Create:
4972 v_out2 = reduc_expr <v_out1> */
4974 if (dump_enabled_p ())
4975 dump_printf_loc (MSG_NOTE, vect_location,
4976 "Reduce using direct vector reduction.\n");
4978 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4979 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4981 tree tmp_dest
4982 = vect_create_destination_var (scalar_dest, vec_elem_type);
4983 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4984 new_phi_result);
4985 gimple_set_lhs (epilog_stmt, tmp_dest);
4986 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4987 gimple_set_lhs (epilog_stmt, new_temp);
4988 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4991 new_temp);
4993 else
4995 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4996 new_phi_result);
4997 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5000 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5001 gimple_set_lhs (epilog_stmt, new_temp);
5002 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5004 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5005 == INTEGER_INDUC_COND_REDUCTION)
5006 && !operand_equal_p (initial_def, induc_val, 0))
5008 /* Earlier we set the initial value to be a vector if induc_val
5009 values. Check the result and if it is induc_val then replace
5010 with the original initial value, unless induc_val is
5011 the same as initial_def already. */
5012 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5013 induc_val);
5015 tmp = make_ssa_name (new_scalar_dest);
5016 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5017 initial_def, new_temp);
5018 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5019 new_temp = tmp;
5022 scalar_results.safe_push (new_temp);
5024 else if (direct_slp_reduc)
5026 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5027 with the elements for other SLP statements replaced with the
5028 neutral value. We can then do a normal reduction on each vector. */
5030 /* Enforced by vectorizable_reduction. */
5031 gcc_assert (new_phis.length () == 1);
5032 gcc_assert (pow2p_hwi (group_size));
5034 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5035 vec<stmt_vec_info> orig_phis
5036 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5037 gimple_seq seq = NULL;
5039 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5040 and the same element size as VECTYPE. */
5041 tree index = build_index_vector (vectype, 0, 1);
5042 tree index_type = TREE_TYPE (index);
5043 tree index_elt_type = TREE_TYPE (index_type);
5044 tree mask_type = build_same_sized_truth_vector_type (index_type);
5046 /* Create a vector that, for each element, identifies which of
5047 the REDUC_GROUP_SIZE results should use it. */
5048 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5049 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5050 build_vector_from_val (index_type, index_mask));
5052 /* Get a neutral vector value. This is simply a splat of the neutral
5053 scalar value if we have one, otherwise the initial scalar value
5054 is itself a neutral value. */
5055 tree vector_identity = NULL_TREE;
5056 if (neutral_op)
5057 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5058 neutral_op);
5059 for (unsigned int i = 0; i < group_size; ++i)
5061 /* If there's no univeral neutral value, we can use the
5062 initial scalar value from the original PHI. This is used
5063 for MIN and MAX reduction, for example. */
5064 if (!neutral_op)
5066 tree scalar_value
5067 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5068 loop_preheader_edge (loop));
5069 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5070 scalar_value);
5073 /* Calculate the equivalent of:
5075 sel[j] = (index[j] == i);
5077 which selects the elements of NEW_PHI_RESULT that should
5078 be included in the result. */
5079 tree compare_val = build_int_cst (index_elt_type, i);
5080 compare_val = build_vector_from_val (index_type, compare_val);
5081 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5082 index, compare_val);
5084 /* Calculate the equivalent of:
5086 vec = seq ? new_phi_result : vector_identity;
5088 VEC is now suitable for a full vector reduction. */
5089 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5090 sel, new_phi_result, vector_identity);
5092 /* Do the reduction and convert it to the appropriate type. */
5093 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5094 TREE_TYPE (vectype), vec);
5095 scalar = gimple_convert (&seq, scalar_type, scalar);
5096 scalar_results.safe_push (scalar);
5098 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5100 else
5102 bool reduce_with_shift;
5103 tree vec_temp;
5105 /* COND reductions all do the final reduction with MAX_EXPR
5106 or MIN_EXPR. */
5107 if (code == COND_EXPR)
5109 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5110 == INTEGER_INDUC_COND_REDUCTION)
5111 code = induc_code;
5112 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5113 == CONST_COND_REDUCTION)
5114 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5115 else
5116 code = MAX_EXPR;
5119 /* See if the target wants to do the final (shift) reduction
5120 in a vector mode of smaller size and first reduce upper/lower
5121 halves against each other. */
5122 enum machine_mode mode1 = mode;
5123 tree vectype1 = vectype;
5124 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5125 unsigned sz1 = sz;
5126 if (!slp_reduc
5127 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5128 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5130 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5131 reduce_with_shift = have_whole_vector_shift (mode1);
5132 if (!VECTOR_MODE_P (mode1))
5133 reduce_with_shift = false;
5134 else
5136 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5137 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5138 reduce_with_shift = false;
5141 /* First reduce the vector to the desired vector size we should
5142 do shift reduction on by combining upper and lower halves. */
5143 new_temp = new_phi_result;
5144 while (sz > sz1)
5146 gcc_assert (!slp_reduc);
5147 sz /= 2;
5148 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5150 /* The target has to make sure we support lowpart/highpart
5151 extraction, either via direct vector extract or through
5152 an integer mode punning. */
5153 tree dst1, dst2;
5154 if (convert_optab_handler (vec_extract_optab,
5155 TYPE_MODE (TREE_TYPE (new_temp)),
5156 TYPE_MODE (vectype1))
5157 != CODE_FOR_nothing)
5159 /* Extract sub-vectors directly once vec_extract becomes
5160 a conversion optab. */
5161 dst1 = make_ssa_name (vectype1);
5162 epilog_stmt
5163 = gimple_build_assign (dst1, BIT_FIELD_REF,
5164 build3 (BIT_FIELD_REF, vectype1,
5165 new_temp, TYPE_SIZE (vectype1),
5166 bitsize_int (0)));
5167 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5168 dst2 = make_ssa_name (vectype1);
5169 epilog_stmt
5170 = gimple_build_assign (dst2, BIT_FIELD_REF,
5171 build3 (BIT_FIELD_REF, vectype1,
5172 new_temp, TYPE_SIZE (vectype1),
5173 bitsize_int (sz * BITS_PER_UNIT)));
5174 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5176 else
5178 /* Extract via punning to appropriately sized integer mode
5179 vector. */
5180 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5182 tree etype = build_vector_type (eltype, 2);
5183 gcc_assert (convert_optab_handler (vec_extract_optab,
5184 TYPE_MODE (etype),
5185 TYPE_MODE (eltype))
5186 != CODE_FOR_nothing);
5187 tree tem = make_ssa_name (etype);
5188 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5189 build1 (VIEW_CONVERT_EXPR,
5190 etype, new_temp));
5191 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5192 new_temp = tem;
5193 tem = make_ssa_name (eltype);
5194 epilog_stmt
5195 = gimple_build_assign (tem, BIT_FIELD_REF,
5196 build3 (BIT_FIELD_REF, eltype,
5197 new_temp, TYPE_SIZE (eltype),
5198 bitsize_int (0)));
5199 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5200 dst1 = make_ssa_name (vectype1);
5201 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5202 build1 (VIEW_CONVERT_EXPR,
5203 vectype1, tem));
5204 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5205 tem = make_ssa_name (eltype);
5206 epilog_stmt
5207 = gimple_build_assign (tem, BIT_FIELD_REF,
5208 build3 (BIT_FIELD_REF, eltype,
5209 new_temp, TYPE_SIZE (eltype),
5210 bitsize_int (sz * BITS_PER_UNIT)));
5211 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5212 dst2 = make_ssa_name (vectype1);
5213 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5214 build1 (VIEW_CONVERT_EXPR,
5215 vectype1, tem));
5216 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5219 new_temp = make_ssa_name (vectype1);
5220 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5221 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5224 if (reduce_with_shift && !slp_reduc)
5226 int element_bitsize = tree_to_uhwi (bitsize);
5227 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5228 for variable-length vectors and also requires direct target support
5229 for loop reductions. */
5230 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5231 int nelements = vec_size_in_bits / element_bitsize;
5232 vec_perm_builder sel;
5233 vec_perm_indices indices;
5235 int elt_offset;
5237 tree zero_vec = build_zero_cst (vectype1);
5238 /* Case 2: Create:
5239 for (offset = nelements/2; offset >= 1; offset/=2)
5241 Create: va' = vec_shift <va, offset>
5242 Create: va = vop <va, va'>
5243 } */
5245 tree rhs;
5247 if (dump_enabled_p ())
5248 dump_printf_loc (MSG_NOTE, vect_location,
5249 "Reduce using vector shifts\n");
5251 mode1 = TYPE_MODE (vectype1);
5252 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5253 for (elt_offset = nelements / 2;
5254 elt_offset >= 1;
5255 elt_offset /= 2)
5257 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5258 indices.new_vector (sel, 2, nelements);
5259 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5260 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5261 new_temp, zero_vec, mask);
5262 new_name = make_ssa_name (vec_dest, epilog_stmt);
5263 gimple_assign_set_lhs (epilog_stmt, new_name);
5264 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5266 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5267 new_temp);
5268 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5269 gimple_assign_set_lhs (epilog_stmt, new_temp);
5270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5273 /* 2.4 Extract the final scalar result. Create:
5274 s_out3 = extract_field <v_out2, bitpos> */
5276 if (dump_enabled_p ())
5277 dump_printf_loc (MSG_NOTE, vect_location,
5278 "extract scalar result\n");
5280 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5281 bitsize, bitsize_zero_node);
5282 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5283 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5284 gimple_assign_set_lhs (epilog_stmt, new_temp);
5285 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5286 scalar_results.safe_push (new_temp);
5288 else
5290 /* Case 3: Create:
5291 s = extract_field <v_out2, 0>
5292 for (offset = element_size;
5293 offset < vector_size;
5294 offset += element_size;)
5296 Create: s' = extract_field <v_out2, offset>
5297 Create: s = op <s, s'> // For non SLP cases
5298 } */
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_NOTE, vect_location,
5302 "Reduce using scalar code.\n");
5304 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5305 int element_bitsize = tree_to_uhwi (bitsize);
5306 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5308 int bit_offset;
5309 if (gimple_code (new_phi) == GIMPLE_PHI)
5310 vec_temp = PHI_RESULT (new_phi);
5311 else
5312 vec_temp = gimple_assign_lhs (new_phi);
5313 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5314 bitsize_zero_node);
5315 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5316 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5317 gimple_assign_set_lhs (epilog_stmt, new_temp);
5318 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5320 /* In SLP we don't need to apply reduction operation, so we just
5321 collect s' values in SCALAR_RESULTS. */
5322 if (slp_reduc)
5323 scalar_results.safe_push (new_temp);
5325 for (bit_offset = element_bitsize;
5326 bit_offset < vec_size_in_bits;
5327 bit_offset += element_bitsize)
5329 tree bitpos = bitsize_int (bit_offset);
5330 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5331 bitsize, bitpos);
5333 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5334 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5335 gimple_assign_set_lhs (epilog_stmt, new_name);
5336 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5338 if (slp_reduc)
5340 /* In SLP we don't need to apply reduction operation, so
5341 we just collect s' values in SCALAR_RESULTS. */
5342 new_temp = new_name;
5343 scalar_results.safe_push (new_name);
5345 else
5347 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5348 new_name, new_temp);
5349 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5350 gimple_assign_set_lhs (epilog_stmt, new_temp);
5351 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356 /* The only case where we need to reduce scalar results in SLP, is
5357 unrolling. If the size of SCALAR_RESULTS is greater than
5358 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5359 REDUC_GROUP_SIZE. */
5360 if (slp_reduc)
5362 tree res, first_res, new_res;
5363 gimple *new_stmt;
5365 /* Reduce multiple scalar results in case of SLP unrolling. */
5366 for (j = group_size; scalar_results.iterate (j, &res);
5367 j++)
5369 first_res = scalar_results[j % group_size];
5370 new_stmt = gimple_build_assign (new_scalar_dest, code,
5371 first_res, res);
5372 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5373 gimple_assign_set_lhs (new_stmt, new_res);
5374 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5375 scalar_results[j % group_size] = new_res;
5378 else
5379 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5380 scalar_results.safe_push (new_temp);
5383 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5384 == INTEGER_INDUC_COND_REDUCTION)
5385 && !operand_equal_p (initial_def, induc_val, 0))
5387 /* Earlier we set the initial value to be a vector if induc_val
5388 values. Check the result and if it is induc_val then replace
5389 with the original initial value, unless induc_val is
5390 the same as initial_def already. */
5391 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5392 induc_val);
5394 tree tmp = make_ssa_name (new_scalar_dest);
5395 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5396 initial_def, new_temp);
5397 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398 scalar_results[0] = tmp;
5402 vect_finalize_reduction:
5404 if (double_reduc)
5405 loop = loop->inner;
5407 /* 2.5 Adjust the final result by the initial value of the reduction
5408 variable. (When such adjustment is not needed, then
5409 'adjustment_def' is zero). For example, if code is PLUS we create:
5410 new_temp = loop_exit_def + adjustment_def */
5412 if (adjustment_def)
5414 gcc_assert (!slp_reduc);
5415 if (nested_in_vect_loop)
5417 new_phi = new_phis[0];
5418 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5419 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5420 new_dest = vect_create_destination_var (scalar_dest, vectype);
5422 else
5424 new_temp = scalar_results[0];
5425 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5426 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5427 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5430 epilog_stmt = gimple_build_assign (new_dest, expr);
5431 new_temp = make_ssa_name (new_dest, epilog_stmt);
5432 gimple_assign_set_lhs (epilog_stmt, new_temp);
5433 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5434 if (nested_in_vect_loop)
5436 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5437 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5438 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5440 if (!double_reduc)
5441 scalar_results.quick_push (new_temp);
5442 else
5443 scalar_results[0] = new_temp;
5445 else
5446 scalar_results[0] = new_temp;
5448 new_phis[0] = epilog_stmt;
5451 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5452 phis with new adjusted scalar results, i.e., replace use <s_out0>
5453 with use <s_out4>.
5455 Transform:
5456 loop_exit:
5457 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5458 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5459 v_out2 = reduce <v_out1>
5460 s_out3 = extract_field <v_out2, 0>
5461 s_out4 = adjust_result <s_out3>
5462 use <s_out0>
5463 use <s_out0>
5465 into:
5467 loop_exit:
5468 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5469 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5470 v_out2 = reduce <v_out1>
5471 s_out3 = extract_field <v_out2, 0>
5472 s_out4 = adjust_result <s_out3>
5473 use <s_out4>
5474 use <s_out4> */
5477 /* In SLP reduction chain we reduce vector results into one vector if
5478 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5479 LHS of the last stmt in the reduction chain, since we are looking for
5480 the loop exit phi node. */
5481 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5483 stmt_vec_info dest_stmt_info
5484 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5485 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5486 group_size = 1;
5489 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5490 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5491 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5492 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5493 correspond to the first vector stmt, etc.
5494 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5495 if (group_size > new_phis.length ())
5497 ratio = group_size / new_phis.length ();
5498 gcc_assert (!(group_size % new_phis.length ()));
5500 else
5501 ratio = 1;
5503 stmt_vec_info epilog_stmt_info = NULL;
5504 for (k = 0; k < group_size; k++)
5506 if (k % ratio == 0)
5508 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5509 reduction_phi_info = reduction_phis[k / ratio];
5510 if (double_reduc)
5511 inner_phi = inner_phis[k / ratio];
5514 if (slp_reduc)
5516 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5518 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5519 /* SLP statements can't participate in patterns. */
5520 gcc_assert (!orig_stmt_info);
5521 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5524 phis.create (3);
5525 /* Find the loop-closed-use at the loop exit of the original scalar
5526 result. (The reduction result is expected to have two immediate uses -
5527 one at the latch block, and one at the loop exit). */
5528 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5529 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5530 && !is_gimple_debug (USE_STMT (use_p)))
5531 phis.safe_push (USE_STMT (use_p));
5533 /* While we expect to have found an exit_phi because of loop-closed-ssa
5534 form we can end up without one if the scalar cycle is dead. */
5536 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5538 if (outer_loop)
5540 stmt_vec_info exit_phi_vinfo
5541 = loop_vinfo->lookup_stmt (exit_phi);
5542 gphi *vect_phi;
5544 /* FORNOW. Currently not supporting the case that an inner-loop
5545 reduction is not used in the outer-loop (but only outside the
5546 outer-loop), unless it is double reduction. */
5547 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5548 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5549 || double_reduc);
5551 if (double_reduc)
5552 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5553 else
5554 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5555 if (!double_reduc
5556 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5557 != vect_double_reduction_def)
5558 continue;
5560 /* Handle double reduction:
5562 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5563 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5564 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5565 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5567 At that point the regular reduction (stmt2 and stmt3) is
5568 already vectorized, as well as the exit phi node, stmt4.
5569 Here we vectorize the phi node of double reduction, stmt1, and
5570 update all relevant statements. */
5572 /* Go through all the uses of s2 to find double reduction phi
5573 node, i.e., stmt1 above. */
5574 orig_name = PHI_RESULT (exit_phi);
5575 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5577 stmt_vec_info use_stmt_vinfo;
5578 tree vect_phi_init, preheader_arg, vect_phi_res;
5579 basic_block bb = gimple_bb (use_stmt);
5581 /* Check that USE_STMT is really double reduction phi
5582 node. */
5583 if (gimple_code (use_stmt) != GIMPLE_PHI
5584 || gimple_phi_num_args (use_stmt) != 2
5585 || bb->loop_father != outer_loop)
5586 continue;
5587 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5588 if (!use_stmt_vinfo
5589 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5590 != vect_double_reduction_def)
5591 continue;
5593 /* Create vector phi node for double reduction:
5594 vs1 = phi <vs0, vs2>
5595 vs1 was created previously in this function by a call to
5596 vect_get_vec_def_for_operand and is stored in
5597 vec_initial_def;
5598 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5599 vs0 is created here. */
5601 /* Create vector phi node. */
5602 vect_phi = create_phi_node (vec_initial_def, bb);
5603 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5605 /* Create vs0 - initial def of the double reduction phi. */
5606 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5607 loop_preheader_edge (outer_loop));
5608 vect_phi_init = get_initial_def_for_reduction
5609 (stmt_info, preheader_arg, NULL);
5611 /* Update phi node arguments with vs0 and vs2. */
5612 add_phi_arg (vect_phi, vect_phi_init,
5613 loop_preheader_edge (outer_loop),
5614 UNKNOWN_LOCATION);
5615 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5616 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5617 if (dump_enabled_p ())
5618 dump_printf_loc (MSG_NOTE, vect_location,
5619 "created double reduction phi node: %G",
5620 vect_phi);
5622 vect_phi_res = PHI_RESULT (vect_phi);
5624 /* Replace the use, i.e., set the correct vs1 in the regular
5625 reduction phi node. FORNOW, NCOPIES is always 1, so the
5626 loop is redundant. */
5627 stmt_vec_info use_info = reduction_phi_info;
5628 for (j = 0; j < ncopies; j++)
5630 edge pr_edge = loop_preheader_edge (loop);
5631 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5632 pr_edge->dest_idx, vect_phi_res);
5633 use_info = STMT_VINFO_RELATED_STMT (use_info);
5639 phis.release ();
5640 if (nested_in_vect_loop)
5642 if (double_reduc)
5643 loop = outer_loop;
5644 else
5645 continue;
5648 phis.create (3);
5649 /* Find the loop-closed-use at the loop exit of the original scalar
5650 result. (The reduction result is expected to have two immediate uses,
5651 one at the latch block, and one at the loop exit). For double
5652 reductions we are looking for exit phis of the outer loop. */
5653 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5655 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5657 if (!is_gimple_debug (USE_STMT (use_p)))
5658 phis.safe_push (USE_STMT (use_p));
5660 else
5662 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5664 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5666 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5668 if (!flow_bb_inside_loop_p (loop,
5669 gimple_bb (USE_STMT (phi_use_p)))
5670 && !is_gimple_debug (USE_STMT (phi_use_p)))
5671 phis.safe_push (USE_STMT (phi_use_p));
5677 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5679 /* Replace the uses: */
5680 orig_name = PHI_RESULT (exit_phi);
5681 scalar_result = scalar_results[k];
5682 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5683 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5684 SET_USE (use_p, scalar_result);
5687 phis.release ();
5691 /* Return a vector of type VECTYPE that is equal to the vector select
5692 operation "MASK ? VEC : IDENTITY". Insert the select statements
5693 before GSI. */
5695 static tree
5696 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5697 tree vec, tree identity)
5699 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5700 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5701 mask, vec, identity);
5702 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5703 return cond;
5706 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5707 order, starting with LHS. Insert the extraction statements before GSI and
5708 associate the new scalar SSA names with variable SCALAR_DEST.
5709 Return the SSA name for the result. */
5711 static tree
5712 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5713 tree_code code, tree lhs, tree vector_rhs)
5715 tree vectype = TREE_TYPE (vector_rhs);
5716 tree scalar_type = TREE_TYPE (vectype);
5717 tree bitsize = TYPE_SIZE (scalar_type);
5718 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5719 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5721 for (unsigned HOST_WIDE_INT bit_offset = 0;
5722 bit_offset < vec_size_in_bits;
5723 bit_offset += element_bitsize)
5725 tree bitpos = bitsize_int (bit_offset);
5726 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5727 bitsize, bitpos);
5729 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5730 rhs = make_ssa_name (scalar_dest, stmt);
5731 gimple_assign_set_lhs (stmt, rhs);
5732 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5734 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5735 tree new_name = make_ssa_name (scalar_dest, stmt);
5736 gimple_assign_set_lhs (stmt, new_name);
5737 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5738 lhs = new_name;
5740 return lhs;
5743 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5744 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5745 statement. CODE is the operation performed by STMT_INFO and OPS are
5746 its scalar operands. REDUC_INDEX is the index of the operand in
5747 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5748 implements in-order reduction, or IFN_LAST if we should open-code it.
5749 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5750 that should be used to control the operation in a fully-masked loop. */
5752 static bool
5753 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5754 gimple_stmt_iterator *gsi,
5755 stmt_vec_info *vec_stmt, slp_tree slp_node,
5756 gimple *reduc_def_stmt,
5757 tree_code code, internal_fn reduc_fn,
5758 tree ops[3], tree vectype_in,
5759 int reduc_index, vec_loop_masks *masks)
5761 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5762 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5763 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5764 stmt_vec_info new_stmt_info = NULL;
5766 int ncopies;
5767 if (slp_node)
5768 ncopies = 1;
5769 else
5770 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5772 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5773 gcc_assert (ncopies == 1);
5774 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5775 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5776 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5777 == FOLD_LEFT_REDUCTION);
5779 if (slp_node)
5780 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5781 TYPE_VECTOR_SUBPARTS (vectype_in)));
5783 tree op0 = ops[1 - reduc_index];
5785 int group_size = 1;
5786 stmt_vec_info scalar_dest_def_info;
5787 auto_vec<tree> vec_oprnds0;
5788 if (slp_node)
5790 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5791 slp_node);
5792 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5793 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5795 else
5797 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5798 vec_oprnds0.create (1);
5799 vec_oprnds0.quick_push (loop_vec_def0);
5800 scalar_dest_def_info = stmt_info;
5803 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5804 tree scalar_type = TREE_TYPE (scalar_dest);
5805 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5807 int vec_num = vec_oprnds0.length ();
5808 gcc_assert (vec_num == 1 || slp_node);
5809 tree vec_elem_type = TREE_TYPE (vectype_out);
5810 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5812 tree vector_identity = NULL_TREE;
5813 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5814 vector_identity = build_zero_cst (vectype_out);
5816 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5817 int i;
5818 tree def0;
5819 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5821 gimple *new_stmt;
5822 tree mask = NULL_TREE;
5823 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5824 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5826 /* Handle MINUS by adding the negative. */
5827 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5829 tree negated = make_ssa_name (vectype_out);
5830 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5831 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5832 def0 = negated;
5835 if (mask)
5836 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5837 vector_identity);
5839 /* On the first iteration the input is simply the scalar phi
5840 result, and for subsequent iterations it is the output of
5841 the preceding operation. */
5842 if (reduc_fn != IFN_LAST)
5844 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5845 /* For chained SLP reductions the output of the previous reduction
5846 operation serves as the input of the next. For the final statement
5847 the output cannot be a temporary - we reuse the original
5848 scalar destination of the last statement. */
5849 if (i != vec_num - 1)
5851 gimple_set_lhs (new_stmt, scalar_dest_var);
5852 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5853 gimple_set_lhs (new_stmt, reduc_var);
5856 else
5858 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5859 reduc_var, def0);
5860 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5861 /* Remove the statement, so that we can use the same code paths
5862 as for statements that we've just created. */
5863 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5864 gsi_remove (&tmp_gsi, false);
5867 if (i == vec_num - 1)
5869 gimple_set_lhs (new_stmt, scalar_dest);
5870 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5871 new_stmt);
5873 else
5874 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5875 new_stmt, gsi);
5877 if (slp_node)
5878 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5881 if (!slp_node)
5882 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5884 return true;
5887 /* Function is_nonwrapping_integer_induction.
5889 Check if STMT_VINO (which is part of loop LOOP) both increments and
5890 does not cause overflow. */
5892 static bool
5893 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5895 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5896 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5897 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5898 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5899 widest_int ni, max_loop_value, lhs_max;
5900 wi::overflow_type overflow = wi::OVF_NONE;
5902 /* Make sure the loop is integer based. */
5903 if (TREE_CODE (base) != INTEGER_CST
5904 || TREE_CODE (step) != INTEGER_CST)
5905 return false;
5907 /* Check that the max size of the loop will not wrap. */
5909 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5910 return true;
5912 if (! max_stmt_executions (loop, &ni))
5913 return false;
5915 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5916 &overflow);
5917 if (overflow)
5918 return false;
5920 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5921 TYPE_SIGN (lhs_type), &overflow);
5922 if (overflow)
5923 return false;
5925 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5926 <= TYPE_PRECISION (lhs_type));
5929 /* Function vectorizable_reduction.
5931 Check if STMT_INFO performs a reduction operation that can be vectorized.
5932 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5933 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5934 Return true if STMT_INFO is vectorizable in this way.
5936 This function also handles reduction idioms (patterns) that have been
5937 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5938 may be of this form:
5939 X = pattern_expr (arg0, arg1, ..., X)
5940 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5941 sequence that had been detected and replaced by the pattern-stmt
5942 (STMT_INFO).
5944 This function also handles reduction of condition expressions, for example:
5945 for (int i = 0; i < N; i++)
5946 if (a[i] < value)
5947 last = a[i];
5948 This is handled by vectorising the loop and creating an additional vector
5949 containing the loop indexes for which "a[i] < value" was true. In the
5950 function epilogue this is reduced to a single max value and then used to
5951 index into the vector of results.
5953 In some cases of reduction patterns, the type of the reduction variable X is
5954 different than the type of the other arguments of STMT_INFO.
5955 In such cases, the vectype that is used when transforming STMT_INFO into
5956 a vector stmt is different than the vectype that is used to determine the
5957 vectorization factor, because it consists of a different number of elements
5958 than the actual number of elements that are being operated upon in parallel.
5960 For example, consider an accumulation of shorts into an int accumulator.
5961 On some targets it's possible to vectorize this pattern operating on 8
5962 shorts at a time (hence, the vectype for purposes of determining the
5963 vectorization factor should be V8HI); on the other hand, the vectype that
5964 is used to create the vector form is actually V4SI (the type of the result).
5966 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5967 indicates what is the actual level of parallelism (V8HI in the example), so
5968 that the right vectorization factor would be derived. This vectype
5969 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5970 be used to create the vectorized stmt. The right vectype for the vectorized
5971 stmt is obtained from the type of the result X:
5972 get_vectype_for_scalar_type (TREE_TYPE (X))
5974 This means that, contrary to "regular" reductions (or "regular" stmts in
5975 general), the following equation:
5976 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5977 does *NOT* necessarily hold for reduction patterns. */
5979 bool
5980 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5981 stmt_vec_info *vec_stmt, slp_tree slp_node,
5982 slp_instance slp_node_instance,
5983 stmt_vector_for_cost *cost_vec)
5985 tree vec_dest;
5986 tree scalar_dest;
5987 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5988 tree vectype_in = NULL_TREE;
5989 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5990 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5991 enum tree_code code, orig_code;
5992 internal_fn reduc_fn;
5993 machine_mode vec_mode;
5994 int op_type;
5995 optab optab;
5996 tree new_temp = NULL_TREE;
5997 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5998 stmt_vec_info cond_stmt_vinfo = NULL;
5999 enum tree_code cond_reduc_op_code = ERROR_MARK;
6000 tree scalar_type;
6001 bool is_simple_use;
6002 int i;
6003 int ncopies;
6004 int epilog_copies;
6005 stmt_vec_info prev_stmt_info, prev_phi_info;
6006 bool single_defuse_cycle = false;
6007 stmt_vec_info new_stmt_info = NULL;
6008 int j;
6009 tree ops[3];
6010 enum vect_def_type dts[3];
6011 bool nested_cycle = false, found_nested_cycle_def = false;
6012 bool double_reduc = false;
6013 basic_block def_bb;
6014 struct loop * def_stmt_loop;
6015 tree def_arg;
6016 auto_vec<tree> vec_oprnds0;
6017 auto_vec<tree> vec_oprnds1;
6018 auto_vec<tree> vec_oprnds2;
6019 auto_vec<tree> vect_defs;
6020 auto_vec<stmt_vec_info> phis;
6021 int vec_num;
6022 tree def0, tem;
6023 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6024 tree cond_reduc_val = NULL_TREE;
6026 /* Make sure it was already recognized as a reduction computation. */
6027 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6028 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6029 return false;
6031 if (nested_in_vect_loop_p (loop, stmt_info))
6033 loop = loop->inner;
6034 nested_cycle = true;
6037 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6038 gcc_assert (slp_node
6039 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6041 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6043 tree phi_result = gimple_phi_result (phi);
6044 /* Analysis is fully done on the reduction stmt invocation. */
6045 if (! vec_stmt)
6047 if (slp_node)
6048 slp_node_instance->reduc_phis = slp_node;
6050 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6051 return true;
6054 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6055 /* Leave the scalar phi in place. Note that checking
6056 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6057 for reductions involving a single statement. */
6058 return true;
6060 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6061 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6063 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6064 == EXTRACT_LAST_REDUCTION)
6065 /* Leave the scalar phi in place. */
6066 return true;
6068 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6069 code = gimple_assign_rhs_code (reduc_stmt);
6070 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6072 tree op = gimple_op (reduc_stmt, k);
6073 if (op == phi_result)
6074 continue;
6075 if (k == 1 && code == COND_EXPR)
6076 continue;
6077 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6078 gcc_assert (is_simple_use);
6079 if (dt == vect_constant_def || dt == vect_external_def)
6080 continue;
6081 if (!vectype_in
6082 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6083 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6084 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6085 break;
6087 /* For a nested cycle we might end up with an operation like
6088 phi_result * phi_result. */
6089 if (!vectype_in)
6090 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6091 gcc_assert (vectype_in);
6093 if (slp_node)
6094 ncopies = 1;
6095 else
6096 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6098 stmt_vec_info use_stmt_info;
6099 if (ncopies > 1
6100 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6101 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6102 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6103 single_defuse_cycle = true;
6105 /* Create the destination vector */
6106 scalar_dest = gimple_assign_lhs (reduc_stmt);
6107 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6109 if (slp_node)
6110 /* The size vect_schedule_slp_instance computes is off for us. */
6111 vec_num = vect_get_num_vectors
6112 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6113 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6114 vectype_in);
6115 else
6116 vec_num = 1;
6118 /* Generate the reduction PHIs upfront. */
6119 prev_phi_info = NULL;
6120 for (j = 0; j < ncopies; j++)
6122 if (j == 0 || !single_defuse_cycle)
6124 for (i = 0; i < vec_num; i++)
6126 /* Create the reduction-phi that defines the reduction
6127 operand. */
6128 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6129 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6131 if (slp_node)
6132 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6133 else
6135 if (j == 0)
6136 STMT_VINFO_VEC_STMT (stmt_info)
6137 = *vec_stmt = new_phi_info;
6138 else
6139 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6140 prev_phi_info = new_phi_info;
6146 return true;
6149 /* 1. Is vectorizable reduction? */
6150 /* Not supportable if the reduction variable is used in the loop, unless
6151 it's a reduction chain. */
6152 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6153 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6154 return false;
6156 /* Reductions that are not used even in an enclosing outer-loop,
6157 are expected to be "live" (used out of the loop). */
6158 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6159 && !STMT_VINFO_LIVE_P (stmt_info))
6160 return false;
6162 /* 2. Has this been recognized as a reduction pattern?
6164 Check if STMT represents a pattern that has been recognized
6165 in earlier analysis stages. For stmts that represent a pattern,
6166 the STMT_VINFO_RELATED_STMT field records the last stmt in
6167 the original sequence that constitutes the pattern. */
6169 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6170 if (orig_stmt_info)
6172 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6173 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6176 /* 3. Check the operands of the operation. The first operands are defined
6177 inside the loop body. The last operand is the reduction variable,
6178 which is defined by the loop-header-phi. */
6180 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6182 /* Flatten RHS. */
6183 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6185 case GIMPLE_BINARY_RHS:
6186 code = gimple_assign_rhs_code (stmt);
6187 op_type = TREE_CODE_LENGTH (code);
6188 gcc_assert (op_type == binary_op);
6189 ops[0] = gimple_assign_rhs1 (stmt);
6190 ops[1] = gimple_assign_rhs2 (stmt);
6191 break;
6193 case GIMPLE_TERNARY_RHS:
6194 code = gimple_assign_rhs_code (stmt);
6195 op_type = TREE_CODE_LENGTH (code);
6196 gcc_assert (op_type == ternary_op);
6197 ops[0] = gimple_assign_rhs1 (stmt);
6198 ops[1] = gimple_assign_rhs2 (stmt);
6199 ops[2] = gimple_assign_rhs3 (stmt);
6200 break;
6202 case GIMPLE_UNARY_RHS:
6203 return false;
6205 default:
6206 gcc_unreachable ();
6209 if (code == COND_EXPR && slp_node)
6210 return false;
6212 scalar_dest = gimple_assign_lhs (stmt);
6213 scalar_type = TREE_TYPE (scalar_dest);
6214 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6215 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6216 return false;
6218 /* Do not try to vectorize bit-precision reductions. */
6219 if (!type_has_mode_precision_p (scalar_type))
6220 return false;
6222 /* All uses but the last are expected to be defined in the loop.
6223 The last use is the reduction variable. In case of nested cycle this
6224 assumption is not true: we use reduc_index to record the index of the
6225 reduction variable. */
6226 stmt_vec_info reduc_def_info = NULL;
6227 int reduc_index = -1;
6228 for (i = 0; i < op_type; i++)
6230 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6231 if (i == 0 && code == COND_EXPR)
6232 continue;
6234 stmt_vec_info def_stmt_info;
6235 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6236 &def_stmt_info);
6237 dt = dts[i];
6238 gcc_assert (is_simple_use);
6239 if (dt == vect_reduction_def)
6241 reduc_def_info = def_stmt_info;
6242 reduc_index = i;
6243 continue;
6245 else if (tem)
6247 /* To properly compute ncopies we are interested in the widest
6248 input type in case we're looking at a widening accumulation. */
6249 if (!vectype_in
6250 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6251 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6252 vectype_in = tem;
6255 if (dt != vect_internal_def
6256 && dt != vect_external_def
6257 && dt != vect_constant_def
6258 && dt != vect_induction_def
6259 && !(dt == vect_nested_cycle && nested_cycle))
6260 return false;
6262 if (dt == vect_nested_cycle)
6264 found_nested_cycle_def = true;
6265 reduc_def_info = def_stmt_info;
6266 reduc_index = i;
6269 if (i == 1 && code == COND_EXPR)
6271 /* Record how value of COND_EXPR is defined. */
6272 if (dt == vect_constant_def)
6274 cond_reduc_dt = dt;
6275 cond_reduc_val = ops[i];
6277 if (dt == vect_induction_def
6278 && def_stmt_info
6279 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6281 cond_reduc_dt = dt;
6282 cond_stmt_vinfo = def_stmt_info;
6287 if (!vectype_in)
6288 vectype_in = vectype_out;
6290 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6291 directy used in stmt. */
6292 if (reduc_index == -1)
6294 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6296 if (dump_enabled_p ())
6297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6298 "in-order reduction chain without SLP.\n");
6299 return false;
6302 if (orig_stmt_info)
6303 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6304 else
6305 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6308 if (! reduc_def_info)
6309 return false;
6311 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6312 if (!reduc_def_phi)
6313 return false;
6315 if (!(reduc_index == -1
6316 || dts[reduc_index] == vect_reduction_def
6317 || dts[reduc_index] == vect_nested_cycle
6318 || ((dts[reduc_index] == vect_internal_def
6319 || dts[reduc_index] == vect_external_def
6320 || dts[reduc_index] == vect_constant_def
6321 || dts[reduc_index] == vect_induction_def)
6322 && nested_cycle && found_nested_cycle_def)))
6324 /* For pattern recognized stmts, orig_stmt might be a reduction,
6325 but some helper statements for the pattern might not, or
6326 might be COND_EXPRs with reduction uses in the condition. */
6327 gcc_assert (orig_stmt_info);
6328 return false;
6331 /* PHIs should not participate in patterns. */
6332 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6333 enum vect_reduction_type v_reduc_type
6334 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6335 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6337 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6338 /* If we have a condition reduction, see if we can simplify it further. */
6339 if (v_reduc_type == COND_REDUCTION)
6341 /* TODO: We can't yet handle reduction chains, since we need to treat
6342 each COND_EXPR in the chain specially, not just the last one.
6343 E.g. for:
6345 x_1 = PHI <x_3, ...>
6346 x_2 = a_2 ? ... : x_1;
6347 x_3 = a_3 ? ... : x_2;
6349 we're interested in the last element in x_3 for which a_2 || a_3
6350 is true, whereas the current reduction chain handling would
6351 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6352 as a reduction operation. */
6353 if (reduc_index == -1)
6355 if (dump_enabled_p ())
6356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357 "conditional reduction chains not supported\n");
6358 return false;
6361 /* vect_is_simple_reduction ensured that operand 2 is the
6362 loop-carried operand. */
6363 gcc_assert (reduc_index == 2);
6365 /* Loop peeling modifies initial value of reduction PHI, which
6366 makes the reduction stmt to be transformed different to the
6367 original stmt analyzed. We need to record reduction code for
6368 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6369 it can be used directly at transform stage. */
6370 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6371 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6373 /* Also set the reduction type to CONST_COND_REDUCTION. */
6374 gcc_assert (cond_reduc_dt == vect_constant_def);
6375 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6377 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6378 vectype_in, OPTIMIZE_FOR_SPEED))
6380 if (dump_enabled_p ())
6381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6382 "optimizing condition reduction with"
6383 " FOLD_EXTRACT_LAST.\n");
6384 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6386 else if (cond_reduc_dt == vect_induction_def)
6388 tree base
6389 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6390 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6392 gcc_assert (TREE_CODE (base) == INTEGER_CST
6393 && TREE_CODE (step) == INTEGER_CST);
6394 cond_reduc_val = NULL_TREE;
6395 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6396 above base; punt if base is the minimum value of the type for
6397 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6398 if (tree_int_cst_sgn (step) == -1)
6400 cond_reduc_op_code = MIN_EXPR;
6401 if (tree_int_cst_sgn (base) == -1)
6402 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6403 else if (tree_int_cst_lt (base,
6404 TYPE_MAX_VALUE (TREE_TYPE (base))))
6405 cond_reduc_val
6406 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6408 else
6410 cond_reduc_op_code = MAX_EXPR;
6411 if (tree_int_cst_sgn (base) == 1)
6412 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6413 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6414 base))
6415 cond_reduc_val
6416 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6418 if (cond_reduc_val)
6420 if (dump_enabled_p ())
6421 dump_printf_loc (MSG_NOTE, vect_location,
6422 "condition expression based on "
6423 "integer induction.\n");
6424 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6425 = INTEGER_INDUC_COND_REDUCTION;
6428 else if (cond_reduc_dt == vect_constant_def)
6430 enum vect_def_type cond_initial_dt;
6431 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6432 tree cond_initial_val
6433 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6435 gcc_assert (cond_reduc_val != NULL_TREE);
6436 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6437 if (cond_initial_dt == vect_constant_def
6438 && types_compatible_p (TREE_TYPE (cond_initial_val),
6439 TREE_TYPE (cond_reduc_val)))
6441 tree e = fold_binary (LE_EXPR, boolean_type_node,
6442 cond_initial_val, cond_reduc_val);
6443 if (e && (integer_onep (e) || integer_zerop (e)))
6445 if (dump_enabled_p ())
6446 dump_printf_loc (MSG_NOTE, vect_location,
6447 "condition expression based on "
6448 "compile time constant.\n");
6449 /* Record reduction code at analysis stage. */
6450 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6451 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6452 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6453 = CONST_COND_REDUCTION;
6459 if (orig_stmt_info)
6460 gcc_assert (tmp == orig_stmt_info
6461 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6462 else
6463 /* We changed STMT to be the first stmt in reduction chain, hence we
6464 check that in this case the first element in the chain is STMT. */
6465 gcc_assert (tmp == stmt_info
6466 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6468 if (STMT_VINFO_LIVE_P (reduc_def_info))
6469 return false;
6471 if (slp_node)
6472 ncopies = 1;
6473 else
6474 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6476 gcc_assert (ncopies >= 1);
6478 vec_mode = TYPE_MODE (vectype_in);
6479 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6481 if (nested_cycle)
6483 def_bb = gimple_bb (reduc_def_phi);
6484 def_stmt_loop = def_bb->loop_father;
6485 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6486 loop_preheader_edge (def_stmt_loop));
6487 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6488 if (def_arg_stmt_info
6489 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6490 == vect_double_reduction_def))
6491 double_reduc = true;
6494 if (code == COND_EXPR)
6496 /* Only call during the analysis stage, otherwise we'll lose
6497 STMT_VINFO_TYPE. We'll pass ops[0] as reduc_op, it's only
6498 used as a flag during analysis. */
6499 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6500 ops[0], 0, NULL,
6501 cost_vec))
6503 if (dump_enabled_p ())
6504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6505 "unsupported condition in reduction\n");
6506 return false;
6509 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6510 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6512 /* Only call during the analysis stage, otherwise we'll lose
6513 STMT_VINFO_TYPE. We only support this for nested cycles
6514 without double reductions at the moment. */
6515 if (!nested_cycle
6516 || double_reduc
6517 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6518 NULL, cost_vec)))
6520 if (dump_enabled_p ())
6521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522 "unsupported shift or rotation in reduction\n");
6523 return false;
6526 else
6528 /* 4. Supportable by target? */
6530 /* 4.1. check support for the operation in the loop */
6531 optab = optab_for_tree_code (code, vectype_in, optab_default);
6532 if (!optab)
6534 if (dump_enabled_p ())
6535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536 "no optab.\n");
6538 return false;
6541 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6543 if (dump_enabled_p ())
6544 dump_printf (MSG_NOTE, "op not supported by target.\n");
6546 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6547 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6548 return false;
6550 if (dump_enabled_p ())
6551 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6554 /* Worthwhile without SIMD support? */
6555 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6556 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6558 if (dump_enabled_p ())
6559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6560 "not worthwhile without SIMD support.\n");
6562 return false;
6566 /* 4.2. Check support for the epilog operation.
6568 If STMT represents a reduction pattern, then the type of the
6569 reduction variable may be different than the type of the rest
6570 of the arguments. For example, consider the case of accumulation
6571 of shorts into an int accumulator; The original code:
6572 S1: int_a = (int) short_a;
6573 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6575 was replaced with:
6576 STMT: int_acc = widen_sum <short_a, int_acc>
6578 This means that:
6579 1. The tree-code that is used to create the vector operation in the
6580 epilog code (that reduces the partial results) is not the
6581 tree-code of STMT, but is rather the tree-code of the original
6582 stmt from the pattern that STMT is replacing. I.e, in the example
6583 above we want to use 'widen_sum' in the loop, but 'plus' in the
6584 epilog.
6585 2. The type (mode) we use to check available target support
6586 for the vector operation to be created in the *epilog*, is
6587 determined by the type of the reduction variable (in the example
6588 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6589 However the type (mode) we use to check available target support
6590 for the vector operation to be created *inside the loop*, is
6591 determined by the type of the other arguments to STMT (in the
6592 example we'd check this: optab_handler (widen_sum_optab,
6593 vect_short_mode)).
6595 This is contrary to "regular" reductions, in which the types of all
6596 the arguments are the same as the type of the reduction variable.
6597 For "regular" reductions we can therefore use the same vector type
6598 (and also the same tree-code) when generating the epilog code and
6599 when generating the code inside the loop. */
6601 vect_reduction_type reduction_type
6602 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6603 if (orig_stmt_info
6604 && (reduction_type == TREE_CODE_REDUCTION
6605 || reduction_type == FOLD_LEFT_REDUCTION))
6607 /* This is a reduction pattern: get the vectype from the type of the
6608 reduction variable, and get the tree-code from orig_stmt. */
6609 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6610 gcc_assert (vectype_out);
6611 vec_mode = TYPE_MODE (vectype_out);
6613 else
6615 /* Regular reduction: use the same vectype and tree-code as used for
6616 the vector code inside the loop can be used for the epilog code. */
6617 orig_code = code;
6619 if (code == MINUS_EXPR)
6620 orig_code = PLUS_EXPR;
6622 /* For simple condition reductions, replace with the actual expression
6623 we want to base our reduction around. */
6624 if (reduction_type == CONST_COND_REDUCTION)
6626 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6627 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6629 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6630 orig_code = cond_reduc_op_code;
6633 reduc_fn = IFN_LAST;
6635 if (reduction_type == TREE_CODE_REDUCTION
6636 || reduction_type == FOLD_LEFT_REDUCTION
6637 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6638 || reduction_type == CONST_COND_REDUCTION)
6640 if (reduction_type == FOLD_LEFT_REDUCTION
6641 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6642 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6644 if (reduc_fn != IFN_LAST
6645 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6646 OPTIMIZE_FOR_SPEED))
6648 if (dump_enabled_p ())
6649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6650 "reduc op not supported by target.\n");
6652 reduc_fn = IFN_LAST;
6655 else
6657 if (!nested_cycle || double_reduc)
6659 if (dump_enabled_p ())
6660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6661 "no reduc code for scalar code.\n");
6663 return false;
6667 else if (reduction_type == COND_REDUCTION)
6669 int scalar_precision
6670 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6671 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6672 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6673 nunits_out);
6675 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6676 OPTIMIZE_FOR_SPEED))
6677 reduc_fn = IFN_REDUC_MAX;
6680 if (reduction_type != EXTRACT_LAST_REDUCTION
6681 && (!nested_cycle || double_reduc)
6682 && reduc_fn == IFN_LAST
6683 && !nunits_out.is_constant ())
6685 if (dump_enabled_p ())
6686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6687 "missing target support for reduction on"
6688 " variable-length vectors.\n");
6689 return false;
6692 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6693 && ncopies > 1)
6695 if (dump_enabled_p ())
6696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6697 "multiple types in double reduction or condition "
6698 "reduction.\n");
6699 return false;
6702 /* For SLP reductions, see if there is a neutral value we can use. */
6703 tree neutral_op = NULL_TREE;
6704 if (slp_node)
6705 neutral_op = neutral_op_for_slp_reduction
6706 (slp_node_instance->reduc_phis, code,
6707 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6709 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6711 /* We can't support in-order reductions of code such as this:
6713 for (int i = 0; i < n1; ++i)
6714 for (int j = 0; j < n2; ++j)
6715 l += a[j];
6717 since GCC effectively transforms the loop when vectorizing:
6719 for (int i = 0; i < n1 / VF; ++i)
6720 for (int j = 0; j < n2; ++j)
6721 for (int k = 0; k < VF; ++k)
6722 l += a[j];
6724 which is a reassociation of the original operation. */
6725 if (dump_enabled_p ())
6726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727 "in-order double reduction not supported.\n");
6729 return false;
6732 if (reduction_type == FOLD_LEFT_REDUCTION
6733 && slp_node
6734 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6736 /* We cannot use in-order reductions in this case because there is
6737 an implicit reassociation of the operations involved. */
6738 if (dump_enabled_p ())
6739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740 "in-order unchained SLP reductions not supported.\n");
6741 return false;
6744 /* For double reductions, and for SLP reductions with a neutral value,
6745 we construct a variable-length initial vector by loading a vector
6746 full of the neutral value and then shift-and-inserting the start
6747 values into the low-numbered elements. */
6748 if ((double_reduc || neutral_op)
6749 && !nunits_out.is_constant ()
6750 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6751 vectype_out, OPTIMIZE_FOR_SPEED))
6753 if (dump_enabled_p ())
6754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755 "reduction on variable-length vectors requires"
6756 " target support for a vector-shift-and-insert"
6757 " operation.\n");
6758 return false;
6761 /* Check extra constraints for variable-length unchained SLP reductions. */
6762 if (STMT_SLP_TYPE (stmt_info)
6763 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6764 && !nunits_out.is_constant ())
6766 /* We checked above that we could build the initial vector when
6767 there's a neutral element value. Check here for the case in
6768 which each SLP statement has its own initial value and in which
6769 that value needs to be repeated for every instance of the
6770 statement within the initial vector. */
6771 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6772 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6773 if (!neutral_op
6774 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6776 if (dump_enabled_p ())
6777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6778 "unsupported form of SLP reduction for"
6779 " variable-length vectors: cannot build"
6780 " initial vector.\n");
6781 return false;
6783 /* The epilogue code relies on the number of elements being a multiple
6784 of the group size. The duplicate-and-interleave approach to setting
6785 up the the initial vector does too. */
6786 if (!multiple_p (nunits_out, group_size))
6788 if (dump_enabled_p ())
6789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6790 "unsupported form of SLP reduction for"
6791 " variable-length vectors: the vector size"
6792 " is not a multiple of the number of results.\n");
6793 return false;
6797 /* In case of widenning multiplication by a constant, we update the type
6798 of the constant to be the type of the other operand. We check that the
6799 constant fits the type in the pattern recognition pass. */
6800 if (code == DOT_PROD_EXPR
6801 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6803 if (TREE_CODE (ops[0]) == INTEGER_CST)
6804 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6805 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6806 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6807 else
6809 if (dump_enabled_p ())
6810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811 "invalid types in dot-prod\n");
6813 return false;
6817 if (reduction_type == COND_REDUCTION)
6819 widest_int ni;
6821 if (! max_loop_iterations (loop, &ni))
6823 if (dump_enabled_p ())
6824 dump_printf_loc (MSG_NOTE, vect_location,
6825 "loop count not known, cannot create cond "
6826 "reduction.\n");
6827 return false;
6829 /* Convert backedges to iterations. */
6830 ni += 1;
6832 /* The additional index will be the same type as the condition. Check
6833 that the loop can fit into this less one (because we'll use up the
6834 zero slot for when there are no matches). */
6835 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6836 if (wi::geu_p (ni, wi::to_widest (max_index)))
6838 if (dump_enabled_p ())
6839 dump_printf_loc (MSG_NOTE, vect_location,
6840 "loop size is greater than data size.\n");
6841 return false;
6845 /* In case the vectorization factor (VF) is bigger than the number
6846 of elements that we can fit in a vectype (nunits), we have to generate
6847 more than one vector stmt - i.e - we need to "unroll" the
6848 vector stmt by a factor VF/nunits. For more details see documentation
6849 in vectorizable_operation. */
6851 /* If the reduction is used in an outer loop we need to generate
6852 VF intermediate results, like so (e.g. for ncopies=2):
6853 r0 = phi (init, r0)
6854 r1 = phi (init, r1)
6855 r0 = x0 + r0;
6856 r1 = x1 + r1;
6857 (i.e. we generate VF results in 2 registers).
6858 In this case we have a separate def-use cycle for each copy, and therefore
6859 for each copy we get the vector def for the reduction variable from the
6860 respective phi node created for this copy.
6862 Otherwise (the reduction is unused in the loop nest), we can combine
6863 together intermediate results, like so (e.g. for ncopies=2):
6864 r = phi (init, r)
6865 r = x0 + r;
6866 r = x1 + r;
6867 (i.e. we generate VF/2 results in a single register).
6868 In this case for each copy we get the vector def for the reduction variable
6869 from the vectorized reduction operation generated in the previous iteration.
6871 This only works when we see both the reduction PHI and its only consumer
6872 in vectorizable_reduction and there are no intermediate stmts
6873 participating. */
6874 stmt_vec_info use_stmt_info;
6875 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6876 if (ncopies > 1
6877 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6878 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6879 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6881 single_defuse_cycle = true;
6882 epilog_copies = 1;
6884 else
6885 epilog_copies = ncopies;
6887 /* If the reduction stmt is one of the patterns that have lane
6888 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6889 if ((ncopies > 1
6890 && ! single_defuse_cycle)
6891 && (code == DOT_PROD_EXPR
6892 || code == WIDEN_SUM_EXPR
6893 || code == SAD_EXPR))
6895 if (dump_enabled_p ())
6896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6897 "multi def-use cycle not possible for lane-reducing "
6898 "reduction operation\n");
6899 return false;
6902 if (slp_node)
6903 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6904 else
6905 vec_num = 1;
6907 internal_fn cond_fn = get_conditional_internal_fn (code);
6908 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6910 if (!vec_stmt) /* transformation not required. */
6912 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6913 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6915 if (reduction_type != FOLD_LEFT_REDUCTION
6916 && (cond_fn == IFN_LAST
6917 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6918 OPTIMIZE_FOR_SPEED)))
6920 if (dump_enabled_p ())
6921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922 "can't use a fully-masked loop because no"
6923 " conditional operation is available.\n");
6924 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6926 else if (reduc_index == -1)
6928 if (dump_enabled_p ())
6929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6930 "can't use a fully-masked loop for chained"
6931 " reductions.\n");
6932 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6934 else
6935 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6936 vectype_in);
6938 if (dump_enabled_p ()
6939 && reduction_type == FOLD_LEFT_REDUCTION)
6940 dump_printf_loc (MSG_NOTE, vect_location,
6941 "using an in-order (fold-left) reduction.\n");
6942 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6943 return true;
6946 /* Transform. */
6948 if (dump_enabled_p ())
6949 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6951 /* FORNOW: Multiple types are not supported for condition. */
6952 if (code == COND_EXPR)
6953 gcc_assert (ncopies == 1);
6955 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6957 if (reduction_type == FOLD_LEFT_REDUCTION)
6958 return vectorize_fold_left_reduction
6959 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6960 reduc_fn, ops, vectype_in, reduc_index, masks);
6962 if (reduction_type == EXTRACT_LAST_REDUCTION)
6964 gcc_assert (!slp_node);
6965 return vectorizable_condition (stmt_info, gsi, vec_stmt,
6966 NULL, reduc_index, NULL, NULL);
6969 /* Create the destination vector */
6970 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6972 prev_stmt_info = NULL;
6973 prev_phi_info = NULL;
6974 if (!slp_node)
6976 vec_oprnds0.create (1);
6977 vec_oprnds1.create (1);
6978 if (op_type == ternary_op)
6979 vec_oprnds2.create (1);
6982 phis.create (vec_num);
6983 vect_defs.create (vec_num);
6984 if (!slp_node)
6985 vect_defs.quick_push (NULL_TREE);
6987 if (slp_node)
6988 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6989 else
6990 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6992 for (j = 0; j < ncopies; j++)
6994 if (code == COND_EXPR)
6996 gcc_assert (!slp_node);
6997 vectorizable_condition (stmt_info, gsi, vec_stmt,
6998 PHI_RESULT (phis[0]->stmt),
6999 reduc_index, NULL, NULL);
7000 /* Multiple types are not supported for condition. */
7001 break;
7003 if (code == LSHIFT_EXPR
7004 || code == RSHIFT_EXPR)
7006 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7007 break;
7010 /* Handle uses. */
7011 if (j == 0)
7013 if (slp_node)
7015 /* Get vec defs for all the operands except the reduction index,
7016 ensuring the ordering of the ops in the vector is kept. */
7017 auto_vec<tree, 3> slp_ops;
7018 auto_vec<vec<tree>, 3> vec_defs;
7020 slp_ops.quick_push (ops[0]);
7021 slp_ops.quick_push (ops[1]);
7022 if (op_type == ternary_op)
7023 slp_ops.quick_push (ops[2]);
7025 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7027 vec_oprnds0.safe_splice (vec_defs[0]);
7028 vec_defs[0].release ();
7029 vec_oprnds1.safe_splice (vec_defs[1]);
7030 vec_defs[1].release ();
7031 if (op_type == ternary_op)
7033 vec_oprnds2.safe_splice (vec_defs[2]);
7034 vec_defs[2].release ();
7037 else
7039 vec_oprnds0.quick_push
7040 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7041 vec_oprnds1.quick_push
7042 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7043 if (op_type == ternary_op)
7044 vec_oprnds2.quick_push
7045 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7048 else
7050 if (!slp_node)
7052 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7054 if (single_defuse_cycle && reduc_index == 0)
7055 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7056 else
7057 vec_oprnds0[0]
7058 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7059 vec_oprnds0[0]);
7060 if (single_defuse_cycle && reduc_index == 1)
7061 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7062 else
7063 vec_oprnds1[0]
7064 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7065 vec_oprnds1[0]);
7066 if (op_type == ternary_op)
7068 if (single_defuse_cycle && reduc_index == 2)
7069 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7070 else
7071 vec_oprnds2[0]
7072 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7073 vec_oprnds2[0]);
7078 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7080 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7081 if (masked_loop_p)
7083 /* Make sure that the reduction accumulator is vop[0]. */
7084 if (reduc_index == 1)
7086 gcc_assert (commutative_tree_code (code));
7087 std::swap (vop[0], vop[1]);
7089 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7090 vectype_in, i * ncopies + j);
7091 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7092 vop[0], vop[1],
7093 vop[0]);
7094 new_temp = make_ssa_name (vec_dest, call);
7095 gimple_call_set_lhs (call, new_temp);
7096 gimple_call_set_nothrow (call, true);
7097 new_stmt_info
7098 = vect_finish_stmt_generation (stmt_info, call, gsi);
7100 else
7102 if (op_type == ternary_op)
7103 vop[2] = vec_oprnds2[i];
7105 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7106 vop[0], vop[1], vop[2]);
7107 new_temp = make_ssa_name (vec_dest, new_stmt);
7108 gimple_assign_set_lhs (new_stmt, new_temp);
7109 new_stmt_info
7110 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7113 if (slp_node)
7115 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7116 vect_defs.quick_push (new_temp);
7118 else
7119 vect_defs[0] = new_temp;
7122 if (slp_node)
7123 continue;
7125 if (j == 0)
7126 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7127 else
7128 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7130 prev_stmt_info = new_stmt_info;
7133 /* Finalize the reduction-phi (set its arguments) and create the
7134 epilog reduction code. */
7135 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7136 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7138 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7139 epilog_copies, reduc_fn, phis,
7140 double_reduc, slp_node, slp_node_instance,
7141 cond_reduc_val, cond_reduc_op_code,
7142 neutral_op);
7144 return true;
7147 /* Function vect_min_worthwhile_factor.
7149 For a loop where we could vectorize the operation indicated by CODE,
7150 return the minimum vectorization factor that makes it worthwhile
7151 to use generic vectors. */
7152 static unsigned int
7153 vect_min_worthwhile_factor (enum tree_code code)
7155 switch (code)
7157 case PLUS_EXPR:
7158 case MINUS_EXPR:
7159 case NEGATE_EXPR:
7160 return 4;
7162 case BIT_AND_EXPR:
7163 case BIT_IOR_EXPR:
7164 case BIT_XOR_EXPR:
7165 case BIT_NOT_EXPR:
7166 return 2;
7168 default:
7169 return INT_MAX;
7173 /* Return true if VINFO indicates we are doing loop vectorization and if
7174 it is worth decomposing CODE operations into scalar operations for
7175 that loop's vectorization factor. */
7177 bool
7178 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7180 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7181 unsigned HOST_WIDE_INT value;
7182 return (loop_vinfo
7183 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7184 && value >= vect_min_worthwhile_factor (code));
7187 /* Function vectorizable_induction
7189 Check if STMT_INFO performs an induction computation that can be vectorized.
7190 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7191 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7192 Return true if STMT_INFO is vectorizable in this way. */
7194 bool
7195 vectorizable_induction (stmt_vec_info stmt_info,
7196 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7197 stmt_vec_info *vec_stmt, slp_tree slp_node,
7198 stmt_vector_for_cost *cost_vec)
7200 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7201 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7202 unsigned ncopies;
7203 bool nested_in_vect_loop = false;
7204 struct loop *iv_loop;
7205 tree vec_def;
7206 edge pe = loop_preheader_edge (loop);
7207 basic_block new_bb;
7208 tree new_vec, vec_init, vec_step, t;
7209 tree new_name;
7210 gimple *new_stmt;
7211 gphi *induction_phi;
7212 tree induc_def, vec_dest;
7213 tree init_expr, step_expr;
7214 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7215 unsigned i;
7216 tree expr;
7217 gimple_seq stmts;
7218 imm_use_iterator imm_iter;
7219 use_operand_p use_p;
7220 gimple *exit_phi;
7221 edge latch_e;
7222 tree loop_arg;
7223 gimple_stmt_iterator si;
7225 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7226 if (!phi)
7227 return false;
7229 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7230 return false;
7232 /* Make sure it was recognized as induction computation. */
7233 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7234 return false;
7236 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7237 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7239 if (slp_node)
7240 ncopies = 1;
7241 else
7242 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7243 gcc_assert (ncopies >= 1);
7245 /* FORNOW. These restrictions should be relaxed. */
7246 if (nested_in_vect_loop_p (loop, stmt_info))
7248 imm_use_iterator imm_iter;
7249 use_operand_p use_p;
7250 gimple *exit_phi;
7251 edge latch_e;
7252 tree loop_arg;
7254 if (ncopies > 1)
7256 if (dump_enabled_p ())
7257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7258 "multiple types in nested loop.\n");
7259 return false;
7262 /* FORNOW: outer loop induction with SLP not supported. */
7263 if (STMT_SLP_TYPE (stmt_info))
7264 return false;
7266 exit_phi = NULL;
7267 latch_e = loop_latch_edge (loop->inner);
7268 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7269 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7271 gimple *use_stmt = USE_STMT (use_p);
7272 if (is_gimple_debug (use_stmt))
7273 continue;
7275 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7277 exit_phi = use_stmt;
7278 break;
7281 if (exit_phi)
7283 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7284 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7285 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7287 if (dump_enabled_p ())
7288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7289 "inner-loop induction only used outside "
7290 "of the outer vectorized loop.\n");
7291 return false;
7295 nested_in_vect_loop = true;
7296 iv_loop = loop->inner;
7298 else
7299 iv_loop = loop;
7300 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7302 if (slp_node && !nunits.is_constant ())
7304 /* The current SLP code creates the initial value element-by-element. */
7305 if (dump_enabled_p ())
7306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7307 "SLP induction not supported for variable-length"
7308 " vectors.\n");
7309 return false;
7312 if (!vec_stmt) /* transformation not required. */
7314 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7315 DUMP_VECT_SCOPE ("vectorizable_induction");
7316 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7317 return true;
7320 /* Transform. */
7322 /* Compute a vector variable, initialized with the first VF values of
7323 the induction variable. E.g., for an iv with IV_PHI='X' and
7324 evolution S, for a vector of 4 units, we want to compute:
7325 [X, X + S, X + 2*S, X + 3*S]. */
7327 if (dump_enabled_p ())
7328 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7330 latch_e = loop_latch_edge (iv_loop);
7331 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7333 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7334 gcc_assert (step_expr != NULL_TREE);
7336 pe = loop_preheader_edge (iv_loop);
7337 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7338 loop_preheader_edge (iv_loop));
7340 stmts = NULL;
7341 if (!nested_in_vect_loop)
7343 /* Convert the initial value to the desired type. */
7344 tree new_type = TREE_TYPE (vectype);
7345 init_expr = gimple_convert (&stmts, new_type, init_expr);
7347 /* If we are using the loop mask to "peel" for alignment then we need
7348 to adjust the start value here. */
7349 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7350 if (skip_niters != NULL_TREE)
7352 if (FLOAT_TYPE_P (vectype))
7353 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7354 skip_niters);
7355 else
7356 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7357 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7358 skip_niters, step_expr);
7359 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7360 init_expr, skip_step);
7364 /* Convert the step to the desired type. */
7365 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7367 if (stmts)
7369 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7370 gcc_assert (!new_bb);
7373 /* Find the first insertion point in the BB. */
7374 basic_block bb = gimple_bb (phi);
7375 si = gsi_after_labels (bb);
7377 /* For SLP induction we have to generate several IVs as for example
7378 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7379 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7380 [VF*S, VF*S, VF*S, VF*S] for all. */
7381 if (slp_node)
7383 /* Enforced above. */
7384 unsigned int const_nunits = nunits.to_constant ();
7386 /* Generate [VF*S, VF*S, ... ]. */
7387 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7389 expr = build_int_cst (integer_type_node, vf);
7390 expr = fold_convert (TREE_TYPE (step_expr), expr);
7392 else
7393 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7394 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7395 expr, step_expr);
7396 if (! CONSTANT_CLASS_P (new_name))
7397 new_name = vect_init_vector (stmt_info, new_name,
7398 TREE_TYPE (step_expr), NULL);
7399 new_vec = build_vector_from_val (vectype, new_name);
7400 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7402 /* Now generate the IVs. */
7403 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7404 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7405 unsigned elts = const_nunits * nvects;
7406 unsigned nivs = least_common_multiple (group_size,
7407 const_nunits) / const_nunits;
7408 gcc_assert (elts % group_size == 0);
7409 tree elt = init_expr;
7410 unsigned ivn;
7411 for (ivn = 0; ivn < nivs; ++ivn)
7413 tree_vector_builder elts (vectype, const_nunits, 1);
7414 stmts = NULL;
7415 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7417 if (ivn*const_nunits + eltn >= group_size
7418 && (ivn * const_nunits + eltn) % group_size == 0)
7419 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7420 elt, step_expr);
7421 elts.quick_push (elt);
7423 vec_init = gimple_build_vector (&stmts, &elts);
7424 if (stmts)
7426 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7427 gcc_assert (!new_bb);
7430 /* Create the induction-phi that defines the induction-operand. */
7431 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7432 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7433 stmt_vec_info induction_phi_info
7434 = loop_vinfo->add_stmt (induction_phi);
7435 induc_def = PHI_RESULT (induction_phi);
7437 /* Create the iv update inside the loop */
7438 vec_def = make_ssa_name (vec_dest);
7439 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7440 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7441 loop_vinfo->add_stmt (new_stmt);
7443 /* Set the arguments of the phi node: */
7444 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7445 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7446 UNKNOWN_LOCATION);
7448 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7451 /* Re-use IVs when we can. */
7452 if (ivn < nvects)
7454 unsigned vfp
7455 = least_common_multiple (group_size, const_nunits) / group_size;
7456 /* Generate [VF'*S, VF'*S, ... ]. */
7457 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7459 expr = build_int_cst (integer_type_node, vfp);
7460 expr = fold_convert (TREE_TYPE (step_expr), expr);
7462 else
7463 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7464 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7465 expr, step_expr);
7466 if (! CONSTANT_CLASS_P (new_name))
7467 new_name = vect_init_vector (stmt_info, new_name,
7468 TREE_TYPE (step_expr), NULL);
7469 new_vec = build_vector_from_val (vectype, new_name);
7470 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7471 for (; ivn < nvects; ++ivn)
7473 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7474 tree def;
7475 if (gimple_code (iv) == GIMPLE_PHI)
7476 def = gimple_phi_result (iv);
7477 else
7478 def = gimple_assign_lhs (iv);
7479 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7480 PLUS_EXPR,
7481 def, vec_step);
7482 if (gimple_code (iv) == GIMPLE_PHI)
7483 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7484 else
7486 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7487 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7489 SLP_TREE_VEC_STMTS (slp_node).quick_push
7490 (loop_vinfo->add_stmt (new_stmt));
7494 return true;
7497 /* Create the vector that holds the initial_value of the induction. */
7498 if (nested_in_vect_loop)
7500 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7501 been created during vectorization of previous stmts. We obtain it
7502 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7503 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7504 /* If the initial value is not of proper type, convert it. */
7505 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7507 new_stmt
7508 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7509 vect_simple_var,
7510 "vec_iv_"),
7511 VIEW_CONVERT_EXPR,
7512 build1 (VIEW_CONVERT_EXPR, vectype,
7513 vec_init));
7514 vec_init = gimple_assign_lhs (new_stmt);
7515 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7516 new_stmt);
7517 gcc_assert (!new_bb);
7518 loop_vinfo->add_stmt (new_stmt);
7521 else
7523 /* iv_loop is the loop to be vectorized. Create:
7524 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7525 stmts = NULL;
7526 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7528 unsigned HOST_WIDE_INT const_nunits;
7529 if (nunits.is_constant (&const_nunits))
7531 tree_vector_builder elts (vectype, const_nunits, 1);
7532 elts.quick_push (new_name);
7533 for (i = 1; i < const_nunits; i++)
7535 /* Create: new_name_i = new_name + step_expr */
7536 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7537 new_name, step_expr);
7538 elts.quick_push (new_name);
7540 /* Create a vector from [new_name_0, new_name_1, ...,
7541 new_name_nunits-1] */
7542 vec_init = gimple_build_vector (&stmts, &elts);
7544 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7545 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7546 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7547 new_name, step_expr);
7548 else
7550 /* Build:
7551 [base, base, base, ...]
7552 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7553 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7554 gcc_assert (flag_associative_math);
7555 tree index = build_index_vector (vectype, 0, 1);
7556 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7557 new_name);
7558 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7559 step_expr);
7560 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7561 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7562 vec_init, step_vec);
7563 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7564 vec_init, base_vec);
7567 if (stmts)
7569 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7570 gcc_assert (!new_bb);
7575 /* Create the vector that holds the step of the induction. */
7576 if (nested_in_vect_loop)
7577 /* iv_loop is nested in the loop to be vectorized. Generate:
7578 vec_step = [S, S, S, S] */
7579 new_name = step_expr;
7580 else
7582 /* iv_loop is the loop to be vectorized. Generate:
7583 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7584 gimple_seq seq = NULL;
7585 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7587 expr = build_int_cst (integer_type_node, vf);
7588 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7590 else
7591 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7592 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7593 expr, step_expr);
7594 if (seq)
7596 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7597 gcc_assert (!new_bb);
7601 t = unshare_expr (new_name);
7602 gcc_assert (CONSTANT_CLASS_P (new_name)
7603 || TREE_CODE (new_name) == SSA_NAME);
7604 new_vec = build_vector_from_val (vectype, t);
7605 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7608 /* Create the following def-use cycle:
7609 loop prolog:
7610 vec_init = ...
7611 vec_step = ...
7612 loop:
7613 vec_iv = PHI <vec_init, vec_loop>
7615 STMT
7617 vec_loop = vec_iv + vec_step; */
7619 /* Create the induction-phi that defines the induction-operand. */
7620 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7621 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7622 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7623 induc_def = PHI_RESULT (induction_phi);
7625 /* Create the iv update inside the loop */
7626 vec_def = make_ssa_name (vec_dest);
7627 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7628 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7629 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7631 /* Set the arguments of the phi node: */
7632 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7633 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7634 UNKNOWN_LOCATION);
7636 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7638 /* In case that vectorization factor (VF) is bigger than the number
7639 of elements that we can fit in a vectype (nunits), we have to generate
7640 more than one vector stmt - i.e - we need to "unroll" the
7641 vector stmt by a factor VF/nunits. For more details see documentation
7642 in vectorizable_operation. */
7644 if (ncopies > 1)
7646 gimple_seq seq = NULL;
7647 stmt_vec_info prev_stmt_vinfo;
7648 /* FORNOW. This restriction should be relaxed. */
7649 gcc_assert (!nested_in_vect_loop);
7651 /* Create the vector that holds the step of the induction. */
7652 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7654 expr = build_int_cst (integer_type_node, nunits);
7655 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7657 else
7658 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7659 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7660 expr, step_expr);
7661 if (seq)
7663 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7664 gcc_assert (!new_bb);
7667 t = unshare_expr (new_name);
7668 gcc_assert (CONSTANT_CLASS_P (new_name)
7669 || TREE_CODE (new_name) == SSA_NAME);
7670 new_vec = build_vector_from_val (vectype, t);
7671 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7673 vec_def = induc_def;
7674 prev_stmt_vinfo = induction_phi_info;
7675 for (i = 1; i < ncopies; i++)
7677 /* vec_i = vec_prev + vec_step */
7678 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7679 vec_def, vec_step);
7680 vec_def = make_ssa_name (vec_dest, new_stmt);
7681 gimple_assign_set_lhs (new_stmt, vec_def);
7683 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7684 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7685 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7686 prev_stmt_vinfo = new_stmt_info;
7690 if (nested_in_vect_loop)
7692 /* Find the loop-closed exit-phi of the induction, and record
7693 the final vector of induction results: */
7694 exit_phi = NULL;
7695 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7697 gimple *use_stmt = USE_STMT (use_p);
7698 if (is_gimple_debug (use_stmt))
7699 continue;
7701 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7703 exit_phi = use_stmt;
7704 break;
7707 if (exit_phi)
7709 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7710 /* FORNOW. Currently not supporting the case that an inner-loop induction
7711 is not used in the outer-loop (i.e. only outside the outer-loop). */
7712 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7713 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7715 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7716 if (dump_enabled_p ())
7717 dump_printf_loc (MSG_NOTE, vect_location,
7718 "vector of inductions after inner-loop:%G",
7719 new_stmt);
7724 if (dump_enabled_p ())
7725 dump_printf_loc (MSG_NOTE, vect_location,
7726 "transform induction: created def-use cycle: %G%G",
7727 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7729 return true;
7732 /* Function vectorizable_live_operation.
7734 STMT_INFO computes a value that is used outside the loop. Check if
7735 it can be supported. */
7737 bool
7738 vectorizable_live_operation (stmt_vec_info stmt_info,
7739 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7740 slp_tree slp_node, int slp_index,
7741 stmt_vec_info *vec_stmt,
7742 stmt_vector_for_cost *)
7744 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7745 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7746 imm_use_iterator imm_iter;
7747 tree lhs, lhs_type, bitsize, vec_bitsize;
7748 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7749 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7750 int ncopies;
7751 gimple *use_stmt;
7752 auto_vec<tree> vec_oprnds;
7753 int vec_entry = 0;
7754 poly_uint64 vec_index = 0;
7756 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7758 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7759 return false;
7761 /* FORNOW. CHECKME. */
7762 if (nested_in_vect_loop_p (loop, stmt_info))
7763 return false;
7765 /* If STMT is not relevant and it is a simple assignment and its inputs are
7766 invariant then it can remain in place, unvectorized. The original last
7767 scalar value that it computes will be used. */
7768 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7770 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7771 if (dump_enabled_p ())
7772 dump_printf_loc (MSG_NOTE, vect_location,
7773 "statement is simple and uses invariant. Leaving in "
7774 "place.\n");
7775 return true;
7778 if (slp_node)
7779 ncopies = 1;
7780 else
7781 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7783 if (slp_node)
7785 gcc_assert (slp_index >= 0);
7787 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7788 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7790 /* Get the last occurrence of the scalar index from the concatenation of
7791 all the slp vectors. Calculate which slp vector it is and the index
7792 within. */
7793 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7795 /* Calculate which vector contains the result, and which lane of
7796 that vector we need. */
7797 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7799 if (dump_enabled_p ())
7800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7801 "Cannot determine which vector holds the"
7802 " final result.\n");
7803 return false;
7807 if (!vec_stmt)
7809 /* No transformation required. */
7810 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7812 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7813 OPTIMIZE_FOR_SPEED))
7815 if (dump_enabled_p ())
7816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7817 "can't use a fully-masked loop because "
7818 "the target doesn't support extract last "
7819 "reduction.\n");
7820 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7822 else if (slp_node)
7824 if (dump_enabled_p ())
7825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7826 "can't use a fully-masked loop because an "
7827 "SLP statement is live after the loop.\n");
7828 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7830 else if (ncopies > 1)
7832 if (dump_enabled_p ())
7833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7834 "can't use a fully-masked loop because"
7835 " ncopies is greater than 1.\n");
7836 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7838 else
7840 gcc_assert (ncopies == 1 && !slp_node);
7841 vect_record_loop_mask (loop_vinfo,
7842 &LOOP_VINFO_MASKS (loop_vinfo),
7843 1, vectype);
7846 return true;
7849 /* Use the lhs of the original scalar statement. */
7850 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7852 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7853 : gimple_get_lhs (stmt);
7854 lhs_type = TREE_TYPE (lhs);
7856 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7857 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7858 : TYPE_SIZE (TREE_TYPE (vectype)));
7859 vec_bitsize = TYPE_SIZE (vectype);
7861 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7862 tree vec_lhs, bitstart;
7863 if (slp_node)
7865 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7867 /* Get the correct slp vectorized stmt. */
7868 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7869 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7870 vec_lhs = gimple_phi_result (phi);
7871 else
7872 vec_lhs = gimple_get_lhs (vec_stmt);
7874 /* Get entry to use. */
7875 bitstart = bitsize_int (vec_index);
7876 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7878 else
7880 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7881 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7882 gcc_checking_assert (ncopies == 1
7883 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7885 /* For multiple copies, get the last copy. */
7886 for (int i = 1; i < ncopies; ++i)
7887 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7889 /* Get the last lane in the vector. */
7890 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7893 gimple_seq stmts = NULL;
7894 tree new_tree;
7895 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7897 /* Emit:
7899 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7901 where VEC_LHS is the vectorized live-out result and MASK is
7902 the loop mask for the final iteration. */
7903 gcc_assert (ncopies == 1 && !slp_node);
7904 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7905 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7906 1, vectype, 0);
7907 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7908 scalar_type, mask, vec_lhs);
7910 /* Convert the extracted vector element to the required scalar type. */
7911 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7913 else
7915 tree bftype = TREE_TYPE (vectype);
7916 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7917 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7918 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7919 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7920 &stmts, true, NULL_TREE);
7923 if (stmts)
7924 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7926 /* Replace use of lhs with newly computed result. If the use stmt is a
7927 single arg PHI, just replace all uses of PHI result. It's necessary
7928 because lcssa PHI defining lhs may be before newly inserted stmt. */
7929 use_operand_p use_p;
7930 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7931 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7932 && !is_gimple_debug (use_stmt))
7934 if (gimple_code (use_stmt) == GIMPLE_PHI
7935 && gimple_phi_num_args (use_stmt) == 1)
7937 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7939 else
7941 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7942 SET_USE (use_p, new_tree);
7944 update_stmt (use_stmt);
7947 return true;
7950 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7952 static void
7953 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7955 ssa_op_iter op_iter;
7956 imm_use_iterator imm_iter;
7957 def_operand_p def_p;
7958 gimple *ustmt;
7960 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7962 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7964 basic_block bb;
7966 if (!is_gimple_debug (ustmt))
7967 continue;
7969 bb = gimple_bb (ustmt);
7971 if (!flow_bb_inside_loop_p (loop, bb))
7973 if (gimple_debug_bind_p (ustmt))
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_NOTE, vect_location,
7977 "killing debug use\n");
7979 gimple_debug_bind_reset_value (ustmt);
7980 update_stmt (ustmt);
7982 else
7983 gcc_unreachable ();
7989 /* Given loop represented by LOOP_VINFO, return true if computation of
7990 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7991 otherwise. */
7993 static bool
7994 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7996 /* Constant case. */
7997 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7999 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8000 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8002 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8003 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8004 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8005 return true;
8008 widest_int max;
8009 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8010 /* Check the upper bound of loop niters. */
8011 if (get_max_loop_iterations (loop, &max))
8013 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8014 signop sgn = TYPE_SIGN (type);
8015 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8016 if (max < type_max)
8017 return true;
8019 return false;
8022 /* Return a mask type with half the number of elements as TYPE. */
8024 tree
8025 vect_halve_mask_nunits (tree type)
8027 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8028 return build_truth_vector_type (nunits, current_vector_size);
8031 /* Return a mask type with twice as many elements as TYPE. */
8033 tree
8034 vect_double_mask_nunits (tree type)
8036 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8037 return build_truth_vector_type (nunits, current_vector_size);
8040 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8041 contain a sequence of NVECTORS masks that each control a vector of type
8042 VECTYPE. */
8044 void
8045 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8046 unsigned int nvectors, tree vectype)
8048 gcc_assert (nvectors != 0);
8049 if (masks->length () < nvectors)
8050 masks->safe_grow_cleared (nvectors);
8051 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8052 /* The number of scalars per iteration and the number of vectors are
8053 both compile-time constants. */
8054 unsigned int nscalars_per_iter
8055 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8056 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8057 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8059 rgm->max_nscalars_per_iter = nscalars_per_iter;
8060 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8064 /* Given a complete set of masks MASKS, extract mask number INDEX
8065 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8066 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8068 See the comment above vec_loop_masks for more details about the mask
8069 arrangement. */
8071 tree
8072 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8073 unsigned int nvectors, tree vectype, unsigned int index)
8075 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8076 tree mask_type = rgm->mask_type;
8078 /* Populate the rgroup's mask array, if this is the first time we've
8079 used it. */
8080 if (rgm->masks.is_empty ())
8082 rgm->masks.safe_grow_cleared (nvectors);
8083 for (unsigned int i = 0; i < nvectors; ++i)
8085 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8086 /* Provide a dummy definition until the real one is available. */
8087 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8088 rgm->masks[i] = mask;
8092 tree mask = rgm->masks[index];
8093 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8094 TYPE_VECTOR_SUBPARTS (vectype)))
8096 /* A loop mask for data type X can be reused for data type Y
8097 if X has N times more elements than Y and if Y's elements
8098 are N times bigger than X's. In this case each sequence
8099 of N elements in the loop mask will be all-zero or all-one.
8100 We can then view-convert the mask so that each sequence of
8101 N elements is replaced by a single element. */
8102 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8103 TYPE_VECTOR_SUBPARTS (vectype)));
8104 gimple_seq seq = NULL;
8105 mask_type = build_same_sized_truth_vector_type (vectype);
8106 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8107 if (seq)
8108 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8110 return mask;
8113 /* Scale profiling counters by estimation for LOOP which is vectorized
8114 by factor VF. */
8116 static void
8117 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8119 edge preheader = loop_preheader_edge (loop);
8120 /* Reduce loop iterations by the vectorization factor. */
8121 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8122 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8124 if (freq_h.nonzero_p ())
8126 profile_probability p;
8128 /* Avoid dropping loop body profile counter to 0 because of zero count
8129 in loop's preheader. */
8130 if (!(freq_e == profile_count::zero ()))
8131 freq_e = freq_e.force_nonzero ();
8132 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8133 scale_loop_frequencies (loop, p);
8136 edge exit_e = single_exit (loop);
8137 exit_e->probability = profile_probability::always ()
8138 .apply_scale (1, new_est_niter + 1);
8140 edge exit_l = single_pred_edge (loop->latch);
8141 profile_probability prob = exit_l->probability;
8142 exit_l->probability = exit_e->probability.invert ();
8143 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8144 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8147 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8148 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8149 stmt_vec_info. */
8151 static void
8152 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8153 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8156 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8158 if (dump_enabled_p ())
8159 dump_printf_loc (MSG_NOTE, vect_location,
8160 "------>vectorizing statement: %G", stmt_info->stmt);
8162 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8163 vect_loop_kill_debug_uses (loop, stmt_info);
8165 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8166 && !STMT_VINFO_LIVE_P (stmt_info))
8167 return;
8169 if (STMT_VINFO_VECTYPE (stmt_info))
8171 poly_uint64 nunits
8172 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8173 if (!STMT_SLP_TYPE (stmt_info)
8174 && maybe_ne (nunits, vf)
8175 && dump_enabled_p ())
8176 /* For SLP VF is set according to unrolling factor, and not
8177 to vector size, hence for SLP this print is not valid. */
8178 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8181 /* Pure SLP statements have already been vectorized. We still need
8182 to apply loop vectorization to hybrid SLP statements. */
8183 if (PURE_SLP_STMT (stmt_info))
8184 return;
8186 if (dump_enabled_p ())
8187 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8189 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8190 *seen_store = stmt_info;
8193 /* Function vect_transform_loop.
8195 The analysis phase has determined that the loop is vectorizable.
8196 Vectorize the loop - created vectorized stmts to replace the scalar
8197 stmts in the loop, and update the loop exit condition.
8198 Returns scalar epilogue loop if any. */
8200 struct loop *
8201 vect_transform_loop (loop_vec_info loop_vinfo)
8203 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8204 struct loop *epilogue = NULL;
8205 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8206 int nbbs = loop->num_nodes;
8207 int i;
8208 tree niters_vector = NULL_TREE;
8209 tree step_vector = NULL_TREE;
8210 tree niters_vector_mult_vf = NULL_TREE;
8211 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8212 unsigned int lowest_vf = constant_lower_bound (vf);
8213 gimple *stmt;
8214 bool check_profitability = false;
8215 unsigned int th;
8217 DUMP_VECT_SCOPE ("vec_transform_loop");
8219 loop_vinfo->shared->check_datarefs ();
8221 /* Use the more conservative vectorization threshold. If the number
8222 of iterations is constant assume the cost check has been performed
8223 by our caller. If the threshold makes all loops profitable that
8224 run at least the (estimated) vectorization factor number of times
8225 checking is pointless, too. */
8226 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8227 if (th >= vect_vf_for_cost (loop_vinfo)
8228 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8230 if (dump_enabled_p ())
8231 dump_printf_loc (MSG_NOTE, vect_location,
8232 "Profitability threshold is %d loop iterations.\n",
8233 th);
8234 check_profitability = true;
8237 /* Make sure there exists a single-predecessor exit bb. Do this before
8238 versioning. */
8239 edge e = single_exit (loop);
8240 if (! single_pred_p (e->dest))
8242 split_loop_exit_edge (e, true);
8243 if (dump_enabled_p ())
8244 dump_printf (MSG_NOTE, "split exit edge\n");
8247 /* Version the loop first, if required, so the profitability check
8248 comes first. */
8250 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8252 poly_uint64 versioning_threshold
8253 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8254 if (check_profitability
8255 && ordered_p (poly_uint64 (th), versioning_threshold))
8257 versioning_threshold = ordered_max (poly_uint64 (th),
8258 versioning_threshold);
8259 check_profitability = false;
8261 vect_loop_versioning (loop_vinfo, th, check_profitability,
8262 versioning_threshold);
8263 check_profitability = false;
8266 /* Make sure there exists a single-predecessor exit bb also on the
8267 scalar loop copy. Do this after versioning but before peeling
8268 so CFG structure is fine for both scalar and if-converted loop
8269 to make slpeel_duplicate_current_defs_from_edges face matched
8270 loop closed PHI nodes on the exit. */
8271 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8273 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8274 if (! single_pred_p (e->dest))
8276 split_loop_exit_edge (e, true);
8277 if (dump_enabled_p ())
8278 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8282 tree niters = vect_build_loop_niters (loop_vinfo);
8283 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8284 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8285 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8286 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8287 &step_vector, &niters_vector_mult_vf, th,
8288 check_profitability, niters_no_overflow);
8290 if (niters_vector == NULL_TREE)
8292 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8293 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8294 && known_eq (lowest_vf, vf))
8296 niters_vector
8297 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8298 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8299 step_vector = build_one_cst (TREE_TYPE (niters));
8301 else
8302 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8303 &step_vector, niters_no_overflow);
8306 /* 1) Make sure the loop header has exactly two entries
8307 2) Make sure we have a preheader basic block. */
8309 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8311 split_edge (loop_preheader_edge (loop));
8313 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8314 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8315 /* This will deal with any possible peeling. */
8316 vect_prepare_for_masked_peels (loop_vinfo);
8318 /* Schedule the SLP instances first, then handle loop vectorization
8319 below. */
8320 if (!loop_vinfo->slp_instances.is_empty ())
8322 DUMP_VECT_SCOPE ("scheduling SLP instances");
8323 vect_schedule_slp (loop_vinfo);
8326 /* FORNOW: the vectorizer supports only loops which body consist
8327 of one basic block (header + empty latch). When the vectorizer will
8328 support more involved loop forms, the order by which the BBs are
8329 traversed need to be reconsidered. */
8331 for (i = 0; i < nbbs; i++)
8333 basic_block bb = bbs[i];
8334 stmt_vec_info stmt_info;
8336 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8337 gsi_next (&si))
8339 gphi *phi = si.phi ();
8340 if (dump_enabled_p ())
8341 dump_printf_loc (MSG_NOTE, vect_location,
8342 "------>vectorizing phi: %G", phi);
8343 stmt_info = loop_vinfo->lookup_stmt (phi);
8344 if (!stmt_info)
8345 continue;
8347 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8348 vect_loop_kill_debug_uses (loop, stmt_info);
8350 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8351 && !STMT_VINFO_LIVE_P (stmt_info))
8352 continue;
8354 if (STMT_VINFO_VECTYPE (stmt_info)
8355 && (maybe_ne
8356 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8357 && dump_enabled_p ())
8358 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8360 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8361 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8362 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8363 && ! PURE_SLP_STMT (stmt_info))
8365 if (dump_enabled_p ())
8366 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8367 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8371 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8372 !gsi_end_p (si);)
8374 stmt = gsi_stmt (si);
8375 /* During vectorization remove existing clobber stmts. */
8376 if (gimple_clobber_p (stmt))
8378 unlink_stmt_vdef (stmt);
8379 gsi_remove (&si, true);
8380 release_defs (stmt);
8382 else
8384 stmt_info = loop_vinfo->lookup_stmt (stmt);
8386 /* vector stmts created in the outer-loop during vectorization of
8387 stmts in an inner-loop may not have a stmt_info, and do not
8388 need to be vectorized. */
8389 stmt_vec_info seen_store = NULL;
8390 if (stmt_info)
8392 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8394 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8395 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8396 !gsi_end_p (subsi); gsi_next (&subsi))
8398 stmt_vec_info pat_stmt_info
8399 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8400 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8401 &si, &seen_store);
8403 stmt_vec_info pat_stmt_info
8404 = STMT_VINFO_RELATED_STMT (stmt_info);
8405 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8406 &seen_store);
8408 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8409 &seen_store);
8411 gsi_next (&si);
8412 if (seen_store)
8414 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8415 /* Interleaving. If IS_STORE is TRUE, the
8416 vectorization of the interleaving chain was
8417 completed - free all the stores in the chain. */
8418 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8419 else
8420 /* Free the attached stmt_vec_info and remove the stmt. */
8421 loop_vinfo->remove_stmt (stmt_info);
8426 /* Stub out scalar statements that must not survive vectorization.
8427 Doing this here helps with grouped statements, or statements that
8428 are involved in patterns. */
8429 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8430 !gsi_end_p (gsi); gsi_next (&gsi))
8432 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8433 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8435 tree lhs = gimple_get_lhs (call);
8436 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8438 tree zero = build_zero_cst (TREE_TYPE (lhs));
8439 gimple *new_stmt = gimple_build_assign (lhs, zero);
8440 gsi_replace (&gsi, new_stmt, true);
8444 } /* BBs in loop */
8446 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8447 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8448 if (integer_onep (step_vector))
8449 niters_no_overflow = true;
8450 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8451 niters_vector_mult_vf, !niters_no_overflow);
8453 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8454 scale_profile_for_vect_loop (loop, assumed_vf);
8456 /* True if the final iteration might not handle a full vector's
8457 worth of scalar iterations. */
8458 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8459 /* The minimum number of iterations performed by the epilogue. This
8460 is 1 when peeling for gaps because we always need a final scalar
8461 iteration. */
8462 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8463 /* +1 to convert latch counts to loop iteration counts,
8464 -min_epilogue_iters to remove iterations that cannot be performed
8465 by the vector code. */
8466 int bias_for_lowest = 1 - min_epilogue_iters;
8467 int bias_for_assumed = bias_for_lowest;
8468 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8469 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8471 /* When the amount of peeling is known at compile time, the first
8472 iteration will have exactly alignment_npeels active elements.
8473 In the worst case it will have at least one. */
8474 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8475 bias_for_lowest += lowest_vf - min_first_active;
8476 bias_for_assumed += assumed_vf - min_first_active;
8478 /* In these calculations the "- 1" converts loop iteration counts
8479 back to latch counts. */
8480 if (loop->any_upper_bound)
8481 loop->nb_iterations_upper_bound
8482 = (final_iter_may_be_partial
8483 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8484 lowest_vf) - 1
8485 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8486 lowest_vf) - 1);
8487 if (loop->any_likely_upper_bound)
8488 loop->nb_iterations_likely_upper_bound
8489 = (final_iter_may_be_partial
8490 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8491 + bias_for_lowest, lowest_vf) - 1
8492 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8493 + bias_for_lowest, lowest_vf) - 1);
8494 if (loop->any_estimate)
8495 loop->nb_iterations_estimate
8496 = (final_iter_may_be_partial
8497 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8498 assumed_vf) - 1
8499 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8500 assumed_vf) - 1);
8502 if (dump_enabled_p ())
8504 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8506 dump_printf_loc (MSG_NOTE, vect_location,
8507 "LOOP VECTORIZED\n");
8508 if (loop->inner)
8509 dump_printf_loc (MSG_NOTE, vect_location,
8510 "OUTER LOOP VECTORIZED\n");
8511 dump_printf (MSG_NOTE, "\n");
8513 else
8515 dump_printf_loc (MSG_NOTE, vect_location,
8516 "LOOP EPILOGUE VECTORIZED (VS=");
8517 dump_dec (MSG_NOTE, current_vector_size);
8518 dump_printf (MSG_NOTE, ")\n");
8522 /* Free SLP instances here because otherwise stmt reference counting
8523 won't work. */
8524 slp_instance instance;
8525 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8526 vect_free_slp_instance (instance, true);
8527 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8528 /* Clear-up safelen field since its value is invalid after vectorization
8529 since vectorized loop can have loop-carried dependencies. */
8530 loop->safelen = 0;
8532 /* Don't vectorize epilogue for epilogue. */
8533 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8534 epilogue = NULL;
8536 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8537 epilogue = NULL;
8539 if (epilogue)
8541 auto_vector_sizes vector_sizes;
8542 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8543 unsigned int next_size = 0;
8545 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8546 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8547 && known_eq (vf, lowest_vf))
8549 unsigned int eiters
8550 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8551 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8552 eiters = eiters % lowest_vf;
8553 epilogue->nb_iterations_upper_bound = eiters - 1;
8555 unsigned int ratio;
8556 while (next_size < vector_sizes.length ()
8557 && !(constant_multiple_p (current_vector_size,
8558 vector_sizes[next_size], &ratio)
8559 && eiters >= lowest_vf / ratio))
8560 next_size += 1;
8562 else
8563 while (next_size < vector_sizes.length ()
8564 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8565 next_size += 1;
8567 if (next_size == vector_sizes.length ())
8568 epilogue = NULL;
8571 if (epilogue)
8573 epilogue->force_vectorize = loop->force_vectorize;
8574 epilogue->safelen = loop->safelen;
8575 epilogue->dont_vectorize = false;
8577 /* We may need to if-convert epilogue to vectorize it. */
8578 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8579 tree_if_conversion (epilogue);
8582 return epilogue;
8585 /* The code below is trying to perform simple optimization - revert
8586 if-conversion for masked stores, i.e. if the mask of a store is zero
8587 do not perform it and all stored value producers also if possible.
8588 For example,
8589 for (i=0; i<n; i++)
8590 if (c[i])
8592 p1[i] += 1;
8593 p2[i] = p3[i] +2;
8595 this transformation will produce the following semi-hammock:
8597 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8599 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8600 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8601 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8602 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8603 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8604 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8608 void
8609 optimize_mask_stores (struct loop *loop)
8611 basic_block *bbs = get_loop_body (loop);
8612 unsigned nbbs = loop->num_nodes;
8613 unsigned i;
8614 basic_block bb;
8615 struct loop *bb_loop;
8616 gimple_stmt_iterator gsi;
8617 gimple *stmt;
8618 auto_vec<gimple *> worklist;
8620 vect_location = find_loop_location (loop);
8621 /* Pick up all masked stores in loop if any. */
8622 for (i = 0; i < nbbs; i++)
8624 bb = bbs[i];
8625 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8626 gsi_next (&gsi))
8628 stmt = gsi_stmt (gsi);
8629 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8630 worklist.safe_push (stmt);
8634 free (bbs);
8635 if (worklist.is_empty ())
8636 return;
8638 /* Loop has masked stores. */
8639 while (!worklist.is_empty ())
8641 gimple *last, *last_store;
8642 edge e, efalse;
8643 tree mask;
8644 basic_block store_bb, join_bb;
8645 gimple_stmt_iterator gsi_to;
8646 tree vdef, new_vdef;
8647 gphi *phi;
8648 tree vectype;
8649 tree zero;
8651 last = worklist.pop ();
8652 mask = gimple_call_arg (last, 2);
8653 bb = gimple_bb (last);
8654 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8655 the same loop as if_bb. It could be different to LOOP when two
8656 level loop-nest is vectorized and mask_store belongs to the inner
8657 one. */
8658 e = split_block (bb, last);
8659 bb_loop = bb->loop_father;
8660 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8661 join_bb = e->dest;
8662 store_bb = create_empty_bb (bb);
8663 add_bb_to_loop (store_bb, bb_loop);
8664 e->flags = EDGE_TRUE_VALUE;
8665 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8666 /* Put STORE_BB to likely part. */
8667 efalse->probability = profile_probability::unlikely ();
8668 store_bb->count = efalse->count ();
8669 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8670 if (dom_info_available_p (CDI_DOMINATORS))
8671 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8672 if (dump_enabled_p ())
8673 dump_printf_loc (MSG_NOTE, vect_location,
8674 "Create new block %d to sink mask stores.",
8675 store_bb->index);
8676 /* Create vector comparison with boolean result. */
8677 vectype = TREE_TYPE (mask);
8678 zero = build_zero_cst (vectype);
8679 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8680 gsi = gsi_last_bb (bb);
8681 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8682 /* Create new PHI node for vdef of the last masked store:
8683 .MEM_2 = VDEF <.MEM_1>
8684 will be converted to
8685 .MEM.3 = VDEF <.MEM_1>
8686 and new PHI node will be created in join bb
8687 .MEM_2 = PHI <.MEM_1, .MEM_3>
8689 vdef = gimple_vdef (last);
8690 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8691 gimple_set_vdef (last, new_vdef);
8692 phi = create_phi_node (vdef, join_bb);
8693 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8695 /* Put all masked stores with the same mask to STORE_BB if possible. */
8696 while (true)
8698 gimple_stmt_iterator gsi_from;
8699 gimple *stmt1 = NULL;
8701 /* Move masked store to STORE_BB. */
8702 last_store = last;
8703 gsi = gsi_for_stmt (last);
8704 gsi_from = gsi;
8705 /* Shift GSI to the previous stmt for further traversal. */
8706 gsi_prev (&gsi);
8707 gsi_to = gsi_start_bb (store_bb);
8708 gsi_move_before (&gsi_from, &gsi_to);
8709 /* Setup GSI_TO to the non-empty block start. */
8710 gsi_to = gsi_start_bb (store_bb);
8711 if (dump_enabled_p ())
8712 dump_printf_loc (MSG_NOTE, vect_location,
8713 "Move stmt to created bb\n%G", last);
8714 /* Move all stored value producers if possible. */
8715 while (!gsi_end_p (gsi))
8717 tree lhs;
8718 imm_use_iterator imm_iter;
8719 use_operand_p use_p;
8720 bool res;
8722 /* Skip debug statements. */
8723 if (is_gimple_debug (gsi_stmt (gsi)))
8725 gsi_prev (&gsi);
8726 continue;
8728 stmt1 = gsi_stmt (gsi);
8729 /* Do not consider statements writing to memory or having
8730 volatile operand. */
8731 if (gimple_vdef (stmt1)
8732 || gimple_has_volatile_ops (stmt1))
8733 break;
8734 gsi_from = gsi;
8735 gsi_prev (&gsi);
8736 lhs = gimple_get_lhs (stmt1);
8737 if (!lhs)
8738 break;
8740 /* LHS of vectorized stmt must be SSA_NAME. */
8741 if (TREE_CODE (lhs) != SSA_NAME)
8742 break;
8744 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8746 /* Remove dead scalar statement. */
8747 if (has_zero_uses (lhs))
8749 gsi_remove (&gsi_from, true);
8750 continue;
8754 /* Check that LHS does not have uses outside of STORE_BB. */
8755 res = true;
8756 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8758 gimple *use_stmt;
8759 use_stmt = USE_STMT (use_p);
8760 if (is_gimple_debug (use_stmt))
8761 continue;
8762 if (gimple_bb (use_stmt) != store_bb)
8764 res = false;
8765 break;
8768 if (!res)
8769 break;
8771 if (gimple_vuse (stmt1)
8772 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8773 break;
8775 /* Can move STMT1 to STORE_BB. */
8776 if (dump_enabled_p ())
8777 dump_printf_loc (MSG_NOTE, vect_location,
8778 "Move stmt to created bb\n%G", stmt1);
8779 gsi_move_before (&gsi_from, &gsi_to);
8780 /* Shift GSI_TO for further insertion. */
8781 gsi_prev (&gsi_to);
8783 /* Put other masked stores with the same mask to STORE_BB. */
8784 if (worklist.is_empty ()
8785 || gimple_call_arg (worklist.last (), 2) != mask
8786 || worklist.last () != stmt1)
8787 break;
8788 last = worklist.pop ();
8790 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);