tree-vect-data-refs.c (vect_find_stmt_data_reference): Handle even zero DR_OFFSET...
[official-gcc.git] / gcc / tree-vect-loop.c
blobd3facf67bf951ce6c3bacd8c0d5baeacb4d36503
1 /* Loop Vectorization
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
185 if (stmt_vectype)
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
258 return opt_result::success ();
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i = 0; i < nbbs; i++)
304 basic_block bb = bbs[i];
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
315 gcc_assert (stmt_info);
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
347 vect_update_max_nunits (&vectorization_factor, vectype);
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
376 for (i = 0; i < mask_producers.length (); i++)
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
385 return opt_result::success ();
389 /* Function vect_is_simple_iv_evolution.
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
420 *init = init_expr;
421 *step = step_expr;
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
439 return true;
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
453 x_3 = ...;
456 outer2:
457 x_4 = PHI <x_3(inner)>;
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
476 /* Function vect_analyze_scalar_cycles_1.
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
534 worklist.safe_push (stmt_vinfo);
535 continue;
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
566 if (double_reduc)
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
576 else
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
587 else
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
612 /* Function vect_analyze_scalar_cycles.
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
621 Example1: reduction:
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
627 Example2: induction:
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
682 stmt_vec_info first;
683 unsigned i;
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
706 /* Function vect_get_loop_niters.
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
713 Return the loop exit condition. */
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
730 if (!exit)
731 return cond;
733 niter = chrec_dont_know;
734 may_be_zero = NULL_TREE;
735 niter_assumptions = boolean_true_node;
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737 || chrec_contains_undetermined (niter_desc.niter))
738 return cond;
740 niter_assumptions = niter_desc.assumptions;
741 may_be_zero = niter_desc.may_be_zero;
742 niter = niter_desc.niter;
744 if (may_be_zero && integer_zerop (may_be_zero))
745 may_be_zero = NULL_TREE;
747 if (may_be_zero)
749 if (COMPARISON_CLASS_P (may_be_zero))
751 /* Try to combine may_be_zero with assumptions, this can simplify
752 computation of niter expression. */
753 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
754 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
755 niter_assumptions,
756 fold_build1 (TRUTH_NOT_EXPR,
757 boolean_type_node,
758 may_be_zero));
759 else
760 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
761 build_int_cst (TREE_TYPE (niter), 0),
762 rewrite_to_non_trapping_overflow (niter));
764 may_be_zero = NULL_TREE;
766 else if (integer_nonzerop (may_be_zero))
768 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
769 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
770 return cond;
772 else
773 return cond;
776 *assumptions = niter_assumptions;
777 *number_of_iterationsm1 = niter;
779 /* We want the number of loop header executions which is the number
780 of latch executions plus one.
781 ??? For UINT_MAX latch executions this number overflows to zero
782 for loops like do { n++; } while (n != 0); */
783 if (niter && !chrec_contains_undetermined (niter))
784 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
785 build_int_cst (TREE_TYPE (niter), 1));
786 *number_of_iterations = niter;
788 return cond;
791 /* Function bb_in_loop_p
793 Used as predicate for dfs order traversal of the loop bbs. */
795 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data)
798 const struct loop *const loop = (const struct loop *)data;
799 if (flow_bb_inside_loop_p (loop, bb))
800 return true;
801 return false;
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
809 : vec_info (vec_info::loop, init_cost (loop_in), shared),
810 loop (loop_in),
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812 num_itersm1 (NULL_TREE),
813 num_iters (NULL_TREE),
814 num_iters_unchanged (NULL_TREE),
815 num_iters_assumptions (NULL_TREE),
816 th (0),
817 versioning_threshold (0),
818 vectorization_factor (0),
819 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE),
821 mask_compare_type (NULL_TREE),
822 simd_if_cond (NULL_TREE),
823 unaligned_dr (NULL),
824 peeling_for_alignment (0),
825 ptr_mask (0),
826 ivexpr_map (NULL),
827 scan_map (NULL),
828 slp_unrolling_factor (1),
829 single_scalar_iteration_cost (0),
830 vectorizable (false),
831 can_fully_mask_p (true),
832 fully_masked_p (false),
833 peeling_for_gaps (false),
834 peeling_for_niter (false),
835 operands_swapped (false),
836 no_data_dependencies (false),
837 has_mask_store (false),
838 scalar_loop (NULL),
839 orig_loop_info (NULL)
841 /* CHECKME: We want to visit all BBs before their successors (except for
842 latch blocks, for which this assertion wouldn't hold). In the simple
843 case of the loop forms we allow, a dfs order of the BBs would the same
844 as reversed postorder traversal, so we are safe. */
846 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
847 bbs, loop->num_nodes, loop);
848 gcc_assert (nbbs == loop->num_nodes);
850 for (unsigned int i = 0; i < nbbs; i++)
852 basic_block bb = bbs[i];
853 gimple_stmt_iterator si;
855 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
857 gimple *phi = gsi_stmt (si);
858 gimple_set_uid (phi, 0);
859 add_stmt (phi);
862 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
864 gimple *stmt = gsi_stmt (si);
865 gimple_set_uid (stmt, 0);
866 add_stmt (stmt);
867 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
868 third argument is the #pragma omp simd if (x) condition, when 0,
869 loop shouldn't be vectorized, when non-zero constant, it should
870 be vectorized normally, otherwise versioned with vectorized loop
871 done if the condition is non-zero at runtime. */
872 if (loop_in->simduid
873 && is_gimple_call (stmt)
874 && gimple_call_internal_p (stmt)
875 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
876 && gimple_call_num_args (stmt) >= 3
877 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
878 && (loop_in->simduid
879 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
881 tree arg = gimple_call_arg (stmt, 2);
882 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
883 simd_if_cond = arg;
884 else
885 gcc_assert (integer_nonzerop (arg));
891 /* Free all levels of MASKS. */
893 void
894 release_vec_loop_masks (vec_loop_masks *masks)
896 rgroup_masks *rgm;
897 unsigned int i;
898 FOR_EACH_VEC_ELT (*masks, i, rgm)
899 rgm->masks.release ();
900 masks->release ();
903 /* Free all memory used by the _loop_vec_info, as well as all the
904 stmt_vec_info structs of all the stmts in the loop. */
906 _loop_vec_info::~_loop_vec_info ()
908 int nbbs;
909 gimple_stmt_iterator si;
910 int j;
912 nbbs = loop->num_nodes;
913 for (j = 0; j < nbbs; j++)
915 basic_block bb = bbs[j];
916 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
918 gimple *stmt = gsi_stmt (si);
920 /* We may have broken canonical form by moving a constant
921 into RHS1 of a commutative op. Fix such occurrences. */
922 if (operands_swapped && is_gimple_assign (stmt))
924 enum tree_code code = gimple_assign_rhs_code (stmt);
926 if ((code == PLUS_EXPR
927 || code == POINTER_PLUS_EXPR
928 || code == MULT_EXPR)
929 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
930 swap_ssa_operands (stmt,
931 gimple_assign_rhs1_ptr (stmt),
932 gimple_assign_rhs2_ptr (stmt));
933 else if (code == COND_EXPR
934 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
936 tree cond_expr = gimple_assign_rhs1 (stmt);
937 enum tree_code cond_code = TREE_CODE (cond_expr);
939 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
941 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
942 0));
943 cond_code = invert_tree_comparison (cond_code,
944 honor_nans);
945 if (cond_code != ERROR_MARK)
947 TREE_SET_CODE (cond_expr, cond_code);
948 swap_ssa_operands (stmt,
949 gimple_assign_rhs2_ptr (stmt),
950 gimple_assign_rhs3_ptr (stmt));
955 gsi_next (&si);
959 free (bbs);
961 release_vec_loop_masks (&masks);
962 delete ivexpr_map;
963 delete scan_map;
965 loop->aux = NULL;
968 /* Return an invariant or register for EXPR and emit necessary
969 computations in the LOOP_VINFO loop preheader. */
971 tree
972 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
974 if (is_gimple_reg (expr)
975 || is_gimple_min_invariant (expr))
976 return expr;
978 if (! loop_vinfo->ivexpr_map)
979 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
980 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
981 if (! cached)
983 gimple_seq stmts = NULL;
984 cached = force_gimple_operand (unshare_expr (expr),
985 &stmts, true, NULL_TREE);
986 if (stmts)
988 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
989 gsi_insert_seq_on_edge_immediate (e, stmts);
992 return cached;
995 /* Return true if we can use CMP_TYPE as the comparison type to produce
996 all masks required to mask LOOP_VINFO. */
998 static bool
999 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1001 rgroup_masks *rgm;
1002 unsigned int i;
1003 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1004 if (rgm->mask_type != NULL_TREE
1005 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1006 cmp_type, rgm->mask_type,
1007 OPTIMIZE_FOR_SPEED))
1008 return false;
1009 return true;
1012 /* Calculate the maximum number of scalars per iteration for every
1013 rgroup in LOOP_VINFO. */
1015 static unsigned int
1016 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1018 unsigned int res = 1;
1019 unsigned int i;
1020 rgroup_masks *rgm;
1021 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1022 res = MAX (res, rgm->max_nscalars_per_iter);
1023 return res;
1026 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1027 whether we can actually generate the masks required. Return true if so,
1028 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1030 static bool
1031 vect_verify_full_masking (loop_vec_info loop_vinfo)
1033 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1034 unsigned int min_ni_width;
1035 unsigned int max_nscalars_per_iter
1036 = vect_get_max_nscalars_per_iter (loop_vinfo);
1038 /* Use a normal loop if there are no statements that need masking.
1039 This only happens in rare degenerate cases: it means that the loop
1040 has no loads, no stores, and no live-out values. */
1041 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1042 return false;
1044 /* Get the maximum number of iterations that is representable
1045 in the counter type. */
1046 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1047 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1049 /* Get a more refined estimate for the number of iterations. */
1050 widest_int max_back_edges;
1051 if (max_loop_iterations (loop, &max_back_edges))
1052 max_ni = wi::smin (max_ni, max_back_edges + 1);
1054 /* Account for rgroup masks, in which each bit is replicated N times. */
1055 max_ni *= max_nscalars_per_iter;
1057 /* Work out how many bits we need to represent the limit. */
1058 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1060 /* Find a scalar mode for which WHILE_ULT is supported. */
1061 opt_scalar_int_mode cmp_mode_iter;
1062 tree cmp_type = NULL_TREE;
1063 tree iv_type = NULL_TREE;
1064 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1065 unsigned int iv_precision = UINT_MAX;
1067 if (iv_limit != -1)
1068 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1069 UNSIGNED);
1071 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1073 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1074 if (cmp_bits >= min_ni_width
1075 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1077 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1078 if (this_type
1079 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1081 /* Although we could stop as soon as we find a valid mode,
1082 there are at least two reasons why that's not always the
1083 best choice:
1085 - An IV that's Pmode or wider is more likely to be reusable
1086 in address calculations than an IV that's narrower than
1087 Pmode.
1089 - Doing the comparison in IV_PRECISION or wider allows
1090 a natural 0-based IV, whereas using a narrower comparison
1091 type requires mitigations against wrap-around.
1093 Conversely, if the IV limit is variable, doing the comparison
1094 in a wider type than the original type can introduce
1095 unnecessary extensions, so picking the widest valid mode
1096 is not always a good choice either.
1098 Here we prefer the first IV type that's Pmode or wider,
1099 and the first comparison type that's IV_PRECISION or wider.
1100 (The comparison type must be no wider than the IV type,
1101 to avoid extensions in the vector loop.)
1103 ??? We might want to try continuing beyond Pmode for ILP32
1104 targets if CMP_BITS < IV_PRECISION. */
1105 iv_type = this_type;
1106 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1107 cmp_type = this_type;
1108 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1109 break;
1114 if (!cmp_type)
1115 return false;
1117 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1118 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1119 return true;
1122 /* Calculate the cost of one scalar iteration of the loop. */
1123 static void
1124 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1126 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1127 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1128 int nbbs = loop->num_nodes, factor;
1129 int innerloop_iters, i;
1131 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1133 /* Gather costs for statements in the scalar loop. */
1135 /* FORNOW. */
1136 innerloop_iters = 1;
1137 if (loop->inner)
1138 innerloop_iters = 50; /* FIXME */
1140 for (i = 0; i < nbbs; i++)
1142 gimple_stmt_iterator si;
1143 basic_block bb = bbs[i];
1145 if (bb->loop_father == loop->inner)
1146 factor = innerloop_iters;
1147 else
1148 factor = 1;
1150 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1152 gimple *stmt = gsi_stmt (si);
1153 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1155 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1156 continue;
1158 /* Skip stmts that are not vectorized inside the loop. */
1159 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1160 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1161 && (!STMT_VINFO_LIVE_P (vstmt_info)
1162 || !VECTORIZABLE_CYCLE_DEF
1163 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1164 continue;
1166 vect_cost_for_stmt kind;
1167 if (STMT_VINFO_DATA_REF (stmt_info))
1169 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1170 kind = scalar_load;
1171 else
1172 kind = scalar_store;
1174 else
1175 kind = scalar_stmt;
1177 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1178 factor, kind, stmt_info, 0, vect_prologue);
1182 /* Now accumulate cost. */
1183 void *target_cost_data = init_cost (loop);
1184 stmt_info_for_cost *si;
1185 int j;
1186 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1187 j, si)
1188 (void) add_stmt_cost (target_cost_data, si->count,
1189 si->kind, si->stmt_info, si->misalign,
1190 vect_body);
1191 unsigned dummy, body_cost = 0;
1192 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1193 destroy_cost_data (target_cost_data);
1194 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1198 /* Function vect_analyze_loop_form_1.
1200 Verify that certain CFG restrictions hold, including:
1201 - the loop has a pre-header
1202 - the loop has a single entry and exit
1203 - the loop exit condition is simple enough
1204 - the number of iterations can be analyzed, i.e, a countable loop. The
1205 niter could be analyzed under some assumptions. */
1207 opt_result
1208 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1209 tree *assumptions, tree *number_of_iterationsm1,
1210 tree *number_of_iterations, gcond **inner_loop_cond)
1212 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1214 /* Different restrictions apply when we are considering an inner-most loop,
1215 vs. an outer (nested) loop.
1216 (FORNOW. May want to relax some of these restrictions in the future). */
1218 if (!loop->inner)
1220 /* Inner-most loop. We currently require that the number of BBs is
1221 exactly 2 (the header and latch). Vectorizable inner-most loops
1222 look like this:
1224 (pre-header)
1226 header <--------+
1227 | | |
1228 | +--> latch --+
1230 (exit-bb) */
1232 if (loop->num_nodes != 2)
1233 return opt_result::failure_at (vect_location,
1234 "not vectorized:"
1235 " control flow in loop.\n");
1237 if (empty_block_p (loop->header))
1238 return opt_result::failure_at (vect_location,
1239 "not vectorized: empty loop.\n");
1241 else
1243 struct loop *innerloop = loop->inner;
1244 edge entryedge;
1246 /* Nested loop. We currently require that the loop is doubly-nested,
1247 contains a single inner loop, and the number of BBs is exactly 5.
1248 Vectorizable outer-loops look like this:
1250 (pre-header)
1252 header <---+
1254 inner-loop |
1256 tail ------+
1258 (exit-bb)
1260 The inner-loop has the properties expected of inner-most loops
1261 as described above. */
1263 if ((loop->inner)->inner || (loop->inner)->next)
1264 return opt_result::failure_at (vect_location,
1265 "not vectorized:"
1266 " multiple nested loops.\n");
1268 if (loop->num_nodes != 5)
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized:"
1271 " control flow in loop.\n");
1273 entryedge = loop_preheader_edge (innerloop);
1274 if (entryedge->src != loop->header
1275 || !single_exit (innerloop)
1276 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1277 return opt_result::failure_at (vect_location,
1278 "not vectorized:"
1279 " unsupported outerloop form.\n");
1281 /* Analyze the inner-loop. */
1282 tree inner_niterm1, inner_niter, inner_assumptions;
1283 opt_result res
1284 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1285 &inner_assumptions, &inner_niterm1,
1286 &inner_niter, NULL);
1287 if (!res)
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "not vectorized: Bad inner loop.\n");
1292 return res;
1295 /* Don't support analyzing niter under assumptions for inner
1296 loop. */
1297 if (!integer_onep (inner_assumptions))
1298 return opt_result::failure_at (vect_location,
1299 "not vectorized: Bad inner loop.\n");
1301 if (!expr_invariant_in_loop_p (loop, inner_niter))
1302 return opt_result::failure_at (vect_location,
1303 "not vectorized: inner-loop count not"
1304 " invariant.\n");
1306 if (dump_enabled_p ())
1307 dump_printf_loc (MSG_NOTE, vect_location,
1308 "Considering outer-loop vectorization.\n");
1311 if (!single_exit (loop))
1312 return opt_result::failure_at (vect_location,
1313 "not vectorized: multiple exits.\n");
1314 if (EDGE_COUNT (loop->header->preds) != 2)
1315 return opt_result::failure_at (vect_location,
1316 "not vectorized:"
1317 " too many incoming edges.\n");
1319 /* We assume that the loop exit condition is at the end of the loop. i.e,
1320 that the loop is represented as a do-while (with a proper if-guard
1321 before the loop if needed), where the loop header contains all the
1322 executable statements, and the latch is empty. */
1323 if (!empty_block_p (loop->latch)
1324 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1325 return opt_result::failure_at (vect_location,
1326 "not vectorized: latch block not empty.\n");
1328 /* Make sure the exit is not abnormal. */
1329 edge e = single_exit (loop);
1330 if (e->flags & EDGE_ABNORMAL)
1331 return opt_result::failure_at (vect_location,
1332 "not vectorized:"
1333 " abnormal loop exit edge.\n");
1335 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1336 number_of_iterationsm1);
1337 if (!*loop_cond)
1338 return opt_result::failure_at
1339 (vect_location,
1340 "not vectorized: complicated exit condition.\n");
1342 if (integer_zerop (*assumptions)
1343 || !*number_of_iterations
1344 || chrec_contains_undetermined (*number_of_iterations))
1345 return opt_result::failure_at
1346 (*loop_cond,
1347 "not vectorized: number of iterations cannot be computed.\n");
1349 if (integer_zerop (*number_of_iterations))
1350 return opt_result::failure_at
1351 (*loop_cond,
1352 "not vectorized: number of iterations = 0.\n");
1354 return opt_result::success ();
1357 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1359 opt_loop_vec_info
1360 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1362 tree assumptions, number_of_iterations, number_of_iterationsm1;
1363 gcond *loop_cond, *inner_loop_cond = NULL;
1365 opt_result res
1366 = vect_analyze_loop_form_1 (loop, &loop_cond,
1367 &assumptions, &number_of_iterationsm1,
1368 &number_of_iterations, &inner_loop_cond);
1369 if (!res)
1370 return opt_loop_vec_info::propagate_failure (res);
1372 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1373 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1374 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1375 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1376 if (!integer_onep (assumptions))
1378 /* We consider to vectorize this loop by versioning it under
1379 some assumptions. In order to do this, we need to clear
1380 existing information computed by scev and niter analyzer. */
1381 scev_reset_htab ();
1382 free_numbers_of_iterations_estimates (loop);
1383 /* Also set flag for this loop so that following scev and niter
1384 analysis are done under the assumptions. */
1385 loop_constraint_set (loop, LOOP_C_FINITE);
1386 /* Also record the assumptions for versioning. */
1387 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1390 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1392 if (dump_enabled_p ())
1394 dump_printf_loc (MSG_NOTE, vect_location,
1395 "Symbolic number of iterations is ");
1396 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1397 dump_printf (MSG_NOTE, "\n");
1401 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1402 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1403 if (inner_loop_cond)
1405 stmt_vec_info inner_loop_cond_info
1406 = loop_vinfo->lookup_stmt (inner_loop_cond);
1407 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1410 gcc_assert (!loop->aux);
1411 loop->aux = loop_vinfo;
1412 return opt_loop_vec_info::success (loop_vinfo);
1417 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1418 statements update the vectorization factor. */
1420 static void
1421 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1423 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1424 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1425 int nbbs = loop->num_nodes;
1426 poly_uint64 vectorization_factor;
1427 int i;
1429 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1431 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1432 gcc_assert (known_ne (vectorization_factor, 0U));
1434 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1435 vectorization factor of the loop is the unrolling factor required by
1436 the SLP instances. If that unrolling factor is 1, we say, that we
1437 perform pure SLP on loop - cross iteration parallelism is not
1438 exploited. */
1439 bool only_slp_in_loop = true;
1440 for (i = 0; i < nbbs; i++)
1442 basic_block bb = bbs[i];
1443 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1444 gsi_next (&si))
1446 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1447 stmt_info = vect_stmt_to_vectorize (stmt_info);
1448 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1449 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1450 && !PURE_SLP_STMT (stmt_info))
1451 /* STMT needs both SLP and loop-based vectorization. */
1452 only_slp_in_loop = false;
1456 if (only_slp_in_loop)
1458 if (dump_enabled_p ())
1459 dump_printf_loc (MSG_NOTE, vect_location,
1460 "Loop contains only SLP stmts\n");
1461 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1463 else
1465 if (dump_enabled_p ())
1466 dump_printf_loc (MSG_NOTE, vect_location,
1467 "Loop contains SLP and non-SLP stmts\n");
1468 /* Both the vectorization factor and unroll factor have the form
1469 current_vector_size * X for some rational X, so they must have
1470 a common multiple. */
1471 vectorization_factor
1472 = force_common_multiple (vectorization_factor,
1473 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1476 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1477 if (dump_enabled_p ())
1479 dump_printf_loc (MSG_NOTE, vect_location,
1480 "Updating vectorization factor to ");
1481 dump_dec (MSG_NOTE, vectorization_factor);
1482 dump_printf (MSG_NOTE, ".\n");
1486 /* Return true if STMT_INFO describes a double reduction phi and if
1487 the other phi in the reduction is also relevant for vectorization.
1488 This rejects cases such as:
1490 outer1:
1491 x_1 = PHI <x_3(outer2), ...>;
1494 inner:
1495 x_2 = ...;
1498 outer2:
1499 x_3 = PHI <x_2(inner)>;
1501 if nothing in x_2 or elsewhere makes x_1 relevant. */
1503 static bool
1504 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1506 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1507 return false;
1509 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1512 /* Function vect_analyze_loop_operations.
1514 Scan the loop stmts and make sure they are all vectorizable. */
1516 static opt_result
1517 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1519 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1521 int nbbs = loop->num_nodes;
1522 int i;
1523 stmt_vec_info stmt_info;
1524 bool need_to_vectorize = false;
1525 bool ok;
1527 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1529 auto_vec<stmt_info_for_cost> cost_vec;
1531 for (i = 0; i < nbbs; i++)
1533 basic_block bb = bbs[i];
1535 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1536 gsi_next (&si))
1538 gphi *phi = si.phi ();
1539 ok = true;
1541 stmt_info = loop_vinfo->lookup_stmt (phi);
1542 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1544 if (virtual_operand_p (gimple_phi_result (phi)))
1545 continue;
1547 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1548 (i.e., a phi in the tail of the outer-loop). */
1549 if (! is_loop_header_bb_p (bb))
1551 /* FORNOW: we currently don't support the case that these phis
1552 are not used in the outerloop (unless it is double reduction,
1553 i.e., this phi is vect_reduction_def), cause this case
1554 requires to actually do something here. */
1555 if (STMT_VINFO_LIVE_P (stmt_info)
1556 && !vect_active_double_reduction_p (stmt_info))
1557 return opt_result::failure_at (phi,
1558 "Unsupported loop-closed phi"
1559 " in outer-loop.\n");
1561 /* If PHI is used in the outer loop, we check that its operand
1562 is defined in the inner loop. */
1563 if (STMT_VINFO_RELEVANT_P (stmt_info))
1565 tree phi_op;
1567 if (gimple_phi_num_args (phi) != 1)
1568 return opt_result::failure_at (phi, "unsupported phi");
1570 phi_op = PHI_ARG_DEF (phi, 0);
1571 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1572 if (!op_def_info)
1573 return opt_result::failure_at (phi, "unsupported phi");
1575 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1576 && (STMT_VINFO_RELEVANT (op_def_info)
1577 != vect_used_in_outer_by_reduction))
1578 return opt_result::failure_at (phi, "unsupported phi");
1581 continue;
1584 gcc_assert (stmt_info);
1586 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1587 || STMT_VINFO_LIVE_P (stmt_info))
1588 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1589 /* A scalar-dependence cycle that we don't support. */
1590 return opt_result::failure_at (phi,
1591 "not vectorized:"
1592 " scalar dependence cycle.\n");
1594 if (STMT_VINFO_RELEVANT_P (stmt_info))
1596 need_to_vectorize = true;
1597 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1598 && ! PURE_SLP_STMT (stmt_info))
1599 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1600 &cost_vec);
1601 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1602 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1603 && ! PURE_SLP_STMT (stmt_info))
1604 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1605 &cost_vec);
1608 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1609 if (ok
1610 && STMT_VINFO_LIVE_P (stmt_info)
1611 && !PURE_SLP_STMT (stmt_info))
1612 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1613 &cost_vec);
1615 if (!ok)
1616 return opt_result::failure_at (phi,
1617 "not vectorized: relevant phi not "
1618 "supported: %G",
1619 static_cast <gimple *> (phi));
1622 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1623 gsi_next (&si))
1625 gimple *stmt = gsi_stmt (si);
1626 if (!gimple_clobber_p (stmt))
1628 opt_result res
1629 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1630 &need_to_vectorize,
1631 NULL, NULL, &cost_vec);
1632 if (!res)
1633 return res;
1636 } /* bbs */
1638 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1640 /* All operations in the loop are either irrelevant (deal with loop
1641 control, or dead), or only used outside the loop and can be moved
1642 out of the loop (e.g. invariants, inductions). The loop can be
1643 optimized away by scalar optimizations. We're better off not
1644 touching this loop. */
1645 if (!need_to_vectorize)
1647 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_NOTE, vect_location,
1649 "All the computation can be taken out of the loop.\n");
1650 return opt_result::failure_at
1651 (vect_location,
1652 "not vectorized: redundant loop. no profit to vectorize.\n");
1655 return opt_result::success ();
1658 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1659 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1660 definitely no, or -1 if it's worth retrying. */
1662 static int
1663 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1665 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1666 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1668 /* Only fully-masked loops can have iteration counts less than the
1669 vectorization factor. */
1670 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1672 HOST_WIDE_INT max_niter;
1674 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1675 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1676 else
1677 max_niter = max_stmt_executions_int (loop);
1679 if (max_niter != -1
1680 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1684 "not vectorized: iteration count smaller than "
1685 "vectorization factor.\n");
1686 return 0;
1690 int min_profitable_iters, min_profitable_estimate;
1691 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1692 &min_profitable_estimate);
1694 if (min_profitable_iters < 0)
1696 if (dump_enabled_p ())
1697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698 "not vectorized: vectorization not profitable.\n");
1699 if (dump_enabled_p ())
1700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1701 "not vectorized: vector version will never be "
1702 "profitable.\n");
1703 return -1;
1706 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1707 * assumed_vf);
1709 /* Use the cost model only if it is more conservative than user specified
1710 threshold. */
1711 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1712 min_profitable_iters);
1714 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1716 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1717 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721 "not vectorized: vectorization not profitable.\n");
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_NOTE, vect_location,
1724 "not vectorized: iteration count smaller than user "
1725 "specified loop bound parameter or minimum profitable "
1726 "iterations (whichever is more conservative).\n");
1727 return 0;
1730 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1731 if (estimated_niter == -1)
1732 estimated_niter = likely_max_stmt_executions_int (loop);
1733 if (estimated_niter != -1
1734 && ((unsigned HOST_WIDE_INT) estimated_niter
1735 < MAX (th, (unsigned) min_profitable_estimate)))
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1739 "not vectorized: estimated iteration count too "
1740 "small.\n");
1741 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_NOTE, vect_location,
1743 "not vectorized: estimated iteration count smaller "
1744 "than specified loop bound parameter or minimum "
1745 "profitable iterations (whichever is more "
1746 "conservative).\n");
1747 return -1;
1750 return 1;
1753 static opt_result
1754 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1755 vec<data_reference_p> *datarefs,
1756 unsigned int *n_stmts)
1758 *n_stmts = 0;
1759 for (unsigned i = 0; i < loop->num_nodes; i++)
1760 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1761 !gsi_end_p (gsi); gsi_next (&gsi))
1763 gimple *stmt = gsi_stmt (gsi);
1764 if (is_gimple_debug (stmt))
1765 continue;
1766 ++(*n_stmts);
1767 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1768 if (!res)
1770 if (is_gimple_call (stmt) && loop->safelen)
1772 tree fndecl = gimple_call_fndecl (stmt), op;
1773 if (fndecl != NULL_TREE)
1775 cgraph_node *node = cgraph_node::get (fndecl);
1776 if (node != NULL && node->simd_clones != NULL)
1778 unsigned int j, n = gimple_call_num_args (stmt);
1779 for (j = 0; j < n; j++)
1781 op = gimple_call_arg (stmt, j);
1782 if (DECL_P (op)
1783 || (REFERENCE_CLASS_P (op)
1784 && get_base_address (op)))
1785 break;
1787 op = gimple_call_lhs (stmt);
1788 /* Ignore #pragma omp declare simd functions
1789 if they don't have data references in the
1790 call stmt itself. */
1791 if (j == n
1792 && !(op
1793 && (DECL_P (op)
1794 || (REFERENCE_CLASS_P (op)
1795 && get_base_address (op)))))
1796 continue;
1800 return res;
1802 /* If dependence analysis will give up due to the limit on the
1803 number of datarefs stop here and fail fatally. */
1804 if (datarefs->length ()
1805 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1806 return opt_result::failure_at (stmt, "exceeded param "
1807 "loop-max-datarefs-for-datadeps\n");
1809 return opt_result::success ();
1812 /* Look for SLP-only access groups and turn each individual access into its own
1813 group. */
1814 static void
1815 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1817 unsigned int i;
1818 struct data_reference *dr;
1820 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1822 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1823 FOR_EACH_VEC_ELT (datarefs, i, dr)
1825 gcc_assert (DR_REF (dr));
1826 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1828 /* Check if the load is a part of an interleaving chain. */
1829 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1831 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1832 unsigned int group_size = DR_GROUP_SIZE (first_element);
1834 /* Check if SLP-only groups. */
1835 if (!STMT_SLP_TYPE (stmt_info)
1836 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1838 /* Dissolve the group. */
1839 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1841 stmt_vec_info vinfo = first_element;
1842 while (vinfo)
1844 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1845 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1846 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1847 DR_GROUP_SIZE (vinfo) = 1;
1848 DR_GROUP_GAP (vinfo) = group_size - 1;
1849 vinfo = next;
1856 /* Function vect_analyze_loop_2.
1858 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1859 for it. The different analyses will record information in the
1860 loop_vec_info struct. */
1861 static opt_result
1862 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1864 opt_result ok = opt_result::success ();
1865 int res;
1866 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1867 poly_uint64 min_vf = 2;
1869 /* The first group of checks is independent of the vector size. */
1870 fatal = true;
1872 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1873 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1874 return opt_result::failure_at (vect_location,
1875 "not vectorized: simd if(0)\n");
1877 /* Find all data references in the loop (which correspond to vdefs/vuses)
1878 and analyze their evolution in the loop. */
1880 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1882 /* Gather the data references and count stmts in the loop. */
1883 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1885 opt_result res
1886 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1887 &LOOP_VINFO_DATAREFS (loop_vinfo),
1888 n_stmts);
1889 if (!res)
1891 if (dump_enabled_p ())
1892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1893 "not vectorized: loop contains function "
1894 "calls or data references that cannot "
1895 "be analyzed\n");
1896 return res;
1898 loop_vinfo->shared->save_datarefs ();
1900 else
1901 loop_vinfo->shared->check_datarefs ();
1903 /* Analyze the data references and also adjust the minimal
1904 vectorization factor according to the loads and stores. */
1906 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1907 if (!ok)
1909 if (dump_enabled_p ())
1910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1911 "bad data references.\n");
1912 return ok;
1915 /* Classify all cross-iteration scalar data-flow cycles.
1916 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1917 vect_analyze_scalar_cycles (loop_vinfo);
1919 vect_pattern_recog (loop_vinfo);
1921 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1923 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1924 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1926 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1927 if (!ok)
1929 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931 "bad data access.\n");
1932 return ok;
1935 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1937 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1938 if (!ok)
1940 if (dump_enabled_p ())
1941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1942 "unexpected pattern.\n");
1943 return ok;
1946 /* While the rest of the analysis below depends on it in some way. */
1947 fatal = false;
1949 /* Analyze data dependences between the data-refs in the loop
1950 and adjust the maximum vectorization factor according to
1951 the dependences.
1952 FORNOW: fail at the first data dependence that we encounter. */
1954 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1955 if (!ok)
1957 if (dump_enabled_p ())
1958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959 "bad data dependence.\n");
1960 return ok;
1962 if (max_vf != MAX_VECTORIZATION_FACTOR
1963 && maybe_lt (max_vf, min_vf))
1964 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1965 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1967 ok = vect_determine_vectorization_factor (loop_vinfo);
1968 if (!ok)
1970 if (dump_enabled_p ())
1971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1972 "can't determine vectorization factor.\n");
1973 return ok;
1975 if (max_vf != MAX_VECTORIZATION_FACTOR
1976 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1977 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1979 /* Compute the scalar iteration cost. */
1980 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1982 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1983 unsigned th;
1985 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1986 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1987 if (!ok)
1988 return ok;
1990 /* If there are any SLP instances mark them as pure_slp. */
1991 bool slp = vect_make_slp_decision (loop_vinfo);
1992 if (slp)
1994 /* Find stmts that need to be both vectorized and SLPed. */
1995 vect_detect_hybrid_slp (loop_vinfo);
1997 /* Update the vectorization factor based on the SLP decision. */
1998 vect_update_vf_for_slp (loop_vinfo);
2001 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2003 /* We don't expect to have to roll back to anything other than an empty
2004 set of rgroups. */
2005 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2007 /* This is the point where we can re-start analysis with SLP forced off. */
2008 start_over:
2010 /* Now the vectorization factor is final. */
2011 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2012 gcc_assert (known_ne (vectorization_factor, 0U));
2014 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2016 dump_printf_loc (MSG_NOTE, vect_location,
2017 "vectorization_factor = ");
2018 dump_dec (MSG_NOTE, vectorization_factor);
2019 dump_printf (MSG_NOTE, ", niters = %wd\n",
2020 LOOP_VINFO_INT_NITERS (loop_vinfo));
2023 HOST_WIDE_INT max_niter
2024 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2026 /* Analyze the alignment of the data-refs in the loop.
2027 Fail if a data reference is found that cannot be vectorized. */
2029 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2030 if (!ok)
2032 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2034 "bad data alignment.\n");
2035 return ok;
2038 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2039 It is important to call pruning after vect_analyze_data_ref_accesses,
2040 since we use grouping information gathered by interleaving analysis. */
2041 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2042 if (!ok)
2043 return ok;
2045 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2046 vectorization, since we do not want to add extra peeling or
2047 add versioning for alignment. */
2048 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2049 /* This pass will decide on using loop versioning and/or loop peeling in
2050 order to enhance the alignment of data references in the loop. */
2051 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2052 else
2053 ok = vect_verify_datarefs_alignment (loop_vinfo);
2054 if (!ok)
2055 return ok;
2057 if (slp)
2059 /* Analyze operations in the SLP instances. Note this may
2060 remove unsupported SLP instances which makes the above
2061 SLP kind detection invalid. */
2062 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2063 vect_slp_analyze_operations (loop_vinfo);
2064 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2066 ok = opt_result::failure_at (vect_location,
2067 "unsupported SLP instances\n");
2068 goto again;
2072 /* Dissolve SLP-only groups. */
2073 vect_dissolve_slp_only_groups (loop_vinfo);
2075 /* Scan all the remaining operations in the loop that are not subject
2076 to SLP and make sure they are vectorizable. */
2077 ok = vect_analyze_loop_operations (loop_vinfo);
2078 if (!ok)
2080 if (dump_enabled_p ())
2081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2082 "bad operation or unsupported loop bound.\n");
2083 return ok;
2086 /* Decide whether to use a fully-masked loop for this vectorization
2087 factor. */
2088 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2089 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2090 && vect_verify_full_masking (loop_vinfo));
2091 if (dump_enabled_p ())
2093 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2094 dump_printf_loc (MSG_NOTE, vect_location,
2095 "using a fully-masked loop.\n");
2096 else
2097 dump_printf_loc (MSG_NOTE, vect_location,
2098 "not using a fully-masked loop.\n");
2101 /* If epilog loop is required because of data accesses with gaps,
2102 one additional iteration needs to be peeled. Check if there is
2103 enough iterations for vectorization. */
2104 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2105 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2106 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2108 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2109 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2111 if (known_lt (wi::to_widest (scalar_niters), vf))
2112 return opt_result::failure_at (vect_location,
2113 "loop has no enough iterations to"
2114 " support peeling for gaps.\n");
2117 /* Check the costings of the loop make vectorizing worthwhile. */
2118 res = vect_analyze_loop_costing (loop_vinfo);
2119 if (res < 0)
2121 ok = opt_result::failure_at (vect_location,
2122 "Loop costings may not be worthwhile.\n");
2123 goto again;
2125 if (!res)
2126 return opt_result::failure_at (vect_location,
2127 "Loop costings not worthwhile.\n");
2129 /* Decide whether we need to create an epilogue loop to handle
2130 remaining scalar iterations. */
2131 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2133 unsigned HOST_WIDE_INT const_vf;
2134 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2135 /* The main loop handles all iterations. */
2136 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2137 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2138 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2140 /* Work out the (constant) number of iterations that need to be
2141 peeled for reasons other than niters. */
2142 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2143 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2144 peel_niter += 1;
2145 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2146 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2147 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2149 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2150 /* ??? When peeling for gaps but not alignment, we could
2151 try to check whether the (variable) niters is known to be
2152 VF * N + 1. That's something of a niche case though. */
2153 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2154 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2155 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2156 < (unsigned) exact_log2 (const_vf))
2157 /* In case of versioning, check if the maximum number of
2158 iterations is greater than th. If they are identical,
2159 the epilogue is unnecessary. */
2160 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2161 || ((unsigned HOST_WIDE_INT) max_niter
2162 > (th / const_vf) * const_vf))))
2163 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2165 /* If an epilogue loop is required make sure we can create one. */
2166 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2167 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2169 if (dump_enabled_p ())
2170 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2171 if (!vect_can_advance_ivs_p (loop_vinfo)
2172 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2173 single_exit (LOOP_VINFO_LOOP
2174 (loop_vinfo))))
2176 ok = opt_result::failure_at (vect_location,
2177 "not vectorized: can't create required "
2178 "epilog loop\n");
2179 goto again;
2183 /* During peeling, we need to check if number of loop iterations is
2184 enough for both peeled prolog loop and vector loop. This check
2185 can be merged along with threshold check of loop versioning, so
2186 increase threshold for this case if necessary. */
2187 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2189 poly_uint64 niters_th = 0;
2191 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2193 /* Niters for peeled prolog loop. */
2194 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2196 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2197 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2198 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2200 else
2201 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2204 /* Niters for at least one iteration of vectorized loop. */
2205 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2206 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2207 /* One additional iteration because of peeling for gap. */
2208 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2209 niters_th += 1;
2210 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2213 gcc_assert (known_eq (vectorization_factor,
2214 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2216 /* Ok to vectorize! */
2217 return opt_result::success ();
2219 again:
2220 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2221 gcc_assert (!ok);
2223 /* Try again with SLP forced off but if we didn't do any SLP there is
2224 no point in re-trying. */
2225 if (!slp)
2226 return ok;
2228 /* If there are reduction chains re-trying will fail anyway. */
2229 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2230 return ok;
2232 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2233 via interleaving or lane instructions. */
2234 slp_instance instance;
2235 slp_tree node;
2236 unsigned i, j;
2237 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2239 stmt_vec_info vinfo;
2240 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2241 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2242 continue;
2243 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2244 unsigned int size = DR_GROUP_SIZE (vinfo);
2245 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2246 if (! vect_store_lanes_supported (vectype, size, false)
2247 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2248 && ! vect_grouped_store_supported (vectype, size))
2249 return opt_result::failure_at (vinfo->stmt,
2250 "unsupported grouped store\n");
2251 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2253 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2254 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2255 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2256 size = DR_GROUP_SIZE (vinfo);
2257 vectype = STMT_VINFO_VECTYPE (vinfo);
2258 if (! vect_load_lanes_supported (vectype, size, false)
2259 && ! vect_grouped_load_supported (vectype, single_element_p,
2260 size))
2261 return opt_result::failure_at (vinfo->stmt,
2262 "unsupported grouped load\n");
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location,
2268 "re-trying with SLP disabled\n");
2270 /* Roll back state appropriately. No SLP this time. */
2271 slp = false;
2272 /* Restore vectorization factor as it were without SLP. */
2273 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2274 /* Free the SLP instances. */
2275 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2276 vect_free_slp_instance (instance, false);
2277 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2278 /* Reset SLP type to loop_vect on all stmts. */
2279 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2281 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2282 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2283 !gsi_end_p (si); gsi_next (&si))
2285 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2286 STMT_SLP_TYPE (stmt_info) = loop_vect;
2288 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2289 !gsi_end_p (si); gsi_next (&si))
2291 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2292 STMT_SLP_TYPE (stmt_info) = loop_vect;
2293 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2295 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2296 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2297 STMT_SLP_TYPE (stmt_info) = loop_vect;
2298 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2299 !gsi_end_p (pi); gsi_next (&pi))
2300 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2301 = loop_vect;
2305 /* Free optimized alias test DDRS. */
2306 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2307 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2308 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2309 /* Reset target cost data. */
2310 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2311 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2312 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2313 /* Reset accumulated rgroup information. */
2314 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2315 /* Reset assorted flags. */
2316 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2317 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2318 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2319 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2320 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2322 goto start_over;
2325 /* Function vect_analyze_loop.
2327 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2328 for it. The different analyses will record information in the
2329 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2330 be vectorized. */
2331 opt_loop_vec_info
2332 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2333 vec_info_shared *shared)
2335 auto_vector_sizes vector_sizes;
2337 /* Autodetect first vector size we try. */
2338 current_vector_size = 0;
2339 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2340 loop->simdlen != 0);
2341 unsigned int next_size = 0;
2343 DUMP_VECT_SCOPE ("analyze_loop_nest");
2345 if (loop_outer (loop)
2346 && loop_vec_info_for_loop (loop_outer (loop))
2347 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2348 return opt_loop_vec_info::failure_at (vect_location,
2349 "outer-loop already vectorized.\n");
2351 if (!find_loop_nest (loop, &shared->loop_nest))
2352 return opt_loop_vec_info::failure_at
2353 (vect_location,
2354 "not vectorized: loop nest containing two or more consecutive inner"
2355 " loops cannot be vectorized\n");
2357 unsigned n_stmts = 0;
2358 poly_uint64 autodetected_vector_size = 0;
2359 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2360 poly_uint64 first_vector_size = 0;
2361 while (1)
2363 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2364 opt_loop_vec_info loop_vinfo
2365 = vect_analyze_loop_form (loop, shared);
2366 if (!loop_vinfo)
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2370 "bad loop form.\n");
2371 gcc_checking_assert (first_loop_vinfo == NULL);
2372 return loop_vinfo;
2375 bool fatal = false;
2377 if (orig_loop_vinfo)
2378 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2380 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2381 if (res)
2383 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2385 if (loop->simdlen
2386 && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2387 (unsigned HOST_WIDE_INT) loop->simdlen))
2389 if (first_loop_vinfo == NULL)
2391 first_loop_vinfo = loop_vinfo;
2392 first_vector_size = current_vector_size;
2393 loop->aux = NULL;
2395 else
2396 delete loop_vinfo;
2398 else
2400 delete first_loop_vinfo;
2401 return loop_vinfo;
2404 else
2405 delete loop_vinfo;
2407 if (next_size == 0)
2408 autodetected_vector_size = current_vector_size;
2410 if (next_size < vector_sizes.length ()
2411 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2412 next_size += 1;
2414 if (fatal)
2416 gcc_checking_assert (first_loop_vinfo == NULL);
2417 return opt_loop_vec_info::propagate_failure (res);
2420 if (next_size == vector_sizes.length ()
2421 || known_eq (current_vector_size, 0U))
2423 if (first_loop_vinfo)
2425 current_vector_size = first_vector_size;
2426 loop->aux = (loop_vec_info) first_loop_vinfo;
2427 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_NOTE, vect_location,
2430 "***** Choosing vector size ");
2431 dump_dec (MSG_NOTE, current_vector_size);
2432 dump_printf (MSG_NOTE, "\n");
2434 return first_loop_vinfo;
2436 else
2437 return opt_loop_vec_info::propagate_failure (res);
2440 /* Try the next biggest vector size. */
2441 current_vector_size = vector_sizes[next_size++];
2442 if (dump_enabled_p ())
2444 dump_printf_loc (MSG_NOTE, vect_location,
2445 "***** Re-trying analysis with "
2446 "vector size ");
2447 dump_dec (MSG_NOTE, current_vector_size);
2448 dump_printf (MSG_NOTE, "\n");
2453 /* Return true if there is an in-order reduction function for CODE, storing
2454 it in *REDUC_FN if so. */
2456 static bool
2457 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2459 switch (code)
2461 case PLUS_EXPR:
2462 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2463 return true;
2465 default:
2466 return false;
2470 /* Function reduction_fn_for_scalar_code
2472 Input:
2473 CODE - tree_code of a reduction operations.
2475 Output:
2476 REDUC_FN - the corresponding internal function to be used to reduce the
2477 vector of partial results into a single scalar result, or IFN_LAST
2478 if the operation is a supported reduction operation, but does not have
2479 such an internal function.
2481 Return FALSE if CODE currently cannot be vectorized as reduction. */
2483 static bool
2484 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2486 switch (code)
2488 case MAX_EXPR:
2489 *reduc_fn = IFN_REDUC_MAX;
2490 return true;
2492 case MIN_EXPR:
2493 *reduc_fn = IFN_REDUC_MIN;
2494 return true;
2496 case PLUS_EXPR:
2497 *reduc_fn = IFN_REDUC_PLUS;
2498 return true;
2500 case BIT_AND_EXPR:
2501 *reduc_fn = IFN_REDUC_AND;
2502 return true;
2504 case BIT_IOR_EXPR:
2505 *reduc_fn = IFN_REDUC_IOR;
2506 return true;
2508 case BIT_XOR_EXPR:
2509 *reduc_fn = IFN_REDUC_XOR;
2510 return true;
2512 case MULT_EXPR:
2513 case MINUS_EXPR:
2514 *reduc_fn = IFN_LAST;
2515 return true;
2517 default:
2518 return false;
2522 /* If there is a neutral value X such that SLP reduction NODE would not
2523 be affected by the introduction of additional X elements, return that X,
2524 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2525 is true if the SLP statements perform a single reduction, false if each
2526 statement performs an independent reduction. */
2528 static tree
2529 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2530 bool reduc_chain)
2532 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2533 stmt_vec_info stmt_vinfo = stmts[0];
2534 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2535 tree scalar_type = TREE_TYPE (vector_type);
2536 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2537 gcc_assert (loop);
2539 switch (code)
2541 case WIDEN_SUM_EXPR:
2542 case DOT_PROD_EXPR:
2543 case SAD_EXPR:
2544 case PLUS_EXPR:
2545 case MINUS_EXPR:
2546 case BIT_IOR_EXPR:
2547 case BIT_XOR_EXPR:
2548 return build_zero_cst (scalar_type);
2550 case MULT_EXPR:
2551 return build_one_cst (scalar_type);
2553 case BIT_AND_EXPR:
2554 return build_all_ones_cst (scalar_type);
2556 case MAX_EXPR:
2557 case MIN_EXPR:
2558 /* For MIN/MAX the initial values are neutral. A reduction chain
2559 has only a single initial value, so that value is neutral for
2560 all statements. */
2561 if (reduc_chain)
2562 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2563 loop_preheader_edge (loop));
2564 return NULL_TREE;
2566 default:
2567 return NULL_TREE;
2571 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2572 STMT is printed with a message MSG. */
2574 static void
2575 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2577 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2580 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2581 operation. Return true if the results of DEF_STMT_INFO are something
2582 that can be accumulated by such a reduction. */
2584 static bool
2585 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2587 return (is_gimple_assign (def_stmt_info->stmt)
2588 || is_gimple_call (def_stmt_info->stmt)
2589 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2590 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2591 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2592 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2595 /* Detect SLP reduction of the form:
2597 #a1 = phi <a5, a0>
2598 a2 = operation (a1)
2599 a3 = operation (a2)
2600 a4 = operation (a3)
2601 a5 = operation (a4)
2603 #a = phi <a5>
2605 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2606 FIRST_STMT is the first reduction stmt in the chain
2607 (a2 = operation (a1)).
2609 Return TRUE if a reduction chain was detected. */
2611 static bool
2612 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2613 gimple *first_stmt)
2615 struct loop *loop = (gimple_bb (phi))->loop_father;
2616 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2617 enum tree_code code;
2618 gimple *loop_use_stmt = NULL;
2619 stmt_vec_info use_stmt_info;
2620 tree lhs;
2621 imm_use_iterator imm_iter;
2622 use_operand_p use_p;
2623 int nloop_uses, size = 0, n_out_of_loop_uses;
2624 bool found = false;
2626 if (loop != vect_loop)
2627 return false;
2629 auto_vec<stmt_vec_info, 8> reduc_chain;
2630 lhs = PHI_RESULT (phi);
2631 code = gimple_assign_rhs_code (first_stmt);
2632 while (1)
2634 nloop_uses = 0;
2635 n_out_of_loop_uses = 0;
2636 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2638 gimple *use_stmt = USE_STMT (use_p);
2639 if (is_gimple_debug (use_stmt))
2640 continue;
2642 /* Check if we got back to the reduction phi. */
2643 if (use_stmt == phi)
2645 loop_use_stmt = use_stmt;
2646 found = true;
2647 break;
2650 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2652 loop_use_stmt = use_stmt;
2653 nloop_uses++;
2655 else
2656 n_out_of_loop_uses++;
2658 /* There are can be either a single use in the loop or two uses in
2659 phi nodes. */
2660 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2661 return false;
2664 if (found)
2665 break;
2667 /* We reached a statement with no loop uses. */
2668 if (nloop_uses == 0)
2669 return false;
2671 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2672 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2673 return false;
2675 if (!is_gimple_assign (loop_use_stmt)
2676 || code != gimple_assign_rhs_code (loop_use_stmt)
2677 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2678 return false;
2680 /* Insert USE_STMT into reduction chain. */
2681 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2682 reduc_chain.safe_push (use_stmt_info);
2684 lhs = gimple_assign_lhs (loop_use_stmt);
2685 size++;
2688 if (!found || loop_use_stmt != phi || size < 2)
2689 return false;
2691 /* Swap the operands, if needed, to make the reduction operand be the second
2692 operand. */
2693 lhs = PHI_RESULT (phi);
2694 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2696 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2697 if (gimple_assign_rhs2 (next_stmt) == lhs)
2699 tree op = gimple_assign_rhs1 (next_stmt);
2700 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2702 /* Check that the other def is either defined in the loop
2703 ("vect_internal_def"), or it's an induction (defined by a
2704 loop-header phi-node). */
2705 if (def_stmt_info
2706 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2707 && vect_valid_reduction_input_p (def_stmt_info))
2709 lhs = gimple_assign_lhs (next_stmt);
2710 continue;
2713 return false;
2715 else
2717 tree op = gimple_assign_rhs2 (next_stmt);
2718 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2720 /* Check that the other def is either defined in the loop
2721 ("vect_internal_def"), or it's an induction (defined by a
2722 loop-header phi-node). */
2723 if (def_stmt_info
2724 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2725 && vect_valid_reduction_input_p (def_stmt_info))
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2729 next_stmt);
2731 swap_ssa_operands (next_stmt,
2732 gimple_assign_rhs1_ptr (next_stmt),
2733 gimple_assign_rhs2_ptr (next_stmt));
2734 update_stmt (next_stmt);
2736 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2737 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2739 else
2740 return false;
2743 lhs = gimple_assign_lhs (next_stmt);
2746 /* Build up the actual chain. */
2747 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2749 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2750 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2752 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2753 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2755 /* Save the chain for further analysis in SLP detection. */
2756 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2757 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2759 return true;
2762 /* Return true if we need an in-order reduction for operation CODE
2763 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2764 overflow must wrap. */
2766 static bool
2767 needs_fold_left_reduction_p (tree type, tree_code code,
2768 bool need_wrapping_integral_overflow)
2770 /* CHECKME: check for !flag_finite_math_only too? */
2771 if (SCALAR_FLOAT_TYPE_P (type))
2772 switch (code)
2774 case MIN_EXPR:
2775 case MAX_EXPR:
2776 return false;
2778 default:
2779 return !flag_associative_math;
2782 if (INTEGRAL_TYPE_P (type))
2784 if (!operation_no_trapping_overflow (type, code))
2785 return true;
2786 if (need_wrapping_integral_overflow
2787 && !TYPE_OVERFLOW_WRAPS (type)
2788 && operation_can_overflow (code))
2789 return true;
2790 return false;
2793 if (SAT_FIXED_POINT_TYPE_P (type))
2794 return true;
2796 return false;
2799 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2800 reduction operation CODE has a handled computation expression. */
2802 bool
2803 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2804 tree loop_arg, enum tree_code code)
2806 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2807 auto_bitmap visited;
2808 tree lookfor = PHI_RESULT (phi);
2809 ssa_op_iter curri;
2810 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2811 while (USE_FROM_PTR (curr) != loop_arg)
2812 curr = op_iter_next_use (&curri);
2813 curri.i = curri.numops;
2816 path.safe_push (std::make_pair (curri, curr));
2817 tree use = USE_FROM_PTR (curr);
2818 if (use == lookfor)
2819 break;
2820 gimple *def = SSA_NAME_DEF_STMT (use);
2821 if (gimple_nop_p (def)
2822 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2824 pop:
2827 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2828 curri = x.first;
2829 curr = x.second;
2831 curr = op_iter_next_use (&curri);
2832 /* Skip already visited or non-SSA operands (from iterating
2833 over PHI args). */
2834 while (curr != NULL_USE_OPERAND_P
2835 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2836 || ! bitmap_set_bit (visited,
2837 SSA_NAME_VERSION
2838 (USE_FROM_PTR (curr)))));
2840 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2841 if (curr == NULL_USE_OPERAND_P)
2842 break;
2844 else
2846 if (gimple_code (def) == GIMPLE_PHI)
2847 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2848 else
2849 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2850 while (curr != NULL_USE_OPERAND_P
2851 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2852 || ! bitmap_set_bit (visited,
2853 SSA_NAME_VERSION
2854 (USE_FROM_PTR (curr)))))
2855 curr = op_iter_next_use (&curri);
2856 if (curr == NULL_USE_OPERAND_P)
2857 goto pop;
2860 while (1);
2861 if (dump_file && (dump_flags & TDF_DETAILS))
2863 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2864 unsigned i;
2865 std::pair<ssa_op_iter, use_operand_p> *x;
2866 FOR_EACH_VEC_ELT (path, i, x)
2867 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2868 dump_printf (MSG_NOTE, "\n");
2871 /* Check whether the reduction path detected is valid. */
2872 bool fail = path.length () == 0;
2873 bool neg = false;
2874 for (unsigned i = 1; i < path.length (); ++i)
2876 gimple *use_stmt = USE_STMT (path[i].second);
2877 tree op = USE_FROM_PTR (path[i].second);
2878 if (! has_single_use (op)
2879 || ! is_gimple_assign (use_stmt))
2881 fail = true;
2882 break;
2884 if (gimple_assign_rhs_code (use_stmt) != code)
2886 if (code == PLUS_EXPR
2887 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2889 /* Track whether we negate the reduction value each iteration. */
2890 if (gimple_assign_rhs2 (use_stmt) == op)
2891 neg = ! neg;
2893 else
2895 fail = true;
2896 break;
2900 return ! fail && ! neg;
2904 /* Function vect_is_simple_reduction
2906 (1) Detect a cross-iteration def-use cycle that represents a simple
2907 reduction computation. We look for the following pattern:
2909 loop_header:
2910 a1 = phi < a0, a2 >
2911 a3 = ...
2912 a2 = operation (a3, a1)
2916 a3 = ...
2917 loop_header:
2918 a1 = phi < a0, a2 >
2919 a2 = operation (a3, a1)
2921 such that:
2922 1. operation is commutative and associative and it is safe to
2923 change the order of the computation
2924 2. no uses for a2 in the loop (a2 is used out of the loop)
2925 3. no uses of a1 in the loop besides the reduction operation
2926 4. no uses of a1 outside the loop.
2928 Conditions 1,4 are tested here.
2929 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2931 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2932 nested cycles.
2934 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2935 reductions:
2937 a1 = phi < a0, a2 >
2938 inner loop (def of a3)
2939 a2 = phi < a3 >
2941 (4) Detect condition expressions, ie:
2942 for (int i = 0; i < N; i++)
2943 if (a[i] < val)
2944 ret_val = a[i];
2948 static stmt_vec_info
2949 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2950 bool *double_reduc,
2951 bool need_wrapping_integral_overflow,
2952 enum vect_reduction_type *v_reduc_type)
2954 gphi *phi = as_a <gphi *> (phi_info->stmt);
2955 struct loop *loop = (gimple_bb (phi))->loop_father;
2956 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2957 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2958 gimple *phi_use_stmt = NULL;
2959 enum tree_code orig_code, code;
2960 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2961 tree type;
2962 tree name;
2963 imm_use_iterator imm_iter;
2964 use_operand_p use_p;
2965 bool phi_def;
2967 *double_reduc = false;
2968 *v_reduc_type = TREE_CODE_REDUCTION;
2970 tree phi_name = PHI_RESULT (phi);
2971 /* ??? If there are no uses of the PHI result the inner loop reduction
2972 won't be detected as possibly double-reduction by vectorizable_reduction
2973 because that tries to walk the PHI arg from the preheader edge which
2974 can be constant. See PR60382. */
2975 if (has_zero_uses (phi_name))
2976 return NULL;
2977 unsigned nphi_def_loop_uses = 0;
2978 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2980 gimple *use_stmt = USE_STMT (use_p);
2981 if (is_gimple_debug (use_stmt))
2982 continue;
2984 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2986 if (dump_enabled_p ())
2987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2988 "intermediate value used outside loop.\n");
2990 return NULL;
2993 nphi_def_loop_uses++;
2994 phi_use_stmt = use_stmt;
2997 edge latch_e = loop_latch_edge (loop);
2998 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2999 if (TREE_CODE (loop_arg) != SSA_NAME)
3001 if (dump_enabled_p ())
3002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3003 "reduction: not ssa_name: %T\n", loop_arg);
3004 return NULL;
3007 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
3008 if (!def_stmt_info
3009 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3010 return NULL;
3012 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
3014 name = gimple_assign_lhs (def_stmt);
3015 phi_def = false;
3017 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3019 name = PHI_RESULT (def_stmt);
3020 phi_def = true;
3022 else
3024 if (dump_enabled_p ())
3025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3026 "reduction: unhandled reduction operation: %G",
3027 def_stmt_info->stmt);
3028 return NULL;
3031 unsigned nlatch_def_loop_uses = 0;
3032 auto_vec<gphi *, 3> lcphis;
3033 bool inner_loop_of_double_reduc = false;
3034 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3036 gimple *use_stmt = USE_STMT (use_p);
3037 if (is_gimple_debug (use_stmt))
3038 continue;
3039 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3040 nlatch_def_loop_uses++;
3041 else
3043 /* We can have more than one loop-closed PHI. */
3044 lcphis.safe_push (as_a <gphi *> (use_stmt));
3045 if (nested_in_vect_loop
3046 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3047 == vect_double_reduction_def))
3048 inner_loop_of_double_reduc = true;
3052 /* If this isn't a nested cycle or if the nested cycle reduction value
3053 is used ouside of the inner loop we cannot handle uses of the reduction
3054 value. */
3055 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
3056 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
3058 if (dump_enabled_p ())
3059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3060 "reduction used in loop.\n");
3061 return NULL;
3064 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3065 defined in the inner loop. */
3066 if (phi_def)
3068 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
3069 op1 = PHI_ARG_DEF (def_stmt, 0);
3071 if (gimple_phi_num_args (def_stmt) != 1
3072 || TREE_CODE (op1) != SSA_NAME)
3074 if (dump_enabled_p ())
3075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3076 "unsupported phi node definition.\n");
3078 return NULL;
3081 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3082 if (gimple_bb (def1)
3083 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3084 && loop->inner
3085 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3086 && is_gimple_assign (def1)
3087 && is_a <gphi *> (phi_use_stmt)
3088 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3090 if (dump_enabled_p ())
3091 report_vect_op (MSG_NOTE, def_stmt,
3092 "detected double reduction: ");
3094 *double_reduc = true;
3095 return def_stmt_info;
3098 return NULL;
3101 /* If we are vectorizing an inner reduction we are executing that
3102 in the original order only in case we are not dealing with a
3103 double reduction. */
3104 bool check_reduction = true;
3105 if (flow_loop_nested_p (vect_loop, loop))
3107 gphi *lcphi;
3108 unsigned i;
3109 check_reduction = false;
3110 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3111 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3113 gimple *use_stmt = USE_STMT (use_p);
3114 if (is_gimple_debug (use_stmt))
3115 continue;
3116 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3117 check_reduction = true;
3121 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3122 code = orig_code = gimple_assign_rhs_code (def_stmt);
3124 if (nested_in_vect_loop && !check_reduction)
3126 /* FIXME: Even for non-reductions code generation is funneled
3127 through vectorizable_reduction for the stmt defining the
3128 PHI latch value. So we have to artificially restrict ourselves
3129 for the supported operations. */
3130 switch (get_gimple_rhs_class (code))
3132 case GIMPLE_BINARY_RHS:
3133 case GIMPLE_TERNARY_RHS:
3134 break;
3135 default:
3136 /* Not supported by vectorizable_reduction. */
3137 if (dump_enabled_p ())
3138 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3139 "nested cycle: not handled operation: ");
3140 return NULL;
3142 if (dump_enabled_p ())
3143 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3144 return def_stmt_info;
3147 /* We can handle "res -= x[i]", which is non-associative by
3148 simply rewriting this into "res += -x[i]". Avoid changing
3149 gimple instruction for the first simple tests and only do this
3150 if we're allowed to change code at all. */
3151 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3152 code = PLUS_EXPR;
3154 if (code == COND_EXPR)
3156 if (! nested_in_vect_loop)
3157 *v_reduc_type = COND_REDUCTION;
3159 op3 = gimple_assign_rhs1 (def_stmt);
3160 if (COMPARISON_CLASS_P (op3))
3162 op4 = TREE_OPERAND (op3, 1);
3163 op3 = TREE_OPERAND (op3, 0);
3165 if (op3 == phi_name || op4 == phi_name)
3167 if (dump_enabled_p ())
3168 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3169 "reduction: condition depends on previous"
3170 " iteration: ");
3171 return NULL;
3174 op1 = gimple_assign_rhs2 (def_stmt);
3175 op2 = gimple_assign_rhs3 (def_stmt);
3177 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3179 if (dump_enabled_p ())
3180 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3181 "reduction: not commutative/associative: ");
3182 return NULL;
3184 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3186 op1 = gimple_assign_rhs1 (def_stmt);
3187 op2 = gimple_assign_rhs2 (def_stmt);
3189 else
3191 if (dump_enabled_p ())
3192 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3193 "reduction: not handled operation: ");
3194 return NULL;
3197 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3199 if (dump_enabled_p ())
3200 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3201 "reduction: both uses not ssa_names: ");
3203 return NULL;
3206 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3207 if ((TREE_CODE (op1) == SSA_NAME
3208 && !types_compatible_p (type,TREE_TYPE (op1)))
3209 || (TREE_CODE (op2) == SSA_NAME
3210 && !types_compatible_p (type, TREE_TYPE (op2)))
3211 || (op3 && TREE_CODE (op3) == SSA_NAME
3212 && !types_compatible_p (type, TREE_TYPE (op3)))
3213 || (op4 && TREE_CODE (op4) == SSA_NAME
3214 && !types_compatible_p (type, TREE_TYPE (op4))))
3216 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_NOTE, vect_location,
3219 "reduction: multiple types: operation type: "
3220 "%T, operands types: %T,%T",
3221 type, TREE_TYPE (op1), TREE_TYPE (op2));
3222 if (op3)
3223 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3225 if (op4)
3226 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3227 dump_printf (MSG_NOTE, "\n");
3230 return NULL;
3233 /* Check whether it's ok to change the order of the computation.
3234 Generally, when vectorizing a reduction we change the order of the
3235 computation. This may change the behavior of the program in some
3236 cases, so we need to check that this is ok. One exception is when
3237 vectorizing an outer-loop: the inner-loop is executed sequentially,
3238 and therefore vectorizing reductions in the inner-loop during
3239 outer-loop vectorization is safe. */
3240 if (check_reduction
3241 && *v_reduc_type == TREE_CODE_REDUCTION
3242 && needs_fold_left_reduction_p (type, code,
3243 need_wrapping_integral_overflow))
3244 *v_reduc_type = FOLD_LEFT_REDUCTION;
3246 /* Reduction is safe. We're dealing with one of the following:
3247 1) integer arithmetic and no trapv
3248 2) floating point arithmetic, and special flags permit this optimization
3249 3) nested cycle (i.e., outer loop vectorization). */
3250 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3251 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3252 if (code != COND_EXPR && !def1_info && !def2_info)
3254 if (dump_enabled_p ())
3255 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3256 return NULL;
3259 /* Check that one def is the reduction def, defined by PHI,
3260 the other def is either defined in the loop ("vect_internal_def"),
3261 or it's an induction (defined by a loop-header phi-node). */
3263 if (def2_info
3264 && def2_info->stmt == phi
3265 && (code == COND_EXPR
3266 || !def1_info
3267 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3268 || vect_valid_reduction_input_p (def1_info)))
3270 if (dump_enabled_p ())
3271 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3272 return def_stmt_info;
3275 if (def1_info
3276 && def1_info->stmt == phi
3277 && (code == COND_EXPR
3278 || !def2_info
3279 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3280 || vect_valid_reduction_input_p (def2_info)))
3282 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3284 /* Check if we can swap operands (just for simplicity - so that
3285 the rest of the code can assume that the reduction variable
3286 is always the last (second) argument). */
3287 if (code == COND_EXPR)
3289 /* Swap cond_expr by inverting the condition. */
3290 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3291 enum tree_code invert_code = ERROR_MARK;
3292 enum tree_code cond_code = TREE_CODE (cond_expr);
3294 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3296 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3297 invert_code = invert_tree_comparison (cond_code, honor_nans);
3299 if (invert_code != ERROR_MARK)
3301 TREE_SET_CODE (cond_expr, invert_code);
3302 swap_ssa_operands (def_stmt,
3303 gimple_assign_rhs2_ptr (def_stmt),
3304 gimple_assign_rhs3_ptr (def_stmt));
3306 else
3308 if (dump_enabled_p ())
3309 report_vect_op (MSG_NOTE, def_stmt,
3310 "detected reduction: cannot swap operands "
3311 "for cond_expr");
3312 return NULL;
3315 else
3316 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3317 gimple_assign_rhs2_ptr (def_stmt));
3319 if (dump_enabled_p ())
3320 report_vect_op (MSG_NOTE, def_stmt,
3321 "detected reduction: need to swap operands: ");
3323 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3324 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3326 else
3328 if (dump_enabled_p ())
3329 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3332 return def_stmt_info;
3335 /* Try to find SLP reduction chain. */
3336 if (! nested_in_vect_loop
3337 && code != COND_EXPR
3338 && orig_code != MINUS_EXPR
3339 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3341 if (dump_enabled_p ())
3342 report_vect_op (MSG_NOTE, def_stmt,
3343 "reduction: detected reduction chain: ");
3345 return def_stmt_info;
3348 /* Look for the expression computing loop_arg from loop PHI result. */
3349 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3350 return def_stmt_info;
3352 if (dump_enabled_p ())
3354 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3355 "reduction: unknown pattern: ");
3358 return NULL;
3361 /* Wrapper around vect_is_simple_reduction, which will modify code
3362 in-place if it enables detection of more reductions. Arguments
3363 as there. */
3365 stmt_vec_info
3366 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3367 bool *double_reduc,
3368 bool need_wrapping_integral_overflow)
3370 enum vect_reduction_type v_reduc_type;
3371 stmt_vec_info def_info
3372 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3373 need_wrapping_integral_overflow,
3374 &v_reduc_type);
3375 if (def_info)
3377 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3378 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3379 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3380 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3382 return def_info;
3385 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3387 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3388 int *peel_iters_epilogue,
3389 stmt_vector_for_cost *scalar_cost_vec,
3390 stmt_vector_for_cost *prologue_cost_vec,
3391 stmt_vector_for_cost *epilogue_cost_vec)
3393 int retval = 0;
3394 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3396 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3398 *peel_iters_epilogue = assumed_vf / 2;
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_NOTE, vect_location,
3401 "cost model: epilogue peel iters set to vf/2 "
3402 "because loop iterations are unknown .\n");
3404 /* If peeled iterations are known but number of scalar loop
3405 iterations are unknown, count a taken branch per peeled loop. */
3406 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3407 NULL, 0, vect_prologue);
3408 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3409 NULL, 0, vect_epilogue);
3411 else
3413 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3414 peel_iters_prologue = niters < peel_iters_prologue ?
3415 niters : peel_iters_prologue;
3416 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3417 /* If we need to peel for gaps, but no peeling is required, we have to
3418 peel VF iterations. */
3419 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3420 *peel_iters_epilogue = assumed_vf;
3423 stmt_info_for_cost *si;
3424 int j;
3425 if (peel_iters_prologue)
3426 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3427 retval += record_stmt_cost (prologue_cost_vec,
3428 si->count * peel_iters_prologue,
3429 si->kind, si->stmt_info, si->misalign,
3430 vect_prologue);
3431 if (*peel_iters_epilogue)
3432 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3433 retval += record_stmt_cost (epilogue_cost_vec,
3434 si->count * *peel_iters_epilogue,
3435 si->kind, si->stmt_info, si->misalign,
3436 vect_epilogue);
3438 return retval;
3441 /* Function vect_estimate_min_profitable_iters
3443 Return the number of iterations required for the vector version of the
3444 loop to be profitable relative to the cost of the scalar version of the
3445 loop.
3447 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3448 of iterations for vectorization. -1 value means loop vectorization
3449 is not profitable. This returned value may be used for dynamic
3450 profitability check.
3452 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3453 for static check against estimated number of iterations. */
3455 static void
3456 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3457 int *ret_min_profitable_niters,
3458 int *ret_min_profitable_estimate)
3460 int min_profitable_iters;
3461 int min_profitable_estimate;
3462 int peel_iters_prologue;
3463 int peel_iters_epilogue;
3464 unsigned vec_inside_cost = 0;
3465 int vec_outside_cost = 0;
3466 unsigned vec_prologue_cost = 0;
3467 unsigned vec_epilogue_cost = 0;
3468 int scalar_single_iter_cost = 0;
3469 int scalar_outside_cost = 0;
3470 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3471 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3472 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3474 /* Cost model disabled. */
3475 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3477 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3479 *ret_min_profitable_niters = 0;
3480 *ret_min_profitable_estimate = 0;
3481 return;
3484 /* Requires loop versioning tests to handle misalignment. */
3485 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3487 /* FIXME: Make cost depend on complexity of individual check. */
3488 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3489 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3490 vect_prologue);
3491 if (dump_enabled_p ())
3492 dump_printf (MSG_NOTE,
3493 "cost model: Adding cost of checks for loop "
3494 "versioning to treat misalignment.\n");
3497 /* Requires loop versioning with alias checks. */
3498 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3500 /* FIXME: Make cost depend on complexity of individual check. */
3501 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3502 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3503 vect_prologue);
3504 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3505 if (len)
3506 /* Count LEN - 1 ANDs and LEN comparisons. */
3507 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3508 NULL, 0, vect_prologue);
3509 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3510 if (len)
3512 /* Count LEN - 1 ANDs and LEN comparisons. */
3513 unsigned int nstmts = len * 2 - 1;
3514 /* +1 for each bias that needs adding. */
3515 for (unsigned int i = 0; i < len; ++i)
3516 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3517 nstmts += 1;
3518 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3519 NULL, 0, vect_prologue);
3521 if (dump_enabled_p ())
3522 dump_printf (MSG_NOTE,
3523 "cost model: Adding cost of checks for loop "
3524 "versioning aliasing.\n");
3527 /* Requires loop versioning with niter checks. */
3528 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3530 /* FIXME: Make cost depend on complexity of individual check. */
3531 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3532 vect_prologue);
3533 if (dump_enabled_p ())
3534 dump_printf (MSG_NOTE,
3535 "cost model: Adding cost of checks for loop "
3536 "versioning niters.\n");
3539 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3540 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3541 vect_prologue);
3543 /* Count statements in scalar loop. Using this as scalar cost for a single
3544 iteration for now.
3546 TODO: Add outer loop support.
3548 TODO: Consider assigning different costs to different scalar
3549 statements. */
3551 scalar_single_iter_cost
3552 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3554 /* Add additional cost for the peeled instructions in prologue and epilogue
3555 loop. (For fully-masked loops there will be no peeling.)
3557 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3558 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3560 TODO: Build an expression that represents peel_iters for prologue and
3561 epilogue to be used in a run-time test. */
3563 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3565 peel_iters_prologue = 0;
3566 peel_iters_epilogue = 0;
3568 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3570 /* We need to peel exactly one iteration. */
3571 peel_iters_epilogue += 1;
3572 stmt_info_for_cost *si;
3573 int j;
3574 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3575 j, si)
3576 (void) add_stmt_cost (target_cost_data, si->count,
3577 si->kind, si->stmt_info, si->misalign,
3578 vect_epilogue);
3581 else if (npeel < 0)
3583 peel_iters_prologue = assumed_vf / 2;
3584 if (dump_enabled_p ())
3585 dump_printf (MSG_NOTE, "cost model: "
3586 "prologue peel iters set to vf/2.\n");
3588 /* If peeling for alignment is unknown, loop bound of main loop becomes
3589 unknown. */
3590 peel_iters_epilogue = assumed_vf / 2;
3591 if (dump_enabled_p ())
3592 dump_printf (MSG_NOTE, "cost model: "
3593 "epilogue peel iters set to vf/2 because "
3594 "peeling for alignment is unknown.\n");
3596 /* If peeled iterations are unknown, count a taken branch and a not taken
3597 branch per peeled loop. Even if scalar loop iterations are known,
3598 vector iterations are not known since peeled prologue iterations are
3599 not known. Hence guards remain the same. */
3600 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3601 NULL, 0, vect_prologue);
3602 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3603 NULL, 0, vect_prologue);
3604 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3605 NULL, 0, vect_epilogue);
3606 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3607 NULL, 0, vect_epilogue);
3608 stmt_info_for_cost *si;
3609 int j;
3610 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3612 (void) add_stmt_cost (target_cost_data,
3613 si->count * peel_iters_prologue,
3614 si->kind, si->stmt_info, si->misalign,
3615 vect_prologue);
3616 (void) add_stmt_cost (target_cost_data,
3617 si->count * peel_iters_epilogue,
3618 si->kind, si->stmt_info, si->misalign,
3619 vect_epilogue);
3622 else
3624 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3625 stmt_info_for_cost *si;
3626 int j;
3627 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3629 prologue_cost_vec.create (2);
3630 epilogue_cost_vec.create (2);
3631 peel_iters_prologue = npeel;
3633 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3634 &peel_iters_epilogue,
3635 &LOOP_VINFO_SCALAR_ITERATION_COST
3636 (loop_vinfo),
3637 &prologue_cost_vec,
3638 &epilogue_cost_vec);
3640 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3641 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3642 si->misalign, vect_prologue);
3644 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3645 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3646 si->misalign, vect_epilogue);
3648 prologue_cost_vec.release ();
3649 epilogue_cost_vec.release ();
3652 /* FORNOW: The scalar outside cost is incremented in one of the
3653 following ways:
3655 1. The vectorizer checks for alignment and aliasing and generates
3656 a condition that allows dynamic vectorization. A cost model
3657 check is ANDED with the versioning condition. Hence scalar code
3658 path now has the added cost of the versioning check.
3660 if (cost > th & versioning_check)
3661 jmp to vector code
3663 Hence run-time scalar is incremented by not-taken branch cost.
3665 2. The vectorizer then checks if a prologue is required. If the
3666 cost model check was not done before during versioning, it has to
3667 be done before the prologue check.
3669 if (cost <= th)
3670 prologue = scalar_iters
3671 if (prologue == 0)
3672 jmp to vector code
3673 else
3674 execute prologue
3675 if (prologue == num_iters)
3676 go to exit
3678 Hence the run-time scalar cost is incremented by a taken branch,
3679 plus a not-taken branch, plus a taken branch cost.
3681 3. The vectorizer then checks if an epilogue is required. If the
3682 cost model check was not done before during prologue check, it
3683 has to be done with the epilogue check.
3685 if (prologue == 0)
3686 jmp to vector code
3687 else
3688 execute prologue
3689 if (prologue == num_iters)
3690 go to exit
3691 vector code:
3692 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3693 jmp to epilogue
3695 Hence the run-time scalar cost should be incremented by 2 taken
3696 branches.
3698 TODO: The back end may reorder the BBS's differently and reverse
3699 conditions/branch directions. Change the estimates below to
3700 something more reasonable. */
3702 /* If the number of iterations is known and we do not do versioning, we can
3703 decide whether to vectorize at compile time. Hence the scalar version
3704 do not carry cost model guard costs. */
3705 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3706 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3708 /* Cost model check occurs at versioning. */
3709 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3710 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3711 else
3713 /* Cost model check occurs at prologue generation. */
3714 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3715 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3716 + vect_get_stmt_cost (cond_branch_not_taken);
3717 /* Cost model check occurs at epilogue generation. */
3718 else
3719 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3723 /* Complete the target-specific cost calculations. */
3724 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3725 &vec_inside_cost, &vec_epilogue_cost);
3727 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3729 if (dump_enabled_p ())
3731 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3732 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3733 vec_inside_cost);
3734 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3735 vec_prologue_cost);
3736 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3737 vec_epilogue_cost);
3738 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3739 scalar_single_iter_cost);
3740 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3741 scalar_outside_cost);
3742 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3743 vec_outside_cost);
3744 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3745 peel_iters_prologue);
3746 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3747 peel_iters_epilogue);
3750 /* Calculate number of iterations required to make the vector version
3751 profitable, relative to the loop bodies only. The following condition
3752 must hold true:
3753 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3754 where
3755 SIC = scalar iteration cost, VIC = vector iteration cost,
3756 VOC = vector outside cost, VF = vectorization factor,
3757 NPEEL = prologue iterations + epilogue iterations,
3758 SOC = scalar outside cost for run time cost model check. */
3760 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3761 - vec_inside_cost);
3762 if (saving_per_viter <= 0)
3764 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3765 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3766 "vectorization did not happen for a simd loop");
3768 if (dump_enabled_p ())
3769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3770 "cost model: the vector iteration cost = %d "
3771 "divided by the scalar iteration cost = %d "
3772 "is greater or equal to the vectorization factor = %d"
3773 ".\n",
3774 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3775 *ret_min_profitable_niters = -1;
3776 *ret_min_profitable_estimate = -1;
3777 return;
3780 /* ??? The "if" arm is written to handle all cases; see below for what
3781 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3782 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3784 /* Rewriting the condition above in terms of the number of
3785 vector iterations (vniters) rather than the number of
3786 scalar iterations (niters) gives:
3788 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3790 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3792 For integer N, X and Y when X > 0:
3794 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3795 int outside_overhead = (vec_outside_cost
3796 - scalar_single_iter_cost * peel_iters_prologue
3797 - scalar_single_iter_cost * peel_iters_epilogue
3798 - scalar_outside_cost);
3799 /* We're only interested in cases that require at least one
3800 vector iteration. */
3801 int min_vec_niters = 1;
3802 if (outside_overhead > 0)
3803 min_vec_niters = outside_overhead / saving_per_viter + 1;
3805 if (dump_enabled_p ())
3806 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3807 min_vec_niters);
3809 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3811 /* Now that we know the minimum number of vector iterations,
3812 find the minimum niters for which the scalar cost is larger:
3814 SIC * niters > VIC * vniters + VOC - SOC
3816 We know that the minimum niters is no more than
3817 vniters * VF + NPEEL, but it might be (and often is) less
3818 than that if a partial vector iteration is cheaper than the
3819 equivalent scalar code. */
3820 int threshold = (vec_inside_cost * min_vec_niters
3821 + vec_outside_cost
3822 - scalar_outside_cost);
3823 if (threshold <= 0)
3824 min_profitable_iters = 1;
3825 else
3826 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3828 else
3829 /* Convert the number of vector iterations into a number of
3830 scalar iterations. */
3831 min_profitable_iters = (min_vec_niters * assumed_vf
3832 + peel_iters_prologue
3833 + peel_iters_epilogue);
3835 else
3837 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3838 * assumed_vf
3839 - vec_inside_cost * peel_iters_prologue
3840 - vec_inside_cost * peel_iters_epilogue);
3841 if (min_profitable_iters <= 0)
3842 min_profitable_iters = 0;
3843 else
3845 min_profitable_iters /= saving_per_viter;
3847 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3848 <= (((int) vec_inside_cost * min_profitable_iters)
3849 + (((int) vec_outside_cost - scalar_outside_cost)
3850 * assumed_vf)))
3851 min_profitable_iters++;
3855 if (dump_enabled_p ())
3856 dump_printf (MSG_NOTE,
3857 " Calculated minimum iters for profitability: %d\n",
3858 min_profitable_iters);
3860 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3861 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3862 /* We want the vectorized loop to execute at least once. */
3863 min_profitable_iters = assumed_vf + peel_iters_prologue;
3865 if (dump_enabled_p ())
3866 dump_printf_loc (MSG_NOTE, vect_location,
3867 " Runtime profitability threshold = %d\n",
3868 min_profitable_iters);
3870 *ret_min_profitable_niters = min_profitable_iters;
3872 /* Calculate number of iterations required to make the vector version
3873 profitable, relative to the loop bodies only.
3875 Non-vectorized variant is SIC * niters and it must win over vector
3876 variant on the expected loop trip count. The following condition must hold true:
3877 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3879 if (vec_outside_cost <= 0)
3880 min_profitable_estimate = 0;
3881 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3883 /* This is a repeat of the code above, but with + SOC rather
3884 than - SOC. */
3885 int outside_overhead = (vec_outside_cost
3886 - scalar_single_iter_cost * peel_iters_prologue
3887 - scalar_single_iter_cost * peel_iters_epilogue
3888 + scalar_outside_cost);
3889 int min_vec_niters = 1;
3890 if (outside_overhead > 0)
3891 min_vec_niters = outside_overhead / saving_per_viter + 1;
3893 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3895 int threshold = (vec_inside_cost * min_vec_niters
3896 + vec_outside_cost
3897 + scalar_outside_cost);
3898 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3900 else
3901 min_profitable_estimate = (min_vec_niters * assumed_vf
3902 + peel_iters_prologue
3903 + peel_iters_epilogue);
3905 else
3907 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3908 * assumed_vf
3909 - vec_inside_cost * peel_iters_prologue
3910 - vec_inside_cost * peel_iters_epilogue)
3911 / ((scalar_single_iter_cost * assumed_vf)
3912 - vec_inside_cost);
3914 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3915 if (dump_enabled_p ())
3916 dump_printf_loc (MSG_NOTE, vect_location,
3917 " Static estimate profitability threshold = %d\n",
3918 min_profitable_estimate);
3920 *ret_min_profitable_estimate = min_profitable_estimate;
3923 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3924 vector elements (not bits) for a vector with NELT elements. */
3925 static void
3926 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3927 vec_perm_builder *sel)
3929 /* The encoding is a single stepped pattern. Any wrap-around is handled
3930 by vec_perm_indices. */
3931 sel->new_vector (nelt, 1, 3);
3932 for (unsigned int i = 0; i < 3; i++)
3933 sel->quick_push (i + offset);
3936 /* Checks whether the target supports whole-vector shifts for vectors of mode
3937 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3938 it supports vec_perm_const with masks for all necessary shift amounts. */
3939 static bool
3940 have_whole_vector_shift (machine_mode mode)
3942 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3943 return true;
3945 /* Variable-length vectors should be handled via the optab. */
3946 unsigned int nelt;
3947 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3948 return false;
3950 vec_perm_builder sel;
3951 vec_perm_indices indices;
3952 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3954 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3955 indices.new_vector (sel, 2, nelt);
3956 if (!can_vec_perm_const_p (mode, indices, false))
3957 return false;
3959 return true;
3962 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3963 functions. Design better to avoid maintenance issues. */
3965 /* Function vect_model_reduction_cost.
3967 Models cost for a reduction operation, including the vector ops
3968 generated within the strip-mine loop, the initial definition before
3969 the loop, and the epilogue code that must be generated. */
3971 static void
3972 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3973 int ncopies, stmt_vector_for_cost *cost_vec)
3975 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3976 enum tree_code code;
3977 optab optab;
3978 tree vectype;
3979 machine_mode mode;
3980 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3981 struct loop *loop = NULL;
3983 if (loop_vinfo)
3984 loop = LOOP_VINFO_LOOP (loop_vinfo);
3986 /* Condition reductions generate two reductions in the loop. */
3987 vect_reduction_type reduction_type
3988 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3989 if (reduction_type == COND_REDUCTION)
3990 ncopies *= 2;
3992 vectype = STMT_VINFO_VECTYPE (stmt_info);
3993 mode = TYPE_MODE (vectype);
3994 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3996 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3998 if (reduction_type == EXTRACT_LAST_REDUCTION
3999 || reduction_type == FOLD_LEFT_REDUCTION)
4001 /* No extra instructions needed in the prologue. */
4002 prologue_cost = 0;
4004 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4005 /* Count one reduction-like operation per vector. */
4006 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4007 stmt_info, 0, vect_body);
4008 else
4010 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4011 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4012 inside_cost = record_stmt_cost (cost_vec, nelements,
4013 vec_to_scalar, stmt_info, 0,
4014 vect_body);
4015 inside_cost += record_stmt_cost (cost_vec, nelements,
4016 scalar_stmt, stmt_info, 0,
4017 vect_body);
4020 else
4022 /* Add in cost for initial definition.
4023 For cond reduction we have four vectors: initial index, step,
4024 initial result of the data reduction, initial value of the index
4025 reduction. */
4026 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4027 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4028 scalar_to_vec, stmt_info, 0,
4029 vect_prologue);
4031 /* Cost of reduction op inside loop. */
4032 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4033 stmt_info, 0, vect_body);
4036 /* Determine cost of epilogue code.
4038 We have a reduction operator that will reduce the vector in one statement.
4039 Also requires scalar extract. */
4041 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4043 if (reduc_fn != IFN_LAST)
4045 if (reduction_type == COND_REDUCTION)
4047 /* An EQ stmt and an COND_EXPR stmt. */
4048 epilogue_cost += record_stmt_cost (cost_vec, 2,
4049 vector_stmt, stmt_info, 0,
4050 vect_epilogue);
4051 /* Reduction of the max index and a reduction of the found
4052 values. */
4053 epilogue_cost += record_stmt_cost (cost_vec, 2,
4054 vec_to_scalar, stmt_info, 0,
4055 vect_epilogue);
4056 /* A broadcast of the max value. */
4057 epilogue_cost += record_stmt_cost (cost_vec, 1,
4058 scalar_to_vec, stmt_info, 0,
4059 vect_epilogue);
4061 else
4063 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4064 stmt_info, 0, vect_epilogue);
4065 epilogue_cost += record_stmt_cost (cost_vec, 1,
4066 vec_to_scalar, stmt_info, 0,
4067 vect_epilogue);
4070 else if (reduction_type == COND_REDUCTION)
4072 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4073 /* Extraction of scalar elements. */
4074 epilogue_cost += record_stmt_cost (cost_vec,
4075 2 * estimated_nunits,
4076 vec_to_scalar, stmt_info, 0,
4077 vect_epilogue);
4078 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4079 epilogue_cost += record_stmt_cost (cost_vec,
4080 2 * estimated_nunits - 3,
4081 scalar_stmt, stmt_info, 0,
4082 vect_epilogue);
4084 else if (reduction_type == EXTRACT_LAST_REDUCTION
4085 || reduction_type == FOLD_LEFT_REDUCTION)
4086 /* No extra instructions need in the epilogue. */
4088 else
4090 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4091 tree bitsize =
4092 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4093 int element_bitsize = tree_to_uhwi (bitsize);
4094 int nelements = vec_size_in_bits / element_bitsize;
4096 if (code == COND_EXPR)
4097 code = MAX_EXPR;
4099 optab = optab_for_tree_code (code, vectype, optab_default);
4101 /* We have a whole vector shift available. */
4102 if (optab != unknown_optab
4103 && VECTOR_MODE_P (mode)
4104 && optab_handler (optab, mode) != CODE_FOR_nothing
4105 && have_whole_vector_shift (mode))
4107 /* Final reduction via vector shifts and the reduction operator.
4108 Also requires scalar extract. */
4109 epilogue_cost += record_stmt_cost (cost_vec,
4110 exact_log2 (nelements) * 2,
4111 vector_stmt, stmt_info, 0,
4112 vect_epilogue);
4113 epilogue_cost += record_stmt_cost (cost_vec, 1,
4114 vec_to_scalar, stmt_info, 0,
4115 vect_epilogue);
4117 else
4118 /* Use extracts and reduction op for final reduction. For N
4119 elements, we have N extracts and N-1 reduction ops. */
4120 epilogue_cost += record_stmt_cost (cost_vec,
4121 nelements + nelements - 1,
4122 vector_stmt, stmt_info, 0,
4123 vect_epilogue);
4127 if (dump_enabled_p ())
4128 dump_printf (MSG_NOTE,
4129 "vect_model_reduction_cost: inside_cost = %d, "
4130 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4131 prologue_cost, epilogue_cost);
4135 /* Function vect_model_induction_cost.
4137 Models cost for induction operations. */
4139 static void
4140 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4141 stmt_vector_for_cost *cost_vec)
4143 unsigned inside_cost, prologue_cost;
4145 if (PURE_SLP_STMT (stmt_info))
4146 return;
4148 /* loop cost for vec_loop. */
4149 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4150 stmt_info, 0, vect_body);
4152 /* prologue cost for vec_init and vec_step. */
4153 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4154 stmt_info, 0, vect_prologue);
4156 if (dump_enabled_p ())
4157 dump_printf_loc (MSG_NOTE, vect_location,
4158 "vect_model_induction_cost: inside_cost = %d, "
4159 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4164 /* Function get_initial_def_for_reduction
4166 Input:
4167 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4168 INIT_VAL - the initial value of the reduction variable
4170 Output:
4171 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4172 of the reduction (used for adjusting the epilog - see below).
4173 Return a vector variable, initialized according to the operation that
4174 STMT_VINFO performs. This vector will be used as the initial value
4175 of the vector of partial results.
4177 Option1 (adjust in epilog): Initialize the vector as follows:
4178 add/bit or/xor: [0,0,...,0,0]
4179 mult/bit and: [1,1,...,1,1]
4180 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4181 and when necessary (e.g. add/mult case) let the caller know
4182 that it needs to adjust the result by init_val.
4184 Option2: Initialize the vector as follows:
4185 add/bit or/xor: [init_val,0,0,...,0]
4186 mult/bit and: [init_val,1,1,...,1]
4187 min/max/cond_expr: [init_val,init_val,...,init_val]
4188 and no adjustments are needed.
4190 For example, for the following code:
4192 s = init_val;
4193 for (i=0;i<n;i++)
4194 s = s + a[i];
4196 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4197 For a vector of 4 units, we want to return either [0,0,0,init_val],
4198 or [0,0,0,0] and let the caller know that it needs to adjust
4199 the result at the end by 'init_val'.
4201 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4202 initialization vector is simpler (same element in all entries), if
4203 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4205 A cost model should help decide between these two schemes. */
4207 tree
4208 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4209 tree *adjustment_def)
4211 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4212 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4213 tree scalar_type = TREE_TYPE (init_val);
4214 tree vectype = get_vectype_for_scalar_type (scalar_type);
4215 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4216 tree def_for_init;
4217 tree init_def;
4218 REAL_VALUE_TYPE real_init_val = dconst0;
4219 int int_init_val = 0;
4220 gimple_seq stmts = NULL;
4222 gcc_assert (vectype);
4224 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4225 || SCALAR_FLOAT_TYPE_P (scalar_type));
4227 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4228 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4230 vect_reduction_type reduction_type
4231 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4233 switch (code)
4235 case WIDEN_SUM_EXPR:
4236 case DOT_PROD_EXPR:
4237 case SAD_EXPR:
4238 case PLUS_EXPR:
4239 case MINUS_EXPR:
4240 case BIT_IOR_EXPR:
4241 case BIT_XOR_EXPR:
4242 case MULT_EXPR:
4243 case BIT_AND_EXPR:
4245 /* ADJUSTMENT_DEF is NULL when called from
4246 vect_create_epilog_for_reduction to vectorize double reduction. */
4247 if (adjustment_def)
4248 *adjustment_def = init_val;
4250 if (code == MULT_EXPR)
4252 real_init_val = dconst1;
4253 int_init_val = 1;
4256 if (code == BIT_AND_EXPR)
4257 int_init_val = -1;
4259 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4260 def_for_init = build_real (scalar_type, real_init_val);
4261 else
4262 def_for_init = build_int_cst (scalar_type, int_init_val);
4264 if (adjustment_def)
4265 /* Option1: the first element is '0' or '1' as well. */
4266 init_def = gimple_build_vector_from_val (&stmts, vectype,
4267 def_for_init);
4268 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4270 /* Option2 (variable length): the first element is INIT_VAL. */
4271 init_def = gimple_build_vector_from_val (&stmts, vectype,
4272 def_for_init);
4273 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4274 vectype, init_def, init_val);
4276 else
4278 /* Option2: the first element is INIT_VAL. */
4279 tree_vector_builder elts (vectype, 1, 2);
4280 elts.quick_push (init_val);
4281 elts.quick_push (def_for_init);
4282 init_def = gimple_build_vector (&stmts, &elts);
4285 break;
4287 case MIN_EXPR:
4288 case MAX_EXPR:
4289 case COND_EXPR:
4291 if (adjustment_def)
4293 *adjustment_def = NULL_TREE;
4294 if (reduction_type != COND_REDUCTION
4295 && reduction_type != EXTRACT_LAST_REDUCTION)
4297 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4298 break;
4301 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4302 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4304 break;
4306 default:
4307 gcc_unreachable ();
4310 if (stmts)
4311 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4312 return init_def;
4315 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4316 NUMBER_OF_VECTORS is the number of vector defs to create.
4317 If NEUTRAL_OP is nonnull, introducing extra elements of that
4318 value will not change the result. */
4320 static void
4321 get_initial_defs_for_reduction (slp_tree slp_node,
4322 vec<tree> *vec_oprnds,
4323 unsigned int number_of_vectors,
4324 bool reduc_chain, tree neutral_op)
4326 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4327 stmt_vec_info stmt_vinfo = stmts[0];
4328 unsigned HOST_WIDE_INT nunits;
4329 unsigned j, number_of_places_left_in_vector;
4330 tree vector_type;
4331 unsigned int group_size = stmts.length ();
4332 unsigned int i;
4333 struct loop *loop;
4335 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4337 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4339 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4340 gcc_assert (loop);
4341 edge pe = loop_preheader_edge (loop);
4343 gcc_assert (!reduc_chain || neutral_op);
4345 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4346 created vectors. It is greater than 1 if unrolling is performed.
4348 For example, we have two scalar operands, s1 and s2 (e.g., group of
4349 strided accesses of size two), while NUNITS is four (i.e., four scalars
4350 of this type can be packed in a vector). The output vector will contain
4351 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4352 will be 2).
4354 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4355 vectors containing the operands.
4357 For example, NUNITS is four as before, and the group size is 8
4358 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4359 {s5, s6, s7, s8}. */
4361 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4362 nunits = group_size;
4364 number_of_places_left_in_vector = nunits;
4365 bool constant_p = true;
4366 tree_vector_builder elts (vector_type, nunits, 1);
4367 elts.quick_grow (nunits);
4368 gimple_seq ctor_seq = NULL;
4369 for (j = 0; j < nunits * number_of_vectors; ++j)
4371 tree op;
4372 i = j % group_size;
4373 stmt_vinfo = stmts[i];
4375 /* Get the def before the loop. In reduction chain we have only
4376 one initial value. Else we have as many as PHIs in the group. */
4377 if (reduc_chain)
4378 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4379 else if (((vec_oprnds->length () + 1) * nunits
4380 - number_of_places_left_in_vector >= group_size)
4381 && neutral_op)
4382 op = neutral_op;
4383 else
4384 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4386 /* Create 'vect_ = {op0,op1,...,opn}'. */
4387 number_of_places_left_in_vector--;
4388 elts[nunits - number_of_places_left_in_vector - 1] = op;
4389 if (!CONSTANT_CLASS_P (op))
4390 constant_p = false;
4392 if (number_of_places_left_in_vector == 0)
4394 tree init;
4395 if (constant_p && !neutral_op
4396 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4397 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4398 /* Build the vector directly from ELTS. */
4399 init = gimple_build_vector (&ctor_seq, &elts);
4400 else if (neutral_op)
4402 /* Build a vector of the neutral value and shift the
4403 other elements into place. */
4404 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4405 neutral_op);
4406 int k = nunits;
4407 while (k > 0 && elts[k - 1] == neutral_op)
4408 k -= 1;
4409 while (k > 0)
4411 k -= 1;
4412 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4413 vector_type, init, elts[k]);
4416 else
4418 /* First time round, duplicate ELTS to fill the
4419 required number of vectors. */
4420 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4421 number_of_vectors, *vec_oprnds);
4422 break;
4424 vec_oprnds->quick_push (init);
4426 number_of_places_left_in_vector = nunits;
4427 elts.new_vector (vector_type, nunits, 1);
4428 elts.quick_grow (nunits);
4429 constant_p = true;
4432 if (ctor_seq != NULL)
4433 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4437 /* Function vect_create_epilog_for_reduction
4439 Create code at the loop-epilog to finalize the result of a reduction
4440 computation.
4442 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4443 reduction statements.
4444 STMT_INFO is the scalar reduction stmt that is being vectorized.
4445 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4446 number of elements that we can fit in a vectype (nunits). In this case
4447 we have to generate more than one vector stmt - i.e - we need to "unroll"
4448 the vector stmt by a factor VF/nunits. For more details see documentation
4449 in vectorizable_operation.
4450 REDUC_FN is the internal function for the epilog reduction.
4451 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4452 computation.
4453 REDUC_INDEX is the index of the operand in the right hand side of the
4454 statement that is defined by REDUCTION_PHI.
4455 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4456 SLP_NODE is an SLP node containing a group of reduction statements. The
4457 first one in this group is STMT_INFO.
4458 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4459 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4460 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4461 any value of the IV in the loop.
4462 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4463 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4464 null if this is not an SLP reduction
4466 This function:
4467 1. Creates the reduction def-use cycles: sets the arguments for
4468 REDUCTION_PHIS:
4469 The loop-entry argument is the vectorized initial-value of the reduction.
4470 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4471 sums.
4472 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4473 by calling the function specified by REDUC_FN if available, or by
4474 other means (whole-vector shifts or a scalar loop).
4475 The function also creates a new phi node at the loop exit to preserve
4476 loop-closed form, as illustrated below.
4478 The flow at the entry to this function:
4480 loop:
4481 vec_def = phi <null, null> # REDUCTION_PHI
4482 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4483 s_loop = scalar_stmt # (scalar) STMT_INFO
4484 loop_exit:
4485 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4486 use <s_out0>
4487 use <s_out0>
4489 The above is transformed by this function into:
4491 loop:
4492 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4493 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4494 s_loop = scalar_stmt # (scalar) STMT_INFO
4495 loop_exit:
4496 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4497 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4498 v_out2 = reduce <v_out1>
4499 s_out3 = extract_field <v_out2, 0>
4500 s_out4 = adjust_result <s_out3>
4501 use <s_out4>
4502 use <s_out4>
4505 static void
4506 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4507 stmt_vec_info stmt_info,
4508 gimple *reduc_def_stmt,
4509 int ncopies, internal_fn reduc_fn,
4510 vec<stmt_vec_info> reduction_phis,
4511 bool double_reduc,
4512 slp_tree slp_node,
4513 slp_instance slp_node_instance,
4514 tree induc_val, enum tree_code induc_code,
4515 tree neutral_op)
4517 stmt_vec_info prev_phi_info;
4518 tree vectype;
4519 machine_mode mode;
4520 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4521 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4522 basic_block exit_bb;
4523 tree scalar_dest;
4524 tree scalar_type;
4525 gimple *new_phi = NULL, *phi;
4526 stmt_vec_info phi_info;
4527 gimple_stmt_iterator exit_gsi;
4528 tree vec_dest;
4529 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4530 gimple *epilog_stmt = NULL;
4531 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4532 gimple *exit_phi;
4533 tree bitsize;
4534 tree adjustment_def = NULL;
4535 tree vec_initial_def = NULL;
4536 tree expr, def, initial_def = NULL;
4537 tree orig_name, scalar_result;
4538 imm_use_iterator imm_iter, phi_imm_iter;
4539 use_operand_p use_p, phi_use_p;
4540 gimple *use_stmt;
4541 stmt_vec_info reduction_phi_info = NULL;
4542 bool nested_in_vect_loop = false;
4543 auto_vec<gimple *> new_phis;
4544 auto_vec<stmt_vec_info> inner_phis;
4545 int j, i;
4546 auto_vec<tree> scalar_results;
4547 unsigned int group_size = 1, k, ratio;
4548 auto_vec<tree> vec_initial_defs;
4549 auto_vec<gimple *> phis;
4550 bool slp_reduc = false;
4551 bool direct_slp_reduc;
4552 tree new_phi_result;
4553 stmt_vec_info inner_phi = NULL;
4554 tree induction_index = NULL_TREE;
4556 if (slp_node)
4557 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4559 if (nested_in_vect_loop_p (loop, stmt_info))
4561 outer_loop = loop;
4562 loop = loop->inner;
4563 nested_in_vect_loop = true;
4564 gcc_assert (!slp_node);
4567 vectype = STMT_VINFO_VECTYPE (stmt_info);
4568 gcc_assert (vectype);
4569 mode = TYPE_MODE (vectype);
4571 /* 1. Create the reduction def-use cycle:
4572 Set the arguments of REDUCTION_PHIS, i.e., transform
4574 loop:
4575 vec_def = phi <null, null> # REDUCTION_PHI
4576 VECT_DEF = vector_stmt # vectorized form of STMT
4579 into:
4581 loop:
4582 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4583 VECT_DEF = vector_stmt # vectorized form of STMT
4586 (in case of SLP, do it for all the phis). */
4588 /* Get the loop-entry arguments. */
4589 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4590 if (slp_node)
4592 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4593 vec_initial_defs.reserve (vec_num);
4594 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4595 &vec_initial_defs, vec_num,
4596 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4597 neutral_op);
4599 else
4601 /* Get at the scalar def before the loop, that defines the initial value
4602 of the reduction variable. */
4603 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4604 loop_preheader_edge (loop));
4605 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4606 and we can't use zero for induc_val, use initial_def. Similarly
4607 for REDUC_MIN and initial_def larger than the base. */
4608 if (TREE_CODE (initial_def) == INTEGER_CST
4609 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4610 == INTEGER_INDUC_COND_REDUCTION)
4611 && !integer_zerop (induc_val)
4612 && ((induc_code == MAX_EXPR
4613 && tree_int_cst_lt (initial_def, induc_val))
4614 || (induc_code == MIN_EXPR
4615 && tree_int_cst_lt (induc_val, initial_def))))
4616 induc_val = initial_def;
4618 if (double_reduc)
4619 /* In case of double reduction we only create a vector variable
4620 to be put in the reduction phi node. The actual statement
4621 creation is done later in this function. */
4622 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4623 else if (nested_in_vect_loop)
4625 /* Do not use an adjustment def as that case is not supported
4626 correctly if ncopies is not one. */
4627 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4628 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4629 stmt_info);
4631 else
4632 vec_initial_def
4633 = get_initial_def_for_reduction (stmt_info, initial_def,
4634 &adjustment_def);
4635 vec_initial_defs.create (1);
4636 vec_initial_defs.quick_push (vec_initial_def);
4639 /* Set phi nodes arguments. */
4640 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4642 tree vec_init_def = vec_initial_defs[i];
4643 tree def = vect_defs[i];
4644 for (j = 0; j < ncopies; j++)
4646 if (j != 0)
4648 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4649 if (nested_in_vect_loop)
4650 vec_init_def
4651 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4654 /* Set the loop-entry arg of the reduction-phi. */
4656 gphi *phi = as_a <gphi *> (phi_info->stmt);
4657 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4658 == INTEGER_INDUC_COND_REDUCTION)
4660 /* Initialise the reduction phi to zero. This prevents initial
4661 values of non-zero interferring with the reduction op. */
4662 gcc_assert (ncopies == 1);
4663 gcc_assert (i == 0);
4665 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4666 tree induc_val_vec
4667 = build_vector_from_val (vec_init_def_type, induc_val);
4669 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4670 UNKNOWN_LOCATION);
4672 else
4673 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4674 UNKNOWN_LOCATION);
4676 /* Set the loop-latch arg for the reduction-phi. */
4677 if (j > 0)
4678 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4680 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4682 if (dump_enabled_p ())
4683 dump_printf_loc (MSG_NOTE, vect_location,
4684 "transform reduction: created def-use cycle: %G%G",
4685 phi, SSA_NAME_DEF_STMT (def));
4689 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4690 which is updated with the current index of the loop for every match of
4691 the original loop's cond_expr (VEC_STMT). This results in a vector
4692 containing the last time the condition passed for that vector lane.
4693 The first match will be a 1 to allow 0 to be used for non-matching
4694 indexes. If there are no matches at all then the vector will be all
4695 zeroes. */
4696 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4698 tree indx_before_incr, indx_after_incr;
4699 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4701 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4702 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4704 int scalar_precision
4705 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4706 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4707 tree cr_index_vector_type = build_vector_type
4708 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4710 /* First we create a simple vector induction variable which starts
4711 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4712 vector size (STEP). */
4714 /* Create a {1,2,3,...} vector. */
4715 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4717 /* Create a vector of the step value. */
4718 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4719 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4721 /* Create an induction variable. */
4722 gimple_stmt_iterator incr_gsi;
4723 bool insert_after;
4724 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4725 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4726 insert_after, &indx_before_incr, &indx_after_incr);
4728 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4729 filled with zeros (VEC_ZERO). */
4731 /* Create a vector of 0s. */
4732 tree zero = build_zero_cst (cr_index_scalar_type);
4733 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4735 /* Create a vector phi node. */
4736 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4737 new_phi = create_phi_node (new_phi_tree, loop->header);
4738 loop_vinfo->add_stmt (new_phi);
4739 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4740 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4742 /* Now take the condition from the loops original cond_expr
4743 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4744 every match uses values from the induction variable
4745 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4746 (NEW_PHI_TREE).
4747 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4748 the new cond_expr (INDEX_COND_EXPR). */
4750 /* Duplicate the condition from vec_stmt. */
4751 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4753 /* Create a conditional, where the condition is taken from vec_stmt
4754 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4755 else is the phi (NEW_PHI_TREE). */
4756 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4757 ccompare, indx_before_incr,
4758 new_phi_tree);
4759 induction_index = make_ssa_name (cr_index_vector_type);
4760 gimple *index_condition = gimple_build_assign (induction_index,
4761 index_cond_expr);
4762 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4763 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4764 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4766 /* Update the phi with the vec cond. */
4767 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4768 loop_latch_edge (loop), UNKNOWN_LOCATION);
4771 /* 2. Create epilog code.
4772 The reduction epilog code operates across the elements of the vector
4773 of partial results computed by the vectorized loop.
4774 The reduction epilog code consists of:
4776 step 1: compute the scalar result in a vector (v_out2)
4777 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4778 step 3: adjust the scalar result (s_out3) if needed.
4780 Step 1 can be accomplished using one the following three schemes:
4781 (scheme 1) using reduc_fn, if available.
4782 (scheme 2) using whole-vector shifts, if available.
4783 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4784 combined.
4786 The overall epilog code looks like this:
4788 s_out0 = phi <s_loop> # original EXIT_PHI
4789 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4790 v_out2 = reduce <v_out1> # step 1
4791 s_out3 = extract_field <v_out2, 0> # step 2
4792 s_out4 = adjust_result <s_out3> # step 3
4794 (step 3 is optional, and steps 1 and 2 may be combined).
4795 Lastly, the uses of s_out0 are replaced by s_out4. */
4798 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4799 v_out1 = phi <VECT_DEF>
4800 Store them in NEW_PHIS. */
4802 exit_bb = single_exit (loop)->dest;
4803 prev_phi_info = NULL;
4804 new_phis.create (vect_defs.length ());
4805 FOR_EACH_VEC_ELT (vect_defs, i, def)
4807 for (j = 0; j < ncopies; j++)
4809 tree new_def = copy_ssa_name (def);
4810 phi = create_phi_node (new_def, exit_bb);
4811 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4812 if (j == 0)
4813 new_phis.quick_push (phi);
4814 else
4816 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4817 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4820 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4821 prev_phi_info = phi_info;
4825 /* The epilogue is created for the outer-loop, i.e., for the loop being
4826 vectorized. Create exit phis for the outer loop. */
4827 if (double_reduc)
4829 loop = outer_loop;
4830 exit_bb = single_exit (loop)->dest;
4831 inner_phis.create (vect_defs.length ());
4832 FOR_EACH_VEC_ELT (new_phis, i, phi)
4834 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4835 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4836 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4837 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4838 PHI_RESULT (phi));
4839 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4840 inner_phis.quick_push (phi_info);
4841 new_phis[i] = outer_phi;
4842 while (STMT_VINFO_RELATED_STMT (phi_info))
4844 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4845 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4846 outer_phi = create_phi_node (new_result, exit_bb);
4847 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4848 PHI_RESULT (phi_info->stmt));
4849 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4850 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4851 prev_phi_info = outer_phi_info;
4856 exit_gsi = gsi_after_labels (exit_bb);
4858 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4859 (i.e. when reduc_fn is not available) and in the final adjustment
4860 code (if needed). Also get the original scalar reduction variable as
4861 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4862 represents a reduction pattern), the tree-code and scalar-def are
4863 taken from the original stmt that the pattern-stmt (STMT) replaces.
4864 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4865 are taken from STMT. */
4867 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4868 if (orig_stmt_info != stmt_info)
4870 /* Reduction pattern */
4871 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4872 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4875 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4876 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4877 partial results are added and not subtracted. */
4878 if (code == MINUS_EXPR)
4879 code = PLUS_EXPR;
4881 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4882 scalar_type = TREE_TYPE (scalar_dest);
4883 scalar_results.create (group_size);
4884 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4885 bitsize = TYPE_SIZE (scalar_type);
4887 /* In case this is a reduction in an inner-loop while vectorizing an outer
4888 loop - we don't need to extract a single scalar result at the end of the
4889 inner-loop (unless it is double reduction, i.e., the use of reduction is
4890 outside the outer-loop). The final vector of partial results will be used
4891 in the vectorized outer-loop, or reduced to a scalar result at the end of
4892 the outer-loop. */
4893 if (nested_in_vect_loop && !double_reduc)
4894 goto vect_finalize_reduction;
4896 /* SLP reduction without reduction chain, e.g.,
4897 # a1 = phi <a2, a0>
4898 # b1 = phi <b2, b0>
4899 a2 = operation (a1)
4900 b2 = operation (b1) */
4901 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4903 /* True if we should implement SLP_REDUC using native reduction operations
4904 instead of scalar operations. */
4905 direct_slp_reduc = (reduc_fn != IFN_LAST
4906 && slp_reduc
4907 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4909 /* In case of reduction chain, e.g.,
4910 # a1 = phi <a3, a0>
4911 a2 = operation (a1)
4912 a3 = operation (a2),
4914 we may end up with more than one vector result. Here we reduce them to
4915 one vector. */
4916 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4918 tree first_vect = PHI_RESULT (new_phis[0]);
4919 gassign *new_vec_stmt = NULL;
4920 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4921 for (k = 1; k < new_phis.length (); k++)
4923 gimple *next_phi = new_phis[k];
4924 tree second_vect = PHI_RESULT (next_phi);
4925 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4926 new_vec_stmt = gimple_build_assign (tem, code,
4927 first_vect, second_vect);
4928 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4929 first_vect = tem;
4932 new_phi_result = first_vect;
4933 if (new_vec_stmt)
4935 new_phis.truncate (0);
4936 new_phis.safe_push (new_vec_stmt);
4939 /* Likewise if we couldn't use a single defuse cycle. */
4940 else if (ncopies > 1)
4942 gcc_assert (new_phis.length () == 1);
4943 tree first_vect = PHI_RESULT (new_phis[0]);
4944 gassign *new_vec_stmt = NULL;
4945 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4946 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4947 for (int k = 1; k < ncopies; ++k)
4949 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4950 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4951 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4952 new_vec_stmt = gimple_build_assign (tem, code,
4953 first_vect, second_vect);
4954 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4955 first_vect = tem;
4957 new_phi_result = first_vect;
4958 new_phis.truncate (0);
4959 new_phis.safe_push (new_vec_stmt);
4961 else
4962 new_phi_result = PHI_RESULT (new_phis[0]);
4964 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4965 && reduc_fn != IFN_LAST)
4967 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4968 various data values where the condition matched and another vector
4969 (INDUCTION_INDEX) containing all the indexes of those matches. We
4970 need to extract the last matching index (which will be the index with
4971 highest value) and use this to index into the data vector.
4972 For the case where there were no matches, the data vector will contain
4973 all default values and the index vector will be all zeros. */
4975 /* Get various versions of the type of the vector of indexes. */
4976 tree index_vec_type = TREE_TYPE (induction_index);
4977 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4978 tree index_scalar_type = TREE_TYPE (index_vec_type);
4979 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4980 (index_vec_type);
4982 /* Get an unsigned integer version of the type of the data vector. */
4983 int scalar_precision
4984 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4985 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4986 tree vectype_unsigned = build_vector_type
4987 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4989 /* First we need to create a vector (ZERO_VEC) of zeros and another
4990 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4991 can create using a MAX reduction and then expanding.
4992 In the case where the loop never made any matches, the max index will
4993 be zero. */
4995 /* Vector of {0, 0, 0,...}. */
4996 tree zero_vec = make_ssa_name (vectype);
4997 tree zero_vec_rhs = build_zero_cst (vectype);
4998 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4999 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5001 /* Find maximum value from the vector of found indexes. */
5002 tree max_index = make_ssa_name (index_scalar_type);
5003 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5004 1, induction_index);
5005 gimple_call_set_lhs (max_index_stmt, max_index);
5006 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5008 /* Vector of {max_index, max_index, max_index,...}. */
5009 tree max_index_vec = make_ssa_name (index_vec_type);
5010 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5011 max_index);
5012 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5013 max_index_vec_rhs);
5014 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5016 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5017 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5018 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5019 otherwise. Only one value should match, resulting in a vector
5020 (VEC_COND) with one data value and the rest zeros.
5021 In the case where the loop never made any matches, every index will
5022 match, resulting in a vector with all data values (which will all be
5023 the default value). */
5025 /* Compare the max index vector to the vector of found indexes to find
5026 the position of the max value. */
5027 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5028 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5029 induction_index,
5030 max_index_vec);
5031 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5033 /* Use the compare to choose either values from the data vector or
5034 zero. */
5035 tree vec_cond = make_ssa_name (vectype);
5036 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5037 vec_compare, new_phi_result,
5038 zero_vec);
5039 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5041 /* Finally we need to extract the data value from the vector (VEC_COND)
5042 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5043 reduction, but because this doesn't exist, we can use a MAX reduction
5044 instead. The data value might be signed or a float so we need to cast
5045 it first.
5046 In the case where the loop never made any matches, the data values are
5047 all identical, and so will reduce down correctly. */
5049 /* Make the matched data values unsigned. */
5050 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5051 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5052 vec_cond);
5053 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5054 VIEW_CONVERT_EXPR,
5055 vec_cond_cast_rhs);
5056 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5058 /* Reduce down to a scalar value. */
5059 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5060 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5061 1, vec_cond_cast);
5062 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5063 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5065 /* Convert the reduced value back to the result type and set as the
5066 result. */
5067 gimple_seq stmts = NULL;
5068 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5069 data_reduc);
5070 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5071 scalar_results.safe_push (new_temp);
5073 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5074 && reduc_fn == IFN_LAST)
5076 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5077 idx = 0;
5078 idx_val = induction_index[0];
5079 val = data_reduc[0];
5080 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5081 if (induction_index[i] > idx_val)
5082 val = data_reduc[i], idx_val = induction_index[i];
5083 return val; */
5085 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5086 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5087 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5088 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5089 /* Enforced by vectorizable_reduction, which ensures we have target
5090 support before allowing a conditional reduction on variable-length
5091 vectors. */
5092 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5093 tree idx_val = NULL_TREE, val = NULL_TREE;
5094 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5096 tree old_idx_val = idx_val;
5097 tree old_val = val;
5098 idx_val = make_ssa_name (idx_eltype);
5099 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5100 build3 (BIT_FIELD_REF, idx_eltype,
5101 induction_index,
5102 bitsize_int (el_size),
5103 bitsize_int (off)));
5104 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5105 val = make_ssa_name (data_eltype);
5106 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5107 build3 (BIT_FIELD_REF,
5108 data_eltype,
5109 new_phi_result,
5110 bitsize_int (el_size),
5111 bitsize_int (off)));
5112 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5113 if (off != 0)
5115 tree new_idx_val = idx_val;
5116 tree new_val = val;
5117 if (off != v_size - el_size)
5119 new_idx_val = make_ssa_name (idx_eltype);
5120 epilog_stmt = gimple_build_assign (new_idx_val,
5121 MAX_EXPR, idx_val,
5122 old_idx_val);
5123 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5125 new_val = make_ssa_name (data_eltype);
5126 epilog_stmt = gimple_build_assign (new_val,
5127 COND_EXPR,
5128 build2 (GT_EXPR,
5129 boolean_type_node,
5130 idx_val,
5131 old_idx_val),
5132 val, old_val);
5133 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5134 idx_val = new_idx_val;
5135 val = new_val;
5138 /* Convert the reduced value back to the result type and set as the
5139 result. */
5140 gimple_seq stmts = NULL;
5141 val = gimple_convert (&stmts, scalar_type, val);
5142 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5143 scalar_results.safe_push (val);
5146 /* 2.3 Create the reduction code, using one of the three schemes described
5147 above. In SLP we simply need to extract all the elements from the
5148 vector (without reducing them), so we use scalar shifts. */
5149 else if (reduc_fn != IFN_LAST && !slp_reduc)
5151 tree tmp;
5152 tree vec_elem_type;
5154 /* Case 1: Create:
5155 v_out2 = reduc_expr <v_out1> */
5157 if (dump_enabled_p ())
5158 dump_printf_loc (MSG_NOTE, vect_location,
5159 "Reduce using direct vector reduction.\n");
5161 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5162 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5164 tree tmp_dest
5165 = vect_create_destination_var (scalar_dest, vec_elem_type);
5166 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5167 new_phi_result);
5168 gimple_set_lhs (epilog_stmt, tmp_dest);
5169 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5170 gimple_set_lhs (epilog_stmt, new_temp);
5171 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5173 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5174 new_temp);
5176 else
5178 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5179 new_phi_result);
5180 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5183 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5184 gimple_set_lhs (epilog_stmt, new_temp);
5185 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5187 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5188 == INTEGER_INDUC_COND_REDUCTION)
5189 && !operand_equal_p (initial_def, induc_val, 0))
5191 /* Earlier we set the initial value to be a vector if induc_val
5192 values. Check the result and if it is induc_val then replace
5193 with the original initial value, unless induc_val is
5194 the same as initial_def already. */
5195 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5196 induc_val);
5198 tmp = make_ssa_name (new_scalar_dest);
5199 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5200 initial_def, new_temp);
5201 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5202 new_temp = tmp;
5205 scalar_results.safe_push (new_temp);
5207 else if (direct_slp_reduc)
5209 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5210 with the elements for other SLP statements replaced with the
5211 neutral value. We can then do a normal reduction on each vector. */
5213 /* Enforced by vectorizable_reduction. */
5214 gcc_assert (new_phis.length () == 1);
5215 gcc_assert (pow2p_hwi (group_size));
5217 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5218 vec<stmt_vec_info> orig_phis
5219 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5220 gimple_seq seq = NULL;
5222 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5223 and the same element size as VECTYPE. */
5224 tree index = build_index_vector (vectype, 0, 1);
5225 tree index_type = TREE_TYPE (index);
5226 tree index_elt_type = TREE_TYPE (index_type);
5227 tree mask_type = build_same_sized_truth_vector_type (index_type);
5229 /* Create a vector that, for each element, identifies which of
5230 the REDUC_GROUP_SIZE results should use it. */
5231 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5232 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5233 build_vector_from_val (index_type, index_mask));
5235 /* Get a neutral vector value. This is simply a splat of the neutral
5236 scalar value if we have one, otherwise the initial scalar value
5237 is itself a neutral value. */
5238 tree vector_identity = NULL_TREE;
5239 if (neutral_op)
5240 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5241 neutral_op);
5242 for (unsigned int i = 0; i < group_size; ++i)
5244 /* If there's no univeral neutral value, we can use the
5245 initial scalar value from the original PHI. This is used
5246 for MIN and MAX reduction, for example. */
5247 if (!neutral_op)
5249 tree scalar_value
5250 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5251 loop_preheader_edge (loop));
5252 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5253 scalar_value);
5256 /* Calculate the equivalent of:
5258 sel[j] = (index[j] == i);
5260 which selects the elements of NEW_PHI_RESULT that should
5261 be included in the result. */
5262 tree compare_val = build_int_cst (index_elt_type, i);
5263 compare_val = build_vector_from_val (index_type, compare_val);
5264 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5265 index, compare_val);
5267 /* Calculate the equivalent of:
5269 vec = seq ? new_phi_result : vector_identity;
5271 VEC is now suitable for a full vector reduction. */
5272 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5273 sel, new_phi_result, vector_identity);
5275 /* Do the reduction and convert it to the appropriate type. */
5276 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5277 TREE_TYPE (vectype), vec);
5278 scalar = gimple_convert (&seq, scalar_type, scalar);
5279 scalar_results.safe_push (scalar);
5281 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5283 else
5285 bool reduce_with_shift;
5286 tree vec_temp;
5288 /* COND reductions all do the final reduction with MAX_EXPR
5289 or MIN_EXPR. */
5290 if (code == COND_EXPR)
5292 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5293 == INTEGER_INDUC_COND_REDUCTION)
5294 code = induc_code;
5295 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5296 == CONST_COND_REDUCTION)
5297 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5298 else
5299 code = MAX_EXPR;
5302 /* See if the target wants to do the final (shift) reduction
5303 in a vector mode of smaller size and first reduce upper/lower
5304 halves against each other. */
5305 enum machine_mode mode1 = mode;
5306 tree vectype1 = vectype;
5307 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5308 unsigned sz1 = sz;
5309 if (!slp_reduc
5310 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5311 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5313 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5314 reduce_with_shift = have_whole_vector_shift (mode1);
5315 if (!VECTOR_MODE_P (mode1))
5316 reduce_with_shift = false;
5317 else
5319 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5320 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5321 reduce_with_shift = false;
5324 /* First reduce the vector to the desired vector size we should
5325 do shift reduction on by combining upper and lower halves. */
5326 new_temp = new_phi_result;
5327 while (sz > sz1)
5329 gcc_assert (!slp_reduc);
5330 sz /= 2;
5331 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5333 /* The target has to make sure we support lowpart/highpart
5334 extraction, either via direct vector extract or through
5335 an integer mode punning. */
5336 tree dst1, dst2;
5337 if (convert_optab_handler (vec_extract_optab,
5338 TYPE_MODE (TREE_TYPE (new_temp)),
5339 TYPE_MODE (vectype1))
5340 != CODE_FOR_nothing)
5342 /* Extract sub-vectors directly once vec_extract becomes
5343 a conversion optab. */
5344 dst1 = make_ssa_name (vectype1);
5345 epilog_stmt
5346 = gimple_build_assign (dst1, BIT_FIELD_REF,
5347 build3 (BIT_FIELD_REF, vectype1,
5348 new_temp, TYPE_SIZE (vectype1),
5349 bitsize_int (0)));
5350 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5351 dst2 = make_ssa_name (vectype1);
5352 epilog_stmt
5353 = gimple_build_assign (dst2, BIT_FIELD_REF,
5354 build3 (BIT_FIELD_REF, vectype1,
5355 new_temp, TYPE_SIZE (vectype1),
5356 bitsize_int (sz * BITS_PER_UNIT)));
5357 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5359 else
5361 /* Extract via punning to appropriately sized integer mode
5362 vector. */
5363 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5365 tree etype = build_vector_type (eltype, 2);
5366 gcc_assert (convert_optab_handler (vec_extract_optab,
5367 TYPE_MODE (etype),
5368 TYPE_MODE (eltype))
5369 != CODE_FOR_nothing);
5370 tree tem = make_ssa_name (etype);
5371 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5372 build1 (VIEW_CONVERT_EXPR,
5373 etype, new_temp));
5374 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5375 new_temp = tem;
5376 tem = make_ssa_name (eltype);
5377 epilog_stmt
5378 = gimple_build_assign (tem, BIT_FIELD_REF,
5379 build3 (BIT_FIELD_REF, eltype,
5380 new_temp, TYPE_SIZE (eltype),
5381 bitsize_int (0)));
5382 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5383 dst1 = make_ssa_name (vectype1);
5384 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5385 build1 (VIEW_CONVERT_EXPR,
5386 vectype1, tem));
5387 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5388 tem = make_ssa_name (eltype);
5389 epilog_stmt
5390 = gimple_build_assign (tem, BIT_FIELD_REF,
5391 build3 (BIT_FIELD_REF, eltype,
5392 new_temp, TYPE_SIZE (eltype),
5393 bitsize_int (sz * BITS_PER_UNIT)));
5394 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5395 dst2 = make_ssa_name (vectype1);
5396 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5397 build1 (VIEW_CONVERT_EXPR,
5398 vectype1, tem));
5399 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5402 new_temp = make_ssa_name (vectype1);
5403 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5404 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5407 if (reduce_with_shift && !slp_reduc)
5409 int element_bitsize = tree_to_uhwi (bitsize);
5410 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5411 for variable-length vectors and also requires direct target support
5412 for loop reductions. */
5413 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5414 int nelements = vec_size_in_bits / element_bitsize;
5415 vec_perm_builder sel;
5416 vec_perm_indices indices;
5418 int elt_offset;
5420 tree zero_vec = build_zero_cst (vectype1);
5421 /* Case 2: Create:
5422 for (offset = nelements/2; offset >= 1; offset/=2)
5424 Create: va' = vec_shift <va, offset>
5425 Create: va = vop <va, va'>
5426 } */
5428 tree rhs;
5430 if (dump_enabled_p ())
5431 dump_printf_loc (MSG_NOTE, vect_location,
5432 "Reduce using vector shifts\n");
5434 mode1 = TYPE_MODE (vectype1);
5435 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5436 for (elt_offset = nelements / 2;
5437 elt_offset >= 1;
5438 elt_offset /= 2)
5440 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5441 indices.new_vector (sel, 2, nelements);
5442 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5443 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5444 new_temp, zero_vec, mask);
5445 new_name = make_ssa_name (vec_dest, epilog_stmt);
5446 gimple_assign_set_lhs (epilog_stmt, new_name);
5447 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5449 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5450 new_temp);
5451 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5452 gimple_assign_set_lhs (epilog_stmt, new_temp);
5453 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5456 /* 2.4 Extract the final scalar result. Create:
5457 s_out3 = extract_field <v_out2, bitpos> */
5459 if (dump_enabled_p ())
5460 dump_printf_loc (MSG_NOTE, vect_location,
5461 "extract scalar result\n");
5463 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5464 bitsize, bitsize_zero_node);
5465 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5466 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5467 gimple_assign_set_lhs (epilog_stmt, new_temp);
5468 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5469 scalar_results.safe_push (new_temp);
5471 else
5473 /* Case 3: Create:
5474 s = extract_field <v_out2, 0>
5475 for (offset = element_size;
5476 offset < vector_size;
5477 offset += element_size;)
5479 Create: s' = extract_field <v_out2, offset>
5480 Create: s = op <s, s'> // For non SLP cases
5481 } */
5483 if (dump_enabled_p ())
5484 dump_printf_loc (MSG_NOTE, vect_location,
5485 "Reduce using scalar code.\n");
5487 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5488 int element_bitsize = tree_to_uhwi (bitsize);
5489 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5491 int bit_offset;
5492 if (gimple_code (new_phi) == GIMPLE_PHI)
5493 vec_temp = PHI_RESULT (new_phi);
5494 else
5495 vec_temp = gimple_assign_lhs (new_phi);
5496 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5497 bitsize_zero_node);
5498 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5499 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5500 gimple_assign_set_lhs (epilog_stmt, new_temp);
5501 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5503 /* In SLP we don't need to apply reduction operation, so we just
5504 collect s' values in SCALAR_RESULTS. */
5505 if (slp_reduc)
5506 scalar_results.safe_push (new_temp);
5508 for (bit_offset = element_bitsize;
5509 bit_offset < vec_size_in_bits;
5510 bit_offset += element_bitsize)
5512 tree bitpos = bitsize_int (bit_offset);
5513 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5514 bitsize, bitpos);
5516 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5517 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5518 gimple_assign_set_lhs (epilog_stmt, new_name);
5519 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5521 if (slp_reduc)
5523 /* In SLP we don't need to apply reduction operation, so
5524 we just collect s' values in SCALAR_RESULTS. */
5525 new_temp = new_name;
5526 scalar_results.safe_push (new_name);
5528 else
5530 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5531 new_name, new_temp);
5532 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5533 gimple_assign_set_lhs (epilog_stmt, new_temp);
5534 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5539 /* The only case where we need to reduce scalar results in SLP, is
5540 unrolling. If the size of SCALAR_RESULTS is greater than
5541 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5542 REDUC_GROUP_SIZE. */
5543 if (slp_reduc)
5545 tree res, first_res, new_res;
5546 gimple *new_stmt;
5548 /* Reduce multiple scalar results in case of SLP unrolling. */
5549 for (j = group_size; scalar_results.iterate (j, &res);
5550 j++)
5552 first_res = scalar_results[j % group_size];
5553 new_stmt = gimple_build_assign (new_scalar_dest, code,
5554 first_res, res);
5555 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5556 gimple_assign_set_lhs (new_stmt, new_res);
5557 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5558 scalar_results[j % group_size] = new_res;
5561 else
5562 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5563 scalar_results.safe_push (new_temp);
5566 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5567 == INTEGER_INDUC_COND_REDUCTION)
5568 && !operand_equal_p (initial_def, induc_val, 0))
5570 /* Earlier we set the initial value to be a vector if induc_val
5571 values. Check the result and if it is induc_val then replace
5572 with the original initial value, unless induc_val is
5573 the same as initial_def already. */
5574 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5575 induc_val);
5577 tree tmp = make_ssa_name (new_scalar_dest);
5578 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5579 initial_def, new_temp);
5580 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5581 scalar_results[0] = tmp;
5585 vect_finalize_reduction:
5587 if (double_reduc)
5588 loop = loop->inner;
5590 /* 2.5 Adjust the final result by the initial value of the reduction
5591 variable. (When such adjustment is not needed, then
5592 'adjustment_def' is zero). For example, if code is PLUS we create:
5593 new_temp = loop_exit_def + adjustment_def */
5595 if (adjustment_def)
5597 gcc_assert (!slp_reduc);
5598 if (nested_in_vect_loop)
5600 new_phi = new_phis[0];
5601 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5602 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5603 new_dest = vect_create_destination_var (scalar_dest, vectype);
5605 else
5607 new_temp = scalar_results[0];
5608 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5609 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5610 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5613 epilog_stmt = gimple_build_assign (new_dest, expr);
5614 new_temp = make_ssa_name (new_dest, epilog_stmt);
5615 gimple_assign_set_lhs (epilog_stmt, new_temp);
5616 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617 if (nested_in_vect_loop)
5619 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5620 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5621 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5623 if (!double_reduc)
5624 scalar_results.quick_push (new_temp);
5625 else
5626 scalar_results[0] = new_temp;
5628 else
5629 scalar_results[0] = new_temp;
5631 new_phis[0] = epilog_stmt;
5634 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5635 phis with new adjusted scalar results, i.e., replace use <s_out0>
5636 with use <s_out4>.
5638 Transform:
5639 loop_exit:
5640 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5641 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5642 v_out2 = reduce <v_out1>
5643 s_out3 = extract_field <v_out2, 0>
5644 s_out4 = adjust_result <s_out3>
5645 use <s_out0>
5646 use <s_out0>
5648 into:
5650 loop_exit:
5651 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5652 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5653 v_out2 = reduce <v_out1>
5654 s_out3 = extract_field <v_out2, 0>
5655 s_out4 = adjust_result <s_out3>
5656 use <s_out4>
5657 use <s_out4> */
5660 /* In SLP reduction chain we reduce vector results into one vector if
5661 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5662 LHS of the last stmt in the reduction chain, since we are looking for
5663 the loop exit phi node. */
5664 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5666 stmt_vec_info dest_stmt_info
5667 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5668 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5669 group_size = 1;
5672 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5673 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5674 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5675 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5676 correspond to the first vector stmt, etc.
5677 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5678 if (group_size > new_phis.length ())
5680 ratio = group_size / new_phis.length ();
5681 gcc_assert (!(group_size % new_phis.length ()));
5683 else
5684 ratio = 1;
5686 stmt_vec_info epilog_stmt_info = NULL;
5687 for (k = 0; k < group_size; k++)
5689 if (k % ratio == 0)
5691 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5692 reduction_phi_info = reduction_phis[k / ratio];
5693 if (double_reduc)
5694 inner_phi = inner_phis[k / ratio];
5697 if (slp_reduc)
5699 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5701 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5702 /* SLP statements can't participate in patterns. */
5703 gcc_assert (!orig_stmt_info);
5704 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5707 phis.create (3);
5708 /* Find the loop-closed-use at the loop exit of the original scalar
5709 result. (The reduction result is expected to have two immediate uses -
5710 one at the latch block, and one at the loop exit). */
5711 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5712 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5713 && !is_gimple_debug (USE_STMT (use_p)))
5714 phis.safe_push (USE_STMT (use_p));
5716 /* While we expect to have found an exit_phi because of loop-closed-ssa
5717 form we can end up without one if the scalar cycle is dead. */
5719 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5721 if (outer_loop)
5723 stmt_vec_info exit_phi_vinfo
5724 = loop_vinfo->lookup_stmt (exit_phi);
5725 gphi *vect_phi;
5727 if (double_reduc)
5728 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5729 else
5730 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5731 if (!double_reduc
5732 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5733 != vect_double_reduction_def)
5734 continue;
5736 /* Handle double reduction:
5738 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5739 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5740 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5741 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5743 At that point the regular reduction (stmt2 and stmt3) is
5744 already vectorized, as well as the exit phi node, stmt4.
5745 Here we vectorize the phi node of double reduction, stmt1, and
5746 update all relevant statements. */
5748 /* Go through all the uses of s2 to find double reduction phi
5749 node, i.e., stmt1 above. */
5750 orig_name = PHI_RESULT (exit_phi);
5751 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5753 stmt_vec_info use_stmt_vinfo;
5754 tree vect_phi_init, preheader_arg, vect_phi_res;
5755 basic_block bb = gimple_bb (use_stmt);
5757 /* Check that USE_STMT is really double reduction phi
5758 node. */
5759 if (gimple_code (use_stmt) != GIMPLE_PHI
5760 || gimple_phi_num_args (use_stmt) != 2
5761 || bb->loop_father != outer_loop)
5762 continue;
5763 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5764 if (!use_stmt_vinfo
5765 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5766 != vect_double_reduction_def)
5767 continue;
5769 /* Create vector phi node for double reduction:
5770 vs1 = phi <vs0, vs2>
5771 vs1 was created previously in this function by a call to
5772 vect_get_vec_def_for_operand and is stored in
5773 vec_initial_def;
5774 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5775 vs0 is created here. */
5777 /* Create vector phi node. */
5778 vect_phi = create_phi_node (vec_initial_def, bb);
5779 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5781 /* Create vs0 - initial def of the double reduction phi. */
5782 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5783 loop_preheader_edge (outer_loop));
5784 vect_phi_init = get_initial_def_for_reduction
5785 (stmt_info, preheader_arg, NULL);
5787 /* Update phi node arguments with vs0 and vs2. */
5788 add_phi_arg (vect_phi, vect_phi_init,
5789 loop_preheader_edge (outer_loop),
5790 UNKNOWN_LOCATION);
5791 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5792 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5793 if (dump_enabled_p ())
5794 dump_printf_loc (MSG_NOTE, vect_location,
5795 "created double reduction phi node: %G",
5796 vect_phi);
5798 vect_phi_res = PHI_RESULT (vect_phi);
5800 /* Replace the use, i.e., set the correct vs1 in the regular
5801 reduction phi node. FORNOW, NCOPIES is always 1, so the
5802 loop is redundant. */
5803 stmt_vec_info use_info = reduction_phi_info;
5804 for (j = 0; j < ncopies; j++)
5806 edge pr_edge = loop_preheader_edge (loop);
5807 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5808 pr_edge->dest_idx, vect_phi_res);
5809 use_info = STMT_VINFO_RELATED_STMT (use_info);
5815 phis.release ();
5816 if (nested_in_vect_loop)
5818 if (double_reduc)
5819 loop = outer_loop;
5820 else
5821 continue;
5824 phis.create (3);
5825 /* Find the loop-closed-use at the loop exit of the original scalar
5826 result. (The reduction result is expected to have two immediate uses,
5827 one at the latch block, and one at the loop exit). For double
5828 reductions we are looking for exit phis of the outer loop. */
5829 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5831 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5833 if (!is_gimple_debug (USE_STMT (use_p)))
5834 phis.safe_push (USE_STMT (use_p));
5836 else
5838 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5840 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5842 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5844 if (!flow_bb_inside_loop_p (loop,
5845 gimple_bb (USE_STMT (phi_use_p)))
5846 && !is_gimple_debug (USE_STMT (phi_use_p)))
5847 phis.safe_push (USE_STMT (phi_use_p));
5853 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5855 /* Replace the uses: */
5856 orig_name = PHI_RESULT (exit_phi);
5857 scalar_result = scalar_results[k];
5858 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5859 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5860 SET_USE (use_p, scalar_result);
5863 phis.release ();
5867 /* Return a vector of type VECTYPE that is equal to the vector select
5868 operation "MASK ? VEC : IDENTITY". Insert the select statements
5869 before GSI. */
5871 static tree
5872 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5873 tree vec, tree identity)
5875 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5876 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5877 mask, vec, identity);
5878 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5879 return cond;
5882 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5883 order, starting with LHS. Insert the extraction statements before GSI and
5884 associate the new scalar SSA names with variable SCALAR_DEST.
5885 Return the SSA name for the result. */
5887 static tree
5888 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5889 tree_code code, tree lhs, tree vector_rhs)
5891 tree vectype = TREE_TYPE (vector_rhs);
5892 tree scalar_type = TREE_TYPE (vectype);
5893 tree bitsize = TYPE_SIZE (scalar_type);
5894 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5895 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5897 for (unsigned HOST_WIDE_INT bit_offset = 0;
5898 bit_offset < vec_size_in_bits;
5899 bit_offset += element_bitsize)
5901 tree bitpos = bitsize_int (bit_offset);
5902 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5903 bitsize, bitpos);
5905 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5906 rhs = make_ssa_name (scalar_dest, stmt);
5907 gimple_assign_set_lhs (stmt, rhs);
5908 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5910 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5911 tree new_name = make_ssa_name (scalar_dest, stmt);
5912 gimple_assign_set_lhs (stmt, new_name);
5913 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5914 lhs = new_name;
5916 return lhs;
5919 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5920 type of the vector input. */
5922 static internal_fn
5923 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5925 internal_fn mask_reduc_fn;
5927 switch (reduc_fn)
5929 case IFN_FOLD_LEFT_PLUS:
5930 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5931 break;
5933 default:
5934 return IFN_LAST;
5937 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5938 OPTIMIZE_FOR_SPEED))
5939 return mask_reduc_fn;
5940 return IFN_LAST;
5943 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5944 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5945 statement. CODE is the operation performed by STMT_INFO and OPS are
5946 its scalar operands. REDUC_INDEX is the index of the operand in
5947 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5948 implements in-order reduction, or IFN_LAST if we should open-code it.
5949 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5950 that should be used to control the operation in a fully-masked loop. */
5952 static bool
5953 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5954 gimple_stmt_iterator *gsi,
5955 stmt_vec_info *vec_stmt, slp_tree slp_node,
5956 gimple *reduc_def_stmt,
5957 tree_code code, internal_fn reduc_fn,
5958 tree ops[3], tree vectype_in,
5959 int reduc_index, vec_loop_masks *masks)
5961 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5962 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5963 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5964 stmt_vec_info new_stmt_info = NULL;
5965 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5967 int ncopies;
5968 if (slp_node)
5969 ncopies = 1;
5970 else
5971 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5973 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5974 gcc_assert (ncopies == 1);
5975 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5976 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5977 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5978 == FOLD_LEFT_REDUCTION);
5980 if (slp_node)
5981 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5982 TYPE_VECTOR_SUBPARTS (vectype_in)));
5984 tree op0 = ops[1 - reduc_index];
5986 int group_size = 1;
5987 stmt_vec_info scalar_dest_def_info;
5988 auto_vec<tree> vec_oprnds0;
5989 if (slp_node)
5991 auto_vec<vec<tree> > vec_defs (2);
5992 auto_vec<tree> sops(2);
5993 sops.quick_push (ops[0]);
5994 sops.quick_push (ops[1]);
5995 vect_get_slp_defs (sops, slp_node, &vec_defs);
5996 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5997 vec_defs[0].release ();
5998 vec_defs[1].release ();
5999 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6000 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6002 else
6004 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
6005 vec_oprnds0.create (1);
6006 vec_oprnds0.quick_push (loop_vec_def0);
6007 scalar_dest_def_info = stmt_info;
6010 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6011 tree scalar_type = TREE_TYPE (scalar_dest);
6012 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6014 int vec_num = vec_oprnds0.length ();
6015 gcc_assert (vec_num == 1 || slp_node);
6016 tree vec_elem_type = TREE_TYPE (vectype_out);
6017 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6019 tree vector_identity = NULL_TREE;
6020 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6021 vector_identity = build_zero_cst (vectype_out);
6023 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6024 int i;
6025 tree def0;
6026 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6028 gimple *new_stmt;
6029 tree mask = NULL_TREE;
6030 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6031 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6033 /* Handle MINUS by adding the negative. */
6034 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6036 tree negated = make_ssa_name (vectype_out);
6037 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6038 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6039 def0 = negated;
6042 if (mask && mask_reduc_fn == IFN_LAST)
6043 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6044 vector_identity);
6046 /* On the first iteration the input is simply the scalar phi
6047 result, and for subsequent iterations it is the output of
6048 the preceding operation. */
6049 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6051 if (mask && mask_reduc_fn != IFN_LAST)
6052 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6053 def0, mask);
6054 else
6055 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6056 def0);
6057 /* For chained SLP reductions the output of the previous reduction
6058 operation serves as the input of the next. For the final statement
6059 the output cannot be a temporary - we reuse the original
6060 scalar destination of the last statement. */
6061 if (i != vec_num - 1)
6063 gimple_set_lhs (new_stmt, scalar_dest_var);
6064 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6065 gimple_set_lhs (new_stmt, reduc_var);
6068 else
6070 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6071 reduc_var, def0);
6072 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6073 /* Remove the statement, so that we can use the same code paths
6074 as for statements that we've just created. */
6075 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6076 gsi_remove (&tmp_gsi, true);
6079 if (i == vec_num - 1)
6081 gimple_set_lhs (new_stmt, scalar_dest);
6082 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
6083 new_stmt);
6085 else
6086 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
6087 new_stmt, gsi);
6089 if (slp_node)
6090 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6093 if (!slp_node)
6094 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6096 return true;
6099 /* Function is_nonwrapping_integer_induction.
6101 Check if STMT_VINO (which is part of loop LOOP) both increments and
6102 does not cause overflow. */
6104 static bool
6105 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
6107 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6108 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6109 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6110 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6111 widest_int ni, max_loop_value, lhs_max;
6112 wi::overflow_type overflow = wi::OVF_NONE;
6114 /* Make sure the loop is integer based. */
6115 if (TREE_CODE (base) != INTEGER_CST
6116 || TREE_CODE (step) != INTEGER_CST)
6117 return false;
6119 /* Check that the max size of the loop will not wrap. */
6121 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6122 return true;
6124 if (! max_stmt_executions (loop, &ni))
6125 return false;
6127 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6128 &overflow);
6129 if (overflow)
6130 return false;
6132 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6133 TYPE_SIGN (lhs_type), &overflow);
6134 if (overflow)
6135 return false;
6137 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6138 <= TYPE_PRECISION (lhs_type));
6141 /* Check if masking can be supported by inserting a conditional expression.
6142 CODE is the code for the operation. COND_FN is the conditional internal
6143 function, if it exists. VECTYPE_IN is the type of the vector input. */
6144 static bool
6145 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6146 tree vectype_in)
6148 if (cond_fn != IFN_LAST
6149 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6150 OPTIMIZE_FOR_SPEED))
6151 return false;
6153 switch (code)
6155 case DOT_PROD_EXPR:
6156 case SAD_EXPR:
6157 return true;
6159 default:
6160 return false;
6164 /* Insert a conditional expression to enable masked vectorization. CODE is the
6165 code for the operation. VOP is the array of operands. MASK is the loop
6166 mask. GSI is a statement iterator used to place the new conditional
6167 expression. */
6168 static void
6169 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6170 gimple_stmt_iterator *gsi)
6172 switch (code)
6174 case DOT_PROD_EXPR:
6176 tree vectype = TREE_TYPE (vop[1]);
6177 tree zero = build_zero_cst (vectype);
6178 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6179 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6180 mask, vop[1], zero);
6181 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6182 vop[1] = masked_op1;
6183 break;
6186 case SAD_EXPR:
6188 tree vectype = TREE_TYPE (vop[1]);
6189 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6190 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6191 mask, vop[1], vop[0]);
6192 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6193 vop[1] = masked_op1;
6194 break;
6197 default:
6198 gcc_unreachable ();
6202 /* Function vectorizable_reduction.
6204 Check if STMT_INFO performs a reduction operation that can be vectorized.
6205 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6206 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6207 Return true if STMT_INFO is vectorizable in this way.
6209 This function also handles reduction idioms (patterns) that have been
6210 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6211 may be of this form:
6212 X = pattern_expr (arg0, arg1, ..., X)
6213 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6214 sequence that had been detected and replaced by the pattern-stmt
6215 (STMT_INFO).
6217 This function also handles reduction of condition expressions, for example:
6218 for (int i = 0; i < N; i++)
6219 if (a[i] < value)
6220 last = a[i];
6221 This is handled by vectorising the loop and creating an additional vector
6222 containing the loop indexes for which "a[i] < value" was true. In the
6223 function epilogue this is reduced to a single max value and then used to
6224 index into the vector of results.
6226 In some cases of reduction patterns, the type of the reduction variable X is
6227 different than the type of the other arguments of STMT_INFO.
6228 In such cases, the vectype that is used when transforming STMT_INFO into
6229 a vector stmt is different than the vectype that is used to determine the
6230 vectorization factor, because it consists of a different number of elements
6231 than the actual number of elements that are being operated upon in parallel.
6233 For example, consider an accumulation of shorts into an int accumulator.
6234 On some targets it's possible to vectorize this pattern operating on 8
6235 shorts at a time (hence, the vectype for purposes of determining the
6236 vectorization factor should be V8HI); on the other hand, the vectype that
6237 is used to create the vector form is actually V4SI (the type of the result).
6239 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6240 indicates what is the actual level of parallelism (V8HI in the example), so
6241 that the right vectorization factor would be derived. This vectype
6242 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6243 be used to create the vectorized stmt. The right vectype for the vectorized
6244 stmt is obtained from the type of the result X:
6245 get_vectype_for_scalar_type (TREE_TYPE (X))
6247 This means that, contrary to "regular" reductions (or "regular" stmts in
6248 general), the following equation:
6249 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6250 does *NOT* necessarily hold for reduction patterns. */
6252 bool
6253 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6254 stmt_vec_info *vec_stmt, slp_tree slp_node,
6255 slp_instance slp_node_instance,
6256 stmt_vector_for_cost *cost_vec)
6258 tree vec_dest;
6259 tree scalar_dest;
6260 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6261 tree vectype_in = NULL_TREE;
6262 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6263 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6264 enum tree_code code, orig_code;
6265 internal_fn reduc_fn;
6266 machine_mode vec_mode;
6267 int op_type;
6268 optab optab;
6269 tree new_temp = NULL_TREE;
6270 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6271 stmt_vec_info cond_stmt_vinfo = NULL;
6272 enum tree_code cond_reduc_op_code = ERROR_MARK;
6273 tree scalar_type;
6274 bool is_simple_use;
6275 int i;
6276 int ncopies;
6277 int epilog_copies;
6278 stmt_vec_info prev_stmt_info, prev_phi_info;
6279 bool single_defuse_cycle = false;
6280 stmt_vec_info new_stmt_info = NULL;
6281 int j;
6282 tree ops[3];
6283 enum vect_def_type dts[3];
6284 bool nested_cycle = false, found_nested_cycle_def = false;
6285 bool double_reduc = false;
6286 basic_block def_bb;
6287 struct loop * def_stmt_loop;
6288 tree def_arg;
6289 auto_vec<tree> vec_oprnds0;
6290 auto_vec<tree> vec_oprnds1;
6291 auto_vec<tree> vec_oprnds2;
6292 auto_vec<tree> vect_defs;
6293 auto_vec<stmt_vec_info> phis;
6294 int vec_num;
6295 tree def0, tem;
6296 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6297 tree cond_reduc_val = NULL_TREE;
6299 /* Make sure it was already recognized as a reduction computation. */
6300 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6301 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6302 return false;
6304 if (nested_in_vect_loop_p (loop, stmt_info))
6306 loop = loop->inner;
6307 nested_cycle = true;
6310 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6311 gcc_assert (slp_node
6312 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6314 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6316 tree phi_result = gimple_phi_result (phi);
6317 /* Analysis is fully done on the reduction stmt invocation. */
6318 if (! vec_stmt)
6320 if (slp_node)
6321 slp_node_instance->reduc_phis = slp_node;
6323 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6324 return true;
6327 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6328 /* Leave the scalar phi in place. Note that checking
6329 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6330 for reductions involving a single statement. */
6331 return true;
6333 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6334 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6336 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6337 == EXTRACT_LAST_REDUCTION)
6338 /* Leave the scalar phi in place. */
6339 return true;
6341 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6342 code = gimple_assign_rhs_code (reduc_stmt);
6343 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6345 tree op = gimple_op (reduc_stmt, k);
6346 if (op == phi_result)
6347 continue;
6348 if (k == 1 && code == COND_EXPR)
6349 continue;
6350 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6351 gcc_assert (is_simple_use);
6352 if (dt == vect_constant_def || dt == vect_external_def)
6353 continue;
6354 if (!vectype_in
6355 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6356 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6357 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6358 break;
6360 /* For a nested cycle we might end up with an operation like
6361 phi_result * phi_result. */
6362 if (!vectype_in)
6363 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6364 gcc_assert (vectype_in);
6366 if (slp_node)
6367 ncopies = 1;
6368 else
6369 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6371 stmt_vec_info use_stmt_info;
6372 if (ncopies > 1
6373 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6374 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6375 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6376 single_defuse_cycle = true;
6378 /* Create the destination vector */
6379 scalar_dest = gimple_assign_lhs (reduc_stmt);
6380 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6382 if (slp_node)
6383 /* The size vect_schedule_slp_instance computes is off for us. */
6384 vec_num = vect_get_num_vectors
6385 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6386 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6387 vectype_in);
6388 else
6389 vec_num = 1;
6391 /* Generate the reduction PHIs upfront. */
6392 prev_phi_info = NULL;
6393 for (j = 0; j < ncopies; j++)
6395 if (j == 0 || !single_defuse_cycle)
6397 for (i = 0; i < vec_num; i++)
6399 /* Create the reduction-phi that defines the reduction
6400 operand. */
6401 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6402 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6404 if (slp_node)
6405 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6406 else
6408 if (j == 0)
6409 STMT_VINFO_VEC_STMT (stmt_info)
6410 = *vec_stmt = new_phi_info;
6411 else
6412 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6413 prev_phi_info = new_phi_info;
6419 return true;
6422 /* 1. Is vectorizable reduction? */
6423 /* Not supportable if the reduction variable is used in the loop, unless
6424 it's a reduction chain. */
6425 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6426 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6427 return false;
6429 /* Reductions that are not used even in an enclosing outer-loop,
6430 are expected to be "live" (used out of the loop). */
6431 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6432 && !STMT_VINFO_LIVE_P (stmt_info))
6433 return false;
6435 /* 2. Has this been recognized as a reduction pattern?
6437 Check if STMT represents a pattern that has been recognized
6438 in earlier analysis stages. For stmts that represent a pattern,
6439 the STMT_VINFO_RELATED_STMT field records the last stmt in
6440 the original sequence that constitutes the pattern. */
6442 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6443 if (orig_stmt_info)
6445 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6446 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6449 /* 3. Check the operands of the operation. The first operands are defined
6450 inside the loop body. The last operand is the reduction variable,
6451 which is defined by the loop-header-phi. */
6453 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6455 /* Flatten RHS. */
6456 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6458 case GIMPLE_BINARY_RHS:
6459 code = gimple_assign_rhs_code (stmt);
6460 op_type = TREE_CODE_LENGTH (code);
6461 gcc_assert (op_type == binary_op);
6462 ops[0] = gimple_assign_rhs1 (stmt);
6463 ops[1] = gimple_assign_rhs2 (stmt);
6464 break;
6466 case GIMPLE_TERNARY_RHS:
6467 code = gimple_assign_rhs_code (stmt);
6468 op_type = TREE_CODE_LENGTH (code);
6469 gcc_assert (op_type == ternary_op);
6470 ops[0] = gimple_assign_rhs1 (stmt);
6471 ops[1] = gimple_assign_rhs2 (stmt);
6472 ops[2] = gimple_assign_rhs3 (stmt);
6473 break;
6475 case GIMPLE_UNARY_RHS:
6476 return false;
6478 default:
6479 gcc_unreachable ();
6482 if (code == COND_EXPR && slp_node)
6483 return false;
6485 scalar_dest = gimple_assign_lhs (stmt);
6486 scalar_type = TREE_TYPE (scalar_dest);
6487 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6488 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6489 return false;
6491 /* Do not try to vectorize bit-precision reductions. */
6492 if (!type_has_mode_precision_p (scalar_type))
6493 return false;
6495 /* All uses but the last are expected to be defined in the loop.
6496 The last use is the reduction variable. In case of nested cycle this
6497 assumption is not true: we use reduc_index to record the index of the
6498 reduction variable. */
6499 stmt_vec_info reduc_def_info;
6500 if (orig_stmt_info)
6501 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6502 else
6503 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6504 gcc_assert (reduc_def_info);
6505 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6506 tree reduc_def = PHI_RESULT (reduc_def_phi);
6507 int reduc_index = -1;
6508 for (i = 0; i < op_type; i++)
6510 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6511 if (i == 0 && code == COND_EXPR)
6512 continue;
6514 stmt_vec_info def_stmt_info;
6515 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6516 &def_stmt_info);
6517 dt = dts[i];
6518 gcc_assert (is_simple_use);
6519 if (dt == vect_reduction_def
6520 && ops[i] == reduc_def)
6522 reduc_index = i;
6523 continue;
6525 else if (tem)
6527 /* To properly compute ncopies we are interested in the widest
6528 input type in case we're looking at a widening accumulation. */
6529 if (!vectype_in
6530 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6531 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6532 vectype_in = tem;
6535 if (dt != vect_internal_def
6536 && dt != vect_external_def
6537 && dt != vect_constant_def
6538 && dt != vect_induction_def
6539 && !(dt == vect_nested_cycle && nested_cycle))
6540 return false;
6542 if (dt == vect_nested_cycle
6543 && ops[i] == reduc_def)
6545 found_nested_cycle_def = true;
6546 reduc_index = i;
6549 if (i == 1 && code == COND_EXPR)
6551 /* Record how value of COND_EXPR is defined. */
6552 if (dt == vect_constant_def)
6554 cond_reduc_dt = dt;
6555 cond_reduc_val = ops[i];
6557 if (dt == vect_induction_def
6558 && def_stmt_info
6559 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6561 cond_reduc_dt = dt;
6562 cond_stmt_vinfo = def_stmt_info;
6567 if (!vectype_in)
6568 vectype_in = vectype_out;
6570 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6571 directy used in stmt. */
6572 if (reduc_index == -1)
6574 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6576 if (dump_enabled_p ())
6577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6578 "in-order reduction chain without SLP.\n");
6579 return false;
6583 if (!(reduc_index == -1
6584 || dts[reduc_index] == vect_reduction_def
6585 || dts[reduc_index] == vect_nested_cycle
6586 || ((dts[reduc_index] == vect_internal_def
6587 || dts[reduc_index] == vect_external_def
6588 || dts[reduc_index] == vect_constant_def
6589 || dts[reduc_index] == vect_induction_def)
6590 && nested_cycle && found_nested_cycle_def)))
6592 /* For pattern recognized stmts, orig_stmt might be a reduction,
6593 but some helper statements for the pattern might not, or
6594 might be COND_EXPRs with reduction uses in the condition. */
6595 gcc_assert (orig_stmt_info);
6596 return false;
6599 /* PHIs should not participate in patterns. */
6600 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6601 enum vect_reduction_type v_reduc_type
6602 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6603 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6605 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6606 /* If we have a condition reduction, see if we can simplify it further. */
6607 if (v_reduc_type == COND_REDUCTION)
6609 /* TODO: We can't yet handle reduction chains, since we need to treat
6610 each COND_EXPR in the chain specially, not just the last one.
6611 E.g. for:
6613 x_1 = PHI <x_3, ...>
6614 x_2 = a_2 ? ... : x_1;
6615 x_3 = a_3 ? ... : x_2;
6617 we're interested in the last element in x_3 for which a_2 || a_3
6618 is true, whereas the current reduction chain handling would
6619 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6620 as a reduction operation. */
6621 if (reduc_index == -1)
6623 if (dump_enabled_p ())
6624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6625 "conditional reduction chains not supported\n");
6626 return false;
6629 /* vect_is_simple_reduction ensured that operand 2 is the
6630 loop-carried operand. */
6631 gcc_assert (reduc_index == 2);
6633 /* Loop peeling modifies initial value of reduction PHI, which
6634 makes the reduction stmt to be transformed different to the
6635 original stmt analyzed. We need to record reduction code for
6636 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6637 it can be used directly at transform stage. */
6638 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6639 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6641 /* Also set the reduction type to CONST_COND_REDUCTION. */
6642 gcc_assert (cond_reduc_dt == vect_constant_def);
6643 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6645 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6646 vectype_in, OPTIMIZE_FOR_SPEED))
6648 if (dump_enabled_p ())
6649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6650 "optimizing condition reduction with"
6651 " FOLD_EXTRACT_LAST.\n");
6652 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6654 else if (cond_reduc_dt == vect_induction_def)
6656 tree base
6657 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6658 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6660 gcc_assert (TREE_CODE (base) == INTEGER_CST
6661 && TREE_CODE (step) == INTEGER_CST);
6662 cond_reduc_val = NULL_TREE;
6663 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6664 above base; punt if base is the minimum value of the type for
6665 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6666 if (tree_int_cst_sgn (step) == -1)
6668 cond_reduc_op_code = MIN_EXPR;
6669 if (tree_int_cst_sgn (base) == -1)
6670 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6671 else if (tree_int_cst_lt (base,
6672 TYPE_MAX_VALUE (TREE_TYPE (base))))
6673 cond_reduc_val
6674 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6676 else
6678 cond_reduc_op_code = MAX_EXPR;
6679 if (tree_int_cst_sgn (base) == 1)
6680 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6681 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6682 base))
6683 cond_reduc_val
6684 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6686 if (cond_reduc_val)
6688 if (dump_enabled_p ())
6689 dump_printf_loc (MSG_NOTE, vect_location,
6690 "condition expression based on "
6691 "integer induction.\n");
6692 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6693 = INTEGER_INDUC_COND_REDUCTION;
6696 else if (cond_reduc_dt == vect_constant_def)
6698 enum vect_def_type cond_initial_dt;
6699 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6700 tree cond_initial_val
6701 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6703 gcc_assert (cond_reduc_val != NULL_TREE);
6704 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6705 if (cond_initial_dt == vect_constant_def
6706 && types_compatible_p (TREE_TYPE (cond_initial_val),
6707 TREE_TYPE (cond_reduc_val)))
6709 tree e = fold_binary (LE_EXPR, boolean_type_node,
6710 cond_initial_val, cond_reduc_val);
6711 if (e && (integer_onep (e) || integer_zerop (e)))
6713 if (dump_enabled_p ())
6714 dump_printf_loc (MSG_NOTE, vect_location,
6715 "condition expression based on "
6716 "compile time constant.\n");
6717 /* Record reduction code at analysis stage. */
6718 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6719 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6720 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6721 = CONST_COND_REDUCTION;
6727 if (orig_stmt_info)
6728 gcc_assert (tmp == orig_stmt_info
6729 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6730 else
6731 /* We changed STMT to be the first stmt in reduction chain, hence we
6732 check that in this case the first element in the chain is STMT. */
6733 gcc_assert (tmp == stmt_info
6734 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6736 if (STMT_VINFO_LIVE_P (reduc_def_info))
6737 return false;
6739 if (slp_node)
6740 ncopies = 1;
6741 else
6742 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6744 gcc_assert (ncopies >= 1);
6746 vec_mode = TYPE_MODE (vectype_in);
6747 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6749 if (nested_cycle)
6751 def_bb = gimple_bb (reduc_def_phi);
6752 def_stmt_loop = def_bb->loop_father;
6753 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6754 loop_preheader_edge (def_stmt_loop));
6755 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6756 if (def_arg_stmt_info
6757 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6758 == vect_double_reduction_def))
6759 double_reduc = true;
6762 vect_reduction_type reduction_type
6763 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6764 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6765 && ncopies > 1)
6767 if (dump_enabled_p ())
6768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 "multiple types in double reduction or condition "
6770 "reduction.\n");
6771 return false;
6774 if (code == COND_EXPR)
6776 /* Only call during the analysis stage, otherwise we'll lose
6777 STMT_VINFO_TYPE. */
6778 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6779 true, NULL, cost_vec))
6781 if (dump_enabled_p ())
6782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783 "unsupported condition in reduction\n");
6784 return false;
6787 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6788 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6790 /* Only call during the analysis stage, otherwise we'll lose
6791 STMT_VINFO_TYPE. We only support this for nested cycles
6792 without double reductions at the moment. */
6793 if (!nested_cycle
6794 || double_reduc
6795 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6796 NULL, cost_vec)))
6798 if (dump_enabled_p ())
6799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6800 "unsupported shift or rotation in reduction\n");
6801 return false;
6804 else
6806 /* 4. Supportable by target? */
6808 /* 4.1. check support for the operation in the loop */
6809 optab = optab_for_tree_code (code, vectype_in, optab_default);
6810 if (!optab)
6812 if (dump_enabled_p ())
6813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6814 "no optab.\n");
6816 return false;
6819 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6821 if (dump_enabled_p ())
6822 dump_printf (MSG_NOTE, "op not supported by target.\n");
6824 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6825 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6826 return false;
6828 if (dump_enabled_p ())
6829 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6832 /* Worthwhile without SIMD support? */
6833 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6834 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6836 if (dump_enabled_p ())
6837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6838 "not worthwhile without SIMD support.\n");
6840 return false;
6844 /* 4.2. Check support for the epilog operation.
6846 If STMT represents a reduction pattern, then the type of the
6847 reduction variable may be different than the type of the rest
6848 of the arguments. For example, consider the case of accumulation
6849 of shorts into an int accumulator; The original code:
6850 S1: int_a = (int) short_a;
6851 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6853 was replaced with:
6854 STMT: int_acc = widen_sum <short_a, int_acc>
6856 This means that:
6857 1. The tree-code that is used to create the vector operation in the
6858 epilog code (that reduces the partial results) is not the
6859 tree-code of STMT, but is rather the tree-code of the original
6860 stmt from the pattern that STMT is replacing. I.e, in the example
6861 above we want to use 'widen_sum' in the loop, but 'plus' in the
6862 epilog.
6863 2. The type (mode) we use to check available target support
6864 for the vector operation to be created in the *epilog*, is
6865 determined by the type of the reduction variable (in the example
6866 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6867 However the type (mode) we use to check available target support
6868 for the vector operation to be created *inside the loop*, is
6869 determined by the type of the other arguments to STMT (in the
6870 example we'd check this: optab_handler (widen_sum_optab,
6871 vect_short_mode)).
6873 This is contrary to "regular" reductions, in which the types of all
6874 the arguments are the same as the type of the reduction variable.
6875 For "regular" reductions we can therefore use the same vector type
6876 (and also the same tree-code) when generating the epilog code and
6877 when generating the code inside the loop. */
6879 if (orig_stmt_info
6880 && (reduction_type == TREE_CODE_REDUCTION
6881 || reduction_type == FOLD_LEFT_REDUCTION))
6883 /* This is a reduction pattern: get the vectype from the type of the
6884 reduction variable, and get the tree-code from orig_stmt. */
6885 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6886 gcc_assert (vectype_out);
6887 vec_mode = TYPE_MODE (vectype_out);
6889 else
6891 /* Regular reduction: use the same vectype and tree-code as used for
6892 the vector code inside the loop can be used for the epilog code. */
6893 orig_code = code;
6895 if (code == MINUS_EXPR)
6896 orig_code = PLUS_EXPR;
6898 /* For simple condition reductions, replace with the actual expression
6899 we want to base our reduction around. */
6900 if (reduction_type == CONST_COND_REDUCTION)
6902 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6903 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6905 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6906 orig_code = cond_reduc_op_code;
6909 reduc_fn = IFN_LAST;
6911 if (reduction_type == TREE_CODE_REDUCTION
6912 || reduction_type == FOLD_LEFT_REDUCTION
6913 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6914 || reduction_type == CONST_COND_REDUCTION)
6916 if (reduction_type == FOLD_LEFT_REDUCTION
6917 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6918 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6920 if (reduc_fn != IFN_LAST
6921 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6922 OPTIMIZE_FOR_SPEED))
6924 if (dump_enabled_p ())
6925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6926 "reduc op not supported by target.\n");
6928 reduc_fn = IFN_LAST;
6931 else
6933 if (!nested_cycle || double_reduc)
6935 if (dump_enabled_p ())
6936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6937 "no reduc code for scalar code.\n");
6939 return false;
6943 else if (reduction_type == COND_REDUCTION)
6945 int scalar_precision
6946 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6947 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6948 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6949 nunits_out);
6951 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6952 OPTIMIZE_FOR_SPEED))
6953 reduc_fn = IFN_REDUC_MAX;
6956 if (reduction_type != EXTRACT_LAST_REDUCTION
6957 && (!nested_cycle || double_reduc)
6958 && reduc_fn == IFN_LAST
6959 && !nunits_out.is_constant ())
6961 if (dump_enabled_p ())
6962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6963 "missing target support for reduction on"
6964 " variable-length vectors.\n");
6965 return false;
6968 /* For SLP reductions, see if there is a neutral value we can use. */
6969 tree neutral_op = NULL_TREE;
6970 if (slp_node)
6971 neutral_op = neutral_op_for_slp_reduction
6972 (slp_node_instance->reduc_phis, code,
6973 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6975 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6977 /* We can't support in-order reductions of code such as this:
6979 for (int i = 0; i < n1; ++i)
6980 for (int j = 0; j < n2; ++j)
6981 l += a[j];
6983 since GCC effectively transforms the loop when vectorizing:
6985 for (int i = 0; i < n1 / VF; ++i)
6986 for (int j = 0; j < n2; ++j)
6987 for (int k = 0; k < VF; ++k)
6988 l += a[j];
6990 which is a reassociation of the original operation. */
6991 if (dump_enabled_p ())
6992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6993 "in-order double reduction not supported.\n");
6995 return false;
6998 if (reduction_type == FOLD_LEFT_REDUCTION
6999 && slp_node
7000 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7002 /* We cannot use in-order reductions in this case because there is
7003 an implicit reassociation of the operations involved. */
7004 if (dump_enabled_p ())
7005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7006 "in-order unchained SLP reductions not supported.\n");
7007 return false;
7010 /* For double reductions, and for SLP reductions with a neutral value,
7011 we construct a variable-length initial vector by loading a vector
7012 full of the neutral value and then shift-and-inserting the start
7013 values into the low-numbered elements. */
7014 if ((double_reduc || neutral_op)
7015 && !nunits_out.is_constant ()
7016 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7017 vectype_out, OPTIMIZE_FOR_SPEED))
7019 if (dump_enabled_p ())
7020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7021 "reduction on variable-length vectors requires"
7022 " target support for a vector-shift-and-insert"
7023 " operation.\n");
7024 return false;
7027 /* Check extra constraints for variable-length unchained SLP reductions. */
7028 if (STMT_SLP_TYPE (stmt_info)
7029 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7030 && !nunits_out.is_constant ())
7032 /* We checked above that we could build the initial vector when
7033 there's a neutral element value. Check here for the case in
7034 which each SLP statement has its own initial value and in which
7035 that value needs to be repeated for every instance of the
7036 statement within the initial vector. */
7037 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7038 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7039 if (!neutral_op
7040 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7042 if (dump_enabled_p ())
7043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7044 "unsupported form of SLP reduction for"
7045 " variable-length vectors: cannot build"
7046 " initial vector.\n");
7047 return false;
7049 /* The epilogue code relies on the number of elements being a multiple
7050 of the group size. The duplicate-and-interleave approach to setting
7051 up the the initial vector does too. */
7052 if (!multiple_p (nunits_out, group_size))
7054 if (dump_enabled_p ())
7055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7056 "unsupported form of SLP reduction for"
7057 " variable-length vectors: the vector size"
7058 " is not a multiple of the number of results.\n");
7059 return false;
7063 /* In case of widenning multiplication by a constant, we update the type
7064 of the constant to be the type of the other operand. We check that the
7065 constant fits the type in the pattern recognition pass. */
7066 if (code == DOT_PROD_EXPR
7067 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7069 if (TREE_CODE (ops[0]) == INTEGER_CST)
7070 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7071 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7072 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7073 else
7075 if (dump_enabled_p ())
7076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7077 "invalid types in dot-prod\n");
7079 return false;
7083 if (reduction_type == COND_REDUCTION)
7085 widest_int ni;
7087 if (! max_loop_iterations (loop, &ni))
7089 if (dump_enabled_p ())
7090 dump_printf_loc (MSG_NOTE, vect_location,
7091 "loop count not known, cannot create cond "
7092 "reduction.\n");
7093 return false;
7095 /* Convert backedges to iterations. */
7096 ni += 1;
7098 /* The additional index will be the same type as the condition. Check
7099 that the loop can fit into this less one (because we'll use up the
7100 zero slot for when there are no matches). */
7101 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7102 if (wi::geu_p (ni, wi::to_widest (max_index)))
7104 if (dump_enabled_p ())
7105 dump_printf_loc (MSG_NOTE, vect_location,
7106 "loop size is greater than data size.\n");
7107 return false;
7111 /* In case the vectorization factor (VF) is bigger than the number
7112 of elements that we can fit in a vectype (nunits), we have to generate
7113 more than one vector stmt - i.e - we need to "unroll" the
7114 vector stmt by a factor VF/nunits. For more details see documentation
7115 in vectorizable_operation. */
7117 /* If the reduction is used in an outer loop we need to generate
7118 VF intermediate results, like so (e.g. for ncopies=2):
7119 r0 = phi (init, r0)
7120 r1 = phi (init, r1)
7121 r0 = x0 + r0;
7122 r1 = x1 + r1;
7123 (i.e. we generate VF results in 2 registers).
7124 In this case we have a separate def-use cycle for each copy, and therefore
7125 for each copy we get the vector def for the reduction variable from the
7126 respective phi node created for this copy.
7128 Otherwise (the reduction is unused in the loop nest), we can combine
7129 together intermediate results, like so (e.g. for ncopies=2):
7130 r = phi (init, r)
7131 r = x0 + r;
7132 r = x1 + r;
7133 (i.e. we generate VF/2 results in a single register).
7134 In this case for each copy we get the vector def for the reduction variable
7135 from the vectorized reduction operation generated in the previous iteration.
7137 This only works when we see both the reduction PHI and its only consumer
7138 in vectorizable_reduction and there are no intermediate stmts
7139 participating. */
7140 stmt_vec_info use_stmt_info;
7141 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
7142 if (ncopies > 1
7143 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7144 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
7145 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
7147 single_defuse_cycle = true;
7148 epilog_copies = 1;
7150 else
7151 epilog_copies = ncopies;
7153 /* If the reduction stmt is one of the patterns that have lane
7154 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7155 if ((ncopies > 1
7156 && ! single_defuse_cycle)
7157 && (code == DOT_PROD_EXPR
7158 || code == WIDEN_SUM_EXPR
7159 || code == SAD_EXPR))
7161 if (dump_enabled_p ())
7162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7163 "multi def-use cycle not possible for lane-reducing "
7164 "reduction operation\n");
7165 return false;
7168 if (slp_node)
7169 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7170 else
7171 vec_num = 1;
7173 internal_fn cond_fn = get_conditional_internal_fn (code);
7174 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7175 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7177 if (!vec_stmt) /* transformation not required. */
7179 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7180 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7182 if (reduction_type != FOLD_LEFT_REDUCTION
7183 && !mask_by_cond_expr
7184 && (cond_fn == IFN_LAST
7185 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7186 OPTIMIZE_FOR_SPEED)))
7188 if (dump_enabled_p ())
7189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7190 "can't use a fully-masked loop because no"
7191 " conditional operation is available.\n");
7192 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7194 else if (reduc_index == -1)
7196 if (dump_enabled_p ())
7197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7198 "can't use a fully-masked loop for chained"
7199 " reductions.\n");
7200 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7202 else
7203 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7204 vectype_in);
7206 if (dump_enabled_p ()
7207 && reduction_type == FOLD_LEFT_REDUCTION)
7208 dump_printf_loc (MSG_NOTE, vect_location,
7209 "using an in-order (fold-left) reduction.\n");
7210 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7211 return true;
7214 /* Transform. */
7216 if (dump_enabled_p ())
7217 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7219 /* FORNOW: Multiple types are not supported for condition. */
7220 if (code == COND_EXPR)
7221 gcc_assert (ncopies == 1);
7223 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7225 if (reduction_type == FOLD_LEFT_REDUCTION)
7226 return vectorize_fold_left_reduction
7227 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7228 reduc_fn, ops, vectype_in, reduc_index, masks);
7230 if (reduction_type == EXTRACT_LAST_REDUCTION)
7232 gcc_assert (!slp_node);
7233 return vectorizable_condition (stmt_info, gsi, vec_stmt,
7234 true, NULL, NULL);
7237 /* Create the destination vector */
7238 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7240 prev_stmt_info = NULL;
7241 prev_phi_info = NULL;
7242 if (!slp_node)
7244 vec_oprnds0.create (1);
7245 vec_oprnds1.create (1);
7246 if (op_type == ternary_op)
7247 vec_oprnds2.create (1);
7250 phis.create (vec_num);
7251 vect_defs.create (vec_num);
7252 if (!slp_node)
7253 vect_defs.quick_push (NULL_TREE);
7255 if (slp_node)
7256 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7257 else
7258 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7260 for (j = 0; j < ncopies; j++)
7262 if (code == COND_EXPR)
7264 gcc_assert (!slp_node);
7265 vectorizable_condition (stmt_info, gsi, vec_stmt,
7266 true, NULL, NULL);
7267 break;
7269 if (code == LSHIFT_EXPR
7270 || code == RSHIFT_EXPR)
7272 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7273 break;
7276 /* Handle uses. */
7277 if (j == 0)
7279 if (slp_node)
7281 /* Get vec defs for all the operands except the reduction index,
7282 ensuring the ordering of the ops in the vector is kept. */
7283 auto_vec<tree, 3> slp_ops;
7284 auto_vec<vec<tree>, 3> vec_defs;
7286 slp_ops.quick_push (ops[0]);
7287 slp_ops.quick_push (ops[1]);
7288 if (op_type == ternary_op)
7289 slp_ops.quick_push (ops[2]);
7291 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7293 vec_oprnds0.safe_splice (vec_defs[0]);
7294 vec_defs[0].release ();
7295 vec_oprnds1.safe_splice (vec_defs[1]);
7296 vec_defs[1].release ();
7297 if (op_type == ternary_op)
7299 vec_oprnds2.safe_splice (vec_defs[2]);
7300 vec_defs[2].release ();
7303 else
7305 vec_oprnds0.quick_push
7306 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7307 vec_oprnds1.quick_push
7308 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7309 if (op_type == ternary_op)
7310 vec_oprnds2.quick_push
7311 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7314 else
7316 if (!slp_node)
7318 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7320 if (single_defuse_cycle && reduc_index == 0)
7321 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7322 else
7323 vec_oprnds0[0]
7324 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7325 vec_oprnds0[0]);
7326 if (single_defuse_cycle && reduc_index == 1)
7327 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7328 else
7329 vec_oprnds1[0]
7330 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7331 vec_oprnds1[0]);
7332 if (op_type == ternary_op)
7334 if (single_defuse_cycle && reduc_index == 2)
7335 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7336 else
7337 vec_oprnds2[0]
7338 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7339 vec_oprnds2[0]);
7344 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7346 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7347 if (masked_loop_p && !mask_by_cond_expr)
7349 /* Make sure that the reduction accumulator is vop[0]. */
7350 if (reduc_index == 1)
7352 gcc_assert (commutative_tree_code (code));
7353 std::swap (vop[0], vop[1]);
7355 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7356 vectype_in, i * ncopies + j);
7357 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7358 vop[0], vop[1],
7359 vop[0]);
7360 new_temp = make_ssa_name (vec_dest, call);
7361 gimple_call_set_lhs (call, new_temp);
7362 gimple_call_set_nothrow (call, true);
7363 new_stmt_info
7364 = vect_finish_stmt_generation (stmt_info, call, gsi);
7366 else
7368 if (op_type == ternary_op)
7369 vop[2] = vec_oprnds2[i];
7371 if (masked_loop_p && mask_by_cond_expr)
7373 tree mask = vect_get_loop_mask (gsi, masks,
7374 vec_num * ncopies,
7375 vectype_in, i * ncopies + j);
7376 build_vect_cond_expr (code, vop, mask, gsi);
7379 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7380 vop[0], vop[1], vop[2]);
7381 new_temp = make_ssa_name (vec_dest, new_stmt);
7382 gimple_assign_set_lhs (new_stmt, new_temp);
7383 new_stmt_info
7384 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7387 if (slp_node)
7389 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7390 vect_defs.quick_push (new_temp);
7392 else
7393 vect_defs[0] = new_temp;
7396 if (slp_node)
7397 continue;
7399 if (j == 0)
7400 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7401 else
7402 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7404 prev_stmt_info = new_stmt_info;
7407 /* Finalize the reduction-phi (set its arguments) and create the
7408 epilog reduction code. */
7409 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7410 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7412 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7413 epilog_copies, reduc_fn, phis,
7414 double_reduc, slp_node, slp_node_instance,
7415 cond_reduc_val, cond_reduc_op_code,
7416 neutral_op);
7418 return true;
7421 /* Function vect_min_worthwhile_factor.
7423 For a loop where we could vectorize the operation indicated by CODE,
7424 return the minimum vectorization factor that makes it worthwhile
7425 to use generic vectors. */
7426 static unsigned int
7427 vect_min_worthwhile_factor (enum tree_code code)
7429 switch (code)
7431 case PLUS_EXPR:
7432 case MINUS_EXPR:
7433 case NEGATE_EXPR:
7434 return 4;
7436 case BIT_AND_EXPR:
7437 case BIT_IOR_EXPR:
7438 case BIT_XOR_EXPR:
7439 case BIT_NOT_EXPR:
7440 return 2;
7442 default:
7443 return INT_MAX;
7447 /* Return true if VINFO indicates we are doing loop vectorization and if
7448 it is worth decomposing CODE operations into scalar operations for
7449 that loop's vectorization factor. */
7451 bool
7452 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7454 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7455 unsigned HOST_WIDE_INT value;
7456 return (loop_vinfo
7457 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7458 && value >= vect_min_worthwhile_factor (code));
7461 /* Function vectorizable_induction
7463 Check if STMT_INFO performs an induction computation that can be vectorized.
7464 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7465 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7466 Return true if STMT_INFO is vectorizable in this way. */
7468 bool
7469 vectorizable_induction (stmt_vec_info stmt_info,
7470 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7471 stmt_vec_info *vec_stmt, slp_tree slp_node,
7472 stmt_vector_for_cost *cost_vec)
7474 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7475 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7476 unsigned ncopies;
7477 bool nested_in_vect_loop = false;
7478 struct loop *iv_loop;
7479 tree vec_def;
7480 edge pe = loop_preheader_edge (loop);
7481 basic_block new_bb;
7482 tree new_vec, vec_init, vec_step, t;
7483 tree new_name;
7484 gimple *new_stmt;
7485 gphi *induction_phi;
7486 tree induc_def, vec_dest;
7487 tree init_expr, step_expr;
7488 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7489 unsigned i;
7490 tree expr;
7491 gimple_seq stmts;
7492 imm_use_iterator imm_iter;
7493 use_operand_p use_p;
7494 gimple *exit_phi;
7495 edge latch_e;
7496 tree loop_arg;
7497 gimple_stmt_iterator si;
7499 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7500 if (!phi)
7501 return false;
7503 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7504 return false;
7506 /* Make sure it was recognized as induction computation. */
7507 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7508 return false;
7510 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7511 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7513 if (slp_node)
7514 ncopies = 1;
7515 else
7516 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7517 gcc_assert (ncopies >= 1);
7519 /* FORNOW. These restrictions should be relaxed. */
7520 if (nested_in_vect_loop_p (loop, stmt_info))
7522 imm_use_iterator imm_iter;
7523 use_operand_p use_p;
7524 gimple *exit_phi;
7525 edge latch_e;
7526 tree loop_arg;
7528 if (ncopies > 1)
7530 if (dump_enabled_p ())
7531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7532 "multiple types in nested loop.\n");
7533 return false;
7536 /* FORNOW: outer loop induction with SLP not supported. */
7537 if (STMT_SLP_TYPE (stmt_info))
7538 return false;
7540 exit_phi = NULL;
7541 latch_e = loop_latch_edge (loop->inner);
7542 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7543 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7545 gimple *use_stmt = USE_STMT (use_p);
7546 if (is_gimple_debug (use_stmt))
7547 continue;
7549 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7551 exit_phi = use_stmt;
7552 break;
7555 if (exit_phi)
7557 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7558 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7559 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7561 if (dump_enabled_p ())
7562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7563 "inner-loop induction only used outside "
7564 "of the outer vectorized loop.\n");
7565 return false;
7569 nested_in_vect_loop = true;
7570 iv_loop = loop->inner;
7572 else
7573 iv_loop = loop;
7574 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7576 if (slp_node && !nunits.is_constant ())
7578 /* The current SLP code creates the initial value element-by-element. */
7579 if (dump_enabled_p ())
7580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7581 "SLP induction not supported for variable-length"
7582 " vectors.\n");
7583 return false;
7586 if (!vec_stmt) /* transformation not required. */
7588 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7589 DUMP_VECT_SCOPE ("vectorizable_induction");
7590 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7591 return true;
7594 /* Transform. */
7596 /* Compute a vector variable, initialized with the first VF values of
7597 the induction variable. E.g., for an iv with IV_PHI='X' and
7598 evolution S, for a vector of 4 units, we want to compute:
7599 [X, X + S, X + 2*S, X + 3*S]. */
7601 if (dump_enabled_p ())
7602 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7604 latch_e = loop_latch_edge (iv_loop);
7605 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7607 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7608 gcc_assert (step_expr != NULL_TREE);
7610 pe = loop_preheader_edge (iv_loop);
7611 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7612 loop_preheader_edge (iv_loop));
7614 stmts = NULL;
7615 if (!nested_in_vect_loop)
7617 /* Convert the initial value to the desired type. */
7618 tree new_type = TREE_TYPE (vectype);
7619 init_expr = gimple_convert (&stmts, new_type, init_expr);
7621 /* If we are using the loop mask to "peel" for alignment then we need
7622 to adjust the start value here. */
7623 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7624 if (skip_niters != NULL_TREE)
7626 if (FLOAT_TYPE_P (vectype))
7627 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7628 skip_niters);
7629 else
7630 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7631 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7632 skip_niters, step_expr);
7633 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7634 init_expr, skip_step);
7638 /* Convert the step to the desired type. */
7639 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7641 if (stmts)
7643 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7644 gcc_assert (!new_bb);
7647 /* Find the first insertion point in the BB. */
7648 basic_block bb = gimple_bb (phi);
7649 si = gsi_after_labels (bb);
7651 /* For SLP induction we have to generate several IVs as for example
7652 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7653 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7654 [VF*S, VF*S, VF*S, VF*S] for all. */
7655 if (slp_node)
7657 /* Enforced above. */
7658 unsigned int const_nunits = nunits.to_constant ();
7660 /* Generate [VF*S, VF*S, ... ]. */
7661 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7663 expr = build_int_cst (integer_type_node, vf);
7664 expr = fold_convert (TREE_TYPE (step_expr), expr);
7666 else
7667 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7668 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7669 expr, step_expr);
7670 if (! CONSTANT_CLASS_P (new_name))
7671 new_name = vect_init_vector (stmt_info, new_name,
7672 TREE_TYPE (step_expr), NULL);
7673 new_vec = build_vector_from_val (vectype, new_name);
7674 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7676 /* Now generate the IVs. */
7677 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7678 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7679 unsigned elts = const_nunits * nvects;
7680 unsigned nivs = least_common_multiple (group_size,
7681 const_nunits) / const_nunits;
7682 gcc_assert (elts % group_size == 0);
7683 tree elt = init_expr;
7684 unsigned ivn;
7685 for (ivn = 0; ivn < nivs; ++ivn)
7687 tree_vector_builder elts (vectype, const_nunits, 1);
7688 stmts = NULL;
7689 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7691 if (ivn*const_nunits + eltn >= group_size
7692 && (ivn * const_nunits + eltn) % group_size == 0)
7693 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7694 elt, step_expr);
7695 elts.quick_push (elt);
7697 vec_init = gimple_build_vector (&stmts, &elts);
7698 if (stmts)
7700 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7701 gcc_assert (!new_bb);
7704 /* Create the induction-phi that defines the induction-operand. */
7705 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7706 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7707 stmt_vec_info induction_phi_info
7708 = loop_vinfo->add_stmt (induction_phi);
7709 induc_def = PHI_RESULT (induction_phi);
7711 /* Create the iv update inside the loop */
7712 vec_def = make_ssa_name (vec_dest);
7713 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7714 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7715 loop_vinfo->add_stmt (new_stmt);
7717 /* Set the arguments of the phi node: */
7718 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7719 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7720 UNKNOWN_LOCATION);
7722 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7725 /* Re-use IVs when we can. */
7726 if (ivn < nvects)
7728 unsigned vfp
7729 = least_common_multiple (group_size, const_nunits) / group_size;
7730 /* Generate [VF'*S, VF'*S, ... ]. */
7731 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7733 expr = build_int_cst (integer_type_node, vfp);
7734 expr = fold_convert (TREE_TYPE (step_expr), expr);
7736 else
7737 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7738 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7739 expr, step_expr);
7740 if (! CONSTANT_CLASS_P (new_name))
7741 new_name = vect_init_vector (stmt_info, new_name,
7742 TREE_TYPE (step_expr), NULL);
7743 new_vec = build_vector_from_val (vectype, new_name);
7744 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7745 for (; ivn < nvects; ++ivn)
7747 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7748 tree def;
7749 if (gimple_code (iv) == GIMPLE_PHI)
7750 def = gimple_phi_result (iv);
7751 else
7752 def = gimple_assign_lhs (iv);
7753 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7754 PLUS_EXPR,
7755 def, vec_step);
7756 if (gimple_code (iv) == GIMPLE_PHI)
7757 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7758 else
7760 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7761 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7763 SLP_TREE_VEC_STMTS (slp_node).quick_push
7764 (loop_vinfo->add_stmt (new_stmt));
7768 return true;
7771 /* Create the vector that holds the initial_value of the induction. */
7772 if (nested_in_vect_loop)
7774 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7775 been created during vectorization of previous stmts. We obtain it
7776 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7777 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7778 /* If the initial value is not of proper type, convert it. */
7779 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7781 new_stmt
7782 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7783 vect_simple_var,
7784 "vec_iv_"),
7785 VIEW_CONVERT_EXPR,
7786 build1 (VIEW_CONVERT_EXPR, vectype,
7787 vec_init));
7788 vec_init = gimple_assign_lhs (new_stmt);
7789 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7790 new_stmt);
7791 gcc_assert (!new_bb);
7792 loop_vinfo->add_stmt (new_stmt);
7795 else
7797 /* iv_loop is the loop to be vectorized. Create:
7798 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7799 stmts = NULL;
7800 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7802 unsigned HOST_WIDE_INT const_nunits;
7803 if (nunits.is_constant (&const_nunits))
7805 tree_vector_builder elts (vectype, const_nunits, 1);
7806 elts.quick_push (new_name);
7807 for (i = 1; i < const_nunits; i++)
7809 /* Create: new_name_i = new_name + step_expr */
7810 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7811 new_name, step_expr);
7812 elts.quick_push (new_name);
7814 /* Create a vector from [new_name_0, new_name_1, ...,
7815 new_name_nunits-1] */
7816 vec_init = gimple_build_vector (&stmts, &elts);
7818 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7819 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7820 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7821 new_name, step_expr);
7822 else
7824 /* Build:
7825 [base, base, base, ...]
7826 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7827 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7828 gcc_assert (flag_associative_math);
7829 tree index = build_index_vector (vectype, 0, 1);
7830 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7831 new_name);
7832 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7833 step_expr);
7834 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7835 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7836 vec_init, step_vec);
7837 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7838 vec_init, base_vec);
7841 if (stmts)
7843 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7844 gcc_assert (!new_bb);
7849 /* Create the vector that holds the step of the induction. */
7850 if (nested_in_vect_loop)
7851 /* iv_loop is nested in the loop to be vectorized. Generate:
7852 vec_step = [S, S, S, S] */
7853 new_name = step_expr;
7854 else
7856 /* iv_loop is the loop to be vectorized. Generate:
7857 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7858 gimple_seq seq = NULL;
7859 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7861 expr = build_int_cst (integer_type_node, vf);
7862 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7864 else
7865 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7866 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7867 expr, step_expr);
7868 if (seq)
7870 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7871 gcc_assert (!new_bb);
7875 t = unshare_expr (new_name);
7876 gcc_assert (CONSTANT_CLASS_P (new_name)
7877 || TREE_CODE (new_name) == SSA_NAME);
7878 new_vec = build_vector_from_val (vectype, t);
7879 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7882 /* Create the following def-use cycle:
7883 loop prolog:
7884 vec_init = ...
7885 vec_step = ...
7886 loop:
7887 vec_iv = PHI <vec_init, vec_loop>
7889 STMT
7891 vec_loop = vec_iv + vec_step; */
7893 /* Create the induction-phi that defines the induction-operand. */
7894 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7895 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7896 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7897 induc_def = PHI_RESULT (induction_phi);
7899 /* Create the iv update inside the loop */
7900 vec_def = make_ssa_name (vec_dest);
7901 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7902 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7903 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7905 /* Set the arguments of the phi node: */
7906 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7907 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7908 UNKNOWN_LOCATION);
7910 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7912 /* In case that vectorization factor (VF) is bigger than the number
7913 of elements that we can fit in a vectype (nunits), we have to generate
7914 more than one vector stmt - i.e - we need to "unroll" the
7915 vector stmt by a factor VF/nunits. For more details see documentation
7916 in vectorizable_operation. */
7918 if (ncopies > 1)
7920 gimple_seq seq = NULL;
7921 stmt_vec_info prev_stmt_vinfo;
7922 /* FORNOW. This restriction should be relaxed. */
7923 gcc_assert (!nested_in_vect_loop);
7925 /* Create the vector that holds the step of the induction. */
7926 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7928 expr = build_int_cst (integer_type_node, nunits);
7929 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7931 else
7932 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7933 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7934 expr, step_expr);
7935 if (seq)
7937 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7938 gcc_assert (!new_bb);
7941 t = unshare_expr (new_name);
7942 gcc_assert (CONSTANT_CLASS_P (new_name)
7943 || TREE_CODE (new_name) == SSA_NAME);
7944 new_vec = build_vector_from_val (vectype, t);
7945 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7947 vec_def = induc_def;
7948 prev_stmt_vinfo = induction_phi_info;
7949 for (i = 1; i < ncopies; i++)
7951 /* vec_i = vec_prev + vec_step */
7952 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7953 vec_def, vec_step);
7954 vec_def = make_ssa_name (vec_dest, new_stmt);
7955 gimple_assign_set_lhs (new_stmt, vec_def);
7957 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7958 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7959 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7960 prev_stmt_vinfo = new_stmt_info;
7964 if (nested_in_vect_loop)
7966 /* Find the loop-closed exit-phi of the induction, and record
7967 the final vector of induction results: */
7968 exit_phi = NULL;
7969 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7971 gimple *use_stmt = USE_STMT (use_p);
7972 if (is_gimple_debug (use_stmt))
7973 continue;
7975 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7977 exit_phi = use_stmt;
7978 break;
7981 if (exit_phi)
7983 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7984 /* FORNOW. Currently not supporting the case that an inner-loop induction
7985 is not used in the outer-loop (i.e. only outside the outer-loop). */
7986 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7987 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7989 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7990 if (dump_enabled_p ())
7991 dump_printf_loc (MSG_NOTE, vect_location,
7992 "vector of inductions after inner-loop:%G",
7993 new_stmt);
7998 if (dump_enabled_p ())
7999 dump_printf_loc (MSG_NOTE, vect_location,
8000 "transform induction: created def-use cycle: %G%G",
8001 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8003 return true;
8006 /* Function vectorizable_live_operation.
8008 STMT_INFO computes a value that is used outside the loop. Check if
8009 it can be supported. */
8011 bool
8012 vectorizable_live_operation (stmt_vec_info stmt_info,
8013 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8014 slp_tree slp_node, int slp_index,
8015 stmt_vec_info *vec_stmt,
8016 stmt_vector_for_cost *)
8018 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8019 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8020 imm_use_iterator imm_iter;
8021 tree lhs, lhs_type, bitsize, vec_bitsize;
8022 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8023 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8024 int ncopies;
8025 gimple *use_stmt;
8026 auto_vec<tree> vec_oprnds;
8027 int vec_entry = 0;
8028 poly_uint64 vec_index = 0;
8030 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8032 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8033 return false;
8035 /* FORNOW. CHECKME. */
8036 if (nested_in_vect_loop_p (loop, stmt_info))
8037 return false;
8039 /* If STMT is not relevant and it is a simple assignment and its inputs are
8040 invariant then it can remain in place, unvectorized. The original last
8041 scalar value that it computes will be used. */
8042 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8044 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8045 if (dump_enabled_p ())
8046 dump_printf_loc (MSG_NOTE, vect_location,
8047 "statement is simple and uses invariant. Leaving in "
8048 "place.\n");
8049 return true;
8052 if (slp_node)
8053 ncopies = 1;
8054 else
8055 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8057 if (slp_node)
8059 gcc_assert (slp_index >= 0);
8061 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8062 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8064 /* Get the last occurrence of the scalar index from the concatenation of
8065 all the slp vectors. Calculate which slp vector it is and the index
8066 within. */
8067 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8069 /* Calculate which vector contains the result, and which lane of
8070 that vector we need. */
8071 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8073 if (dump_enabled_p ())
8074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8075 "Cannot determine which vector holds the"
8076 " final result.\n");
8077 return false;
8081 if (!vec_stmt)
8083 /* No transformation required. */
8084 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8086 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8087 OPTIMIZE_FOR_SPEED))
8089 if (dump_enabled_p ())
8090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8091 "can't use a fully-masked loop because "
8092 "the target doesn't support extract last "
8093 "reduction.\n");
8094 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8096 else if (slp_node)
8098 if (dump_enabled_p ())
8099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8100 "can't use a fully-masked loop because an "
8101 "SLP statement is live after the loop.\n");
8102 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8104 else if (ncopies > 1)
8106 if (dump_enabled_p ())
8107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8108 "can't use a fully-masked loop because"
8109 " ncopies is greater than 1.\n");
8110 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8112 else
8114 gcc_assert (ncopies == 1 && !slp_node);
8115 vect_record_loop_mask (loop_vinfo,
8116 &LOOP_VINFO_MASKS (loop_vinfo),
8117 1, vectype);
8120 return true;
8123 /* Use the lhs of the original scalar statement. */
8124 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8126 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8127 : gimple_get_lhs (stmt);
8128 lhs_type = TREE_TYPE (lhs);
8130 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8131 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8132 : TYPE_SIZE (TREE_TYPE (vectype)));
8133 vec_bitsize = TYPE_SIZE (vectype);
8135 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8136 tree vec_lhs, bitstart;
8137 if (slp_node)
8139 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8141 /* Get the correct slp vectorized stmt. */
8142 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8143 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8144 vec_lhs = gimple_phi_result (phi);
8145 else
8146 vec_lhs = gimple_get_lhs (vec_stmt);
8148 /* Get entry to use. */
8149 bitstart = bitsize_int (vec_index);
8150 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8152 else
8154 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8155 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8156 gcc_checking_assert (ncopies == 1
8157 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8159 /* For multiple copies, get the last copy. */
8160 for (int i = 1; i < ncopies; ++i)
8161 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8163 /* Get the last lane in the vector. */
8164 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8167 gimple_seq stmts = NULL;
8168 tree new_tree;
8169 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8171 /* Emit:
8173 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8175 where VEC_LHS is the vectorized live-out result and MASK is
8176 the loop mask for the final iteration. */
8177 gcc_assert (ncopies == 1 && !slp_node);
8178 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8179 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8180 1, vectype, 0);
8181 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8182 scalar_type, mask, vec_lhs);
8184 /* Convert the extracted vector element to the required scalar type. */
8185 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8187 else
8189 tree bftype = TREE_TYPE (vectype);
8190 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8191 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8192 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8193 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8194 &stmts, true, NULL_TREE);
8197 if (stmts)
8198 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8200 /* Replace use of lhs with newly computed result. If the use stmt is a
8201 single arg PHI, just replace all uses of PHI result. It's necessary
8202 because lcssa PHI defining lhs may be before newly inserted stmt. */
8203 use_operand_p use_p;
8204 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8205 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8206 && !is_gimple_debug (use_stmt))
8208 if (gimple_code (use_stmt) == GIMPLE_PHI
8209 && gimple_phi_num_args (use_stmt) == 1)
8211 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8213 else
8215 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8216 SET_USE (use_p, new_tree);
8218 update_stmt (use_stmt);
8221 return true;
8224 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8226 static void
8227 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8229 ssa_op_iter op_iter;
8230 imm_use_iterator imm_iter;
8231 def_operand_p def_p;
8232 gimple *ustmt;
8234 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8236 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8238 basic_block bb;
8240 if (!is_gimple_debug (ustmt))
8241 continue;
8243 bb = gimple_bb (ustmt);
8245 if (!flow_bb_inside_loop_p (loop, bb))
8247 if (gimple_debug_bind_p (ustmt))
8249 if (dump_enabled_p ())
8250 dump_printf_loc (MSG_NOTE, vect_location,
8251 "killing debug use\n");
8253 gimple_debug_bind_reset_value (ustmt);
8254 update_stmt (ustmt);
8256 else
8257 gcc_unreachable ();
8263 /* Given loop represented by LOOP_VINFO, return true if computation of
8264 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8265 otherwise. */
8267 static bool
8268 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8270 /* Constant case. */
8271 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8273 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8274 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8276 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8277 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8278 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8279 return true;
8282 widest_int max;
8283 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8284 /* Check the upper bound of loop niters. */
8285 if (get_max_loop_iterations (loop, &max))
8287 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8288 signop sgn = TYPE_SIGN (type);
8289 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8290 if (max < type_max)
8291 return true;
8293 return false;
8296 /* Return a mask type with half the number of elements as TYPE. */
8298 tree
8299 vect_halve_mask_nunits (tree type)
8301 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8302 return build_truth_vector_type (nunits, current_vector_size);
8305 /* Return a mask type with twice as many elements as TYPE. */
8307 tree
8308 vect_double_mask_nunits (tree type)
8310 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8311 return build_truth_vector_type (nunits, current_vector_size);
8314 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8315 contain a sequence of NVECTORS masks that each control a vector of type
8316 VECTYPE. */
8318 void
8319 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8320 unsigned int nvectors, tree vectype)
8322 gcc_assert (nvectors != 0);
8323 if (masks->length () < nvectors)
8324 masks->safe_grow_cleared (nvectors);
8325 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8326 /* The number of scalars per iteration and the number of vectors are
8327 both compile-time constants. */
8328 unsigned int nscalars_per_iter
8329 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8330 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8331 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8333 rgm->max_nscalars_per_iter = nscalars_per_iter;
8334 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8338 /* Given a complete set of masks MASKS, extract mask number INDEX
8339 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8340 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8342 See the comment above vec_loop_masks for more details about the mask
8343 arrangement. */
8345 tree
8346 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8347 unsigned int nvectors, tree vectype, unsigned int index)
8349 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8350 tree mask_type = rgm->mask_type;
8352 /* Populate the rgroup's mask array, if this is the first time we've
8353 used it. */
8354 if (rgm->masks.is_empty ())
8356 rgm->masks.safe_grow_cleared (nvectors);
8357 for (unsigned int i = 0; i < nvectors; ++i)
8359 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8360 /* Provide a dummy definition until the real one is available. */
8361 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8362 rgm->masks[i] = mask;
8366 tree mask = rgm->masks[index];
8367 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8368 TYPE_VECTOR_SUBPARTS (vectype)))
8370 /* A loop mask for data type X can be reused for data type Y
8371 if X has N times more elements than Y and if Y's elements
8372 are N times bigger than X's. In this case each sequence
8373 of N elements in the loop mask will be all-zero or all-one.
8374 We can then view-convert the mask so that each sequence of
8375 N elements is replaced by a single element. */
8376 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8377 TYPE_VECTOR_SUBPARTS (vectype)));
8378 gimple_seq seq = NULL;
8379 mask_type = build_same_sized_truth_vector_type (vectype);
8380 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8381 if (seq)
8382 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8384 return mask;
8387 /* Scale profiling counters by estimation for LOOP which is vectorized
8388 by factor VF. */
8390 static void
8391 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8393 edge preheader = loop_preheader_edge (loop);
8394 /* Reduce loop iterations by the vectorization factor. */
8395 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8396 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8398 if (freq_h.nonzero_p ())
8400 profile_probability p;
8402 /* Avoid dropping loop body profile counter to 0 because of zero count
8403 in loop's preheader. */
8404 if (!(freq_e == profile_count::zero ()))
8405 freq_e = freq_e.force_nonzero ();
8406 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8407 scale_loop_frequencies (loop, p);
8410 edge exit_e = single_exit (loop);
8411 exit_e->probability = profile_probability::always ()
8412 .apply_scale (1, new_est_niter + 1);
8414 edge exit_l = single_pred_edge (loop->latch);
8415 profile_probability prob = exit_l->probability;
8416 exit_l->probability = exit_e->probability.invert ();
8417 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8418 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8421 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8422 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8423 stmt_vec_info. */
8425 static void
8426 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8427 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8429 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8430 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8432 if (dump_enabled_p ())
8433 dump_printf_loc (MSG_NOTE, vect_location,
8434 "------>vectorizing statement: %G", stmt_info->stmt);
8436 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8437 vect_loop_kill_debug_uses (loop, stmt_info);
8439 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8440 && !STMT_VINFO_LIVE_P (stmt_info))
8441 return;
8443 if (STMT_VINFO_VECTYPE (stmt_info))
8445 poly_uint64 nunits
8446 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8447 if (!STMT_SLP_TYPE (stmt_info)
8448 && maybe_ne (nunits, vf)
8449 && dump_enabled_p ())
8450 /* For SLP VF is set according to unrolling factor, and not
8451 to vector size, hence for SLP this print is not valid. */
8452 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8455 /* Pure SLP statements have already been vectorized. We still need
8456 to apply loop vectorization to hybrid SLP statements. */
8457 if (PURE_SLP_STMT (stmt_info))
8458 return;
8460 if (dump_enabled_p ())
8461 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8463 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8464 *seen_store = stmt_info;
8467 /* Function vect_transform_loop.
8469 The analysis phase has determined that the loop is vectorizable.
8470 Vectorize the loop - created vectorized stmts to replace the scalar
8471 stmts in the loop, and update the loop exit condition.
8472 Returns scalar epilogue loop if any. */
8474 struct loop *
8475 vect_transform_loop (loop_vec_info loop_vinfo)
8477 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8478 struct loop *epilogue = NULL;
8479 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8480 int nbbs = loop->num_nodes;
8481 int i;
8482 tree niters_vector = NULL_TREE;
8483 tree step_vector = NULL_TREE;
8484 tree niters_vector_mult_vf = NULL_TREE;
8485 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8486 unsigned int lowest_vf = constant_lower_bound (vf);
8487 gimple *stmt;
8488 bool check_profitability = false;
8489 unsigned int th;
8491 DUMP_VECT_SCOPE ("vec_transform_loop");
8493 loop_vinfo->shared->check_datarefs ();
8495 /* Use the more conservative vectorization threshold. If the number
8496 of iterations is constant assume the cost check has been performed
8497 by our caller. If the threshold makes all loops profitable that
8498 run at least the (estimated) vectorization factor number of times
8499 checking is pointless, too. */
8500 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8501 if (th >= vect_vf_for_cost (loop_vinfo)
8502 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8504 if (dump_enabled_p ())
8505 dump_printf_loc (MSG_NOTE, vect_location,
8506 "Profitability threshold is %d loop iterations.\n",
8507 th);
8508 check_profitability = true;
8511 /* Make sure there exists a single-predecessor exit bb. Do this before
8512 versioning. */
8513 edge e = single_exit (loop);
8514 if (! single_pred_p (e->dest))
8516 split_loop_exit_edge (e, true);
8517 if (dump_enabled_p ())
8518 dump_printf (MSG_NOTE, "split exit edge\n");
8521 /* Version the loop first, if required, so the profitability check
8522 comes first. */
8524 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8526 poly_uint64 versioning_threshold
8527 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8528 if (check_profitability
8529 && ordered_p (poly_uint64 (th), versioning_threshold))
8531 versioning_threshold = ordered_max (poly_uint64 (th),
8532 versioning_threshold);
8533 check_profitability = false;
8535 struct loop *sloop
8536 = vect_loop_versioning (loop_vinfo, th, check_profitability,
8537 versioning_threshold);
8538 sloop->force_vectorize = false;
8539 check_profitability = false;
8542 /* Make sure there exists a single-predecessor exit bb also on the
8543 scalar loop copy. Do this after versioning but before peeling
8544 so CFG structure is fine for both scalar and if-converted loop
8545 to make slpeel_duplicate_current_defs_from_edges face matched
8546 loop closed PHI nodes on the exit. */
8547 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8549 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8550 if (! single_pred_p (e->dest))
8552 split_loop_exit_edge (e, true);
8553 if (dump_enabled_p ())
8554 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8558 tree niters = vect_build_loop_niters (loop_vinfo);
8559 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8560 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8561 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8562 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8563 &step_vector, &niters_vector_mult_vf, th,
8564 check_profitability, niters_no_overflow);
8566 if (niters_vector == NULL_TREE)
8568 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8569 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8570 && known_eq (lowest_vf, vf))
8572 niters_vector
8573 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8574 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8575 step_vector = build_one_cst (TREE_TYPE (niters));
8577 else
8578 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8579 &step_vector, niters_no_overflow);
8582 /* 1) Make sure the loop header has exactly two entries
8583 2) Make sure we have a preheader basic block. */
8585 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8587 split_edge (loop_preheader_edge (loop));
8589 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8590 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8591 /* This will deal with any possible peeling. */
8592 vect_prepare_for_masked_peels (loop_vinfo);
8594 /* Schedule the SLP instances first, then handle loop vectorization
8595 below. */
8596 if (!loop_vinfo->slp_instances.is_empty ())
8598 DUMP_VECT_SCOPE ("scheduling SLP instances");
8599 vect_schedule_slp (loop_vinfo);
8602 /* FORNOW: the vectorizer supports only loops which body consist
8603 of one basic block (header + empty latch). When the vectorizer will
8604 support more involved loop forms, the order by which the BBs are
8605 traversed need to be reconsidered. */
8607 for (i = 0; i < nbbs; i++)
8609 basic_block bb = bbs[i];
8610 stmt_vec_info stmt_info;
8612 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8613 gsi_next (&si))
8615 gphi *phi = si.phi ();
8616 if (dump_enabled_p ())
8617 dump_printf_loc (MSG_NOTE, vect_location,
8618 "------>vectorizing phi: %G", phi);
8619 stmt_info = loop_vinfo->lookup_stmt (phi);
8620 if (!stmt_info)
8621 continue;
8623 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8624 vect_loop_kill_debug_uses (loop, stmt_info);
8626 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8627 && !STMT_VINFO_LIVE_P (stmt_info))
8628 continue;
8630 if (STMT_VINFO_VECTYPE (stmt_info)
8631 && (maybe_ne
8632 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8633 && dump_enabled_p ())
8634 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8636 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8637 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8638 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8639 && ! PURE_SLP_STMT (stmt_info))
8641 if (dump_enabled_p ())
8642 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8643 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8647 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8648 !gsi_end_p (si);)
8650 stmt = gsi_stmt (si);
8651 /* During vectorization remove existing clobber stmts. */
8652 if (gimple_clobber_p (stmt))
8654 unlink_stmt_vdef (stmt);
8655 gsi_remove (&si, true);
8656 release_defs (stmt);
8658 else
8660 stmt_info = loop_vinfo->lookup_stmt (stmt);
8662 /* vector stmts created in the outer-loop during vectorization of
8663 stmts in an inner-loop may not have a stmt_info, and do not
8664 need to be vectorized. */
8665 stmt_vec_info seen_store = NULL;
8666 if (stmt_info)
8668 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8670 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8671 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8672 !gsi_end_p (subsi); gsi_next (&subsi))
8674 stmt_vec_info pat_stmt_info
8675 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8676 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8677 &si, &seen_store);
8679 stmt_vec_info pat_stmt_info
8680 = STMT_VINFO_RELATED_STMT (stmt_info);
8681 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8682 &seen_store);
8684 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8685 &seen_store);
8687 gsi_next (&si);
8688 if (seen_store)
8690 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8691 /* Interleaving. If IS_STORE is TRUE, the
8692 vectorization of the interleaving chain was
8693 completed - free all the stores in the chain. */
8694 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8695 else
8696 /* Free the attached stmt_vec_info and remove the stmt. */
8697 loop_vinfo->remove_stmt (stmt_info);
8702 /* Stub out scalar statements that must not survive vectorization.
8703 Doing this here helps with grouped statements, or statements that
8704 are involved in patterns. */
8705 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8706 !gsi_end_p (gsi); gsi_next (&gsi))
8708 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8709 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8711 tree lhs = gimple_get_lhs (call);
8712 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8714 tree zero = build_zero_cst (TREE_TYPE (lhs));
8715 gimple *new_stmt = gimple_build_assign (lhs, zero);
8716 gsi_replace (&gsi, new_stmt, true);
8720 } /* BBs in loop */
8722 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8723 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8724 if (integer_onep (step_vector))
8725 niters_no_overflow = true;
8726 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8727 niters_vector_mult_vf, !niters_no_overflow);
8729 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8730 scale_profile_for_vect_loop (loop, assumed_vf);
8732 /* True if the final iteration might not handle a full vector's
8733 worth of scalar iterations. */
8734 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8735 /* The minimum number of iterations performed by the epilogue. This
8736 is 1 when peeling for gaps because we always need a final scalar
8737 iteration. */
8738 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8739 /* +1 to convert latch counts to loop iteration counts,
8740 -min_epilogue_iters to remove iterations that cannot be performed
8741 by the vector code. */
8742 int bias_for_lowest = 1 - min_epilogue_iters;
8743 int bias_for_assumed = bias_for_lowest;
8744 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8745 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8747 /* When the amount of peeling is known at compile time, the first
8748 iteration will have exactly alignment_npeels active elements.
8749 In the worst case it will have at least one. */
8750 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8751 bias_for_lowest += lowest_vf - min_first_active;
8752 bias_for_assumed += assumed_vf - min_first_active;
8754 /* In these calculations the "- 1" converts loop iteration counts
8755 back to latch counts. */
8756 if (loop->any_upper_bound)
8757 loop->nb_iterations_upper_bound
8758 = (final_iter_may_be_partial
8759 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8760 lowest_vf) - 1
8761 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8762 lowest_vf) - 1);
8763 if (loop->any_likely_upper_bound)
8764 loop->nb_iterations_likely_upper_bound
8765 = (final_iter_may_be_partial
8766 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8767 + bias_for_lowest, lowest_vf) - 1
8768 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8769 + bias_for_lowest, lowest_vf) - 1);
8770 if (loop->any_estimate)
8771 loop->nb_iterations_estimate
8772 = (final_iter_may_be_partial
8773 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8774 assumed_vf) - 1
8775 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8776 assumed_vf) - 1);
8778 if (dump_enabled_p ())
8780 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8782 dump_printf_loc (MSG_NOTE, vect_location,
8783 "LOOP VECTORIZED\n");
8784 if (loop->inner)
8785 dump_printf_loc (MSG_NOTE, vect_location,
8786 "OUTER LOOP VECTORIZED\n");
8787 dump_printf (MSG_NOTE, "\n");
8789 else
8791 dump_printf_loc (MSG_NOTE, vect_location,
8792 "LOOP EPILOGUE VECTORIZED (VS=");
8793 dump_dec (MSG_NOTE, current_vector_size);
8794 dump_printf (MSG_NOTE, ")\n");
8798 /* Loops vectorized with a variable factor won't benefit from
8799 unrolling/peeling. */
8800 if (!vf.is_constant ())
8802 loop->unroll = 1;
8803 if (dump_enabled_p ())
8804 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8805 " variable-length vectorization factor\n");
8807 /* Free SLP instances here because otherwise stmt reference counting
8808 won't work. */
8809 slp_instance instance;
8810 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8811 vect_free_slp_instance (instance, true);
8812 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8813 /* Clear-up safelen field since its value is invalid after vectorization
8814 since vectorized loop can have loop-carried dependencies. */
8815 loop->safelen = 0;
8817 /* Don't vectorize epilogue for epilogue. */
8818 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8819 epilogue = NULL;
8821 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8822 epilogue = NULL;
8824 if (epilogue)
8826 auto_vector_sizes vector_sizes;
8827 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8828 unsigned int next_size = 0;
8830 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8831 on niters already ajusted for the iterations of the prologue. */
8832 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8833 && known_eq (vf, lowest_vf))
8835 unsigned HOST_WIDE_INT eiters
8836 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8837 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8838 eiters
8839 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8840 epilogue->nb_iterations_upper_bound = eiters - 1;
8841 epilogue->any_upper_bound = true;
8843 unsigned int ratio;
8844 while (next_size < vector_sizes.length ()
8845 && !(constant_multiple_p (current_vector_size,
8846 vector_sizes[next_size], &ratio)
8847 && eiters >= lowest_vf / ratio))
8848 next_size += 1;
8850 else
8851 while (next_size < vector_sizes.length ()
8852 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8853 next_size += 1;
8855 if (next_size == vector_sizes.length ())
8856 epilogue = NULL;
8859 if (epilogue)
8861 epilogue->force_vectorize = loop->force_vectorize;
8862 epilogue->safelen = loop->safelen;
8863 epilogue->dont_vectorize = false;
8865 /* We may need to if-convert epilogue to vectorize it. */
8866 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8867 tree_if_conversion (epilogue);
8870 return epilogue;
8873 /* The code below is trying to perform simple optimization - revert
8874 if-conversion for masked stores, i.e. if the mask of a store is zero
8875 do not perform it and all stored value producers also if possible.
8876 For example,
8877 for (i=0; i<n; i++)
8878 if (c[i])
8880 p1[i] += 1;
8881 p2[i] = p3[i] +2;
8883 this transformation will produce the following semi-hammock:
8885 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8887 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8888 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8889 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8890 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8891 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8892 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8896 void
8897 optimize_mask_stores (struct loop *loop)
8899 basic_block *bbs = get_loop_body (loop);
8900 unsigned nbbs = loop->num_nodes;
8901 unsigned i;
8902 basic_block bb;
8903 struct loop *bb_loop;
8904 gimple_stmt_iterator gsi;
8905 gimple *stmt;
8906 auto_vec<gimple *> worklist;
8907 auto_purge_vect_location sentinel;
8909 vect_location = find_loop_location (loop);
8910 /* Pick up all masked stores in loop if any. */
8911 for (i = 0; i < nbbs; i++)
8913 bb = bbs[i];
8914 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8915 gsi_next (&gsi))
8917 stmt = gsi_stmt (gsi);
8918 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8919 worklist.safe_push (stmt);
8923 free (bbs);
8924 if (worklist.is_empty ())
8925 return;
8927 /* Loop has masked stores. */
8928 while (!worklist.is_empty ())
8930 gimple *last, *last_store;
8931 edge e, efalse;
8932 tree mask;
8933 basic_block store_bb, join_bb;
8934 gimple_stmt_iterator gsi_to;
8935 tree vdef, new_vdef;
8936 gphi *phi;
8937 tree vectype;
8938 tree zero;
8940 last = worklist.pop ();
8941 mask = gimple_call_arg (last, 2);
8942 bb = gimple_bb (last);
8943 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8944 the same loop as if_bb. It could be different to LOOP when two
8945 level loop-nest is vectorized and mask_store belongs to the inner
8946 one. */
8947 e = split_block (bb, last);
8948 bb_loop = bb->loop_father;
8949 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8950 join_bb = e->dest;
8951 store_bb = create_empty_bb (bb);
8952 add_bb_to_loop (store_bb, bb_loop);
8953 e->flags = EDGE_TRUE_VALUE;
8954 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8955 /* Put STORE_BB to likely part. */
8956 efalse->probability = profile_probability::unlikely ();
8957 store_bb->count = efalse->count ();
8958 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8959 if (dom_info_available_p (CDI_DOMINATORS))
8960 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8961 if (dump_enabled_p ())
8962 dump_printf_loc (MSG_NOTE, vect_location,
8963 "Create new block %d to sink mask stores.",
8964 store_bb->index);
8965 /* Create vector comparison with boolean result. */
8966 vectype = TREE_TYPE (mask);
8967 zero = build_zero_cst (vectype);
8968 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8969 gsi = gsi_last_bb (bb);
8970 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8971 /* Create new PHI node for vdef of the last masked store:
8972 .MEM_2 = VDEF <.MEM_1>
8973 will be converted to
8974 .MEM.3 = VDEF <.MEM_1>
8975 and new PHI node will be created in join bb
8976 .MEM_2 = PHI <.MEM_1, .MEM_3>
8978 vdef = gimple_vdef (last);
8979 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8980 gimple_set_vdef (last, new_vdef);
8981 phi = create_phi_node (vdef, join_bb);
8982 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8984 /* Put all masked stores with the same mask to STORE_BB if possible. */
8985 while (true)
8987 gimple_stmt_iterator gsi_from;
8988 gimple *stmt1 = NULL;
8990 /* Move masked store to STORE_BB. */
8991 last_store = last;
8992 gsi = gsi_for_stmt (last);
8993 gsi_from = gsi;
8994 /* Shift GSI to the previous stmt for further traversal. */
8995 gsi_prev (&gsi);
8996 gsi_to = gsi_start_bb (store_bb);
8997 gsi_move_before (&gsi_from, &gsi_to);
8998 /* Setup GSI_TO to the non-empty block start. */
8999 gsi_to = gsi_start_bb (store_bb);
9000 if (dump_enabled_p ())
9001 dump_printf_loc (MSG_NOTE, vect_location,
9002 "Move stmt to created bb\n%G", last);
9003 /* Move all stored value producers if possible. */
9004 while (!gsi_end_p (gsi))
9006 tree lhs;
9007 imm_use_iterator imm_iter;
9008 use_operand_p use_p;
9009 bool res;
9011 /* Skip debug statements. */
9012 if (is_gimple_debug (gsi_stmt (gsi)))
9014 gsi_prev (&gsi);
9015 continue;
9017 stmt1 = gsi_stmt (gsi);
9018 /* Do not consider statements writing to memory or having
9019 volatile operand. */
9020 if (gimple_vdef (stmt1)
9021 || gimple_has_volatile_ops (stmt1))
9022 break;
9023 gsi_from = gsi;
9024 gsi_prev (&gsi);
9025 lhs = gimple_get_lhs (stmt1);
9026 if (!lhs)
9027 break;
9029 /* LHS of vectorized stmt must be SSA_NAME. */
9030 if (TREE_CODE (lhs) != SSA_NAME)
9031 break;
9033 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9035 /* Remove dead scalar statement. */
9036 if (has_zero_uses (lhs))
9038 gsi_remove (&gsi_from, true);
9039 continue;
9043 /* Check that LHS does not have uses outside of STORE_BB. */
9044 res = true;
9045 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9047 gimple *use_stmt;
9048 use_stmt = USE_STMT (use_p);
9049 if (is_gimple_debug (use_stmt))
9050 continue;
9051 if (gimple_bb (use_stmt) != store_bb)
9053 res = false;
9054 break;
9057 if (!res)
9058 break;
9060 if (gimple_vuse (stmt1)
9061 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9062 break;
9064 /* Can move STMT1 to STORE_BB. */
9065 if (dump_enabled_p ())
9066 dump_printf_loc (MSG_NOTE, vect_location,
9067 "Move stmt to created bb\n%G", stmt1);
9068 gsi_move_before (&gsi_from, &gsi_to);
9069 /* Shift GSI_TO for further insertion. */
9070 gsi_prev (&gsi_to);
9072 /* Put other masked stores with the same mask to STORE_BB. */
9073 if (worklist.is_empty ()
9074 || gimple_call_arg (worklist.last (), 2) != mask
9075 || worklist.last () != stmt1)
9076 break;
9077 last = worklist.pop ();
9079 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9083 /* Decide whether it is possible to use a zero-based induction variable
9084 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9085 return the value that the induction variable must be able to hold
9086 in order to ensure that the loop ends with an all-false mask.
9087 Return -1 otherwise. */
9088 widest_int
9089 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9091 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9092 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9093 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9095 /* Calculate the value that the induction variable must be able
9096 to hit in order to ensure that we end the loop with an all-false mask.
9097 This involves adding the maximum number of inactive trailing scalar
9098 iterations. */
9099 widest_int iv_limit = -1;
9100 if (max_loop_iterations (loop, &iv_limit))
9102 if (niters_skip)
9104 /* Add the maximum number of skipped iterations to the
9105 maximum iteration count. */
9106 if (TREE_CODE (niters_skip) == INTEGER_CST)
9107 iv_limit += wi::to_widest (niters_skip);
9108 else
9109 iv_limit += max_vf - 1;
9111 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9112 /* Make a conservatively-correct assumption. */
9113 iv_limit += max_vf - 1;
9115 /* IV_LIMIT is the maximum number of latch iterations, which is also
9116 the maximum in-range IV value. Round this value down to the previous
9117 vector alignment boundary and then add an extra full iteration. */
9118 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9119 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9121 return iv_limit;