2018-11-13 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-loop.c
blob1a39b3bb4e99585766b8f16fd0fefe8df83a5598
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
185 if (stmt_vectype)
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
258 return opt_result::success ();
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i = 0; i < nbbs; i++)
304 basic_block bb = bbs[i];
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
315 gcc_assert (stmt_info);
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
347 vect_update_max_nunits (&vectorization_factor, vectype);
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
376 for (i = 0; i < mask_producers.length (); i++)
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
385 return opt_result::success ();
389 /* Function vect_is_simple_iv_evolution.
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
420 *init = init_expr;
421 *step = step_expr;
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
439 return true;
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
453 x_3 = ...;
456 outer2:
457 x_4 = PHI <x_3(inner)>;
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
476 /* Function vect_analyze_scalar_cycles_1.
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
534 worklist.safe_push (stmt_vinfo);
535 continue;
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
566 if (double_reduc)
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
576 else
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
587 else
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
612 /* Function vect_analyze_scalar_cycles.
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
621 Example1: reduction:
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
627 Example2: induction:
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
682 stmt_vec_info first;
683 unsigned i;
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
706 /* Function vect_get_loop_niters.
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
713 Return the loop exit condition. */
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
730 if (!exit)
731 return cond;
733 niter = chrec_dont_know;
734 may_be_zero = NULL_TREE;
735 niter_assumptions = boolean_true_node;
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737 || chrec_contains_undetermined (niter_desc.niter))
738 return cond;
740 niter_assumptions = niter_desc.assumptions;
741 may_be_zero = niter_desc.may_be_zero;
742 niter = niter_desc.niter;
744 if (may_be_zero && integer_zerop (may_be_zero))
745 may_be_zero = NULL_TREE;
747 if (may_be_zero)
749 if (COMPARISON_CLASS_P (may_be_zero))
751 /* Try to combine may_be_zero with assumptions, this can simplify
752 computation of niter expression. */
753 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
754 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
755 niter_assumptions,
756 fold_build1 (TRUTH_NOT_EXPR,
757 boolean_type_node,
758 may_be_zero));
759 else
760 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
761 build_int_cst (TREE_TYPE (niter), 0),
762 rewrite_to_non_trapping_overflow (niter));
764 may_be_zero = NULL_TREE;
766 else if (integer_nonzerop (may_be_zero))
768 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
769 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
770 return cond;
772 else
773 return cond;
776 *assumptions = niter_assumptions;
777 *number_of_iterationsm1 = niter;
779 /* We want the number of loop header executions which is the number
780 of latch executions plus one.
781 ??? For UINT_MAX latch executions this number overflows to zero
782 for loops like do { n++; } while (n != 0); */
783 if (niter && !chrec_contains_undetermined (niter))
784 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
785 build_int_cst (TREE_TYPE (niter), 1));
786 *number_of_iterations = niter;
788 return cond;
791 /* Function bb_in_loop_p
793 Used as predicate for dfs order traversal of the loop bbs. */
795 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data)
798 const struct loop *const loop = (const struct loop *)data;
799 if (flow_bb_inside_loop_p (loop, bb))
800 return true;
801 return false;
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
809 : vec_info (vec_info::loop, init_cost (loop_in), shared),
810 loop (loop_in),
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812 num_itersm1 (NULL_TREE),
813 num_iters (NULL_TREE),
814 num_iters_unchanged (NULL_TREE),
815 num_iters_assumptions (NULL_TREE),
816 th (0),
817 versioning_threshold (0),
818 vectorization_factor (0),
819 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE),
821 mask_compare_type (NULL_TREE),
822 unaligned_dr (NULL),
823 peeling_for_alignment (0),
824 ptr_mask (0),
825 ivexpr_map (NULL),
826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
829 can_fully_mask_p (true),
830 fully_masked_p (false),
831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop (NULL),
837 orig_loop_info (NULL)
839 /* CHECKME: We want to visit all BBs before their successors (except for
840 latch blocks, for which this assertion wouldn't hold). In the simple
841 case of the loop forms we allow, a dfs order of the BBs would the same
842 as reversed postorder traversal, so we are safe. */
844 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
845 bbs, loop->num_nodes, loop);
846 gcc_assert (nbbs == loop->num_nodes);
848 for (unsigned int i = 0; i < nbbs; i++)
850 basic_block bb = bbs[i];
851 gimple_stmt_iterator si;
853 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
855 gimple *phi = gsi_stmt (si);
856 gimple_set_uid (phi, 0);
857 add_stmt (phi);
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
862 gimple *stmt = gsi_stmt (si);
863 gimple_set_uid (stmt, 0);
864 add_stmt (stmt);
869 /* Free all levels of MASKS. */
871 void
872 release_vec_loop_masks (vec_loop_masks *masks)
874 rgroup_masks *rgm;
875 unsigned int i;
876 FOR_EACH_VEC_ELT (*masks, i, rgm)
877 rgm->masks.release ();
878 masks->release ();
881 /* Free all memory used by the _loop_vec_info, as well as all the
882 stmt_vec_info structs of all the stmts in the loop. */
884 _loop_vec_info::~_loop_vec_info ()
886 int nbbs;
887 gimple_stmt_iterator si;
888 int j;
890 nbbs = loop->num_nodes;
891 for (j = 0; j < nbbs; j++)
893 basic_block bb = bbs[j];
894 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
896 gimple *stmt = gsi_stmt (si);
898 /* We may have broken canonical form by moving a constant
899 into RHS1 of a commutative op. Fix such occurrences. */
900 if (operands_swapped && is_gimple_assign (stmt))
902 enum tree_code code = gimple_assign_rhs_code (stmt);
904 if ((code == PLUS_EXPR
905 || code == POINTER_PLUS_EXPR
906 || code == MULT_EXPR)
907 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
908 swap_ssa_operands (stmt,
909 gimple_assign_rhs1_ptr (stmt),
910 gimple_assign_rhs2_ptr (stmt));
911 else if (code == COND_EXPR
912 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
914 tree cond_expr = gimple_assign_rhs1 (stmt);
915 enum tree_code cond_code = TREE_CODE (cond_expr);
917 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
919 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
920 0));
921 cond_code = invert_tree_comparison (cond_code,
922 honor_nans);
923 if (cond_code != ERROR_MARK)
925 TREE_SET_CODE (cond_expr, cond_code);
926 swap_ssa_operands (stmt,
927 gimple_assign_rhs2_ptr (stmt),
928 gimple_assign_rhs3_ptr (stmt));
933 gsi_next (&si);
937 free (bbs);
939 release_vec_loop_masks (&masks);
940 delete ivexpr_map;
942 loop->aux = NULL;
945 /* Return an invariant or register for EXPR and emit necessary
946 computations in the LOOP_VINFO loop preheader. */
948 tree
949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
951 if (is_gimple_reg (expr)
952 || is_gimple_min_invariant (expr))
953 return expr;
955 if (! loop_vinfo->ivexpr_map)
956 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
957 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
958 if (! cached)
960 gimple_seq stmts = NULL;
961 cached = force_gimple_operand (unshare_expr (expr),
962 &stmts, true, NULL_TREE);
963 if (stmts)
965 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
966 gsi_insert_seq_on_edge_immediate (e, stmts);
969 return cached;
972 /* Return true if we can use CMP_TYPE as the comparison type to produce
973 all masks required to mask LOOP_VINFO. */
975 static bool
976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
978 rgroup_masks *rgm;
979 unsigned int i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
981 if (rgm->mask_type != NULL_TREE
982 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
983 cmp_type, rgm->mask_type,
984 OPTIMIZE_FOR_SPEED))
985 return false;
986 return true;
989 /* Calculate the maximum number of scalars per iteration for every
990 rgroup in LOOP_VINFO. */
992 static unsigned int
993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
995 unsigned int res = 1;
996 unsigned int i;
997 rgroup_masks *rgm;
998 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
999 res = MAX (res, rgm->max_nscalars_per_iter);
1000 return res;
1003 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1004 whether we can actually generate the masks required. Return true if so,
1005 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011 unsigned int min_ni_width;
1013 /* Use a normal loop if there are no statements that need masking.
1014 This only happens in rare degenerate cases: it means that the loop
1015 has no loads, no stores, and no live-out values. */
1016 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017 return false;
1019 /* Get the maximum number of iterations that is representable
1020 in the counter type. */
1021 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1024 /* Get a more refined estimate for the number of iterations. */
1025 widest_int max_back_edges;
1026 if (max_loop_iterations (loop, &max_back_edges))
1027 max_ni = wi::smin (max_ni, max_back_edges + 1);
1029 /* Account for rgroup masks, in which each bit is replicated N times. */
1030 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1032 /* Work out how many bits we need to represent the limit. */
1033 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1035 /* Find a scalar mode for which WHILE_ULT is supported. */
1036 opt_scalar_int_mode cmp_mode_iter;
1037 tree cmp_type = NULL_TREE;
1038 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1040 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041 if (cmp_bits >= min_ni_width
1042 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1044 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045 if (this_type
1046 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1048 /* Although we could stop as soon as we find a valid mode,
1049 it's often better to continue until we hit Pmode, since the
1050 operands to the WHILE are more likely to be reusable in
1051 address calculations. */
1052 cmp_type = this_type;
1053 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054 break;
1059 if (!cmp_type)
1060 return false;
1062 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063 return true;
1066 /* Calculate the cost of one scalar iteration of the loop. */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072 int nbbs = loop->num_nodes, factor;
1073 int innerloop_iters, i;
1075 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1077 /* Gather costs for statements in the scalar loop. */
1079 /* FORNOW. */
1080 innerloop_iters = 1;
1081 if (loop->inner)
1082 innerloop_iters = 50; /* FIXME */
1084 for (i = 0; i < nbbs; i++)
1086 gimple_stmt_iterator si;
1087 basic_block bb = bbs[i];
1089 if (bb->loop_father == loop->inner)
1090 factor = innerloop_iters;
1091 else
1092 factor = 1;
1094 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1096 gimple *stmt = gsi_stmt (si);
1097 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100 continue;
1102 /* Skip stmts that are not vectorized inside the loop. */
1103 if (stmt_info
1104 && !STMT_VINFO_RELEVANT_P (stmt_info)
1105 && (!STMT_VINFO_LIVE_P (stmt_info)
1106 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1107 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1108 continue;
1110 vect_cost_for_stmt kind;
1111 if (STMT_VINFO_DATA_REF (stmt_info))
1113 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114 kind = scalar_load;
1115 else
1116 kind = scalar_store;
1118 else
1119 kind = scalar_stmt;
1121 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122 factor, kind, stmt_info, 0, vect_prologue);
1126 /* Now accumulate cost. */
1127 void *target_cost_data = init_cost (loop);
1128 stmt_info_for_cost *si;
1129 int j;
1130 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131 j, si)
1132 (void) add_stmt_cost (target_cost_data, si->count,
1133 si->kind, si->stmt_info, si->misalign,
1134 vect_body);
1135 unsigned dummy, body_cost = 0;
1136 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137 destroy_cost_data (target_cost_data);
1138 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1142 /* Function vect_analyze_loop_form_1.
1144 Verify that certain CFG restrictions hold, including:
1145 - the loop has a pre-header
1146 - the loop has a single entry and exit
1147 - the loop exit condition is simple enough
1148 - the number of iterations can be analyzed, i.e, a countable loop. The
1149 niter could be analyzed under some assumptions. */
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153 tree *assumptions, tree *number_of_iterationsm1,
1154 tree *number_of_iterations, gcond **inner_loop_cond)
1156 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1158 /* Different restrictions apply when we are considering an inner-most loop,
1159 vs. an outer (nested) loop.
1160 (FORNOW. May want to relax some of these restrictions in the future). */
1162 if (!loop->inner)
1164 /* Inner-most loop. We currently require that the number of BBs is
1165 exactly 2 (the header and latch). Vectorizable inner-most loops
1166 look like this:
1168 (pre-header)
1170 header <--------+
1171 | | |
1172 | +--> latch --+
1174 (exit-bb) */
1176 if (loop->num_nodes != 2)
1177 return opt_result::failure_at (vect_location,
1178 "not vectorized:"
1179 " control flow in loop.\n");
1181 if (empty_block_p (loop->header))
1182 return opt_result::failure_at (vect_location,
1183 "not vectorized: empty loop.\n");
1185 else
1187 struct loop *innerloop = loop->inner;
1188 edge entryedge;
1190 /* Nested loop. We currently require that the loop is doubly-nested,
1191 contains a single inner loop, and the number of BBs is exactly 5.
1192 Vectorizable outer-loops look like this:
1194 (pre-header)
1196 header <---+
1198 inner-loop |
1200 tail ------+
1202 (exit-bb)
1204 The inner-loop has the properties expected of inner-most loops
1205 as described above. */
1207 if ((loop->inner)->inner || (loop->inner)->next)
1208 return opt_result::failure_at (vect_location,
1209 "not vectorized:"
1210 " multiple nested loops.\n");
1212 if (loop->num_nodes != 5)
1213 return opt_result::failure_at (vect_location,
1214 "not vectorized:"
1215 " control flow in loop.\n");
1217 entryedge = loop_preheader_edge (innerloop);
1218 if (entryedge->src != loop->header
1219 || !single_exit (innerloop)
1220 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221 return opt_result::failure_at (vect_location,
1222 "not vectorized:"
1223 " unsupported outerloop form.\n");
1225 /* Analyze the inner-loop. */
1226 tree inner_niterm1, inner_niter, inner_assumptions;
1227 opt_result res
1228 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229 &inner_assumptions, &inner_niterm1,
1230 &inner_niter, NULL);
1231 if (!res)
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "not vectorized: Bad inner loop.\n");
1236 return res;
1239 /* Don't support analyzing niter under assumptions for inner
1240 loop. */
1241 if (!integer_onep (inner_assumptions))
1242 return opt_result::failure_at (vect_location,
1243 "not vectorized: Bad inner loop.\n");
1245 if (!expr_invariant_in_loop_p (loop, inner_niter))
1246 return opt_result::failure_at (vect_location,
1247 "not vectorized: inner-loop count not"
1248 " invariant.\n");
1250 if (dump_enabled_p ())
1251 dump_printf_loc (MSG_NOTE, vect_location,
1252 "Considering outer-loop vectorization.\n");
1255 if (!single_exit (loop))
1256 return opt_result::failure_at (vect_location,
1257 "not vectorized: multiple exits.\n");
1258 if (EDGE_COUNT (loop->header->preds) != 2)
1259 return opt_result::failure_at (vect_location,
1260 "not vectorized:"
1261 " too many incoming edges.\n");
1263 /* We assume that the loop exit condition is at the end of the loop. i.e,
1264 that the loop is represented as a do-while (with a proper if-guard
1265 before the loop if needed), where the loop header contains all the
1266 executable statements, and the latch is empty. */
1267 if (!empty_block_p (loop->latch)
1268 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized: latch block not empty.\n");
1272 /* Make sure the exit is not abnormal. */
1273 edge e = single_exit (loop);
1274 if (e->flags & EDGE_ABNORMAL)
1275 return opt_result::failure_at (vect_location,
1276 "not vectorized:"
1277 " abnormal loop exit edge.\n");
1279 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280 number_of_iterationsm1);
1281 if (!*loop_cond)
1282 return opt_result::failure_at
1283 (vect_location,
1284 "not vectorized: complicated exit condition.\n");
1286 if (integer_zerop (*assumptions)
1287 || !*number_of_iterations
1288 || chrec_contains_undetermined (*number_of_iterations))
1289 return opt_result::failure_at
1290 (*loop_cond,
1291 "not vectorized: number of iterations cannot be computed.\n");
1293 if (integer_zerop (*number_of_iterations))
1294 return opt_result::failure_at
1295 (*loop_cond,
1296 "not vectorized: number of iterations = 0.\n");
1298 return opt_result::success ();
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1306 tree assumptions, number_of_iterations, number_of_iterationsm1;
1307 gcond *loop_cond, *inner_loop_cond = NULL;
1309 opt_result res
1310 = vect_analyze_loop_form_1 (loop, &loop_cond,
1311 &assumptions, &number_of_iterationsm1,
1312 &number_of_iterations, &inner_loop_cond);
1313 if (!res)
1314 return opt_loop_vec_info::propagate_failure (res);
1316 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320 if (!integer_onep (assumptions))
1322 /* We consider to vectorize this loop by versioning it under
1323 some assumptions. In order to do this, we need to clear
1324 existing information computed by scev and niter analyzer. */
1325 scev_reset_htab ();
1326 free_numbers_of_iterations_estimates (loop);
1327 /* Also set flag for this loop so that following scev and niter
1328 analysis are done under the assumptions. */
1329 loop_constraint_set (loop, LOOP_C_FINITE);
1330 /* Also record the assumptions for versioning. */
1331 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1336 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Symbolic number of iterations is ");
1340 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341 dump_printf (MSG_NOTE, "\n");
1345 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347 if (inner_loop_cond)
1349 stmt_vec_info inner_loop_cond_info
1350 = loop_vinfo->lookup_stmt (inner_loop_cond);
1351 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1354 gcc_assert (!loop->aux);
1355 loop->aux = loop_vinfo;
1356 return opt_loop_vec_info::success (loop_vinfo);
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362 statements update the vectorization factor. */
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369 int nbbs = loop->num_nodes;
1370 poly_uint64 vectorization_factor;
1371 int i;
1373 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1375 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376 gcc_assert (known_ne (vectorization_factor, 0U));
1378 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379 vectorization factor of the loop is the unrolling factor required by
1380 the SLP instances. If that unrolling factor is 1, we say, that we
1381 perform pure SLP on loop - cross iteration parallelism is not
1382 exploited. */
1383 bool only_slp_in_loop = true;
1384 for (i = 0; i < nbbs; i++)
1386 basic_block bb = bbs[i];
1387 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388 gsi_next (&si))
1390 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391 stmt_info = vect_stmt_to_vectorize (stmt_info);
1392 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394 && !PURE_SLP_STMT (stmt_info))
1395 /* STMT needs both SLP and loop-based vectorization. */
1396 only_slp_in_loop = false;
1400 if (only_slp_in_loop)
1402 dump_printf_loc (MSG_NOTE, vect_location,
1403 "Loop contains only SLP stmts\n");
1404 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406 else
1408 dump_printf_loc (MSG_NOTE, vect_location,
1409 "Loop contains SLP and non-SLP stmts\n");
1410 /* Both the vectorization factor and unroll factor have the form
1411 current_vector_size * X for some rational X, so they must have
1412 a common multiple. */
1413 vectorization_factor
1414 = force_common_multiple (vectorization_factor,
1415 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1418 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1419 if (dump_enabled_p ())
1421 dump_printf_loc (MSG_NOTE, vect_location,
1422 "Updating vectorization factor to ");
1423 dump_dec (MSG_NOTE, vectorization_factor);
1424 dump_printf (MSG_NOTE, ".\n");
1428 /* Return true if STMT_INFO describes a double reduction phi and if
1429 the other phi in the reduction is also relevant for vectorization.
1430 This rejects cases such as:
1432 outer1:
1433 x_1 = PHI <x_3(outer2), ...>;
1436 inner:
1437 x_2 = ...;
1440 outer2:
1441 x_3 = PHI <x_2(inner)>;
1443 if nothing in x_2 or elsewhere makes x_1 relevant. */
1445 static bool
1446 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1448 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1449 return false;
1451 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1454 /* Function vect_analyze_loop_operations.
1456 Scan the loop stmts and make sure they are all vectorizable. */
1458 static opt_result
1459 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1461 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1462 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1463 int nbbs = loop->num_nodes;
1464 int i;
1465 stmt_vec_info stmt_info;
1466 bool need_to_vectorize = false;
1467 bool ok;
1469 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1471 stmt_vector_for_cost cost_vec;
1472 cost_vec.create (2);
1474 for (i = 0; i < nbbs; i++)
1476 basic_block bb = bbs[i];
1478 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1479 gsi_next (&si))
1481 gphi *phi = si.phi ();
1482 ok = true;
1484 stmt_info = loop_vinfo->lookup_stmt (phi);
1485 if (dump_enabled_p ())
1486 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1487 if (virtual_operand_p (gimple_phi_result (phi)))
1488 continue;
1490 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1491 (i.e., a phi in the tail of the outer-loop). */
1492 if (! is_loop_header_bb_p (bb))
1494 /* FORNOW: we currently don't support the case that these phis
1495 are not used in the outerloop (unless it is double reduction,
1496 i.e., this phi is vect_reduction_def), cause this case
1497 requires to actually do something here. */
1498 if (STMT_VINFO_LIVE_P (stmt_info)
1499 && !vect_active_double_reduction_p (stmt_info))
1500 return opt_result::failure_at (phi,
1501 "Unsupported loop-closed phi"
1502 " in outer-loop.\n");
1504 /* If PHI is used in the outer loop, we check that its operand
1505 is defined in the inner loop. */
1506 if (STMT_VINFO_RELEVANT_P (stmt_info))
1508 tree phi_op;
1510 if (gimple_phi_num_args (phi) != 1)
1511 return opt_result::failure_at (phi, "unsupported phi");
1513 phi_op = PHI_ARG_DEF (phi, 0);
1514 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1515 if (!op_def_info)
1516 return opt_result::failure_at (phi, "unsupported phi");
1518 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1519 && (STMT_VINFO_RELEVANT (op_def_info)
1520 != vect_used_in_outer_by_reduction))
1521 return opt_result::failure_at (phi, "unsupported phi");
1524 continue;
1527 gcc_assert (stmt_info);
1529 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1530 || STMT_VINFO_LIVE_P (stmt_info))
1531 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1532 /* A scalar-dependence cycle that we don't support. */
1533 return opt_result::failure_at (phi,
1534 "not vectorized:"
1535 " scalar dependence cycle.\n");
1537 if (STMT_VINFO_RELEVANT_P (stmt_info))
1539 need_to_vectorize = true;
1540 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1541 && ! PURE_SLP_STMT (stmt_info))
1542 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1543 &cost_vec);
1544 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1545 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1546 && ! PURE_SLP_STMT (stmt_info))
1547 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1548 &cost_vec);
1551 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1552 if (ok
1553 && STMT_VINFO_LIVE_P (stmt_info)
1554 && !PURE_SLP_STMT (stmt_info))
1555 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1556 &cost_vec);
1558 if (!ok)
1559 return opt_result::failure_at (phi,
1560 "not vectorized: relevant phi not "
1561 "supported: %G",
1562 static_cast <gimple *> (phi));
1565 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1566 gsi_next (&si))
1568 gimple *stmt = gsi_stmt (si);
1569 if (!gimple_clobber_p (stmt))
1571 opt_result res
1572 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1573 &need_to_vectorize,
1574 NULL, NULL, &cost_vec);
1575 if (!res)
1576 return res;
1579 } /* bbs */
1581 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1582 cost_vec.release ();
1584 /* All operations in the loop are either irrelevant (deal with loop
1585 control, or dead), or only used outside the loop and can be moved
1586 out of the loop (e.g. invariants, inductions). The loop can be
1587 optimized away by scalar optimizations. We're better off not
1588 touching this loop. */
1589 if (!need_to_vectorize)
1591 if (dump_enabled_p ())
1592 dump_printf_loc (MSG_NOTE, vect_location,
1593 "All the computation can be taken out of the loop.\n");
1594 return opt_result::failure_at
1595 (vect_location,
1596 "not vectorized: redundant loop. no profit to vectorize.\n");
1599 return opt_result::success ();
1602 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1603 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1604 definitely no, or -1 if it's worth retrying. */
1606 static int
1607 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1609 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1610 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1612 /* Only fully-masked loops can have iteration counts less than the
1613 vectorization factor. */
1614 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1616 HOST_WIDE_INT max_niter;
1618 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1619 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1620 else
1621 max_niter = max_stmt_executions_int (loop);
1623 if (max_niter != -1
1624 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1626 if (dump_enabled_p ())
1627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628 "not vectorized: iteration count smaller than "
1629 "vectorization factor.\n");
1630 return 0;
1634 int min_profitable_iters, min_profitable_estimate;
1635 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1636 &min_profitable_estimate);
1638 if (min_profitable_iters < 0)
1640 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642 "not vectorized: vectorization not profitable.\n");
1643 if (dump_enabled_p ())
1644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645 "not vectorized: vector version will never be "
1646 "profitable.\n");
1647 return -1;
1650 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1651 * assumed_vf);
1653 /* Use the cost model only if it is more conservative than user specified
1654 threshold. */
1655 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1656 min_profitable_iters);
1658 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1660 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1661 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1663 if (dump_enabled_p ())
1664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665 "not vectorized: vectorization not profitable.\n");
1666 if (dump_enabled_p ())
1667 dump_printf_loc (MSG_NOTE, vect_location,
1668 "not vectorized: iteration count smaller than user "
1669 "specified loop bound parameter or minimum profitable "
1670 "iterations (whichever is more conservative).\n");
1671 return 0;
1674 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1675 if (estimated_niter == -1)
1676 estimated_niter = likely_max_stmt_executions_int (loop);
1677 if (estimated_niter != -1
1678 && ((unsigned HOST_WIDE_INT) estimated_niter
1679 < MAX (th, (unsigned) min_profitable_estimate)))
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "not vectorized: estimated iteration count too "
1684 "small.\n");
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_NOTE, vect_location,
1687 "not vectorized: estimated iteration count smaller "
1688 "than specified loop bound parameter or minimum "
1689 "profitable iterations (whichever is more "
1690 "conservative).\n");
1691 return -1;
1694 return 1;
1697 static opt_result
1698 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1699 vec<data_reference_p> *datarefs,
1700 unsigned int *n_stmts)
1702 *n_stmts = 0;
1703 for (unsigned i = 0; i < loop->num_nodes; i++)
1704 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1705 !gsi_end_p (gsi); gsi_next (&gsi))
1707 gimple *stmt = gsi_stmt (gsi);
1708 if (is_gimple_debug (stmt))
1709 continue;
1710 ++(*n_stmts);
1711 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1712 if (!res)
1714 if (is_gimple_call (stmt) && loop->safelen)
1716 tree fndecl = gimple_call_fndecl (stmt), op;
1717 if (fndecl != NULL_TREE)
1719 cgraph_node *node = cgraph_node::get (fndecl);
1720 if (node != NULL && node->simd_clones != NULL)
1722 unsigned int j, n = gimple_call_num_args (stmt);
1723 for (j = 0; j < n; j++)
1725 op = gimple_call_arg (stmt, j);
1726 if (DECL_P (op)
1727 || (REFERENCE_CLASS_P (op)
1728 && get_base_address (op)))
1729 break;
1731 op = gimple_call_lhs (stmt);
1732 /* Ignore #pragma omp declare simd functions
1733 if they don't have data references in the
1734 call stmt itself. */
1735 if (j == n
1736 && !(op
1737 && (DECL_P (op)
1738 || (REFERENCE_CLASS_P (op)
1739 && get_base_address (op)))))
1740 continue;
1744 return res;
1746 /* If dependence analysis will give up due to the limit on the
1747 number of datarefs stop here and fail fatally. */
1748 if (datarefs->length ()
1749 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1750 return opt_result::failure_at (stmt, "exceeded param "
1751 "loop-max-datarefs-for-datadeps\n");
1753 return opt_result::success ();
1756 /* Function vect_analyze_loop_2.
1758 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1759 for it. The different analyses will record information in the
1760 loop_vec_info struct. */
1761 static opt_result
1762 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1764 opt_result ok = opt_result::success ();
1765 int res;
1766 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1767 poly_uint64 min_vf = 2;
1769 /* The first group of checks is independent of the vector size. */
1770 fatal = true;
1772 /* Find all data references in the loop (which correspond to vdefs/vuses)
1773 and analyze their evolution in the loop. */
1775 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1777 /* Gather the data references and count stmts in the loop. */
1778 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1780 opt_result res
1781 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1782 &LOOP_VINFO_DATAREFS (loop_vinfo),
1783 n_stmts);
1784 if (!res)
1786 if (dump_enabled_p ())
1787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1788 "not vectorized: loop contains function "
1789 "calls or data references that cannot "
1790 "be analyzed\n");
1791 return res;
1793 loop_vinfo->shared->save_datarefs ();
1795 else
1796 loop_vinfo->shared->check_datarefs ();
1798 /* Analyze the data references and also adjust the minimal
1799 vectorization factor according to the loads and stores. */
1801 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1802 if (!ok)
1804 if (dump_enabled_p ())
1805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806 "bad data references.\n");
1807 return ok;
1810 /* Classify all cross-iteration scalar data-flow cycles.
1811 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1812 vect_analyze_scalar_cycles (loop_vinfo);
1814 vect_pattern_recog (loop_vinfo);
1816 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1818 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1819 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1821 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1822 if (!ok)
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 "bad data access.\n");
1827 return ok;
1830 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1832 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1833 if (!ok)
1835 if (dump_enabled_p ())
1836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837 "unexpected pattern.\n");
1838 return ok;
1841 /* While the rest of the analysis below depends on it in some way. */
1842 fatal = false;
1844 /* Analyze data dependences between the data-refs in the loop
1845 and adjust the maximum vectorization factor according to
1846 the dependences.
1847 FORNOW: fail at the first data dependence that we encounter. */
1849 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1850 if (!ok)
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "bad data dependence.\n");
1855 return ok;
1857 if (max_vf != MAX_VECTORIZATION_FACTOR
1858 && maybe_lt (max_vf, min_vf))
1859 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1860 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1862 ok = vect_determine_vectorization_factor (loop_vinfo);
1863 if (!ok)
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867 "can't determine vectorization factor.\n");
1868 return ok;
1870 if (max_vf != MAX_VECTORIZATION_FACTOR
1871 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1872 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1874 /* Compute the scalar iteration cost. */
1875 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1877 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1878 unsigned th;
1880 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1881 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1882 if (!ok)
1883 return ok;
1885 /* If there are any SLP instances mark them as pure_slp. */
1886 bool slp = vect_make_slp_decision (loop_vinfo);
1887 if (slp)
1889 /* Find stmts that need to be both vectorized and SLPed. */
1890 vect_detect_hybrid_slp (loop_vinfo);
1892 /* Update the vectorization factor based on the SLP decision. */
1893 vect_update_vf_for_slp (loop_vinfo);
1896 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1898 /* We don't expect to have to roll back to anything other than an empty
1899 set of rgroups. */
1900 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1902 /* This is the point where we can re-start analysis with SLP forced off. */
1903 start_over:
1905 /* Now the vectorization factor is final. */
1906 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907 gcc_assert (known_ne (vectorization_factor, 0U));
1909 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE, vect_location,
1912 "vectorization_factor = ");
1913 dump_dec (MSG_NOTE, vectorization_factor);
1914 dump_printf (MSG_NOTE, ", niters = %wd\n",
1915 LOOP_VINFO_INT_NITERS (loop_vinfo));
1918 HOST_WIDE_INT max_niter
1919 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1921 /* Analyze the alignment of the data-refs in the loop.
1922 Fail if a data reference is found that cannot be vectorized. */
1924 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1925 if (!ok)
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "bad data alignment.\n");
1930 return ok;
1933 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1934 It is important to call pruning after vect_analyze_data_ref_accesses,
1935 since we use grouping information gathered by interleaving analysis. */
1936 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1937 if (!ok)
1938 return ok;
1940 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1941 vectorization, since we do not want to add extra peeling or
1942 add versioning for alignment. */
1943 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944 /* This pass will decide on using loop versioning and/or loop peeling in
1945 order to enhance the alignment of data references in the loop. */
1946 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1947 else
1948 ok = vect_verify_datarefs_alignment (loop_vinfo);
1949 if (!ok)
1950 return ok;
1952 if (slp)
1954 /* Analyze operations in the SLP instances. Note this may
1955 remove unsupported SLP instances which makes the above
1956 SLP kind detection invalid. */
1957 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1958 vect_slp_analyze_operations (loop_vinfo);
1959 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1961 ok = opt_result::failure_at (vect_location,
1962 "unsupported SLP instances\n");
1963 goto again;
1967 /* Scan all the remaining operations in the loop that are not subject
1968 to SLP and make sure they are vectorizable. */
1969 ok = vect_analyze_loop_operations (loop_vinfo);
1970 if (!ok)
1972 if (dump_enabled_p ())
1973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974 "bad operation or unsupported loop bound.\n");
1975 return ok;
1978 /* Decide whether to use a fully-masked loop for this vectorization
1979 factor. */
1980 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1981 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1982 && vect_verify_full_masking (loop_vinfo));
1983 if (dump_enabled_p ())
1985 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1986 dump_printf_loc (MSG_NOTE, vect_location,
1987 "using a fully-masked loop.\n");
1988 else
1989 dump_printf_loc (MSG_NOTE, vect_location,
1990 "not using a fully-masked loop.\n");
1993 /* If epilog loop is required because of data accesses with gaps,
1994 one additional iteration needs to be peeled. Check if there is
1995 enough iterations for vectorization. */
1996 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1997 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1998 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2000 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2003 if (known_lt (wi::to_widest (scalar_niters), vf))
2004 return opt_result::failure_at (vect_location,
2005 "loop has no enough iterations to"
2006 " support peeling for gaps.\n");
2009 /* Check the costings of the loop make vectorizing worthwhile. */
2010 res = vect_analyze_loop_costing (loop_vinfo);
2011 if (res < 0)
2013 ok = opt_result::failure_at (vect_location,
2014 "Loop costings may not be worthwhile.\n");
2015 goto again;
2017 if (!res)
2018 return opt_result::failure_at (vect_location,
2019 "Loop costings not worthwhile.\n");
2021 /* Decide whether we need to create an epilogue loop to handle
2022 remaining scalar iterations. */
2023 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2025 unsigned HOST_WIDE_INT const_vf;
2026 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027 /* The main loop handles all iterations. */
2028 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2029 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2030 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2032 /* Work out the (constant) number of iterations that need to be
2033 peeled for reasons other than niters. */
2034 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2035 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2036 peel_niter += 1;
2037 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2038 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2039 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2041 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2042 /* ??? When peeling for gaps but not alignment, we could
2043 try to check whether the (variable) niters is known to be
2044 VF * N + 1. That's something of a niche case though. */
2045 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2046 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2047 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2048 < (unsigned) exact_log2 (const_vf))
2049 /* In case of versioning, check if the maximum number of
2050 iterations is greater than th. If they are identical,
2051 the epilogue is unnecessary. */
2052 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2053 || ((unsigned HOST_WIDE_INT) max_niter
2054 > (th / const_vf) * const_vf))))
2055 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2057 /* If an epilogue loop is required make sure we can create one. */
2058 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2063 if (!vect_can_advance_ivs_p (loop_vinfo)
2064 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2065 single_exit (LOOP_VINFO_LOOP
2066 (loop_vinfo))))
2068 ok = opt_result::failure_at (vect_location,
2069 "not vectorized: can't create required "
2070 "epilog loop\n");
2071 goto again;
2075 /* During peeling, we need to check if number of loop iterations is
2076 enough for both peeled prolog loop and vector loop. This check
2077 can be merged along with threshold check of loop versioning, so
2078 increase threshold for this case if necessary. */
2079 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2081 poly_uint64 niters_th = 0;
2083 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2085 /* Niters for peeled prolog loop. */
2086 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2088 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2089 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2090 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2092 else
2093 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2096 /* Niters for at least one iteration of vectorized loop. */
2097 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2098 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2099 /* One additional iteration because of peeling for gap. */
2100 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2101 niters_th += 1;
2102 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2105 gcc_assert (known_eq (vectorization_factor,
2106 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2108 /* Ok to vectorize! */
2109 return opt_result::success ();
2111 again:
2112 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2113 gcc_assert (!ok);
2115 /* Try again with SLP forced off but if we didn't do any SLP there is
2116 no point in re-trying. */
2117 if (!slp)
2118 return ok;
2120 /* If there are reduction chains re-trying will fail anyway. */
2121 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2122 return ok;
2124 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2125 via interleaving or lane instructions. */
2126 slp_instance instance;
2127 slp_tree node;
2128 unsigned i, j;
2129 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2131 stmt_vec_info vinfo;
2132 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2133 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2134 continue;
2135 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2136 unsigned int size = DR_GROUP_SIZE (vinfo);
2137 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2138 if (! vect_store_lanes_supported (vectype, size, false)
2139 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2140 && ! vect_grouped_store_supported (vectype, size))
2141 return opt_result::failure_at (vinfo->stmt,
2142 "unsupported grouped store\n");
2143 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2145 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2146 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2147 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2148 size = DR_GROUP_SIZE (vinfo);
2149 vectype = STMT_VINFO_VECTYPE (vinfo);
2150 if (! vect_load_lanes_supported (vectype, size, false)
2151 && ! vect_grouped_load_supported (vectype, single_element_p,
2152 size))
2153 return opt_result::failure_at (vinfo->stmt,
2154 "unsupported grouped load\n");
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE, vect_location,
2160 "re-trying with SLP disabled\n");
2162 /* Roll back state appropriately. No SLP this time. */
2163 slp = false;
2164 /* Restore vectorization factor as it were without SLP. */
2165 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2166 /* Free the SLP instances. */
2167 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2168 vect_free_slp_instance (instance, false);
2169 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2170 /* Reset SLP type to loop_vect on all stmts. */
2171 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2173 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2174 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2175 !gsi_end_p (si); gsi_next (&si))
2177 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2178 STMT_SLP_TYPE (stmt_info) = loop_vect;
2180 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2181 !gsi_end_p (si); gsi_next (&si))
2183 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2184 STMT_SLP_TYPE (stmt_info) = loop_vect;
2185 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2187 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2188 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2189 STMT_SLP_TYPE (stmt_info) = loop_vect;
2190 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2191 !gsi_end_p (pi); gsi_next (&pi))
2192 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2193 = loop_vect;
2197 /* Free optimized alias test DDRS. */
2198 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2199 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2200 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2201 /* Reset target cost data. */
2202 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2203 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2204 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2205 /* Reset accumulated rgroup information. */
2206 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2207 /* Reset assorted flags. */
2208 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2209 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2210 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2211 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2212 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2214 goto start_over;
2217 /* Function vect_analyze_loop.
2219 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2220 for it. The different analyses will record information in the
2221 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2222 be vectorized. */
2223 opt_loop_vec_info
2224 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2225 vec_info_shared *shared)
2227 auto_vector_sizes vector_sizes;
2229 /* Autodetect first vector size we try. */
2230 current_vector_size = 0;
2231 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2232 unsigned int next_size = 0;
2234 DUMP_VECT_SCOPE ("analyze_loop_nest");
2236 if (loop_outer (loop)
2237 && loop_vec_info_for_loop (loop_outer (loop))
2238 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2239 return opt_loop_vec_info::failure_at (vect_location,
2240 "outer-loop already vectorized.\n");
2242 if (!find_loop_nest (loop, &shared->loop_nest))
2243 return opt_loop_vec_info::failure_at
2244 (vect_location,
2245 "not vectorized: loop nest containing two or more consecutive inner"
2246 " loops cannot be vectorized\n");
2248 unsigned n_stmts = 0;
2249 poly_uint64 autodetected_vector_size = 0;
2250 while (1)
2252 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2253 opt_loop_vec_info loop_vinfo
2254 = vect_analyze_loop_form (loop, shared);
2255 if (!loop_vinfo)
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "bad loop form.\n");
2260 return loop_vinfo;
2263 bool fatal = false;
2265 if (orig_loop_vinfo)
2266 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2268 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2269 if (res)
2271 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2273 return loop_vinfo;
2276 delete loop_vinfo;
2278 if (next_size == 0)
2279 autodetected_vector_size = current_vector_size;
2281 if (next_size < vector_sizes.length ()
2282 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2283 next_size += 1;
2285 if (fatal
2286 || next_size == vector_sizes.length ()
2287 || known_eq (current_vector_size, 0U))
2288 return opt_loop_vec_info::propagate_failure (res);
2290 /* Try the next biggest vector size. */
2291 current_vector_size = vector_sizes[next_size++];
2292 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE, vect_location,
2295 "***** Re-trying analysis with "
2296 "vector size ");
2297 dump_dec (MSG_NOTE, current_vector_size);
2298 dump_printf (MSG_NOTE, "\n");
2303 /* Return true if there is an in-order reduction function for CODE, storing
2304 it in *REDUC_FN if so. */
2306 static bool
2307 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2309 switch (code)
2311 case PLUS_EXPR:
2312 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2313 return true;
2315 default:
2316 return false;
2320 /* Function reduction_fn_for_scalar_code
2322 Input:
2323 CODE - tree_code of a reduction operations.
2325 Output:
2326 REDUC_FN - the corresponding internal function to be used to reduce the
2327 vector of partial results into a single scalar result, or IFN_LAST
2328 if the operation is a supported reduction operation, but does not have
2329 such an internal function.
2331 Return FALSE if CODE currently cannot be vectorized as reduction. */
2333 static bool
2334 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2336 switch (code)
2338 case MAX_EXPR:
2339 *reduc_fn = IFN_REDUC_MAX;
2340 return true;
2342 case MIN_EXPR:
2343 *reduc_fn = IFN_REDUC_MIN;
2344 return true;
2346 case PLUS_EXPR:
2347 *reduc_fn = IFN_REDUC_PLUS;
2348 return true;
2350 case BIT_AND_EXPR:
2351 *reduc_fn = IFN_REDUC_AND;
2352 return true;
2354 case BIT_IOR_EXPR:
2355 *reduc_fn = IFN_REDUC_IOR;
2356 return true;
2358 case BIT_XOR_EXPR:
2359 *reduc_fn = IFN_REDUC_XOR;
2360 return true;
2362 case MULT_EXPR:
2363 case MINUS_EXPR:
2364 *reduc_fn = IFN_LAST;
2365 return true;
2367 default:
2368 return false;
2372 /* If there is a neutral value X such that SLP reduction NODE would not
2373 be affected by the introduction of additional X elements, return that X,
2374 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2375 is true if the SLP statements perform a single reduction, false if each
2376 statement performs an independent reduction. */
2378 static tree
2379 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2380 bool reduc_chain)
2382 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2383 stmt_vec_info stmt_vinfo = stmts[0];
2384 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2385 tree scalar_type = TREE_TYPE (vector_type);
2386 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2387 gcc_assert (loop);
2389 switch (code)
2391 case WIDEN_SUM_EXPR:
2392 case DOT_PROD_EXPR:
2393 case SAD_EXPR:
2394 case PLUS_EXPR:
2395 case MINUS_EXPR:
2396 case BIT_IOR_EXPR:
2397 case BIT_XOR_EXPR:
2398 return build_zero_cst (scalar_type);
2400 case MULT_EXPR:
2401 return build_one_cst (scalar_type);
2403 case BIT_AND_EXPR:
2404 return build_all_ones_cst (scalar_type);
2406 case MAX_EXPR:
2407 case MIN_EXPR:
2408 /* For MIN/MAX the initial values are neutral. A reduction chain
2409 has only a single initial value, so that value is neutral for
2410 all statements. */
2411 if (reduc_chain)
2412 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2413 loop_preheader_edge (loop));
2414 return NULL_TREE;
2416 default:
2417 return NULL_TREE;
2421 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2422 STMT is printed with a message MSG. */
2424 static void
2425 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2427 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2430 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2431 operation. Return true if the results of DEF_STMT_INFO are something
2432 that can be accumulated by such a reduction. */
2434 static bool
2435 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2437 return (is_gimple_assign (def_stmt_info->stmt)
2438 || is_gimple_call (def_stmt_info->stmt)
2439 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2440 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2441 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2442 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2445 /* Detect SLP reduction of the form:
2447 #a1 = phi <a5, a0>
2448 a2 = operation (a1)
2449 a3 = operation (a2)
2450 a4 = operation (a3)
2451 a5 = operation (a4)
2453 #a = phi <a5>
2455 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2456 FIRST_STMT is the first reduction stmt in the chain
2457 (a2 = operation (a1)).
2459 Return TRUE if a reduction chain was detected. */
2461 static bool
2462 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2463 gimple *first_stmt)
2465 struct loop *loop = (gimple_bb (phi))->loop_father;
2466 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2467 enum tree_code code;
2468 gimple *loop_use_stmt = NULL;
2469 stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2470 tree lhs;
2471 imm_use_iterator imm_iter;
2472 use_operand_p use_p;
2473 int nloop_uses, size = 0, n_out_of_loop_uses;
2474 bool found = false;
2476 if (loop != vect_loop)
2477 return false;
2479 lhs = PHI_RESULT (phi);
2480 code = gimple_assign_rhs_code (first_stmt);
2481 while (1)
2483 nloop_uses = 0;
2484 n_out_of_loop_uses = 0;
2485 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2487 gimple *use_stmt = USE_STMT (use_p);
2488 if (is_gimple_debug (use_stmt))
2489 continue;
2491 /* Check if we got back to the reduction phi. */
2492 if (use_stmt == phi)
2494 loop_use_stmt = use_stmt;
2495 found = true;
2496 break;
2499 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2501 loop_use_stmt = use_stmt;
2502 nloop_uses++;
2504 else
2505 n_out_of_loop_uses++;
2507 /* There are can be either a single use in the loop or two uses in
2508 phi nodes. */
2509 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2510 return false;
2513 if (found)
2514 break;
2516 /* We reached a statement with no loop uses. */
2517 if (nloop_uses == 0)
2518 return false;
2520 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2521 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2522 return false;
2524 if (!is_gimple_assign (loop_use_stmt)
2525 || code != gimple_assign_rhs_code (loop_use_stmt)
2526 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2527 return false;
2529 /* Insert USE_STMT into reduction chain. */
2530 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2531 if (current_stmt_info)
2533 REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2534 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2535 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2537 else
2538 REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2540 lhs = gimple_assign_lhs (loop_use_stmt);
2541 current_stmt_info = use_stmt_info;
2542 size++;
2545 if (!found || loop_use_stmt != phi || size < 2)
2546 return false;
2548 /* Swap the operands, if needed, to make the reduction operand be the second
2549 operand. */
2550 lhs = PHI_RESULT (phi);
2551 stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2552 while (next_stmt_info)
2554 gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2555 if (gimple_assign_rhs2 (next_stmt) == lhs)
2557 tree op = gimple_assign_rhs1 (next_stmt);
2558 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2560 /* Check that the other def is either defined in the loop
2561 ("vect_internal_def"), or it's an induction (defined by a
2562 loop-header phi-node). */
2563 if (def_stmt_info
2564 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2565 && vect_valid_reduction_input_p (def_stmt_info))
2567 lhs = gimple_assign_lhs (next_stmt);
2568 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2569 continue;
2572 return false;
2574 else
2576 tree op = gimple_assign_rhs2 (next_stmt);
2577 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2579 /* Check that the other def is either defined in the loop
2580 ("vect_internal_def"), or it's an induction (defined by a
2581 loop-header phi-node). */
2582 if (def_stmt_info
2583 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2584 && vect_valid_reduction_input_p (def_stmt_info))
2586 if (dump_enabled_p ())
2587 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2588 next_stmt);
2590 swap_ssa_operands (next_stmt,
2591 gimple_assign_rhs1_ptr (next_stmt),
2592 gimple_assign_rhs2_ptr (next_stmt));
2593 update_stmt (next_stmt);
2595 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2596 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2598 else
2599 return false;
2602 lhs = gimple_assign_lhs (next_stmt);
2603 next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2606 /* Save the chain for further analysis in SLP detection. */
2607 stmt_vec_info first_stmt_info
2608 = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2609 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2610 REDUC_GROUP_SIZE (first_stmt_info) = size;
2612 return true;
2615 /* Return true if we need an in-order reduction for operation CODE
2616 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2617 overflow must wrap. */
2619 static bool
2620 needs_fold_left_reduction_p (tree type, tree_code code,
2621 bool need_wrapping_integral_overflow)
2623 /* CHECKME: check for !flag_finite_math_only too? */
2624 if (SCALAR_FLOAT_TYPE_P (type))
2625 switch (code)
2627 case MIN_EXPR:
2628 case MAX_EXPR:
2629 return false;
2631 default:
2632 return !flag_associative_math;
2635 if (INTEGRAL_TYPE_P (type))
2637 if (!operation_no_trapping_overflow (type, code))
2638 return true;
2639 if (need_wrapping_integral_overflow
2640 && !TYPE_OVERFLOW_WRAPS (type)
2641 && operation_can_overflow (code))
2642 return true;
2643 return false;
2646 if (SAT_FIXED_POINT_TYPE_P (type))
2647 return true;
2649 return false;
2652 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2653 reduction operation CODE has a handled computation expression. */
2655 bool
2656 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2657 tree loop_arg, enum tree_code code)
2659 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2660 auto_bitmap visited;
2661 tree lookfor = PHI_RESULT (phi);
2662 ssa_op_iter curri;
2663 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2664 while (USE_FROM_PTR (curr) != loop_arg)
2665 curr = op_iter_next_use (&curri);
2666 curri.i = curri.numops;
2669 path.safe_push (std::make_pair (curri, curr));
2670 tree use = USE_FROM_PTR (curr);
2671 if (use == lookfor)
2672 break;
2673 gimple *def = SSA_NAME_DEF_STMT (use);
2674 if (gimple_nop_p (def)
2675 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2677 pop:
2680 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2681 curri = x.first;
2682 curr = x.second;
2684 curr = op_iter_next_use (&curri);
2685 /* Skip already visited or non-SSA operands (from iterating
2686 over PHI args). */
2687 while (curr != NULL_USE_OPERAND_P
2688 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2689 || ! bitmap_set_bit (visited,
2690 SSA_NAME_VERSION
2691 (USE_FROM_PTR (curr)))));
2693 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2694 if (curr == NULL_USE_OPERAND_P)
2695 break;
2697 else
2699 if (gimple_code (def) == GIMPLE_PHI)
2700 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2701 else
2702 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2703 while (curr != NULL_USE_OPERAND_P
2704 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2705 || ! bitmap_set_bit (visited,
2706 SSA_NAME_VERSION
2707 (USE_FROM_PTR (curr)))))
2708 curr = op_iter_next_use (&curri);
2709 if (curr == NULL_USE_OPERAND_P)
2710 goto pop;
2713 while (1);
2714 if (dump_file && (dump_flags & TDF_DETAILS))
2716 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2717 unsigned i;
2718 std::pair<ssa_op_iter, use_operand_p> *x;
2719 FOR_EACH_VEC_ELT (path, i, x)
2720 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2721 dump_printf (MSG_NOTE, "\n");
2724 /* Check whether the reduction path detected is valid. */
2725 bool fail = path.length () == 0;
2726 bool neg = false;
2727 for (unsigned i = 1; i < path.length (); ++i)
2729 gimple *use_stmt = USE_STMT (path[i].second);
2730 tree op = USE_FROM_PTR (path[i].second);
2731 if (! has_single_use (op)
2732 || ! is_gimple_assign (use_stmt))
2734 fail = true;
2735 break;
2737 if (gimple_assign_rhs_code (use_stmt) != code)
2739 if (code == PLUS_EXPR
2740 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2742 /* Track whether we negate the reduction value each iteration. */
2743 if (gimple_assign_rhs2 (use_stmt) == op)
2744 neg = ! neg;
2746 else
2748 fail = true;
2749 break;
2753 return ! fail && ! neg;
2757 /* Function vect_is_simple_reduction
2759 (1) Detect a cross-iteration def-use cycle that represents a simple
2760 reduction computation. We look for the following pattern:
2762 loop_header:
2763 a1 = phi < a0, a2 >
2764 a3 = ...
2765 a2 = operation (a3, a1)
2769 a3 = ...
2770 loop_header:
2771 a1 = phi < a0, a2 >
2772 a2 = operation (a3, a1)
2774 such that:
2775 1. operation is commutative and associative and it is safe to
2776 change the order of the computation
2777 2. no uses for a2 in the loop (a2 is used out of the loop)
2778 3. no uses of a1 in the loop besides the reduction operation
2779 4. no uses of a1 outside the loop.
2781 Conditions 1,4 are tested here.
2782 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2784 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2785 nested cycles.
2787 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2788 reductions:
2790 a1 = phi < a0, a2 >
2791 inner loop (def of a3)
2792 a2 = phi < a3 >
2794 (4) Detect condition expressions, ie:
2795 for (int i = 0; i < N; i++)
2796 if (a[i] < val)
2797 ret_val = a[i];
2801 static stmt_vec_info
2802 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2803 bool *double_reduc,
2804 bool need_wrapping_integral_overflow,
2805 enum vect_reduction_type *v_reduc_type)
2807 gphi *phi = as_a <gphi *> (phi_info->stmt);
2808 struct loop *loop = (gimple_bb (phi))->loop_father;
2809 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2810 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2811 gimple *phi_use_stmt = NULL;
2812 enum tree_code orig_code, code;
2813 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2814 tree type;
2815 tree name;
2816 imm_use_iterator imm_iter;
2817 use_operand_p use_p;
2818 bool phi_def;
2820 *double_reduc = false;
2821 *v_reduc_type = TREE_CODE_REDUCTION;
2823 tree phi_name = PHI_RESULT (phi);
2824 /* ??? If there are no uses of the PHI result the inner loop reduction
2825 won't be detected as possibly double-reduction by vectorizable_reduction
2826 because that tries to walk the PHI arg from the preheader edge which
2827 can be constant. See PR60382. */
2828 if (has_zero_uses (phi_name))
2829 return NULL;
2830 unsigned nphi_def_loop_uses = 0;
2831 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2833 gimple *use_stmt = USE_STMT (use_p);
2834 if (is_gimple_debug (use_stmt))
2835 continue;
2837 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2839 if (dump_enabled_p ())
2840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2841 "intermediate value used outside loop.\n");
2843 return NULL;
2846 nphi_def_loop_uses++;
2847 phi_use_stmt = use_stmt;
2850 edge latch_e = loop_latch_edge (loop);
2851 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2852 if (TREE_CODE (loop_arg) != SSA_NAME)
2854 if (dump_enabled_p ())
2855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2856 "reduction: not ssa_name: %T\n", loop_arg);
2857 return NULL;
2860 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2861 if (!def_stmt_info
2862 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2863 return NULL;
2865 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2867 name = gimple_assign_lhs (def_stmt);
2868 phi_def = false;
2870 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2872 name = PHI_RESULT (def_stmt);
2873 phi_def = true;
2875 else
2877 if (dump_enabled_p ())
2878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2879 "reduction: unhandled reduction operation: %G",
2880 def_stmt_info->stmt);
2881 return NULL;
2884 unsigned nlatch_def_loop_uses = 0;
2885 auto_vec<gphi *, 3> lcphis;
2886 bool inner_loop_of_double_reduc = false;
2887 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2889 gimple *use_stmt = USE_STMT (use_p);
2890 if (is_gimple_debug (use_stmt))
2891 continue;
2892 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2893 nlatch_def_loop_uses++;
2894 else
2896 /* We can have more than one loop-closed PHI. */
2897 lcphis.safe_push (as_a <gphi *> (use_stmt));
2898 if (nested_in_vect_loop
2899 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2900 == vect_double_reduction_def))
2901 inner_loop_of_double_reduc = true;
2905 /* If this isn't a nested cycle or if the nested cycle reduction value
2906 is used ouside of the inner loop we cannot handle uses of the reduction
2907 value. */
2908 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2909 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2911 if (dump_enabled_p ())
2912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2913 "reduction used in loop.\n");
2914 return NULL;
2917 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2918 defined in the inner loop. */
2919 if (phi_def)
2921 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2922 op1 = PHI_ARG_DEF (def_stmt, 0);
2924 if (gimple_phi_num_args (def_stmt) != 1
2925 || TREE_CODE (op1) != SSA_NAME)
2927 if (dump_enabled_p ())
2928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2929 "unsupported phi node definition.\n");
2931 return NULL;
2934 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2935 if (gimple_bb (def1)
2936 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2937 && loop->inner
2938 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2939 && is_gimple_assign (def1)
2940 && is_a <gphi *> (phi_use_stmt)
2941 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2943 if (dump_enabled_p ())
2944 report_vect_op (MSG_NOTE, def_stmt,
2945 "detected double reduction: ");
2947 *double_reduc = true;
2948 return def_stmt_info;
2951 return NULL;
2954 /* If we are vectorizing an inner reduction we are executing that
2955 in the original order only in case we are not dealing with a
2956 double reduction. */
2957 bool check_reduction = true;
2958 if (flow_loop_nested_p (vect_loop, loop))
2960 gphi *lcphi;
2961 unsigned i;
2962 check_reduction = false;
2963 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2964 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2966 gimple *use_stmt = USE_STMT (use_p);
2967 if (is_gimple_debug (use_stmt))
2968 continue;
2969 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2970 check_reduction = true;
2974 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2975 code = orig_code = gimple_assign_rhs_code (def_stmt);
2977 if (nested_in_vect_loop && !check_reduction)
2979 /* FIXME: Even for non-reductions code generation is funneled
2980 through vectorizable_reduction for the stmt defining the
2981 PHI latch value. So we have to artificially restrict ourselves
2982 for the supported operations. */
2983 switch (get_gimple_rhs_class (code))
2985 case GIMPLE_BINARY_RHS:
2986 case GIMPLE_TERNARY_RHS:
2987 break;
2988 default:
2989 /* Not supported by vectorizable_reduction. */
2990 if (dump_enabled_p ())
2991 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2992 "nested cycle: not handled operation: ");
2993 return NULL;
2995 if (dump_enabled_p ())
2996 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2997 return def_stmt_info;
3000 /* We can handle "res -= x[i]", which is non-associative by
3001 simply rewriting this into "res += -x[i]". Avoid changing
3002 gimple instruction for the first simple tests and only do this
3003 if we're allowed to change code at all. */
3004 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3005 code = PLUS_EXPR;
3007 if (code == COND_EXPR)
3009 if (! nested_in_vect_loop)
3010 *v_reduc_type = COND_REDUCTION;
3012 op3 = gimple_assign_rhs1 (def_stmt);
3013 if (COMPARISON_CLASS_P (op3))
3015 op4 = TREE_OPERAND (op3, 1);
3016 op3 = TREE_OPERAND (op3, 0);
3018 if (op3 == phi_name || op4 == phi_name)
3020 if (dump_enabled_p ())
3021 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3022 "reduction: condition depends on previous"
3023 " iteration: ");
3024 return NULL;
3027 op1 = gimple_assign_rhs2 (def_stmt);
3028 op2 = gimple_assign_rhs3 (def_stmt);
3030 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3032 if (dump_enabled_p ())
3033 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3034 "reduction: not commutative/associative: ");
3035 return NULL;
3037 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3039 op1 = gimple_assign_rhs1 (def_stmt);
3040 op2 = gimple_assign_rhs2 (def_stmt);
3042 else
3044 if (dump_enabled_p ())
3045 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3046 "reduction: not handled operation: ");
3047 return NULL;
3050 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3052 if (dump_enabled_p ())
3053 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3054 "reduction: both uses not ssa_names: ");
3056 return NULL;
3059 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3060 if ((TREE_CODE (op1) == SSA_NAME
3061 && !types_compatible_p (type,TREE_TYPE (op1)))
3062 || (TREE_CODE (op2) == SSA_NAME
3063 && !types_compatible_p (type, TREE_TYPE (op2)))
3064 || (op3 && TREE_CODE (op3) == SSA_NAME
3065 && !types_compatible_p (type, TREE_TYPE (op3)))
3066 || (op4 && TREE_CODE (op4) == SSA_NAME
3067 && !types_compatible_p (type, TREE_TYPE (op4))))
3069 if (dump_enabled_p ())
3071 dump_printf_loc (MSG_NOTE, vect_location,
3072 "reduction: multiple types: operation type: "
3073 "%T, operands types: %T,%T",
3074 type, TREE_TYPE (op1), TREE_TYPE (op2));
3075 if (op3)
3076 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3078 if (op4)
3079 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3080 dump_printf (MSG_NOTE, "\n");
3083 return NULL;
3086 /* Check whether it's ok to change the order of the computation.
3087 Generally, when vectorizing a reduction we change the order of the
3088 computation. This may change the behavior of the program in some
3089 cases, so we need to check that this is ok. One exception is when
3090 vectorizing an outer-loop: the inner-loop is executed sequentially,
3091 and therefore vectorizing reductions in the inner-loop during
3092 outer-loop vectorization is safe. */
3093 if (check_reduction
3094 && *v_reduc_type == TREE_CODE_REDUCTION
3095 && needs_fold_left_reduction_p (type, code,
3096 need_wrapping_integral_overflow))
3097 *v_reduc_type = FOLD_LEFT_REDUCTION;
3099 /* Reduction is safe. We're dealing with one of the following:
3100 1) integer arithmetic and no trapv
3101 2) floating point arithmetic, and special flags permit this optimization
3102 3) nested cycle (i.e., outer loop vectorization). */
3103 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3104 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3105 if (code != COND_EXPR && !def1_info && !def2_info)
3107 if (dump_enabled_p ())
3108 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3109 return NULL;
3112 /* Check that one def is the reduction def, defined by PHI,
3113 the other def is either defined in the loop ("vect_internal_def"),
3114 or it's an induction (defined by a loop-header phi-node). */
3116 if (def2_info
3117 && def2_info->stmt == phi
3118 && (code == COND_EXPR
3119 || !def1_info
3120 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3121 || vect_valid_reduction_input_p (def1_info)))
3123 if (dump_enabled_p ())
3124 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3125 return def_stmt_info;
3128 if (def1_info
3129 && def1_info->stmt == phi
3130 && (code == COND_EXPR
3131 || !def2_info
3132 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3133 || vect_valid_reduction_input_p (def2_info)))
3135 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3137 /* Check if we can swap operands (just for simplicity - so that
3138 the rest of the code can assume that the reduction variable
3139 is always the last (second) argument). */
3140 if (code == COND_EXPR)
3142 /* Swap cond_expr by inverting the condition. */
3143 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3144 enum tree_code invert_code = ERROR_MARK;
3145 enum tree_code cond_code = TREE_CODE (cond_expr);
3147 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3149 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3150 invert_code = invert_tree_comparison (cond_code, honor_nans);
3152 if (invert_code != ERROR_MARK)
3154 TREE_SET_CODE (cond_expr, invert_code);
3155 swap_ssa_operands (def_stmt,
3156 gimple_assign_rhs2_ptr (def_stmt),
3157 gimple_assign_rhs3_ptr (def_stmt));
3159 else
3161 if (dump_enabled_p ())
3162 report_vect_op (MSG_NOTE, def_stmt,
3163 "detected reduction: cannot swap operands "
3164 "for cond_expr");
3165 return NULL;
3168 else
3169 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3170 gimple_assign_rhs2_ptr (def_stmt));
3172 if (dump_enabled_p ())
3173 report_vect_op (MSG_NOTE, def_stmt,
3174 "detected reduction: need to swap operands: ");
3176 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3177 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3179 else
3181 if (dump_enabled_p ())
3182 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3185 return def_stmt_info;
3188 /* Try to find SLP reduction chain. */
3189 if (! nested_in_vect_loop
3190 && code != COND_EXPR
3191 && orig_code != MINUS_EXPR
3192 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3194 if (dump_enabled_p ())
3195 report_vect_op (MSG_NOTE, def_stmt,
3196 "reduction: detected reduction chain: ");
3198 return def_stmt_info;
3201 /* Dissolve group eventually half-built by vect_is_slp_reduction. */
3202 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3203 while (first)
3205 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3206 REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3207 REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3208 first = next;
3211 /* Look for the expression computing loop_arg from loop PHI result. */
3212 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3213 return def_stmt_info;
3215 if (dump_enabled_p ())
3217 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3218 "reduction: unknown pattern: ");
3221 return NULL;
3224 /* Wrapper around vect_is_simple_reduction, which will modify code
3225 in-place if it enables detection of more reductions. Arguments
3226 as there. */
3228 stmt_vec_info
3229 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3230 bool *double_reduc,
3231 bool need_wrapping_integral_overflow)
3233 enum vect_reduction_type v_reduc_type;
3234 stmt_vec_info def_info
3235 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3236 need_wrapping_integral_overflow,
3237 &v_reduc_type);
3238 if (def_info)
3240 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3241 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3242 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3243 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3245 return def_info;
3248 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3250 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3251 int *peel_iters_epilogue,
3252 stmt_vector_for_cost *scalar_cost_vec,
3253 stmt_vector_for_cost *prologue_cost_vec,
3254 stmt_vector_for_cost *epilogue_cost_vec)
3256 int retval = 0;
3257 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3259 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3261 *peel_iters_epilogue = assumed_vf / 2;
3262 if (dump_enabled_p ())
3263 dump_printf_loc (MSG_NOTE, vect_location,
3264 "cost model: epilogue peel iters set to vf/2 "
3265 "because loop iterations are unknown .\n");
3267 /* If peeled iterations are known but number of scalar loop
3268 iterations are unknown, count a taken branch per peeled loop. */
3269 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3270 NULL, 0, vect_prologue);
3271 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3272 NULL, 0, vect_epilogue);
3274 else
3276 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3277 peel_iters_prologue = niters < peel_iters_prologue ?
3278 niters : peel_iters_prologue;
3279 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3280 /* If we need to peel for gaps, but no peeling is required, we have to
3281 peel VF iterations. */
3282 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3283 *peel_iters_epilogue = assumed_vf;
3286 stmt_info_for_cost *si;
3287 int j;
3288 if (peel_iters_prologue)
3289 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3290 retval += record_stmt_cost (prologue_cost_vec,
3291 si->count * peel_iters_prologue,
3292 si->kind, si->stmt_info, si->misalign,
3293 vect_prologue);
3294 if (*peel_iters_epilogue)
3295 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3296 retval += record_stmt_cost (epilogue_cost_vec,
3297 si->count * *peel_iters_epilogue,
3298 si->kind, si->stmt_info, si->misalign,
3299 vect_epilogue);
3301 return retval;
3304 /* Function vect_estimate_min_profitable_iters
3306 Return the number of iterations required for the vector version of the
3307 loop to be profitable relative to the cost of the scalar version of the
3308 loop.
3310 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3311 of iterations for vectorization. -1 value means loop vectorization
3312 is not profitable. This returned value may be used for dynamic
3313 profitability check.
3315 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3316 for static check against estimated number of iterations. */
3318 static void
3319 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3320 int *ret_min_profitable_niters,
3321 int *ret_min_profitable_estimate)
3323 int min_profitable_iters;
3324 int min_profitable_estimate;
3325 int peel_iters_prologue;
3326 int peel_iters_epilogue;
3327 unsigned vec_inside_cost = 0;
3328 int vec_outside_cost = 0;
3329 unsigned vec_prologue_cost = 0;
3330 unsigned vec_epilogue_cost = 0;
3331 int scalar_single_iter_cost = 0;
3332 int scalar_outside_cost = 0;
3333 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3334 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3335 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3337 /* Cost model disabled. */
3338 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3340 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3341 *ret_min_profitable_niters = 0;
3342 *ret_min_profitable_estimate = 0;
3343 return;
3346 /* Requires loop versioning tests to handle misalignment. */
3347 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3349 /* FIXME: Make cost depend on complexity of individual check. */
3350 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3351 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3352 vect_prologue);
3353 dump_printf (MSG_NOTE,
3354 "cost model: Adding cost of checks for loop "
3355 "versioning to treat misalignment.\n");
3358 /* Requires loop versioning with alias checks. */
3359 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3361 /* FIXME: Make cost depend on complexity of individual check. */
3362 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3363 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3364 vect_prologue);
3365 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3366 if (len)
3367 /* Count LEN - 1 ANDs and LEN comparisons. */
3368 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3369 NULL, 0, vect_prologue);
3370 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3371 if (len)
3373 /* Count LEN - 1 ANDs and LEN comparisons. */
3374 unsigned int nstmts = len * 2 - 1;
3375 /* +1 for each bias that needs adding. */
3376 for (unsigned int i = 0; i < len; ++i)
3377 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3378 nstmts += 1;
3379 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3380 NULL, 0, vect_prologue);
3382 dump_printf (MSG_NOTE,
3383 "cost model: Adding cost of checks for loop "
3384 "versioning aliasing.\n");
3387 /* Requires loop versioning with niter checks. */
3388 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3390 /* FIXME: Make cost depend on complexity of individual check. */
3391 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3392 vect_prologue);
3393 dump_printf (MSG_NOTE,
3394 "cost model: Adding cost of checks for loop "
3395 "versioning niters.\n");
3398 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3399 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3400 vect_prologue);
3402 /* Count statements in scalar loop. Using this as scalar cost for a single
3403 iteration for now.
3405 TODO: Add outer loop support.
3407 TODO: Consider assigning different costs to different scalar
3408 statements. */
3410 scalar_single_iter_cost
3411 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3413 /* Add additional cost for the peeled instructions in prologue and epilogue
3414 loop. (For fully-masked loops there will be no peeling.)
3416 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3417 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3419 TODO: Build an expression that represents peel_iters for prologue and
3420 epilogue to be used in a run-time test. */
3422 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3424 peel_iters_prologue = 0;
3425 peel_iters_epilogue = 0;
3427 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3429 /* We need to peel exactly one iteration. */
3430 peel_iters_epilogue += 1;
3431 stmt_info_for_cost *si;
3432 int j;
3433 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3434 j, si)
3435 (void) add_stmt_cost (target_cost_data, si->count,
3436 si->kind, si->stmt_info, si->misalign,
3437 vect_epilogue);
3440 else if (npeel < 0)
3442 peel_iters_prologue = assumed_vf / 2;
3443 dump_printf (MSG_NOTE, "cost model: "
3444 "prologue peel iters set to vf/2.\n");
3446 /* If peeling for alignment is unknown, loop bound of main loop becomes
3447 unknown. */
3448 peel_iters_epilogue = assumed_vf / 2;
3449 dump_printf (MSG_NOTE, "cost model: "
3450 "epilogue peel iters set to vf/2 because "
3451 "peeling for alignment is unknown.\n");
3453 /* If peeled iterations are unknown, count a taken branch and a not taken
3454 branch per peeled loop. Even if scalar loop iterations are known,
3455 vector iterations are not known since peeled prologue iterations are
3456 not known. Hence guards remain the same. */
3457 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3458 NULL, 0, vect_prologue);
3459 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3460 NULL, 0, vect_prologue);
3461 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3462 NULL, 0, vect_epilogue);
3463 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3464 NULL, 0, vect_epilogue);
3465 stmt_info_for_cost *si;
3466 int j;
3467 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3469 (void) add_stmt_cost (target_cost_data,
3470 si->count * peel_iters_prologue,
3471 si->kind, si->stmt_info, si->misalign,
3472 vect_prologue);
3473 (void) add_stmt_cost (target_cost_data,
3474 si->count * peel_iters_epilogue,
3475 si->kind, si->stmt_info, si->misalign,
3476 vect_epilogue);
3479 else
3481 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3482 stmt_info_for_cost *si;
3483 int j;
3484 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3486 prologue_cost_vec.create (2);
3487 epilogue_cost_vec.create (2);
3488 peel_iters_prologue = npeel;
3490 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3491 &peel_iters_epilogue,
3492 &LOOP_VINFO_SCALAR_ITERATION_COST
3493 (loop_vinfo),
3494 &prologue_cost_vec,
3495 &epilogue_cost_vec);
3497 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3498 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3499 si->misalign, vect_prologue);
3501 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3502 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3503 si->misalign, vect_epilogue);
3505 prologue_cost_vec.release ();
3506 epilogue_cost_vec.release ();
3509 /* FORNOW: The scalar outside cost is incremented in one of the
3510 following ways:
3512 1. The vectorizer checks for alignment and aliasing and generates
3513 a condition that allows dynamic vectorization. A cost model
3514 check is ANDED with the versioning condition. Hence scalar code
3515 path now has the added cost of the versioning check.
3517 if (cost > th & versioning_check)
3518 jmp to vector code
3520 Hence run-time scalar is incremented by not-taken branch cost.
3522 2. The vectorizer then checks if a prologue is required. If the
3523 cost model check was not done before during versioning, it has to
3524 be done before the prologue check.
3526 if (cost <= th)
3527 prologue = scalar_iters
3528 if (prologue == 0)
3529 jmp to vector code
3530 else
3531 execute prologue
3532 if (prologue == num_iters)
3533 go to exit
3535 Hence the run-time scalar cost is incremented by a taken branch,
3536 plus a not-taken branch, plus a taken branch cost.
3538 3. The vectorizer then checks if an epilogue is required. If the
3539 cost model check was not done before during prologue check, it
3540 has to be done with the epilogue check.
3542 if (prologue == 0)
3543 jmp to vector code
3544 else
3545 execute prologue
3546 if (prologue == num_iters)
3547 go to exit
3548 vector code:
3549 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3550 jmp to epilogue
3552 Hence the run-time scalar cost should be incremented by 2 taken
3553 branches.
3555 TODO: The back end may reorder the BBS's differently and reverse
3556 conditions/branch directions. Change the estimates below to
3557 something more reasonable. */
3559 /* If the number of iterations is known and we do not do versioning, we can
3560 decide whether to vectorize at compile time. Hence the scalar version
3561 do not carry cost model guard costs. */
3562 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3563 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3565 /* Cost model check occurs at versioning. */
3566 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3567 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3568 else
3570 /* Cost model check occurs at prologue generation. */
3571 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3572 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3573 + vect_get_stmt_cost (cond_branch_not_taken);
3574 /* Cost model check occurs at epilogue generation. */
3575 else
3576 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3580 /* Complete the target-specific cost calculations. */
3581 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3582 &vec_inside_cost, &vec_epilogue_cost);
3584 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3586 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3589 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3590 vec_inside_cost);
3591 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3592 vec_prologue_cost);
3593 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3594 vec_epilogue_cost);
3595 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3596 scalar_single_iter_cost);
3597 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3598 scalar_outside_cost);
3599 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3600 vec_outside_cost);
3601 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3602 peel_iters_prologue);
3603 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3604 peel_iters_epilogue);
3607 /* Calculate number of iterations required to make the vector version
3608 profitable, relative to the loop bodies only. The following condition
3609 must hold true:
3610 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3611 where
3612 SIC = scalar iteration cost, VIC = vector iteration cost,
3613 VOC = vector outside cost, VF = vectorization factor,
3614 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3615 SOC = scalar outside cost for run time cost model check. */
3617 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3619 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3620 * assumed_vf
3621 - vec_inside_cost * peel_iters_prologue
3622 - vec_inside_cost * peel_iters_epilogue);
3623 if (min_profitable_iters <= 0)
3624 min_profitable_iters = 0;
3625 else
3627 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3628 - vec_inside_cost);
3630 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3631 <= (((int) vec_inside_cost * min_profitable_iters)
3632 + (((int) vec_outside_cost - scalar_outside_cost)
3633 * assumed_vf)))
3634 min_profitable_iters++;
3637 /* vector version will never be profitable. */
3638 else
3640 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3641 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3642 "vectorization did not happen for a simd loop");
3644 if (dump_enabled_p ())
3645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3646 "cost model: the vector iteration cost = %d "
3647 "divided by the scalar iteration cost = %d "
3648 "is greater or equal to the vectorization factor = %d"
3649 ".\n",
3650 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3651 *ret_min_profitable_niters = -1;
3652 *ret_min_profitable_estimate = -1;
3653 return;
3656 dump_printf (MSG_NOTE,
3657 " Calculated minimum iters for profitability: %d\n",
3658 min_profitable_iters);
3660 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3661 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3662 /* We want the vectorized loop to execute at least once. */
3663 min_profitable_iters = assumed_vf + peel_iters_prologue;
3665 if (dump_enabled_p ())
3666 dump_printf_loc (MSG_NOTE, vect_location,
3667 " Runtime profitability threshold = %d\n",
3668 min_profitable_iters);
3670 *ret_min_profitable_niters = min_profitable_iters;
3672 /* Calculate number of iterations required to make the vector version
3673 profitable, relative to the loop bodies only.
3675 Non-vectorized variant is SIC * niters and it must win over vector
3676 variant on the expected loop trip count. The following condition must hold true:
3677 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3679 if (vec_outside_cost <= 0)
3680 min_profitable_estimate = 0;
3681 else
3683 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3684 * assumed_vf
3685 - vec_inside_cost * peel_iters_prologue
3686 - vec_inside_cost * peel_iters_epilogue)
3687 / ((scalar_single_iter_cost * assumed_vf)
3688 - vec_inside_cost);
3690 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3691 if (dump_enabled_p ())
3692 dump_printf_loc (MSG_NOTE, vect_location,
3693 " Static estimate profitability threshold = %d\n",
3694 min_profitable_estimate);
3696 *ret_min_profitable_estimate = min_profitable_estimate;
3699 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3700 vector elements (not bits) for a vector with NELT elements. */
3701 static void
3702 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3703 vec_perm_builder *sel)
3705 /* The encoding is a single stepped pattern. Any wrap-around is handled
3706 by vec_perm_indices. */
3707 sel->new_vector (nelt, 1, 3);
3708 for (unsigned int i = 0; i < 3; i++)
3709 sel->quick_push (i + offset);
3712 /* Checks whether the target supports whole-vector shifts for vectors of mode
3713 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3714 it supports vec_perm_const with masks for all necessary shift amounts. */
3715 static bool
3716 have_whole_vector_shift (machine_mode mode)
3718 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3719 return true;
3721 /* Variable-length vectors should be handled via the optab. */
3722 unsigned int nelt;
3723 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3724 return false;
3726 vec_perm_builder sel;
3727 vec_perm_indices indices;
3728 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3730 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3731 indices.new_vector (sel, 2, nelt);
3732 if (!can_vec_perm_const_p (mode, indices, false))
3733 return false;
3735 return true;
3738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3739 functions. Design better to avoid maintenance issues. */
3741 /* Function vect_model_reduction_cost.
3743 Models cost for a reduction operation, including the vector ops
3744 generated within the strip-mine loop, the initial definition before
3745 the loop, and the epilogue code that must be generated. */
3747 static void
3748 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3749 int ncopies, stmt_vector_for_cost *cost_vec)
3751 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3752 enum tree_code code;
3753 optab optab;
3754 tree vectype;
3755 machine_mode mode;
3756 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3757 struct loop *loop = NULL;
3759 if (loop_vinfo)
3760 loop = LOOP_VINFO_LOOP (loop_vinfo);
3762 /* Condition reductions generate two reductions in the loop. */
3763 vect_reduction_type reduction_type
3764 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3765 if (reduction_type == COND_REDUCTION)
3766 ncopies *= 2;
3768 vectype = STMT_VINFO_VECTYPE (stmt_info);
3769 mode = TYPE_MODE (vectype);
3770 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3772 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3774 if (reduction_type == EXTRACT_LAST_REDUCTION
3775 || reduction_type == FOLD_LEFT_REDUCTION)
3777 /* No extra instructions needed in the prologue. */
3778 prologue_cost = 0;
3780 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3781 /* Count one reduction-like operation per vector. */
3782 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3783 stmt_info, 0, vect_body);
3784 else
3786 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3787 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3788 inside_cost = record_stmt_cost (cost_vec, nelements,
3789 vec_to_scalar, stmt_info, 0,
3790 vect_body);
3791 inside_cost += record_stmt_cost (cost_vec, nelements,
3792 scalar_stmt, stmt_info, 0,
3793 vect_body);
3796 else
3798 /* Add in cost for initial definition.
3799 For cond reduction we have four vectors: initial index, step,
3800 initial result of the data reduction, initial value of the index
3801 reduction. */
3802 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3803 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3804 scalar_to_vec, stmt_info, 0,
3805 vect_prologue);
3807 /* Cost of reduction op inside loop. */
3808 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3809 stmt_info, 0, vect_body);
3812 /* Determine cost of epilogue code.
3814 We have a reduction operator that will reduce the vector in one statement.
3815 Also requires scalar extract. */
3817 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3819 if (reduc_fn != IFN_LAST)
3821 if (reduction_type == COND_REDUCTION)
3823 /* An EQ stmt and an COND_EXPR stmt. */
3824 epilogue_cost += record_stmt_cost (cost_vec, 2,
3825 vector_stmt, stmt_info, 0,
3826 vect_epilogue);
3827 /* Reduction of the max index and a reduction of the found
3828 values. */
3829 epilogue_cost += record_stmt_cost (cost_vec, 2,
3830 vec_to_scalar, stmt_info, 0,
3831 vect_epilogue);
3832 /* A broadcast of the max value. */
3833 epilogue_cost += record_stmt_cost (cost_vec, 1,
3834 scalar_to_vec, stmt_info, 0,
3835 vect_epilogue);
3837 else
3839 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3840 stmt_info, 0, vect_epilogue);
3841 epilogue_cost += record_stmt_cost (cost_vec, 1,
3842 vec_to_scalar, stmt_info, 0,
3843 vect_epilogue);
3846 else if (reduction_type == COND_REDUCTION)
3848 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3849 /* Extraction of scalar elements. */
3850 epilogue_cost += record_stmt_cost (cost_vec,
3851 2 * estimated_nunits,
3852 vec_to_scalar, stmt_info, 0,
3853 vect_epilogue);
3854 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3855 epilogue_cost += record_stmt_cost (cost_vec,
3856 2 * estimated_nunits - 3,
3857 scalar_stmt, stmt_info, 0,
3858 vect_epilogue);
3860 else if (reduction_type == EXTRACT_LAST_REDUCTION
3861 || reduction_type == FOLD_LEFT_REDUCTION)
3862 /* No extra instructions need in the epilogue. */
3864 else
3866 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3867 tree bitsize =
3868 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3869 int element_bitsize = tree_to_uhwi (bitsize);
3870 int nelements = vec_size_in_bits / element_bitsize;
3872 if (code == COND_EXPR)
3873 code = MAX_EXPR;
3875 optab = optab_for_tree_code (code, vectype, optab_default);
3877 /* We have a whole vector shift available. */
3878 if (optab != unknown_optab
3879 && VECTOR_MODE_P (mode)
3880 && optab_handler (optab, mode) != CODE_FOR_nothing
3881 && have_whole_vector_shift (mode))
3883 /* Final reduction via vector shifts and the reduction operator.
3884 Also requires scalar extract. */
3885 epilogue_cost += record_stmt_cost (cost_vec,
3886 exact_log2 (nelements) * 2,
3887 vector_stmt, stmt_info, 0,
3888 vect_epilogue);
3889 epilogue_cost += record_stmt_cost (cost_vec, 1,
3890 vec_to_scalar, stmt_info, 0,
3891 vect_epilogue);
3893 else
3894 /* Use extracts and reduction op for final reduction. For N
3895 elements, we have N extracts and N-1 reduction ops. */
3896 epilogue_cost += record_stmt_cost (cost_vec,
3897 nelements + nelements - 1,
3898 vector_stmt, stmt_info, 0,
3899 vect_epilogue);
3903 if (dump_enabled_p ())
3904 dump_printf (MSG_NOTE,
3905 "vect_model_reduction_cost: inside_cost = %d, "
3906 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3907 prologue_cost, epilogue_cost);
3911 /* Function vect_model_induction_cost.
3913 Models cost for induction operations. */
3915 static void
3916 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3917 stmt_vector_for_cost *cost_vec)
3919 unsigned inside_cost, prologue_cost;
3921 if (PURE_SLP_STMT (stmt_info))
3922 return;
3924 /* loop cost for vec_loop. */
3925 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3926 stmt_info, 0, vect_body);
3928 /* prologue cost for vec_init and vec_step. */
3929 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3930 stmt_info, 0, vect_prologue);
3932 if (dump_enabled_p ())
3933 dump_printf_loc (MSG_NOTE, vect_location,
3934 "vect_model_induction_cost: inside_cost = %d, "
3935 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3940 /* Function get_initial_def_for_reduction
3942 Input:
3943 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3944 INIT_VAL - the initial value of the reduction variable
3946 Output:
3947 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3948 of the reduction (used for adjusting the epilog - see below).
3949 Return a vector variable, initialized according to the operation that
3950 STMT_VINFO performs. This vector will be used as the initial value
3951 of the vector of partial results.
3953 Option1 (adjust in epilog): Initialize the vector as follows:
3954 add/bit or/xor: [0,0,...,0,0]
3955 mult/bit and: [1,1,...,1,1]
3956 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3957 and when necessary (e.g. add/mult case) let the caller know
3958 that it needs to adjust the result by init_val.
3960 Option2: Initialize the vector as follows:
3961 add/bit or/xor: [init_val,0,0,...,0]
3962 mult/bit and: [init_val,1,1,...,1]
3963 min/max/cond_expr: [init_val,init_val,...,init_val]
3964 and no adjustments are needed.
3966 For example, for the following code:
3968 s = init_val;
3969 for (i=0;i<n;i++)
3970 s = s + a[i];
3972 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3973 For a vector of 4 units, we want to return either [0,0,0,init_val],
3974 or [0,0,0,0] and let the caller know that it needs to adjust
3975 the result at the end by 'init_val'.
3977 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3978 initialization vector is simpler (same element in all entries), if
3979 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3981 A cost model should help decide between these two schemes. */
3983 tree
3984 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
3985 tree *adjustment_def)
3987 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3988 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3989 tree scalar_type = TREE_TYPE (init_val);
3990 tree vectype = get_vectype_for_scalar_type (scalar_type);
3991 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
3992 tree def_for_init;
3993 tree init_def;
3994 REAL_VALUE_TYPE real_init_val = dconst0;
3995 int int_init_val = 0;
3996 gimple_seq stmts = NULL;
3998 gcc_assert (vectype);
4000 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4001 || SCALAR_FLOAT_TYPE_P (scalar_type));
4003 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4004 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4006 vect_reduction_type reduction_type
4007 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4009 switch (code)
4011 case WIDEN_SUM_EXPR:
4012 case DOT_PROD_EXPR:
4013 case SAD_EXPR:
4014 case PLUS_EXPR:
4015 case MINUS_EXPR:
4016 case BIT_IOR_EXPR:
4017 case BIT_XOR_EXPR:
4018 case MULT_EXPR:
4019 case BIT_AND_EXPR:
4021 /* ADJUSTMENT_DEF is NULL when called from
4022 vect_create_epilog_for_reduction to vectorize double reduction. */
4023 if (adjustment_def)
4024 *adjustment_def = init_val;
4026 if (code == MULT_EXPR)
4028 real_init_val = dconst1;
4029 int_init_val = 1;
4032 if (code == BIT_AND_EXPR)
4033 int_init_val = -1;
4035 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4036 def_for_init = build_real (scalar_type, real_init_val);
4037 else
4038 def_for_init = build_int_cst (scalar_type, int_init_val);
4040 if (adjustment_def)
4041 /* Option1: the first element is '0' or '1' as well. */
4042 init_def = gimple_build_vector_from_val (&stmts, vectype,
4043 def_for_init);
4044 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4046 /* Option2 (variable length): the first element is INIT_VAL. */
4047 init_def = gimple_build_vector_from_val (&stmts, vectype,
4048 def_for_init);
4049 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4050 vectype, init_def, init_val);
4052 else
4054 /* Option2: the first element is INIT_VAL. */
4055 tree_vector_builder elts (vectype, 1, 2);
4056 elts.quick_push (init_val);
4057 elts.quick_push (def_for_init);
4058 init_def = gimple_build_vector (&stmts, &elts);
4061 break;
4063 case MIN_EXPR:
4064 case MAX_EXPR:
4065 case COND_EXPR:
4067 if (adjustment_def)
4069 *adjustment_def = NULL_TREE;
4070 if (reduction_type != COND_REDUCTION
4071 && reduction_type != EXTRACT_LAST_REDUCTION)
4073 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4074 break;
4077 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4078 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4080 break;
4082 default:
4083 gcc_unreachable ();
4086 if (stmts)
4087 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4088 return init_def;
4091 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4092 NUMBER_OF_VECTORS is the number of vector defs to create.
4093 If NEUTRAL_OP is nonnull, introducing extra elements of that
4094 value will not change the result. */
4096 static void
4097 get_initial_defs_for_reduction (slp_tree slp_node,
4098 vec<tree> *vec_oprnds,
4099 unsigned int number_of_vectors,
4100 bool reduc_chain, tree neutral_op)
4102 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4103 stmt_vec_info stmt_vinfo = stmts[0];
4104 unsigned HOST_WIDE_INT nunits;
4105 unsigned j, number_of_places_left_in_vector;
4106 tree vector_type;
4107 tree vop;
4108 int group_size = stmts.length ();
4109 unsigned int vec_num, i;
4110 unsigned number_of_copies = 1;
4111 vec<tree> voprnds;
4112 voprnds.create (number_of_vectors);
4113 struct loop *loop;
4114 auto_vec<tree, 16> permute_results;
4116 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4118 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4120 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4121 gcc_assert (loop);
4122 edge pe = loop_preheader_edge (loop);
4124 gcc_assert (!reduc_chain || neutral_op);
4126 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4127 created vectors. It is greater than 1 if unrolling is performed.
4129 For example, we have two scalar operands, s1 and s2 (e.g., group of
4130 strided accesses of size two), while NUNITS is four (i.e., four scalars
4131 of this type can be packed in a vector). The output vector will contain
4132 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4133 will be 2).
4135 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4136 vectors containing the operands.
4138 For example, NUNITS is four as before, and the group size is 8
4139 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4140 {s5, s6, s7, s8}. */
4142 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4143 nunits = group_size;
4145 number_of_copies = nunits * number_of_vectors / group_size;
4147 number_of_places_left_in_vector = nunits;
4148 bool constant_p = true;
4149 tree_vector_builder elts (vector_type, nunits, 1);
4150 elts.quick_grow (nunits);
4151 for (j = 0; j < number_of_copies; j++)
4153 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4155 tree op;
4156 /* Get the def before the loop. In reduction chain we have only
4157 one initial value. */
4158 if ((j != (number_of_copies - 1)
4159 || (reduc_chain && i != 0))
4160 && neutral_op)
4161 op = neutral_op;
4162 else
4163 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4165 /* Create 'vect_ = {op0,op1,...,opn}'. */
4166 number_of_places_left_in_vector--;
4167 elts[number_of_places_left_in_vector] = op;
4168 if (!CONSTANT_CLASS_P (op))
4169 constant_p = false;
4171 if (number_of_places_left_in_vector == 0)
4173 gimple_seq ctor_seq = NULL;
4174 tree init;
4175 if (constant_p && !neutral_op
4176 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4177 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4178 /* Build the vector directly from ELTS. */
4179 init = gimple_build_vector (&ctor_seq, &elts);
4180 else if (neutral_op)
4182 /* Build a vector of the neutral value and shift the
4183 other elements into place. */
4184 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4185 neutral_op);
4186 int k = nunits;
4187 while (k > 0 && elts[k - 1] == neutral_op)
4188 k -= 1;
4189 while (k > 0)
4191 k -= 1;
4192 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4193 vector_type, init, elts[k]);
4196 else
4198 /* First time round, duplicate ELTS to fill the
4199 required number of vectors, then cherry pick the
4200 appropriate result for each iteration. */
4201 if (vec_oprnds->is_empty ())
4202 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4203 number_of_vectors,
4204 permute_results);
4205 init = permute_results[number_of_vectors - j - 1];
4207 if (ctor_seq != NULL)
4208 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4209 voprnds.quick_push (init);
4211 number_of_places_left_in_vector = nunits;
4212 elts.new_vector (vector_type, nunits, 1);
4213 elts.quick_grow (nunits);
4214 constant_p = true;
4219 /* Since the vectors are created in the reverse order, we should invert
4220 them. */
4221 vec_num = voprnds.length ();
4222 for (j = vec_num; j != 0; j--)
4224 vop = voprnds[j - 1];
4225 vec_oprnds->quick_push (vop);
4228 voprnds.release ();
4230 /* In case that VF is greater than the unrolling factor needed for the SLP
4231 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4232 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4233 to replicate the vectors. */
4234 tree neutral_vec = NULL;
4235 while (number_of_vectors > vec_oprnds->length ())
4237 if (neutral_op)
4239 if (!neutral_vec)
4241 gimple_seq ctor_seq = NULL;
4242 neutral_vec = gimple_build_vector_from_val
4243 (&ctor_seq, vector_type, neutral_op);
4244 if (ctor_seq != NULL)
4245 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4247 vec_oprnds->quick_push (neutral_vec);
4249 else
4251 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4252 vec_oprnds->quick_push (vop);
4258 /* Function vect_create_epilog_for_reduction
4260 Create code at the loop-epilog to finalize the result of a reduction
4261 computation.
4263 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4264 reduction statements.
4265 STMT_INFO is the scalar reduction stmt that is being vectorized.
4266 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4267 number of elements that we can fit in a vectype (nunits). In this case
4268 we have to generate more than one vector stmt - i.e - we need to "unroll"
4269 the vector stmt by a factor VF/nunits. For more details see documentation
4270 in vectorizable_operation.
4271 REDUC_FN is the internal function for the epilog reduction.
4272 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4273 computation.
4274 REDUC_INDEX is the index of the operand in the right hand side of the
4275 statement that is defined by REDUCTION_PHI.
4276 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4277 SLP_NODE is an SLP node containing a group of reduction statements. The
4278 first one in this group is STMT_INFO.
4279 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4280 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4281 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4282 any value of the IV in the loop.
4283 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4284 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4285 null if this is not an SLP reduction
4287 This function:
4288 1. Creates the reduction def-use cycles: sets the arguments for
4289 REDUCTION_PHIS:
4290 The loop-entry argument is the vectorized initial-value of the reduction.
4291 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4292 sums.
4293 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4294 by calling the function specified by REDUC_FN if available, or by
4295 other means (whole-vector shifts or a scalar loop).
4296 The function also creates a new phi node at the loop exit to preserve
4297 loop-closed form, as illustrated below.
4299 The flow at the entry to this function:
4301 loop:
4302 vec_def = phi <null, null> # REDUCTION_PHI
4303 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4304 s_loop = scalar_stmt # (scalar) STMT_INFO
4305 loop_exit:
4306 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4307 use <s_out0>
4308 use <s_out0>
4310 The above is transformed by this function into:
4312 loop:
4313 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4314 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4315 s_loop = scalar_stmt # (scalar) STMT_INFO
4316 loop_exit:
4317 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4318 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4319 v_out2 = reduce <v_out1>
4320 s_out3 = extract_field <v_out2, 0>
4321 s_out4 = adjust_result <s_out3>
4322 use <s_out4>
4323 use <s_out4>
4326 static void
4327 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4328 stmt_vec_info stmt_info,
4329 gimple *reduc_def_stmt,
4330 int ncopies, internal_fn reduc_fn,
4331 vec<stmt_vec_info> reduction_phis,
4332 bool double_reduc,
4333 slp_tree slp_node,
4334 slp_instance slp_node_instance,
4335 tree induc_val, enum tree_code induc_code,
4336 tree neutral_op)
4338 stmt_vec_info prev_phi_info;
4339 tree vectype;
4340 machine_mode mode;
4341 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4342 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4343 basic_block exit_bb;
4344 tree scalar_dest;
4345 tree scalar_type;
4346 gimple *new_phi = NULL, *phi;
4347 stmt_vec_info phi_info;
4348 gimple_stmt_iterator exit_gsi;
4349 tree vec_dest;
4350 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4351 gimple *epilog_stmt = NULL;
4352 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4353 gimple *exit_phi;
4354 tree bitsize;
4355 tree adjustment_def = NULL;
4356 tree vec_initial_def = NULL;
4357 tree expr, def, initial_def = NULL;
4358 tree orig_name, scalar_result;
4359 imm_use_iterator imm_iter, phi_imm_iter;
4360 use_operand_p use_p, phi_use_p;
4361 gimple *use_stmt;
4362 stmt_vec_info reduction_phi_info = NULL;
4363 bool nested_in_vect_loop = false;
4364 auto_vec<gimple *> new_phis;
4365 auto_vec<stmt_vec_info> inner_phis;
4366 int j, i;
4367 auto_vec<tree> scalar_results;
4368 unsigned int group_size = 1, k, ratio;
4369 auto_vec<tree> vec_initial_defs;
4370 auto_vec<gimple *> phis;
4371 bool slp_reduc = false;
4372 bool direct_slp_reduc;
4373 tree new_phi_result;
4374 stmt_vec_info inner_phi = NULL;
4375 tree induction_index = NULL_TREE;
4377 if (slp_node)
4378 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4380 if (nested_in_vect_loop_p (loop, stmt_info))
4382 outer_loop = loop;
4383 loop = loop->inner;
4384 nested_in_vect_loop = true;
4385 gcc_assert (!slp_node);
4388 vectype = STMT_VINFO_VECTYPE (stmt_info);
4389 gcc_assert (vectype);
4390 mode = TYPE_MODE (vectype);
4392 /* 1. Create the reduction def-use cycle:
4393 Set the arguments of REDUCTION_PHIS, i.e., transform
4395 loop:
4396 vec_def = phi <null, null> # REDUCTION_PHI
4397 VECT_DEF = vector_stmt # vectorized form of STMT
4400 into:
4402 loop:
4403 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4404 VECT_DEF = vector_stmt # vectorized form of STMT
4407 (in case of SLP, do it for all the phis). */
4409 /* Get the loop-entry arguments. */
4410 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4411 if (slp_node)
4413 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4414 vec_initial_defs.reserve (vec_num);
4415 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4416 &vec_initial_defs, vec_num,
4417 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4418 neutral_op);
4420 else
4422 /* Get at the scalar def before the loop, that defines the initial value
4423 of the reduction variable. */
4424 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4425 loop_preheader_edge (loop));
4426 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4427 and we can't use zero for induc_val, use initial_def. Similarly
4428 for REDUC_MIN and initial_def larger than the base. */
4429 if (TREE_CODE (initial_def) == INTEGER_CST
4430 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4431 == INTEGER_INDUC_COND_REDUCTION)
4432 && !integer_zerop (induc_val)
4433 && ((induc_code == MAX_EXPR
4434 && tree_int_cst_lt (initial_def, induc_val))
4435 || (induc_code == MIN_EXPR
4436 && tree_int_cst_lt (induc_val, initial_def))))
4437 induc_val = initial_def;
4439 if (double_reduc)
4440 /* In case of double reduction we only create a vector variable
4441 to be put in the reduction phi node. The actual statement
4442 creation is done later in this function. */
4443 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4444 else if (nested_in_vect_loop)
4446 /* Do not use an adjustment def as that case is not supported
4447 correctly if ncopies is not one. */
4448 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4449 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4450 stmt_info);
4452 else
4453 vec_initial_def
4454 = get_initial_def_for_reduction (stmt_info, initial_def,
4455 &adjustment_def);
4456 vec_initial_defs.create (1);
4457 vec_initial_defs.quick_push (vec_initial_def);
4460 /* Set phi nodes arguments. */
4461 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4463 tree vec_init_def = vec_initial_defs[i];
4464 tree def = vect_defs[i];
4465 for (j = 0; j < ncopies; j++)
4467 if (j != 0)
4469 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4470 if (nested_in_vect_loop)
4471 vec_init_def
4472 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4475 /* Set the loop-entry arg of the reduction-phi. */
4477 gphi *phi = as_a <gphi *> (phi_info->stmt);
4478 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4479 == INTEGER_INDUC_COND_REDUCTION)
4481 /* Initialise the reduction phi to zero. This prevents initial
4482 values of non-zero interferring with the reduction op. */
4483 gcc_assert (ncopies == 1);
4484 gcc_assert (i == 0);
4486 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4487 tree induc_val_vec
4488 = build_vector_from_val (vec_init_def_type, induc_val);
4490 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4491 UNKNOWN_LOCATION);
4493 else
4494 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4495 UNKNOWN_LOCATION);
4497 /* Set the loop-latch arg for the reduction-phi. */
4498 if (j > 0)
4499 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4501 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4503 if (dump_enabled_p ())
4504 dump_printf_loc (MSG_NOTE, vect_location,
4505 "transform reduction: created def-use cycle: %G%G",
4506 phi, SSA_NAME_DEF_STMT (def));
4510 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4511 which is updated with the current index of the loop for every match of
4512 the original loop's cond_expr (VEC_STMT). This results in a vector
4513 containing the last time the condition passed for that vector lane.
4514 The first match will be a 1 to allow 0 to be used for non-matching
4515 indexes. If there are no matches at all then the vector will be all
4516 zeroes. */
4517 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4519 tree indx_before_incr, indx_after_incr;
4520 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4522 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4523 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4525 int scalar_precision
4526 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4527 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4528 tree cr_index_vector_type = build_vector_type
4529 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4531 /* First we create a simple vector induction variable which starts
4532 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4533 vector size (STEP). */
4535 /* Create a {1,2,3,...} vector. */
4536 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4538 /* Create a vector of the step value. */
4539 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4540 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4542 /* Create an induction variable. */
4543 gimple_stmt_iterator incr_gsi;
4544 bool insert_after;
4545 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4546 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4547 insert_after, &indx_before_incr, &indx_after_incr);
4549 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4550 filled with zeros (VEC_ZERO). */
4552 /* Create a vector of 0s. */
4553 tree zero = build_zero_cst (cr_index_scalar_type);
4554 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4556 /* Create a vector phi node. */
4557 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4558 new_phi = create_phi_node (new_phi_tree, loop->header);
4559 loop_vinfo->add_stmt (new_phi);
4560 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4561 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4563 /* Now take the condition from the loops original cond_expr
4564 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4565 every match uses values from the induction variable
4566 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4567 (NEW_PHI_TREE).
4568 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4569 the new cond_expr (INDEX_COND_EXPR). */
4571 /* Duplicate the condition from vec_stmt. */
4572 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4574 /* Create a conditional, where the condition is taken from vec_stmt
4575 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4576 else is the phi (NEW_PHI_TREE). */
4577 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4578 ccompare, indx_before_incr,
4579 new_phi_tree);
4580 induction_index = make_ssa_name (cr_index_vector_type);
4581 gimple *index_condition = gimple_build_assign (induction_index,
4582 index_cond_expr);
4583 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4584 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4585 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4587 /* Update the phi with the vec cond. */
4588 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4589 loop_latch_edge (loop), UNKNOWN_LOCATION);
4592 /* 2. Create epilog code.
4593 The reduction epilog code operates across the elements of the vector
4594 of partial results computed by the vectorized loop.
4595 The reduction epilog code consists of:
4597 step 1: compute the scalar result in a vector (v_out2)
4598 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4599 step 3: adjust the scalar result (s_out3) if needed.
4601 Step 1 can be accomplished using one the following three schemes:
4602 (scheme 1) using reduc_fn, if available.
4603 (scheme 2) using whole-vector shifts, if available.
4604 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4605 combined.
4607 The overall epilog code looks like this:
4609 s_out0 = phi <s_loop> # original EXIT_PHI
4610 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4611 v_out2 = reduce <v_out1> # step 1
4612 s_out3 = extract_field <v_out2, 0> # step 2
4613 s_out4 = adjust_result <s_out3> # step 3
4615 (step 3 is optional, and steps 1 and 2 may be combined).
4616 Lastly, the uses of s_out0 are replaced by s_out4. */
4619 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4620 v_out1 = phi <VECT_DEF>
4621 Store them in NEW_PHIS. */
4623 exit_bb = single_exit (loop)->dest;
4624 prev_phi_info = NULL;
4625 new_phis.create (vect_defs.length ());
4626 FOR_EACH_VEC_ELT (vect_defs, i, def)
4628 for (j = 0; j < ncopies; j++)
4630 tree new_def = copy_ssa_name (def);
4631 phi = create_phi_node (new_def, exit_bb);
4632 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4633 if (j == 0)
4634 new_phis.quick_push (phi);
4635 else
4637 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4638 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4641 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4642 prev_phi_info = phi_info;
4646 /* The epilogue is created for the outer-loop, i.e., for the loop being
4647 vectorized. Create exit phis for the outer loop. */
4648 if (double_reduc)
4650 loop = outer_loop;
4651 exit_bb = single_exit (loop)->dest;
4652 inner_phis.create (vect_defs.length ());
4653 FOR_EACH_VEC_ELT (new_phis, i, phi)
4655 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4656 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4657 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4658 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4659 PHI_RESULT (phi));
4660 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4661 inner_phis.quick_push (phi_info);
4662 new_phis[i] = outer_phi;
4663 while (STMT_VINFO_RELATED_STMT (phi_info))
4665 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4666 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4667 outer_phi = create_phi_node (new_result, exit_bb);
4668 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4669 PHI_RESULT (phi_info->stmt));
4670 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4671 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4672 prev_phi_info = outer_phi_info;
4677 exit_gsi = gsi_after_labels (exit_bb);
4679 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4680 (i.e. when reduc_fn is not available) and in the final adjustment
4681 code (if needed). Also get the original scalar reduction variable as
4682 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4683 represents a reduction pattern), the tree-code and scalar-def are
4684 taken from the original stmt that the pattern-stmt (STMT) replaces.
4685 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4686 are taken from STMT. */
4688 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4689 if (orig_stmt_info != stmt_info)
4691 /* Reduction pattern */
4692 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4693 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4696 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4697 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4698 partial results are added and not subtracted. */
4699 if (code == MINUS_EXPR)
4700 code = PLUS_EXPR;
4702 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4703 scalar_type = TREE_TYPE (scalar_dest);
4704 scalar_results.create (group_size);
4705 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4706 bitsize = TYPE_SIZE (scalar_type);
4708 /* In case this is a reduction in an inner-loop while vectorizing an outer
4709 loop - we don't need to extract a single scalar result at the end of the
4710 inner-loop (unless it is double reduction, i.e., the use of reduction is
4711 outside the outer-loop). The final vector of partial results will be used
4712 in the vectorized outer-loop, or reduced to a scalar result at the end of
4713 the outer-loop. */
4714 if (nested_in_vect_loop && !double_reduc)
4715 goto vect_finalize_reduction;
4717 /* SLP reduction without reduction chain, e.g.,
4718 # a1 = phi <a2, a0>
4719 # b1 = phi <b2, b0>
4720 a2 = operation (a1)
4721 b2 = operation (b1) */
4722 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4724 /* True if we should implement SLP_REDUC using native reduction operations
4725 instead of scalar operations. */
4726 direct_slp_reduc = (reduc_fn != IFN_LAST
4727 && slp_reduc
4728 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4730 /* In case of reduction chain, e.g.,
4731 # a1 = phi <a3, a0>
4732 a2 = operation (a1)
4733 a3 = operation (a2),
4735 we may end up with more than one vector result. Here we reduce them to
4736 one vector. */
4737 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4739 tree first_vect = PHI_RESULT (new_phis[0]);
4740 gassign *new_vec_stmt = NULL;
4741 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4742 for (k = 1; k < new_phis.length (); k++)
4744 gimple *next_phi = new_phis[k];
4745 tree second_vect = PHI_RESULT (next_phi);
4746 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4747 new_vec_stmt = gimple_build_assign (tem, code,
4748 first_vect, second_vect);
4749 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4750 first_vect = tem;
4753 new_phi_result = first_vect;
4754 if (new_vec_stmt)
4756 new_phis.truncate (0);
4757 new_phis.safe_push (new_vec_stmt);
4760 /* Likewise if we couldn't use a single defuse cycle. */
4761 else if (ncopies > 1)
4763 gcc_assert (new_phis.length () == 1);
4764 tree first_vect = PHI_RESULT (new_phis[0]);
4765 gassign *new_vec_stmt = NULL;
4766 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4767 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4768 for (int k = 1; k < ncopies; ++k)
4770 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4771 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4772 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4773 new_vec_stmt = gimple_build_assign (tem, code,
4774 first_vect, second_vect);
4775 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4776 first_vect = tem;
4778 new_phi_result = first_vect;
4779 new_phis.truncate (0);
4780 new_phis.safe_push (new_vec_stmt);
4782 else
4783 new_phi_result = PHI_RESULT (new_phis[0]);
4785 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4786 && reduc_fn != IFN_LAST)
4788 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4789 various data values where the condition matched and another vector
4790 (INDUCTION_INDEX) containing all the indexes of those matches. We
4791 need to extract the last matching index (which will be the index with
4792 highest value) and use this to index into the data vector.
4793 For the case where there were no matches, the data vector will contain
4794 all default values and the index vector will be all zeros. */
4796 /* Get various versions of the type of the vector of indexes. */
4797 tree index_vec_type = TREE_TYPE (induction_index);
4798 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4799 tree index_scalar_type = TREE_TYPE (index_vec_type);
4800 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4801 (index_vec_type);
4803 /* Get an unsigned integer version of the type of the data vector. */
4804 int scalar_precision
4805 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4806 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4807 tree vectype_unsigned = build_vector_type
4808 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4810 /* First we need to create a vector (ZERO_VEC) of zeros and another
4811 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4812 can create using a MAX reduction and then expanding.
4813 In the case where the loop never made any matches, the max index will
4814 be zero. */
4816 /* Vector of {0, 0, 0,...}. */
4817 tree zero_vec = make_ssa_name (vectype);
4818 tree zero_vec_rhs = build_zero_cst (vectype);
4819 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4820 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4822 /* Find maximum value from the vector of found indexes. */
4823 tree max_index = make_ssa_name (index_scalar_type);
4824 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4825 1, induction_index);
4826 gimple_call_set_lhs (max_index_stmt, max_index);
4827 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4829 /* Vector of {max_index, max_index, max_index,...}. */
4830 tree max_index_vec = make_ssa_name (index_vec_type);
4831 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4832 max_index);
4833 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4834 max_index_vec_rhs);
4835 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4837 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4838 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4839 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4840 otherwise. Only one value should match, resulting in a vector
4841 (VEC_COND) with one data value and the rest zeros.
4842 In the case where the loop never made any matches, every index will
4843 match, resulting in a vector with all data values (which will all be
4844 the default value). */
4846 /* Compare the max index vector to the vector of found indexes to find
4847 the position of the max value. */
4848 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4849 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4850 induction_index,
4851 max_index_vec);
4852 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4854 /* Use the compare to choose either values from the data vector or
4855 zero. */
4856 tree vec_cond = make_ssa_name (vectype);
4857 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4858 vec_compare, new_phi_result,
4859 zero_vec);
4860 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4862 /* Finally we need to extract the data value from the vector (VEC_COND)
4863 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4864 reduction, but because this doesn't exist, we can use a MAX reduction
4865 instead. The data value might be signed or a float so we need to cast
4866 it first.
4867 In the case where the loop never made any matches, the data values are
4868 all identical, and so will reduce down correctly. */
4870 /* Make the matched data values unsigned. */
4871 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4872 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4873 vec_cond);
4874 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4875 VIEW_CONVERT_EXPR,
4876 vec_cond_cast_rhs);
4877 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4879 /* Reduce down to a scalar value. */
4880 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4881 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4882 1, vec_cond_cast);
4883 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4884 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4886 /* Convert the reduced value back to the result type and set as the
4887 result. */
4888 gimple_seq stmts = NULL;
4889 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4890 data_reduc);
4891 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4892 scalar_results.safe_push (new_temp);
4894 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4895 && reduc_fn == IFN_LAST)
4897 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4898 idx = 0;
4899 idx_val = induction_index[0];
4900 val = data_reduc[0];
4901 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4902 if (induction_index[i] > idx_val)
4903 val = data_reduc[i], idx_val = induction_index[i];
4904 return val; */
4906 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4907 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4908 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4909 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4910 /* Enforced by vectorizable_reduction, which ensures we have target
4911 support before allowing a conditional reduction on variable-length
4912 vectors. */
4913 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4914 tree idx_val = NULL_TREE, val = NULL_TREE;
4915 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4917 tree old_idx_val = idx_val;
4918 tree old_val = val;
4919 idx_val = make_ssa_name (idx_eltype);
4920 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4921 build3 (BIT_FIELD_REF, idx_eltype,
4922 induction_index,
4923 bitsize_int (el_size),
4924 bitsize_int (off)));
4925 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4926 val = make_ssa_name (data_eltype);
4927 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4928 build3 (BIT_FIELD_REF,
4929 data_eltype,
4930 new_phi_result,
4931 bitsize_int (el_size),
4932 bitsize_int (off)));
4933 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4934 if (off != 0)
4936 tree new_idx_val = idx_val;
4937 tree new_val = val;
4938 if (off != v_size - el_size)
4940 new_idx_val = make_ssa_name (idx_eltype);
4941 epilog_stmt = gimple_build_assign (new_idx_val,
4942 MAX_EXPR, idx_val,
4943 old_idx_val);
4944 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4946 new_val = make_ssa_name (data_eltype);
4947 epilog_stmt = gimple_build_assign (new_val,
4948 COND_EXPR,
4949 build2 (GT_EXPR,
4950 boolean_type_node,
4951 idx_val,
4952 old_idx_val),
4953 val, old_val);
4954 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4955 idx_val = new_idx_val;
4956 val = new_val;
4959 /* Convert the reduced value back to the result type and set as the
4960 result. */
4961 gimple_seq stmts = NULL;
4962 val = gimple_convert (&stmts, scalar_type, val);
4963 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4964 scalar_results.safe_push (val);
4967 /* 2.3 Create the reduction code, using one of the three schemes described
4968 above. In SLP we simply need to extract all the elements from the
4969 vector (without reducing them), so we use scalar shifts. */
4970 else if (reduc_fn != IFN_LAST && !slp_reduc)
4972 tree tmp;
4973 tree vec_elem_type;
4975 /* Case 1: Create:
4976 v_out2 = reduc_expr <v_out1> */
4978 if (dump_enabled_p ())
4979 dump_printf_loc (MSG_NOTE, vect_location,
4980 "Reduce using direct vector reduction.\n");
4982 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4983 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4985 tree tmp_dest
4986 = vect_create_destination_var (scalar_dest, vec_elem_type);
4987 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4988 new_phi_result);
4989 gimple_set_lhs (epilog_stmt, tmp_dest);
4990 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4991 gimple_set_lhs (epilog_stmt, new_temp);
4992 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4994 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4995 new_temp);
4997 else
4999 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5000 new_phi_result);
5001 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5004 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5005 gimple_set_lhs (epilog_stmt, new_temp);
5006 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5008 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5009 == INTEGER_INDUC_COND_REDUCTION)
5010 && !operand_equal_p (initial_def, induc_val, 0))
5012 /* Earlier we set the initial value to be a vector if induc_val
5013 values. Check the result and if it is induc_val then replace
5014 with the original initial value, unless induc_val is
5015 the same as initial_def already. */
5016 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5017 induc_val);
5019 tmp = make_ssa_name (new_scalar_dest);
5020 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5021 initial_def, new_temp);
5022 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5023 new_temp = tmp;
5026 scalar_results.safe_push (new_temp);
5028 else if (direct_slp_reduc)
5030 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5031 with the elements for other SLP statements replaced with the
5032 neutral value. We can then do a normal reduction on each vector. */
5034 /* Enforced by vectorizable_reduction. */
5035 gcc_assert (new_phis.length () == 1);
5036 gcc_assert (pow2p_hwi (group_size));
5038 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5039 vec<stmt_vec_info> orig_phis
5040 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5041 gimple_seq seq = NULL;
5043 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5044 and the same element size as VECTYPE. */
5045 tree index = build_index_vector (vectype, 0, 1);
5046 tree index_type = TREE_TYPE (index);
5047 tree index_elt_type = TREE_TYPE (index_type);
5048 tree mask_type = build_same_sized_truth_vector_type (index_type);
5050 /* Create a vector that, for each element, identifies which of
5051 the REDUC_GROUP_SIZE results should use it. */
5052 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5053 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5054 build_vector_from_val (index_type, index_mask));
5056 /* Get a neutral vector value. This is simply a splat of the neutral
5057 scalar value if we have one, otherwise the initial scalar value
5058 is itself a neutral value. */
5059 tree vector_identity = NULL_TREE;
5060 if (neutral_op)
5061 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5062 neutral_op);
5063 for (unsigned int i = 0; i < group_size; ++i)
5065 /* If there's no univeral neutral value, we can use the
5066 initial scalar value from the original PHI. This is used
5067 for MIN and MAX reduction, for example. */
5068 if (!neutral_op)
5070 tree scalar_value
5071 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5072 loop_preheader_edge (loop));
5073 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5074 scalar_value);
5077 /* Calculate the equivalent of:
5079 sel[j] = (index[j] == i);
5081 which selects the elements of NEW_PHI_RESULT that should
5082 be included in the result. */
5083 tree compare_val = build_int_cst (index_elt_type, i);
5084 compare_val = build_vector_from_val (index_type, compare_val);
5085 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5086 index, compare_val);
5088 /* Calculate the equivalent of:
5090 vec = seq ? new_phi_result : vector_identity;
5092 VEC is now suitable for a full vector reduction. */
5093 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5094 sel, new_phi_result, vector_identity);
5096 /* Do the reduction and convert it to the appropriate type. */
5097 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5098 TREE_TYPE (vectype), vec);
5099 scalar = gimple_convert (&seq, scalar_type, scalar);
5100 scalar_results.safe_push (scalar);
5102 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5104 else
5106 bool reduce_with_shift;
5107 tree vec_temp;
5109 /* COND reductions all do the final reduction with MAX_EXPR
5110 or MIN_EXPR. */
5111 if (code == COND_EXPR)
5113 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5114 == INTEGER_INDUC_COND_REDUCTION)
5115 code = induc_code;
5116 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5117 == CONST_COND_REDUCTION)
5118 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5119 else
5120 code = MAX_EXPR;
5123 /* See if the target wants to do the final (shift) reduction
5124 in a vector mode of smaller size and first reduce upper/lower
5125 halves against each other. */
5126 enum machine_mode mode1 = mode;
5127 tree vectype1 = vectype;
5128 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5129 unsigned sz1 = sz;
5130 if (!slp_reduc
5131 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5132 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5134 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5135 reduce_with_shift = have_whole_vector_shift (mode1);
5136 if (!VECTOR_MODE_P (mode1))
5137 reduce_with_shift = false;
5138 else
5140 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5141 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5142 reduce_with_shift = false;
5145 /* First reduce the vector to the desired vector size we should
5146 do shift reduction on by combining upper and lower halves. */
5147 new_temp = new_phi_result;
5148 while (sz > sz1)
5150 gcc_assert (!slp_reduc);
5151 sz /= 2;
5152 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5154 /* The target has to make sure we support lowpart/highpart
5155 extraction, either via direct vector extract or through
5156 an integer mode punning. */
5157 tree dst1, dst2;
5158 if (convert_optab_handler (vec_extract_optab,
5159 TYPE_MODE (TREE_TYPE (new_temp)),
5160 TYPE_MODE (vectype1))
5161 != CODE_FOR_nothing)
5163 /* Extract sub-vectors directly once vec_extract becomes
5164 a conversion optab. */
5165 dst1 = make_ssa_name (vectype1);
5166 epilog_stmt
5167 = gimple_build_assign (dst1, BIT_FIELD_REF,
5168 build3 (BIT_FIELD_REF, vectype1,
5169 new_temp, TYPE_SIZE (vectype1),
5170 bitsize_int (0)));
5171 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5172 dst2 = make_ssa_name (vectype1);
5173 epilog_stmt
5174 = gimple_build_assign (dst2, BIT_FIELD_REF,
5175 build3 (BIT_FIELD_REF, vectype1,
5176 new_temp, TYPE_SIZE (vectype1),
5177 bitsize_int (sz * BITS_PER_UNIT)));
5178 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5180 else
5182 /* Extract via punning to appropriately sized integer mode
5183 vector. */
5184 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5186 tree etype = build_vector_type (eltype, 2);
5187 gcc_assert (convert_optab_handler (vec_extract_optab,
5188 TYPE_MODE (etype),
5189 TYPE_MODE (eltype))
5190 != CODE_FOR_nothing);
5191 tree tem = make_ssa_name (etype);
5192 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5193 build1 (VIEW_CONVERT_EXPR,
5194 etype, new_temp));
5195 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5196 new_temp = tem;
5197 tem = make_ssa_name (eltype);
5198 epilog_stmt
5199 = gimple_build_assign (tem, BIT_FIELD_REF,
5200 build3 (BIT_FIELD_REF, eltype,
5201 new_temp, TYPE_SIZE (eltype),
5202 bitsize_int (0)));
5203 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5204 dst1 = make_ssa_name (vectype1);
5205 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5206 build1 (VIEW_CONVERT_EXPR,
5207 vectype1, tem));
5208 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5209 tem = make_ssa_name (eltype);
5210 epilog_stmt
5211 = gimple_build_assign (tem, BIT_FIELD_REF,
5212 build3 (BIT_FIELD_REF, eltype,
5213 new_temp, TYPE_SIZE (eltype),
5214 bitsize_int (sz * BITS_PER_UNIT)));
5215 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5216 dst2 = make_ssa_name (vectype1);
5217 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5218 build1 (VIEW_CONVERT_EXPR,
5219 vectype1, tem));
5220 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5223 new_temp = make_ssa_name (vectype1);
5224 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5225 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5228 if (reduce_with_shift && !slp_reduc)
5230 int element_bitsize = tree_to_uhwi (bitsize);
5231 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5232 for variable-length vectors and also requires direct target support
5233 for loop reductions. */
5234 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5235 int nelements = vec_size_in_bits / element_bitsize;
5236 vec_perm_builder sel;
5237 vec_perm_indices indices;
5239 int elt_offset;
5241 tree zero_vec = build_zero_cst (vectype1);
5242 /* Case 2: Create:
5243 for (offset = nelements/2; offset >= 1; offset/=2)
5245 Create: va' = vec_shift <va, offset>
5246 Create: va = vop <va, va'>
5247 } */
5249 tree rhs;
5251 if (dump_enabled_p ())
5252 dump_printf_loc (MSG_NOTE, vect_location,
5253 "Reduce using vector shifts\n");
5255 mode1 = TYPE_MODE (vectype1);
5256 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5257 for (elt_offset = nelements / 2;
5258 elt_offset >= 1;
5259 elt_offset /= 2)
5261 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5262 indices.new_vector (sel, 2, nelements);
5263 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5264 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5265 new_temp, zero_vec, mask);
5266 new_name = make_ssa_name (vec_dest, epilog_stmt);
5267 gimple_assign_set_lhs (epilog_stmt, new_name);
5268 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5270 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5271 new_temp);
5272 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5273 gimple_assign_set_lhs (epilog_stmt, new_temp);
5274 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5277 /* 2.4 Extract the final scalar result. Create:
5278 s_out3 = extract_field <v_out2, bitpos> */
5280 if (dump_enabled_p ())
5281 dump_printf_loc (MSG_NOTE, vect_location,
5282 "extract scalar result\n");
5284 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5285 bitsize, bitsize_zero_node);
5286 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5287 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5288 gimple_assign_set_lhs (epilog_stmt, new_temp);
5289 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5290 scalar_results.safe_push (new_temp);
5292 else
5294 /* Case 3: Create:
5295 s = extract_field <v_out2, 0>
5296 for (offset = element_size;
5297 offset < vector_size;
5298 offset += element_size;)
5300 Create: s' = extract_field <v_out2, offset>
5301 Create: s = op <s, s'> // For non SLP cases
5302 } */
5304 if (dump_enabled_p ())
5305 dump_printf_loc (MSG_NOTE, vect_location,
5306 "Reduce using scalar code.\n");
5308 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5309 int element_bitsize = tree_to_uhwi (bitsize);
5310 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5312 int bit_offset;
5313 if (gimple_code (new_phi) == GIMPLE_PHI)
5314 vec_temp = PHI_RESULT (new_phi);
5315 else
5316 vec_temp = gimple_assign_lhs (new_phi);
5317 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5318 bitsize_zero_node);
5319 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5320 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5321 gimple_assign_set_lhs (epilog_stmt, new_temp);
5322 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5324 /* In SLP we don't need to apply reduction operation, so we just
5325 collect s' values in SCALAR_RESULTS. */
5326 if (slp_reduc)
5327 scalar_results.safe_push (new_temp);
5329 for (bit_offset = element_bitsize;
5330 bit_offset < vec_size_in_bits;
5331 bit_offset += element_bitsize)
5333 tree bitpos = bitsize_int (bit_offset);
5334 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5335 bitsize, bitpos);
5337 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5338 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5339 gimple_assign_set_lhs (epilog_stmt, new_name);
5340 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5342 if (slp_reduc)
5344 /* In SLP we don't need to apply reduction operation, so
5345 we just collect s' values in SCALAR_RESULTS. */
5346 new_temp = new_name;
5347 scalar_results.safe_push (new_name);
5349 else
5351 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5352 new_name, new_temp);
5353 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5354 gimple_assign_set_lhs (epilog_stmt, new_temp);
5355 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360 /* The only case where we need to reduce scalar results in SLP, is
5361 unrolling. If the size of SCALAR_RESULTS is greater than
5362 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5363 REDUC_GROUP_SIZE. */
5364 if (slp_reduc)
5366 tree res, first_res, new_res;
5367 gimple *new_stmt;
5369 /* Reduce multiple scalar results in case of SLP unrolling. */
5370 for (j = group_size; scalar_results.iterate (j, &res);
5371 j++)
5373 first_res = scalar_results[j % group_size];
5374 new_stmt = gimple_build_assign (new_scalar_dest, code,
5375 first_res, res);
5376 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5377 gimple_assign_set_lhs (new_stmt, new_res);
5378 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5379 scalar_results[j % group_size] = new_res;
5382 else
5383 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5384 scalar_results.safe_push (new_temp);
5387 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5388 == INTEGER_INDUC_COND_REDUCTION)
5389 && !operand_equal_p (initial_def, induc_val, 0))
5391 /* Earlier we set the initial value to be a vector if induc_val
5392 values. Check the result and if it is induc_val then replace
5393 with the original initial value, unless induc_val is
5394 the same as initial_def already. */
5395 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5396 induc_val);
5398 tree tmp = make_ssa_name (new_scalar_dest);
5399 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5400 initial_def, new_temp);
5401 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5402 scalar_results[0] = tmp;
5406 vect_finalize_reduction:
5408 if (double_reduc)
5409 loop = loop->inner;
5411 /* 2.5 Adjust the final result by the initial value of the reduction
5412 variable. (When such adjustment is not needed, then
5413 'adjustment_def' is zero). For example, if code is PLUS we create:
5414 new_temp = loop_exit_def + adjustment_def */
5416 if (adjustment_def)
5418 gcc_assert (!slp_reduc);
5419 if (nested_in_vect_loop)
5421 new_phi = new_phis[0];
5422 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5423 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5424 new_dest = vect_create_destination_var (scalar_dest, vectype);
5426 else
5428 new_temp = scalar_results[0];
5429 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5430 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5431 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5434 epilog_stmt = gimple_build_assign (new_dest, expr);
5435 new_temp = make_ssa_name (new_dest, epilog_stmt);
5436 gimple_assign_set_lhs (epilog_stmt, new_temp);
5437 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5438 if (nested_in_vect_loop)
5440 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5441 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5442 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5444 if (!double_reduc)
5445 scalar_results.quick_push (new_temp);
5446 else
5447 scalar_results[0] = new_temp;
5449 else
5450 scalar_results[0] = new_temp;
5452 new_phis[0] = epilog_stmt;
5455 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5456 phis with new adjusted scalar results, i.e., replace use <s_out0>
5457 with use <s_out4>.
5459 Transform:
5460 loop_exit:
5461 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5462 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5463 v_out2 = reduce <v_out1>
5464 s_out3 = extract_field <v_out2, 0>
5465 s_out4 = adjust_result <s_out3>
5466 use <s_out0>
5467 use <s_out0>
5469 into:
5471 loop_exit:
5472 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5473 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5474 v_out2 = reduce <v_out1>
5475 s_out3 = extract_field <v_out2, 0>
5476 s_out4 = adjust_result <s_out3>
5477 use <s_out4>
5478 use <s_out4> */
5481 /* In SLP reduction chain we reduce vector results into one vector if
5482 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5483 LHS of the last stmt in the reduction chain, since we are looking for
5484 the loop exit phi node. */
5485 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5487 stmt_vec_info dest_stmt_info
5488 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5489 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5490 group_size = 1;
5493 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5494 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5495 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5496 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5497 correspond to the first vector stmt, etc.
5498 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5499 if (group_size > new_phis.length ())
5501 ratio = group_size / new_phis.length ();
5502 gcc_assert (!(group_size % new_phis.length ()));
5504 else
5505 ratio = 1;
5507 stmt_vec_info epilog_stmt_info = NULL;
5508 for (k = 0; k < group_size; k++)
5510 if (k % ratio == 0)
5512 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5513 reduction_phi_info = reduction_phis[k / ratio];
5514 if (double_reduc)
5515 inner_phi = inner_phis[k / ratio];
5518 if (slp_reduc)
5520 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5522 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5523 /* SLP statements can't participate in patterns. */
5524 gcc_assert (!orig_stmt_info);
5525 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5528 phis.create (3);
5529 /* Find the loop-closed-use at the loop exit of the original scalar
5530 result. (The reduction result is expected to have two immediate uses -
5531 one at the latch block, and one at the loop exit). */
5532 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5533 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5534 && !is_gimple_debug (USE_STMT (use_p)))
5535 phis.safe_push (USE_STMT (use_p));
5537 /* While we expect to have found an exit_phi because of loop-closed-ssa
5538 form we can end up without one if the scalar cycle is dead. */
5540 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5542 if (outer_loop)
5544 stmt_vec_info exit_phi_vinfo
5545 = loop_vinfo->lookup_stmt (exit_phi);
5546 gphi *vect_phi;
5548 /* FORNOW. Currently not supporting the case that an inner-loop
5549 reduction is not used in the outer-loop (but only outside the
5550 outer-loop), unless it is double reduction. */
5551 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5552 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5553 || double_reduc);
5555 if (double_reduc)
5556 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5557 else
5558 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5559 if (!double_reduc
5560 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5561 != vect_double_reduction_def)
5562 continue;
5564 /* Handle double reduction:
5566 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5567 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5568 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5569 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5571 At that point the regular reduction (stmt2 and stmt3) is
5572 already vectorized, as well as the exit phi node, stmt4.
5573 Here we vectorize the phi node of double reduction, stmt1, and
5574 update all relevant statements. */
5576 /* Go through all the uses of s2 to find double reduction phi
5577 node, i.e., stmt1 above. */
5578 orig_name = PHI_RESULT (exit_phi);
5579 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5581 stmt_vec_info use_stmt_vinfo;
5582 tree vect_phi_init, preheader_arg, vect_phi_res;
5583 basic_block bb = gimple_bb (use_stmt);
5585 /* Check that USE_STMT is really double reduction phi
5586 node. */
5587 if (gimple_code (use_stmt) != GIMPLE_PHI
5588 || gimple_phi_num_args (use_stmt) != 2
5589 || bb->loop_father != outer_loop)
5590 continue;
5591 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5592 if (!use_stmt_vinfo
5593 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5594 != vect_double_reduction_def)
5595 continue;
5597 /* Create vector phi node for double reduction:
5598 vs1 = phi <vs0, vs2>
5599 vs1 was created previously in this function by a call to
5600 vect_get_vec_def_for_operand and is stored in
5601 vec_initial_def;
5602 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5603 vs0 is created here. */
5605 /* Create vector phi node. */
5606 vect_phi = create_phi_node (vec_initial_def, bb);
5607 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5609 /* Create vs0 - initial def of the double reduction phi. */
5610 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5611 loop_preheader_edge (outer_loop));
5612 vect_phi_init = get_initial_def_for_reduction
5613 (stmt_info, preheader_arg, NULL);
5615 /* Update phi node arguments with vs0 and vs2. */
5616 add_phi_arg (vect_phi, vect_phi_init,
5617 loop_preheader_edge (outer_loop),
5618 UNKNOWN_LOCATION);
5619 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5620 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5621 if (dump_enabled_p ())
5622 dump_printf_loc (MSG_NOTE, vect_location,
5623 "created double reduction phi node: %G",
5624 vect_phi);
5626 vect_phi_res = PHI_RESULT (vect_phi);
5628 /* Replace the use, i.e., set the correct vs1 in the regular
5629 reduction phi node. FORNOW, NCOPIES is always 1, so the
5630 loop is redundant. */
5631 stmt_vec_info use_info = reduction_phi_info;
5632 for (j = 0; j < ncopies; j++)
5634 edge pr_edge = loop_preheader_edge (loop);
5635 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5636 pr_edge->dest_idx, vect_phi_res);
5637 use_info = STMT_VINFO_RELATED_STMT (use_info);
5643 phis.release ();
5644 if (nested_in_vect_loop)
5646 if (double_reduc)
5647 loop = outer_loop;
5648 else
5649 continue;
5652 phis.create (3);
5653 /* Find the loop-closed-use at the loop exit of the original scalar
5654 result. (The reduction result is expected to have two immediate uses,
5655 one at the latch block, and one at the loop exit). For double
5656 reductions we are looking for exit phis of the outer loop. */
5657 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5659 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5661 if (!is_gimple_debug (USE_STMT (use_p)))
5662 phis.safe_push (USE_STMT (use_p));
5664 else
5666 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5668 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5670 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5672 if (!flow_bb_inside_loop_p (loop,
5673 gimple_bb (USE_STMT (phi_use_p)))
5674 && !is_gimple_debug (USE_STMT (phi_use_p)))
5675 phis.safe_push (USE_STMT (phi_use_p));
5681 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5683 /* Replace the uses: */
5684 orig_name = PHI_RESULT (exit_phi);
5685 scalar_result = scalar_results[k];
5686 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5687 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5688 SET_USE (use_p, scalar_result);
5691 phis.release ();
5695 /* Return a vector of type VECTYPE that is equal to the vector select
5696 operation "MASK ? VEC : IDENTITY". Insert the select statements
5697 before GSI. */
5699 static tree
5700 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5701 tree vec, tree identity)
5703 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5704 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5705 mask, vec, identity);
5706 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5707 return cond;
5710 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5711 order, starting with LHS. Insert the extraction statements before GSI and
5712 associate the new scalar SSA names with variable SCALAR_DEST.
5713 Return the SSA name for the result. */
5715 static tree
5716 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5717 tree_code code, tree lhs, tree vector_rhs)
5719 tree vectype = TREE_TYPE (vector_rhs);
5720 tree scalar_type = TREE_TYPE (vectype);
5721 tree bitsize = TYPE_SIZE (scalar_type);
5722 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5723 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5725 for (unsigned HOST_WIDE_INT bit_offset = 0;
5726 bit_offset < vec_size_in_bits;
5727 bit_offset += element_bitsize)
5729 tree bitpos = bitsize_int (bit_offset);
5730 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5731 bitsize, bitpos);
5733 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5734 rhs = make_ssa_name (scalar_dest, stmt);
5735 gimple_assign_set_lhs (stmt, rhs);
5736 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5738 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5739 tree new_name = make_ssa_name (scalar_dest, stmt);
5740 gimple_assign_set_lhs (stmt, new_name);
5741 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5742 lhs = new_name;
5744 return lhs;
5747 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5748 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5749 statement. CODE is the operation performed by STMT_INFO and OPS are
5750 its scalar operands. REDUC_INDEX is the index of the operand in
5751 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5752 implements in-order reduction, or IFN_LAST if we should open-code it.
5753 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5754 that should be used to control the operation in a fully-masked loop. */
5756 static bool
5757 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5758 gimple_stmt_iterator *gsi,
5759 stmt_vec_info *vec_stmt, slp_tree slp_node,
5760 gimple *reduc_def_stmt,
5761 tree_code code, internal_fn reduc_fn,
5762 tree ops[3], tree vectype_in,
5763 int reduc_index, vec_loop_masks *masks)
5765 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5766 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5767 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5768 stmt_vec_info new_stmt_info = NULL;
5770 int ncopies;
5771 if (slp_node)
5772 ncopies = 1;
5773 else
5774 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5776 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5777 gcc_assert (ncopies == 1);
5778 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5779 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5780 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5781 == FOLD_LEFT_REDUCTION);
5783 if (slp_node)
5784 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5785 TYPE_VECTOR_SUBPARTS (vectype_in)));
5787 tree op0 = ops[1 - reduc_index];
5789 int group_size = 1;
5790 stmt_vec_info scalar_dest_def_info;
5791 auto_vec<tree> vec_oprnds0;
5792 if (slp_node)
5794 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5795 slp_node);
5796 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5797 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5799 else
5801 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5802 vec_oprnds0.create (1);
5803 vec_oprnds0.quick_push (loop_vec_def0);
5804 scalar_dest_def_info = stmt_info;
5807 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5808 tree scalar_type = TREE_TYPE (scalar_dest);
5809 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5811 int vec_num = vec_oprnds0.length ();
5812 gcc_assert (vec_num == 1 || slp_node);
5813 tree vec_elem_type = TREE_TYPE (vectype_out);
5814 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5816 tree vector_identity = NULL_TREE;
5817 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5818 vector_identity = build_zero_cst (vectype_out);
5820 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5821 int i;
5822 tree def0;
5823 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5825 gimple *new_stmt;
5826 tree mask = NULL_TREE;
5827 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5828 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5830 /* Handle MINUS by adding the negative. */
5831 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5833 tree negated = make_ssa_name (vectype_out);
5834 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5835 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5836 def0 = negated;
5839 if (mask)
5840 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5841 vector_identity);
5843 /* On the first iteration the input is simply the scalar phi
5844 result, and for subsequent iterations it is the output of
5845 the preceding operation. */
5846 if (reduc_fn != IFN_LAST)
5848 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5849 /* For chained SLP reductions the output of the previous reduction
5850 operation serves as the input of the next. For the final statement
5851 the output cannot be a temporary - we reuse the original
5852 scalar destination of the last statement. */
5853 if (i != vec_num - 1)
5855 gimple_set_lhs (new_stmt, scalar_dest_var);
5856 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5857 gimple_set_lhs (new_stmt, reduc_var);
5860 else
5862 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5863 reduc_var, def0);
5864 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5865 /* Remove the statement, so that we can use the same code paths
5866 as for statements that we've just created. */
5867 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5868 gsi_remove (&tmp_gsi, false);
5871 if (i == vec_num - 1)
5873 gimple_set_lhs (new_stmt, scalar_dest);
5874 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5875 new_stmt);
5877 else
5878 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5879 new_stmt, gsi);
5881 if (slp_node)
5882 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5885 if (!slp_node)
5886 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5888 return true;
5891 /* Function is_nonwrapping_integer_induction.
5893 Check if STMT_VINO (which is part of loop LOOP) both increments and
5894 does not cause overflow. */
5896 static bool
5897 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5899 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5900 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5901 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5902 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5903 widest_int ni, max_loop_value, lhs_max;
5904 wi::overflow_type overflow = wi::OVF_NONE;
5906 /* Make sure the loop is integer based. */
5907 if (TREE_CODE (base) != INTEGER_CST
5908 || TREE_CODE (step) != INTEGER_CST)
5909 return false;
5911 /* Check that the max size of the loop will not wrap. */
5913 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5914 return true;
5916 if (! max_stmt_executions (loop, &ni))
5917 return false;
5919 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5920 &overflow);
5921 if (overflow)
5922 return false;
5924 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5925 TYPE_SIGN (lhs_type), &overflow);
5926 if (overflow)
5927 return false;
5929 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5930 <= TYPE_PRECISION (lhs_type));
5933 /* Function vectorizable_reduction.
5935 Check if STMT_INFO performs a reduction operation that can be vectorized.
5936 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5937 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5938 Return true if STMT_INFO is vectorizable in this way.
5940 This function also handles reduction idioms (patterns) that have been
5941 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5942 may be of this form:
5943 X = pattern_expr (arg0, arg1, ..., X)
5944 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5945 sequence that had been detected and replaced by the pattern-stmt
5946 (STMT_INFO).
5948 This function also handles reduction of condition expressions, for example:
5949 for (int i = 0; i < N; i++)
5950 if (a[i] < value)
5951 last = a[i];
5952 This is handled by vectorising the loop and creating an additional vector
5953 containing the loop indexes for which "a[i] < value" was true. In the
5954 function epilogue this is reduced to a single max value and then used to
5955 index into the vector of results.
5957 In some cases of reduction patterns, the type of the reduction variable X is
5958 different than the type of the other arguments of STMT_INFO.
5959 In such cases, the vectype that is used when transforming STMT_INFO into
5960 a vector stmt is different than the vectype that is used to determine the
5961 vectorization factor, because it consists of a different number of elements
5962 than the actual number of elements that are being operated upon in parallel.
5964 For example, consider an accumulation of shorts into an int accumulator.
5965 On some targets it's possible to vectorize this pattern operating on 8
5966 shorts at a time (hence, the vectype for purposes of determining the
5967 vectorization factor should be V8HI); on the other hand, the vectype that
5968 is used to create the vector form is actually V4SI (the type of the result).
5970 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5971 indicates what is the actual level of parallelism (V8HI in the example), so
5972 that the right vectorization factor would be derived. This vectype
5973 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5974 be used to create the vectorized stmt. The right vectype for the vectorized
5975 stmt is obtained from the type of the result X:
5976 get_vectype_for_scalar_type (TREE_TYPE (X))
5978 This means that, contrary to "regular" reductions (or "regular" stmts in
5979 general), the following equation:
5980 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5981 does *NOT* necessarily hold for reduction patterns. */
5983 bool
5984 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5985 stmt_vec_info *vec_stmt, slp_tree slp_node,
5986 slp_instance slp_node_instance,
5987 stmt_vector_for_cost *cost_vec)
5989 tree vec_dest;
5990 tree scalar_dest;
5991 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5992 tree vectype_in = NULL_TREE;
5993 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5994 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5995 enum tree_code code, orig_code;
5996 internal_fn reduc_fn;
5997 machine_mode vec_mode;
5998 int op_type;
5999 optab optab;
6000 tree new_temp = NULL_TREE;
6001 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6002 stmt_vec_info cond_stmt_vinfo = NULL;
6003 enum tree_code cond_reduc_op_code = ERROR_MARK;
6004 tree scalar_type;
6005 bool is_simple_use;
6006 int i;
6007 int ncopies;
6008 int epilog_copies;
6009 stmt_vec_info prev_stmt_info, prev_phi_info;
6010 bool single_defuse_cycle = false;
6011 stmt_vec_info new_stmt_info = NULL;
6012 int j;
6013 tree ops[3];
6014 enum vect_def_type dts[3];
6015 bool nested_cycle = false, found_nested_cycle_def = false;
6016 bool double_reduc = false;
6017 basic_block def_bb;
6018 struct loop * def_stmt_loop;
6019 tree def_arg;
6020 auto_vec<tree> vec_oprnds0;
6021 auto_vec<tree> vec_oprnds1;
6022 auto_vec<tree> vec_oprnds2;
6023 auto_vec<tree> vect_defs;
6024 auto_vec<stmt_vec_info> phis;
6025 int vec_num;
6026 tree def0, tem;
6027 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6028 tree cond_reduc_val = NULL_TREE;
6030 /* Make sure it was already recognized as a reduction computation. */
6031 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6032 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6033 return false;
6035 if (nested_in_vect_loop_p (loop, stmt_info))
6037 loop = loop->inner;
6038 nested_cycle = true;
6041 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6042 gcc_assert (slp_node
6043 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6045 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6047 tree phi_result = gimple_phi_result (phi);
6048 /* Analysis is fully done on the reduction stmt invocation. */
6049 if (! vec_stmt)
6051 if (slp_node)
6052 slp_node_instance->reduc_phis = slp_node;
6054 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6055 return true;
6058 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6059 /* Leave the scalar phi in place. Note that checking
6060 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6061 for reductions involving a single statement. */
6062 return true;
6064 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6065 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6067 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6068 == EXTRACT_LAST_REDUCTION)
6069 /* Leave the scalar phi in place. */
6070 return true;
6072 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6073 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6075 tree op = gimple_op (reduc_stmt, k);
6076 if (op == phi_result)
6077 continue;
6078 if (k == 1
6079 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6080 continue;
6081 if (!vectype_in
6082 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6083 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6084 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6085 break;
6087 /* For a nested cycle we might end up with an operation like
6088 phi_result * phi_result. */
6089 if (!vectype_in)
6090 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6091 gcc_assert (vectype_in);
6093 if (slp_node)
6094 ncopies = 1;
6095 else
6096 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6098 stmt_vec_info use_stmt_info;
6099 if (ncopies > 1
6100 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6101 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6102 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6103 single_defuse_cycle = true;
6105 /* Create the destination vector */
6106 scalar_dest = gimple_assign_lhs (reduc_stmt);
6107 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6109 if (slp_node)
6110 /* The size vect_schedule_slp_instance computes is off for us. */
6111 vec_num = vect_get_num_vectors
6112 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6113 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6114 vectype_in);
6115 else
6116 vec_num = 1;
6118 /* Generate the reduction PHIs upfront. */
6119 prev_phi_info = NULL;
6120 for (j = 0; j < ncopies; j++)
6122 if (j == 0 || !single_defuse_cycle)
6124 for (i = 0; i < vec_num; i++)
6126 /* Create the reduction-phi that defines the reduction
6127 operand. */
6128 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6129 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6131 if (slp_node)
6132 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6133 else
6135 if (j == 0)
6136 STMT_VINFO_VEC_STMT (stmt_info)
6137 = *vec_stmt = new_phi_info;
6138 else
6139 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6140 prev_phi_info = new_phi_info;
6146 return true;
6149 /* 1. Is vectorizable reduction? */
6150 /* Not supportable if the reduction variable is used in the loop, unless
6151 it's a reduction chain. */
6152 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6153 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6154 return false;
6156 /* Reductions that are not used even in an enclosing outer-loop,
6157 are expected to be "live" (used out of the loop). */
6158 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6159 && !STMT_VINFO_LIVE_P (stmt_info))
6160 return false;
6162 /* 2. Has this been recognized as a reduction pattern?
6164 Check if STMT represents a pattern that has been recognized
6165 in earlier analysis stages. For stmts that represent a pattern,
6166 the STMT_VINFO_RELATED_STMT field records the last stmt in
6167 the original sequence that constitutes the pattern. */
6169 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6170 if (orig_stmt_info)
6172 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6173 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6176 /* 3. Check the operands of the operation. The first operands are defined
6177 inside the loop body. The last operand is the reduction variable,
6178 which is defined by the loop-header-phi. */
6180 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6182 /* Flatten RHS. */
6183 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6185 case GIMPLE_BINARY_RHS:
6186 code = gimple_assign_rhs_code (stmt);
6187 op_type = TREE_CODE_LENGTH (code);
6188 gcc_assert (op_type == binary_op);
6189 ops[0] = gimple_assign_rhs1 (stmt);
6190 ops[1] = gimple_assign_rhs2 (stmt);
6191 break;
6193 case GIMPLE_TERNARY_RHS:
6194 code = gimple_assign_rhs_code (stmt);
6195 op_type = TREE_CODE_LENGTH (code);
6196 gcc_assert (op_type == ternary_op);
6197 ops[0] = gimple_assign_rhs1 (stmt);
6198 ops[1] = gimple_assign_rhs2 (stmt);
6199 ops[2] = gimple_assign_rhs3 (stmt);
6200 break;
6202 case GIMPLE_UNARY_RHS:
6203 return false;
6205 default:
6206 gcc_unreachable ();
6209 if (code == COND_EXPR && slp_node)
6210 return false;
6212 scalar_dest = gimple_assign_lhs (stmt);
6213 scalar_type = TREE_TYPE (scalar_dest);
6214 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6215 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6216 return false;
6218 /* Do not try to vectorize bit-precision reductions. */
6219 if (!type_has_mode_precision_p (scalar_type))
6220 return false;
6222 /* All uses but the last are expected to be defined in the loop.
6223 The last use is the reduction variable. In case of nested cycle this
6224 assumption is not true: we use reduc_index to record the index of the
6225 reduction variable. */
6226 stmt_vec_info reduc_def_info = NULL;
6227 int reduc_index = -1;
6228 for (i = 0; i < op_type; i++)
6230 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6231 if (i == 0 && code == COND_EXPR)
6232 continue;
6234 stmt_vec_info def_stmt_info;
6235 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6236 &def_stmt_info);
6237 dt = dts[i];
6238 gcc_assert (is_simple_use);
6239 if (dt == vect_reduction_def)
6241 reduc_def_info = def_stmt_info;
6242 reduc_index = i;
6243 continue;
6245 else if (tem)
6247 /* To properly compute ncopies we are interested in the widest
6248 input type in case we're looking at a widening accumulation. */
6249 if (!vectype_in
6250 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6251 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6252 vectype_in = tem;
6255 if (dt != vect_internal_def
6256 && dt != vect_external_def
6257 && dt != vect_constant_def
6258 && dt != vect_induction_def
6259 && !(dt == vect_nested_cycle && nested_cycle))
6260 return false;
6262 if (dt == vect_nested_cycle)
6264 found_nested_cycle_def = true;
6265 reduc_def_info = def_stmt_info;
6266 reduc_index = i;
6269 if (i == 1 && code == COND_EXPR)
6271 /* Record how value of COND_EXPR is defined. */
6272 if (dt == vect_constant_def)
6274 cond_reduc_dt = dt;
6275 cond_reduc_val = ops[i];
6277 if (dt == vect_induction_def
6278 && def_stmt_info
6279 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6281 cond_reduc_dt = dt;
6282 cond_stmt_vinfo = def_stmt_info;
6287 if (!vectype_in)
6288 vectype_in = vectype_out;
6290 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6291 directy used in stmt. */
6292 if (reduc_index == -1)
6294 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6296 if (dump_enabled_p ())
6297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6298 "in-order reduction chain without SLP.\n");
6299 return false;
6302 if (orig_stmt_info)
6303 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6304 else
6305 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6308 if (! reduc_def_info)
6309 return false;
6311 gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6312 if (!reduc_def_phi)
6313 return false;
6315 if (!(reduc_index == -1
6316 || dts[reduc_index] == vect_reduction_def
6317 || dts[reduc_index] == vect_nested_cycle
6318 || ((dts[reduc_index] == vect_internal_def
6319 || dts[reduc_index] == vect_external_def
6320 || dts[reduc_index] == vect_constant_def
6321 || dts[reduc_index] == vect_induction_def)
6322 && nested_cycle && found_nested_cycle_def)))
6324 /* For pattern recognized stmts, orig_stmt might be a reduction,
6325 but some helper statements for the pattern might not, or
6326 might be COND_EXPRs with reduction uses in the condition. */
6327 gcc_assert (orig_stmt_info);
6328 return false;
6331 /* PHIs should not participate in patterns. */
6332 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6333 enum vect_reduction_type v_reduc_type
6334 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6335 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6337 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6338 /* If we have a condition reduction, see if we can simplify it further. */
6339 if (v_reduc_type == COND_REDUCTION)
6341 /* TODO: We can't yet handle reduction chains, since we need to treat
6342 each COND_EXPR in the chain specially, not just the last one.
6343 E.g. for:
6345 x_1 = PHI <x_3, ...>
6346 x_2 = a_2 ? ... : x_1;
6347 x_3 = a_3 ? ... : x_2;
6349 we're interested in the last element in x_3 for which a_2 || a_3
6350 is true, whereas the current reduction chain handling would
6351 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6352 as a reduction operation. */
6353 if (reduc_index == -1)
6355 if (dump_enabled_p ())
6356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357 "conditional reduction chains not supported\n");
6358 return false;
6361 /* vect_is_simple_reduction ensured that operand 2 is the
6362 loop-carried operand. */
6363 gcc_assert (reduc_index == 2);
6365 /* Loop peeling modifies initial value of reduction PHI, which
6366 makes the reduction stmt to be transformed different to the
6367 original stmt analyzed. We need to record reduction code for
6368 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6369 it can be used directly at transform stage. */
6370 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6371 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6373 /* Also set the reduction type to CONST_COND_REDUCTION. */
6374 gcc_assert (cond_reduc_dt == vect_constant_def);
6375 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6377 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6378 vectype_in, OPTIMIZE_FOR_SPEED))
6380 if (dump_enabled_p ())
6381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6382 "optimizing condition reduction with"
6383 " FOLD_EXTRACT_LAST.\n");
6384 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6386 else if (cond_reduc_dt == vect_induction_def)
6388 tree base
6389 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6390 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6392 gcc_assert (TREE_CODE (base) == INTEGER_CST
6393 && TREE_CODE (step) == INTEGER_CST);
6394 cond_reduc_val = NULL_TREE;
6395 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6396 above base; punt if base is the minimum value of the type for
6397 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6398 if (tree_int_cst_sgn (step) == -1)
6400 cond_reduc_op_code = MIN_EXPR;
6401 if (tree_int_cst_sgn (base) == -1)
6402 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6403 else if (tree_int_cst_lt (base,
6404 TYPE_MAX_VALUE (TREE_TYPE (base))))
6405 cond_reduc_val
6406 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6408 else
6410 cond_reduc_op_code = MAX_EXPR;
6411 if (tree_int_cst_sgn (base) == 1)
6412 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6413 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6414 base))
6415 cond_reduc_val
6416 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6418 if (cond_reduc_val)
6420 if (dump_enabled_p ())
6421 dump_printf_loc (MSG_NOTE, vect_location,
6422 "condition expression based on "
6423 "integer induction.\n");
6424 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6425 = INTEGER_INDUC_COND_REDUCTION;
6428 else if (cond_reduc_dt == vect_constant_def)
6430 enum vect_def_type cond_initial_dt;
6431 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6432 tree cond_initial_val
6433 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6435 gcc_assert (cond_reduc_val != NULL_TREE);
6436 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6437 if (cond_initial_dt == vect_constant_def
6438 && types_compatible_p (TREE_TYPE (cond_initial_val),
6439 TREE_TYPE (cond_reduc_val)))
6441 tree e = fold_binary (LE_EXPR, boolean_type_node,
6442 cond_initial_val, cond_reduc_val);
6443 if (e && (integer_onep (e) || integer_zerop (e)))
6445 if (dump_enabled_p ())
6446 dump_printf_loc (MSG_NOTE, vect_location,
6447 "condition expression based on "
6448 "compile time constant.\n");
6449 /* Record reduction code at analysis stage. */
6450 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6451 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6452 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6453 = CONST_COND_REDUCTION;
6459 if (orig_stmt_info)
6460 gcc_assert (tmp == orig_stmt_info
6461 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6462 else
6463 /* We changed STMT to be the first stmt in reduction chain, hence we
6464 check that in this case the first element in the chain is STMT. */
6465 gcc_assert (tmp == stmt_info
6466 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6468 if (STMT_VINFO_LIVE_P (reduc_def_info))
6469 return false;
6471 if (slp_node)
6472 ncopies = 1;
6473 else
6474 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6476 gcc_assert (ncopies >= 1);
6478 vec_mode = TYPE_MODE (vectype_in);
6479 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6481 if (nested_cycle)
6483 def_bb = gimple_bb (reduc_def_phi);
6484 def_stmt_loop = def_bb->loop_father;
6485 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6486 loop_preheader_edge (def_stmt_loop));
6487 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6488 if (def_arg_stmt_info
6489 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6490 == vect_double_reduction_def))
6491 double_reduc = true;
6494 if (code == COND_EXPR)
6496 /* Only call during the analysis stage, otherwise we'll lose
6497 STMT_VINFO_TYPE. We'll pass ops[0] as reduc_op, it's only
6498 used as a flag during analysis. */
6499 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6500 ops[0], 0, NULL,
6501 cost_vec))
6503 if (dump_enabled_p ())
6504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6505 "unsupported condition in reduction\n");
6506 return false;
6509 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6510 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6512 /* Only call during the analysis stage, otherwise we'll lose
6513 STMT_VINFO_TYPE. We only support this for nested cycles
6514 without double reductions at the moment. */
6515 if (!nested_cycle
6516 || double_reduc
6517 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6518 NULL, cost_vec)))
6520 if (dump_enabled_p ())
6521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6522 "unsupported shift or rotation in reduction\n");
6523 return false;
6526 else
6528 /* 4. Supportable by target? */
6530 /* 4.1. check support for the operation in the loop */
6531 optab = optab_for_tree_code (code, vectype_in, optab_default);
6532 if (!optab)
6534 if (dump_enabled_p ())
6535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536 "no optab.\n");
6538 return false;
6541 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6543 if (dump_enabled_p ())
6544 dump_printf (MSG_NOTE, "op not supported by target.\n");
6546 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6547 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6548 return false;
6550 if (dump_enabled_p ())
6551 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6554 /* Worthwhile without SIMD support? */
6555 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6556 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6558 if (dump_enabled_p ())
6559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6560 "not worthwhile without SIMD support.\n");
6562 return false;
6566 /* 4.2. Check support for the epilog operation.
6568 If STMT represents a reduction pattern, then the type of the
6569 reduction variable may be different than the type of the rest
6570 of the arguments. For example, consider the case of accumulation
6571 of shorts into an int accumulator; The original code:
6572 S1: int_a = (int) short_a;
6573 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6575 was replaced with:
6576 STMT: int_acc = widen_sum <short_a, int_acc>
6578 This means that:
6579 1. The tree-code that is used to create the vector operation in the
6580 epilog code (that reduces the partial results) is not the
6581 tree-code of STMT, but is rather the tree-code of the original
6582 stmt from the pattern that STMT is replacing. I.e, in the example
6583 above we want to use 'widen_sum' in the loop, but 'plus' in the
6584 epilog.
6585 2. The type (mode) we use to check available target support
6586 for the vector operation to be created in the *epilog*, is
6587 determined by the type of the reduction variable (in the example
6588 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6589 However the type (mode) we use to check available target support
6590 for the vector operation to be created *inside the loop*, is
6591 determined by the type of the other arguments to STMT (in the
6592 example we'd check this: optab_handler (widen_sum_optab,
6593 vect_short_mode)).
6595 This is contrary to "regular" reductions, in which the types of all
6596 the arguments are the same as the type of the reduction variable.
6597 For "regular" reductions we can therefore use the same vector type
6598 (and also the same tree-code) when generating the epilog code and
6599 when generating the code inside the loop. */
6601 vect_reduction_type reduction_type
6602 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6603 if (orig_stmt_info
6604 && (reduction_type == TREE_CODE_REDUCTION
6605 || reduction_type == FOLD_LEFT_REDUCTION))
6607 /* This is a reduction pattern: get the vectype from the type of the
6608 reduction variable, and get the tree-code from orig_stmt. */
6609 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6610 gcc_assert (vectype_out);
6611 vec_mode = TYPE_MODE (vectype_out);
6613 else
6615 /* Regular reduction: use the same vectype and tree-code as used for
6616 the vector code inside the loop can be used for the epilog code. */
6617 orig_code = code;
6619 if (code == MINUS_EXPR)
6620 orig_code = PLUS_EXPR;
6622 /* For simple condition reductions, replace with the actual expression
6623 we want to base our reduction around. */
6624 if (reduction_type == CONST_COND_REDUCTION)
6626 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6627 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6629 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6630 orig_code = cond_reduc_op_code;
6633 reduc_fn = IFN_LAST;
6635 if (reduction_type == TREE_CODE_REDUCTION
6636 || reduction_type == FOLD_LEFT_REDUCTION
6637 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6638 || reduction_type == CONST_COND_REDUCTION)
6640 if (reduction_type == FOLD_LEFT_REDUCTION
6641 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6642 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6644 if (reduc_fn != IFN_LAST
6645 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6646 OPTIMIZE_FOR_SPEED))
6648 if (dump_enabled_p ())
6649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6650 "reduc op not supported by target.\n");
6652 reduc_fn = IFN_LAST;
6655 else
6657 if (!nested_cycle || double_reduc)
6659 if (dump_enabled_p ())
6660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6661 "no reduc code for scalar code.\n");
6663 return false;
6667 else if (reduction_type == COND_REDUCTION)
6669 int scalar_precision
6670 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6671 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6672 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6673 nunits_out);
6675 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6676 OPTIMIZE_FOR_SPEED))
6677 reduc_fn = IFN_REDUC_MAX;
6680 if (reduction_type != EXTRACT_LAST_REDUCTION
6681 && (!nested_cycle || double_reduc)
6682 && reduc_fn == IFN_LAST
6683 && !nunits_out.is_constant ())
6685 if (dump_enabled_p ())
6686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6687 "missing target support for reduction on"
6688 " variable-length vectors.\n");
6689 return false;
6692 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6693 && ncopies > 1)
6695 if (dump_enabled_p ())
6696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6697 "multiple types in double reduction or condition "
6698 "reduction.\n");
6699 return false;
6702 /* For SLP reductions, see if there is a neutral value we can use. */
6703 tree neutral_op = NULL_TREE;
6704 if (slp_node)
6705 neutral_op = neutral_op_for_slp_reduction
6706 (slp_node_instance->reduc_phis, code,
6707 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6709 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6711 /* We can't support in-order reductions of code such as this:
6713 for (int i = 0; i < n1; ++i)
6714 for (int j = 0; j < n2; ++j)
6715 l += a[j];
6717 since GCC effectively transforms the loop when vectorizing:
6719 for (int i = 0; i < n1 / VF; ++i)
6720 for (int j = 0; j < n2; ++j)
6721 for (int k = 0; k < VF; ++k)
6722 l += a[j];
6724 which is a reassociation of the original operation. */
6725 if (dump_enabled_p ())
6726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727 "in-order double reduction not supported.\n");
6729 return false;
6732 if (reduction_type == FOLD_LEFT_REDUCTION
6733 && slp_node
6734 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6736 /* We cannot use in-order reductions in this case because there is
6737 an implicit reassociation of the operations involved. */
6738 if (dump_enabled_p ())
6739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740 "in-order unchained SLP reductions not supported.\n");
6741 return false;
6744 /* For double reductions, and for SLP reductions with a neutral value,
6745 we construct a variable-length initial vector by loading a vector
6746 full of the neutral value and then shift-and-inserting the start
6747 values into the low-numbered elements. */
6748 if ((double_reduc || neutral_op)
6749 && !nunits_out.is_constant ()
6750 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6751 vectype_out, OPTIMIZE_FOR_SPEED))
6753 if (dump_enabled_p ())
6754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755 "reduction on variable-length vectors requires"
6756 " target support for a vector-shift-and-insert"
6757 " operation.\n");
6758 return false;
6761 /* Check extra constraints for variable-length unchained SLP reductions. */
6762 if (STMT_SLP_TYPE (stmt_info)
6763 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6764 && !nunits_out.is_constant ())
6766 /* We checked above that we could build the initial vector when
6767 there's a neutral element value. Check here for the case in
6768 which each SLP statement has its own initial value and in which
6769 that value needs to be repeated for every instance of the
6770 statement within the initial vector. */
6771 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6772 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6773 if (!neutral_op
6774 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6776 if (dump_enabled_p ())
6777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6778 "unsupported form of SLP reduction for"
6779 " variable-length vectors: cannot build"
6780 " initial vector.\n");
6781 return false;
6783 /* The epilogue code relies on the number of elements being a multiple
6784 of the group size. The duplicate-and-interleave approach to setting
6785 up the the initial vector does too. */
6786 if (!multiple_p (nunits_out, group_size))
6788 if (dump_enabled_p ())
6789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6790 "unsupported form of SLP reduction for"
6791 " variable-length vectors: the vector size"
6792 " is not a multiple of the number of results.\n");
6793 return false;
6797 /* In case of widenning multiplication by a constant, we update the type
6798 of the constant to be the type of the other operand. We check that the
6799 constant fits the type in the pattern recognition pass. */
6800 if (code == DOT_PROD_EXPR
6801 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6803 if (TREE_CODE (ops[0]) == INTEGER_CST)
6804 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6805 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6806 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6807 else
6809 if (dump_enabled_p ())
6810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811 "invalid types in dot-prod\n");
6813 return false;
6817 if (reduction_type == COND_REDUCTION)
6819 widest_int ni;
6821 if (! max_loop_iterations (loop, &ni))
6823 if (dump_enabled_p ())
6824 dump_printf_loc (MSG_NOTE, vect_location,
6825 "loop count not known, cannot create cond "
6826 "reduction.\n");
6827 return false;
6829 /* Convert backedges to iterations. */
6830 ni += 1;
6832 /* The additional index will be the same type as the condition. Check
6833 that the loop can fit into this less one (because we'll use up the
6834 zero slot for when there are no matches). */
6835 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6836 if (wi::geu_p (ni, wi::to_widest (max_index)))
6838 if (dump_enabled_p ())
6839 dump_printf_loc (MSG_NOTE, vect_location,
6840 "loop size is greater than data size.\n");
6841 return false;
6845 /* In case the vectorization factor (VF) is bigger than the number
6846 of elements that we can fit in a vectype (nunits), we have to generate
6847 more than one vector stmt - i.e - we need to "unroll" the
6848 vector stmt by a factor VF/nunits. For more details see documentation
6849 in vectorizable_operation. */
6851 /* If the reduction is used in an outer loop we need to generate
6852 VF intermediate results, like so (e.g. for ncopies=2):
6853 r0 = phi (init, r0)
6854 r1 = phi (init, r1)
6855 r0 = x0 + r0;
6856 r1 = x1 + r1;
6857 (i.e. we generate VF results in 2 registers).
6858 In this case we have a separate def-use cycle for each copy, and therefore
6859 for each copy we get the vector def for the reduction variable from the
6860 respective phi node created for this copy.
6862 Otherwise (the reduction is unused in the loop nest), we can combine
6863 together intermediate results, like so (e.g. for ncopies=2):
6864 r = phi (init, r)
6865 r = x0 + r;
6866 r = x1 + r;
6867 (i.e. we generate VF/2 results in a single register).
6868 In this case for each copy we get the vector def for the reduction variable
6869 from the vectorized reduction operation generated in the previous iteration.
6871 This only works when we see both the reduction PHI and its only consumer
6872 in vectorizable_reduction and there are no intermediate stmts
6873 participating. */
6874 stmt_vec_info use_stmt_info;
6875 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6876 if (ncopies > 1
6877 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6878 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6879 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6881 single_defuse_cycle = true;
6882 epilog_copies = 1;
6884 else
6885 epilog_copies = ncopies;
6887 /* If the reduction stmt is one of the patterns that have lane
6888 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6889 if ((ncopies > 1
6890 && ! single_defuse_cycle)
6891 && (code == DOT_PROD_EXPR
6892 || code == WIDEN_SUM_EXPR
6893 || code == SAD_EXPR))
6895 if (dump_enabled_p ())
6896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6897 "multi def-use cycle not possible for lane-reducing "
6898 "reduction operation\n");
6899 return false;
6902 if (slp_node)
6903 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6904 else
6905 vec_num = 1;
6907 internal_fn cond_fn = get_conditional_internal_fn (code);
6908 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6910 if (!vec_stmt) /* transformation not required. */
6912 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6913 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6915 if (reduction_type != FOLD_LEFT_REDUCTION
6916 && (cond_fn == IFN_LAST
6917 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6918 OPTIMIZE_FOR_SPEED)))
6920 if (dump_enabled_p ())
6921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922 "can't use a fully-masked loop because no"
6923 " conditional operation is available.\n");
6924 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6926 else if (reduc_index == -1)
6928 if (dump_enabled_p ())
6929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6930 "can't use a fully-masked loop for chained"
6931 " reductions.\n");
6932 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6934 else
6935 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6936 vectype_in);
6938 if (dump_enabled_p ()
6939 && reduction_type == FOLD_LEFT_REDUCTION)
6940 dump_printf_loc (MSG_NOTE, vect_location,
6941 "using an in-order (fold-left) reduction.\n");
6942 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6943 return true;
6946 /* Transform. */
6948 if (dump_enabled_p ())
6949 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6951 /* FORNOW: Multiple types are not supported for condition. */
6952 if (code == COND_EXPR)
6953 gcc_assert (ncopies == 1);
6955 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6957 if (reduction_type == FOLD_LEFT_REDUCTION)
6958 return vectorize_fold_left_reduction
6959 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6960 reduc_fn, ops, vectype_in, reduc_index, masks);
6962 if (reduction_type == EXTRACT_LAST_REDUCTION)
6964 gcc_assert (!slp_node);
6965 return vectorizable_condition (stmt_info, gsi, vec_stmt,
6966 NULL, reduc_index, NULL, NULL);
6969 /* Create the destination vector */
6970 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6972 prev_stmt_info = NULL;
6973 prev_phi_info = NULL;
6974 if (!slp_node)
6976 vec_oprnds0.create (1);
6977 vec_oprnds1.create (1);
6978 if (op_type == ternary_op)
6979 vec_oprnds2.create (1);
6982 phis.create (vec_num);
6983 vect_defs.create (vec_num);
6984 if (!slp_node)
6985 vect_defs.quick_push (NULL_TREE);
6987 if (slp_node)
6988 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6989 else
6990 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6992 for (j = 0; j < ncopies; j++)
6994 if (code == COND_EXPR)
6996 gcc_assert (!slp_node);
6997 vectorizable_condition (stmt_info, gsi, vec_stmt,
6998 PHI_RESULT (phis[0]->stmt),
6999 reduc_index, NULL, NULL);
7000 /* Multiple types are not supported for condition. */
7001 break;
7003 if (code == LSHIFT_EXPR
7004 || code == RSHIFT_EXPR)
7006 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7007 break;
7010 /* Handle uses. */
7011 if (j == 0)
7013 if (slp_node)
7015 /* Get vec defs for all the operands except the reduction index,
7016 ensuring the ordering of the ops in the vector is kept. */
7017 auto_vec<tree, 3> slp_ops;
7018 auto_vec<vec<tree>, 3> vec_defs;
7020 slp_ops.quick_push (ops[0]);
7021 slp_ops.quick_push (ops[1]);
7022 if (op_type == ternary_op)
7023 slp_ops.quick_push (ops[2]);
7025 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7027 vec_oprnds0.safe_splice (vec_defs[0]);
7028 vec_defs[0].release ();
7029 vec_oprnds1.safe_splice (vec_defs[1]);
7030 vec_defs[1].release ();
7031 if (op_type == ternary_op)
7033 vec_oprnds2.safe_splice (vec_defs[2]);
7034 vec_defs[2].release ();
7037 else
7039 vec_oprnds0.quick_push
7040 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7041 vec_oprnds1.quick_push
7042 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7043 if (op_type == ternary_op)
7044 vec_oprnds2.quick_push
7045 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7048 else
7050 if (!slp_node)
7052 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7054 if (single_defuse_cycle && reduc_index == 0)
7055 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7056 else
7057 vec_oprnds0[0]
7058 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7059 vec_oprnds0[0]);
7060 if (single_defuse_cycle && reduc_index == 1)
7061 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7062 else
7063 vec_oprnds1[0]
7064 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7065 vec_oprnds1[0]);
7066 if (op_type == ternary_op)
7068 if (single_defuse_cycle && reduc_index == 2)
7069 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7070 else
7071 vec_oprnds2[0]
7072 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7073 vec_oprnds2[0]);
7078 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7080 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7081 if (masked_loop_p)
7083 /* Make sure that the reduction accumulator is vop[0]. */
7084 if (reduc_index == 1)
7086 gcc_assert (commutative_tree_code (code));
7087 std::swap (vop[0], vop[1]);
7089 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7090 vectype_in, i * ncopies + j);
7091 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7092 vop[0], vop[1],
7093 vop[0]);
7094 new_temp = make_ssa_name (vec_dest, call);
7095 gimple_call_set_lhs (call, new_temp);
7096 gimple_call_set_nothrow (call, true);
7097 new_stmt_info
7098 = vect_finish_stmt_generation (stmt_info, call, gsi);
7100 else
7102 if (op_type == ternary_op)
7103 vop[2] = vec_oprnds2[i];
7105 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7106 vop[0], vop[1], vop[2]);
7107 new_temp = make_ssa_name (vec_dest, new_stmt);
7108 gimple_assign_set_lhs (new_stmt, new_temp);
7109 new_stmt_info
7110 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7113 if (slp_node)
7115 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7116 vect_defs.quick_push (new_temp);
7118 else
7119 vect_defs[0] = new_temp;
7122 if (slp_node)
7123 continue;
7125 if (j == 0)
7126 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7127 else
7128 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7130 prev_stmt_info = new_stmt_info;
7133 /* Finalize the reduction-phi (set its arguments) and create the
7134 epilog reduction code. */
7135 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7136 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7138 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7139 epilog_copies, reduc_fn, phis,
7140 double_reduc, slp_node, slp_node_instance,
7141 cond_reduc_val, cond_reduc_op_code,
7142 neutral_op);
7144 return true;
7147 /* Function vect_min_worthwhile_factor.
7149 For a loop where we could vectorize the operation indicated by CODE,
7150 return the minimum vectorization factor that makes it worthwhile
7151 to use generic vectors. */
7152 static unsigned int
7153 vect_min_worthwhile_factor (enum tree_code code)
7155 switch (code)
7157 case PLUS_EXPR:
7158 case MINUS_EXPR:
7159 case NEGATE_EXPR:
7160 return 4;
7162 case BIT_AND_EXPR:
7163 case BIT_IOR_EXPR:
7164 case BIT_XOR_EXPR:
7165 case BIT_NOT_EXPR:
7166 return 2;
7168 default:
7169 return INT_MAX;
7173 /* Return true if VINFO indicates we are doing loop vectorization and if
7174 it is worth decomposing CODE operations into scalar operations for
7175 that loop's vectorization factor. */
7177 bool
7178 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7180 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7181 unsigned HOST_WIDE_INT value;
7182 return (loop_vinfo
7183 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7184 && value >= vect_min_worthwhile_factor (code));
7187 /* Function vectorizable_induction
7189 Check if STMT_INFO performs an induction computation that can be vectorized.
7190 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7191 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7192 Return true if STMT_INFO is vectorizable in this way. */
7194 bool
7195 vectorizable_induction (stmt_vec_info stmt_info,
7196 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7197 stmt_vec_info *vec_stmt, slp_tree slp_node,
7198 stmt_vector_for_cost *cost_vec)
7200 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7201 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7202 unsigned ncopies;
7203 bool nested_in_vect_loop = false;
7204 struct loop *iv_loop;
7205 tree vec_def;
7206 edge pe = loop_preheader_edge (loop);
7207 basic_block new_bb;
7208 tree new_vec, vec_init, vec_step, t;
7209 tree new_name;
7210 gimple *new_stmt;
7211 gphi *induction_phi;
7212 tree induc_def, vec_dest;
7213 tree init_expr, step_expr;
7214 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7215 unsigned i;
7216 tree expr;
7217 gimple_seq stmts;
7218 imm_use_iterator imm_iter;
7219 use_operand_p use_p;
7220 gimple *exit_phi;
7221 edge latch_e;
7222 tree loop_arg;
7223 gimple_stmt_iterator si;
7225 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7226 if (!phi)
7227 return false;
7229 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7230 return false;
7232 /* Make sure it was recognized as induction computation. */
7233 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7234 return false;
7236 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7237 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7239 if (slp_node)
7240 ncopies = 1;
7241 else
7242 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7243 gcc_assert (ncopies >= 1);
7245 /* FORNOW. These restrictions should be relaxed. */
7246 if (nested_in_vect_loop_p (loop, stmt_info))
7248 imm_use_iterator imm_iter;
7249 use_operand_p use_p;
7250 gimple *exit_phi;
7251 edge latch_e;
7252 tree loop_arg;
7254 if (ncopies > 1)
7256 if (dump_enabled_p ())
7257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7258 "multiple types in nested loop.\n");
7259 return false;
7262 /* FORNOW: outer loop induction with SLP not supported. */
7263 if (STMT_SLP_TYPE (stmt_info))
7264 return false;
7266 exit_phi = NULL;
7267 latch_e = loop_latch_edge (loop->inner);
7268 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7269 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7271 gimple *use_stmt = USE_STMT (use_p);
7272 if (is_gimple_debug (use_stmt))
7273 continue;
7275 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7277 exit_phi = use_stmt;
7278 break;
7281 if (exit_phi)
7283 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7284 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7285 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7287 if (dump_enabled_p ())
7288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7289 "inner-loop induction only used outside "
7290 "of the outer vectorized loop.\n");
7291 return false;
7295 nested_in_vect_loop = true;
7296 iv_loop = loop->inner;
7298 else
7299 iv_loop = loop;
7300 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7302 if (slp_node && !nunits.is_constant ())
7304 /* The current SLP code creates the initial value element-by-element. */
7305 if (dump_enabled_p ())
7306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7307 "SLP induction not supported for variable-length"
7308 " vectors.\n");
7309 return false;
7312 if (!vec_stmt) /* transformation not required. */
7314 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7315 DUMP_VECT_SCOPE ("vectorizable_induction");
7316 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7317 return true;
7320 /* Transform. */
7322 /* Compute a vector variable, initialized with the first VF values of
7323 the induction variable. E.g., for an iv with IV_PHI='X' and
7324 evolution S, for a vector of 4 units, we want to compute:
7325 [X, X + S, X + 2*S, X + 3*S]. */
7327 if (dump_enabled_p ())
7328 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7330 latch_e = loop_latch_edge (iv_loop);
7331 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7333 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7334 gcc_assert (step_expr != NULL_TREE);
7336 pe = loop_preheader_edge (iv_loop);
7337 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7338 loop_preheader_edge (iv_loop));
7340 stmts = NULL;
7341 if (!nested_in_vect_loop)
7343 /* Convert the initial value to the desired type. */
7344 tree new_type = TREE_TYPE (vectype);
7345 init_expr = gimple_convert (&stmts, new_type, init_expr);
7347 /* If we are using the loop mask to "peel" for alignment then we need
7348 to adjust the start value here. */
7349 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7350 if (skip_niters != NULL_TREE)
7352 if (FLOAT_TYPE_P (vectype))
7353 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7354 skip_niters);
7355 else
7356 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7357 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7358 skip_niters, step_expr);
7359 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7360 init_expr, skip_step);
7364 /* Convert the step to the desired type. */
7365 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7367 if (stmts)
7369 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7370 gcc_assert (!new_bb);
7373 /* Find the first insertion point in the BB. */
7374 basic_block bb = gimple_bb (phi);
7375 si = gsi_after_labels (bb);
7377 /* For SLP induction we have to generate several IVs as for example
7378 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7379 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7380 [VF*S, VF*S, VF*S, VF*S] for all. */
7381 if (slp_node)
7383 /* Enforced above. */
7384 unsigned int const_nunits = nunits.to_constant ();
7386 /* Generate [VF*S, VF*S, ... ]. */
7387 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7389 expr = build_int_cst (integer_type_node, vf);
7390 expr = fold_convert (TREE_TYPE (step_expr), expr);
7392 else
7393 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7394 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7395 expr, step_expr);
7396 if (! CONSTANT_CLASS_P (new_name))
7397 new_name = vect_init_vector (stmt_info, new_name,
7398 TREE_TYPE (step_expr), NULL);
7399 new_vec = build_vector_from_val (vectype, new_name);
7400 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7402 /* Now generate the IVs. */
7403 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7404 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7405 unsigned elts = const_nunits * nvects;
7406 unsigned nivs = least_common_multiple (group_size,
7407 const_nunits) / const_nunits;
7408 gcc_assert (elts % group_size == 0);
7409 tree elt = init_expr;
7410 unsigned ivn;
7411 for (ivn = 0; ivn < nivs; ++ivn)
7413 tree_vector_builder elts (vectype, const_nunits, 1);
7414 stmts = NULL;
7415 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7417 if (ivn*const_nunits + eltn >= group_size
7418 && (ivn * const_nunits + eltn) % group_size == 0)
7419 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7420 elt, step_expr);
7421 elts.quick_push (elt);
7423 vec_init = gimple_build_vector (&stmts, &elts);
7424 if (stmts)
7426 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7427 gcc_assert (!new_bb);
7430 /* Create the induction-phi that defines the induction-operand. */
7431 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7432 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7433 stmt_vec_info induction_phi_info
7434 = loop_vinfo->add_stmt (induction_phi);
7435 induc_def = PHI_RESULT (induction_phi);
7437 /* Create the iv update inside the loop */
7438 vec_def = make_ssa_name (vec_dest);
7439 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7440 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7441 loop_vinfo->add_stmt (new_stmt);
7443 /* Set the arguments of the phi node: */
7444 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7445 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7446 UNKNOWN_LOCATION);
7448 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7451 /* Re-use IVs when we can. */
7452 if (ivn < nvects)
7454 unsigned vfp
7455 = least_common_multiple (group_size, const_nunits) / group_size;
7456 /* Generate [VF'*S, VF'*S, ... ]. */
7457 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7459 expr = build_int_cst (integer_type_node, vfp);
7460 expr = fold_convert (TREE_TYPE (step_expr), expr);
7462 else
7463 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7464 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7465 expr, step_expr);
7466 if (! CONSTANT_CLASS_P (new_name))
7467 new_name = vect_init_vector (stmt_info, new_name,
7468 TREE_TYPE (step_expr), NULL);
7469 new_vec = build_vector_from_val (vectype, new_name);
7470 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7471 for (; ivn < nvects; ++ivn)
7473 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7474 tree def;
7475 if (gimple_code (iv) == GIMPLE_PHI)
7476 def = gimple_phi_result (iv);
7477 else
7478 def = gimple_assign_lhs (iv);
7479 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7480 PLUS_EXPR,
7481 def, vec_step);
7482 if (gimple_code (iv) == GIMPLE_PHI)
7483 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7484 else
7486 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7487 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7489 SLP_TREE_VEC_STMTS (slp_node).quick_push
7490 (loop_vinfo->add_stmt (new_stmt));
7494 return true;
7497 /* Create the vector that holds the initial_value of the induction. */
7498 if (nested_in_vect_loop)
7500 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7501 been created during vectorization of previous stmts. We obtain it
7502 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7503 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7504 /* If the initial value is not of proper type, convert it. */
7505 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7507 new_stmt
7508 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7509 vect_simple_var,
7510 "vec_iv_"),
7511 VIEW_CONVERT_EXPR,
7512 build1 (VIEW_CONVERT_EXPR, vectype,
7513 vec_init));
7514 vec_init = gimple_assign_lhs (new_stmt);
7515 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7516 new_stmt);
7517 gcc_assert (!new_bb);
7518 loop_vinfo->add_stmt (new_stmt);
7521 else
7523 /* iv_loop is the loop to be vectorized. Create:
7524 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7525 stmts = NULL;
7526 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7528 unsigned HOST_WIDE_INT const_nunits;
7529 if (nunits.is_constant (&const_nunits))
7531 tree_vector_builder elts (vectype, const_nunits, 1);
7532 elts.quick_push (new_name);
7533 for (i = 1; i < const_nunits; i++)
7535 /* Create: new_name_i = new_name + step_expr */
7536 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7537 new_name, step_expr);
7538 elts.quick_push (new_name);
7540 /* Create a vector from [new_name_0, new_name_1, ...,
7541 new_name_nunits-1] */
7542 vec_init = gimple_build_vector (&stmts, &elts);
7544 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7545 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7546 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7547 new_name, step_expr);
7548 else
7550 /* Build:
7551 [base, base, base, ...]
7552 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7553 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7554 gcc_assert (flag_associative_math);
7555 tree index = build_index_vector (vectype, 0, 1);
7556 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7557 new_name);
7558 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7559 step_expr);
7560 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7561 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7562 vec_init, step_vec);
7563 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7564 vec_init, base_vec);
7567 if (stmts)
7569 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7570 gcc_assert (!new_bb);
7575 /* Create the vector that holds the step of the induction. */
7576 if (nested_in_vect_loop)
7577 /* iv_loop is nested in the loop to be vectorized. Generate:
7578 vec_step = [S, S, S, S] */
7579 new_name = step_expr;
7580 else
7582 /* iv_loop is the loop to be vectorized. Generate:
7583 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7584 gimple_seq seq = NULL;
7585 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7587 expr = build_int_cst (integer_type_node, vf);
7588 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7590 else
7591 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7592 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7593 expr, step_expr);
7594 if (seq)
7596 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7597 gcc_assert (!new_bb);
7601 t = unshare_expr (new_name);
7602 gcc_assert (CONSTANT_CLASS_P (new_name)
7603 || TREE_CODE (new_name) == SSA_NAME);
7604 new_vec = build_vector_from_val (vectype, t);
7605 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7608 /* Create the following def-use cycle:
7609 loop prolog:
7610 vec_init = ...
7611 vec_step = ...
7612 loop:
7613 vec_iv = PHI <vec_init, vec_loop>
7615 STMT
7617 vec_loop = vec_iv + vec_step; */
7619 /* Create the induction-phi that defines the induction-operand. */
7620 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7621 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7622 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7623 induc_def = PHI_RESULT (induction_phi);
7625 /* Create the iv update inside the loop */
7626 vec_def = make_ssa_name (vec_dest);
7627 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7628 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7629 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7631 /* Set the arguments of the phi node: */
7632 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7633 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7634 UNKNOWN_LOCATION);
7636 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7638 /* In case that vectorization factor (VF) is bigger than the number
7639 of elements that we can fit in a vectype (nunits), we have to generate
7640 more than one vector stmt - i.e - we need to "unroll" the
7641 vector stmt by a factor VF/nunits. For more details see documentation
7642 in vectorizable_operation. */
7644 if (ncopies > 1)
7646 gimple_seq seq = NULL;
7647 stmt_vec_info prev_stmt_vinfo;
7648 /* FORNOW. This restriction should be relaxed. */
7649 gcc_assert (!nested_in_vect_loop);
7651 /* Create the vector that holds the step of the induction. */
7652 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7654 expr = build_int_cst (integer_type_node, nunits);
7655 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7657 else
7658 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7659 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7660 expr, step_expr);
7661 if (seq)
7663 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7664 gcc_assert (!new_bb);
7667 t = unshare_expr (new_name);
7668 gcc_assert (CONSTANT_CLASS_P (new_name)
7669 || TREE_CODE (new_name) == SSA_NAME);
7670 new_vec = build_vector_from_val (vectype, t);
7671 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7673 vec_def = induc_def;
7674 prev_stmt_vinfo = induction_phi_info;
7675 for (i = 1; i < ncopies; i++)
7677 /* vec_i = vec_prev + vec_step */
7678 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7679 vec_def, vec_step);
7680 vec_def = make_ssa_name (vec_dest, new_stmt);
7681 gimple_assign_set_lhs (new_stmt, vec_def);
7683 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7684 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7685 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7686 prev_stmt_vinfo = new_stmt_info;
7690 if (nested_in_vect_loop)
7692 /* Find the loop-closed exit-phi of the induction, and record
7693 the final vector of induction results: */
7694 exit_phi = NULL;
7695 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7697 gimple *use_stmt = USE_STMT (use_p);
7698 if (is_gimple_debug (use_stmt))
7699 continue;
7701 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7703 exit_phi = use_stmt;
7704 break;
7707 if (exit_phi)
7709 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7710 /* FORNOW. Currently not supporting the case that an inner-loop induction
7711 is not used in the outer-loop (i.e. only outside the outer-loop). */
7712 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7713 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7715 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7716 if (dump_enabled_p ())
7717 dump_printf_loc (MSG_NOTE, vect_location,
7718 "vector of inductions after inner-loop:%G",
7719 new_stmt);
7724 if (dump_enabled_p ())
7725 dump_printf_loc (MSG_NOTE, vect_location,
7726 "transform induction: created def-use cycle: %G%G",
7727 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7729 return true;
7732 /* Function vectorizable_live_operation.
7734 STMT_INFO computes a value that is used outside the loop. Check if
7735 it can be supported. */
7737 bool
7738 vectorizable_live_operation (stmt_vec_info stmt_info,
7739 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7740 slp_tree slp_node, int slp_index,
7741 stmt_vec_info *vec_stmt,
7742 stmt_vector_for_cost *)
7744 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7745 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7746 imm_use_iterator imm_iter;
7747 tree lhs, lhs_type, bitsize, vec_bitsize;
7748 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7749 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7750 int ncopies;
7751 gimple *use_stmt;
7752 auto_vec<tree> vec_oprnds;
7753 int vec_entry = 0;
7754 poly_uint64 vec_index = 0;
7756 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7758 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7759 return false;
7761 /* FORNOW. CHECKME. */
7762 if (nested_in_vect_loop_p (loop, stmt_info))
7763 return false;
7765 /* If STMT is not relevant and it is a simple assignment and its inputs are
7766 invariant then it can remain in place, unvectorized. The original last
7767 scalar value that it computes will be used. */
7768 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7770 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7771 if (dump_enabled_p ())
7772 dump_printf_loc (MSG_NOTE, vect_location,
7773 "statement is simple and uses invariant. Leaving in "
7774 "place.\n");
7775 return true;
7778 if (slp_node)
7779 ncopies = 1;
7780 else
7781 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7783 if (slp_node)
7785 gcc_assert (slp_index >= 0);
7787 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7788 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7790 /* Get the last occurrence of the scalar index from the concatenation of
7791 all the slp vectors. Calculate which slp vector it is and the index
7792 within. */
7793 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7795 /* Calculate which vector contains the result, and which lane of
7796 that vector we need. */
7797 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7799 if (dump_enabled_p ())
7800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7801 "Cannot determine which vector holds the"
7802 " final result.\n");
7803 return false;
7807 if (!vec_stmt)
7809 /* No transformation required. */
7810 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7812 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7813 OPTIMIZE_FOR_SPEED))
7815 if (dump_enabled_p ())
7816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7817 "can't use a fully-masked loop because "
7818 "the target doesn't support extract last "
7819 "reduction.\n");
7820 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7822 else if (slp_node)
7824 if (dump_enabled_p ())
7825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7826 "can't use a fully-masked loop because an "
7827 "SLP statement is live after the loop.\n");
7828 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7830 else if (ncopies > 1)
7832 if (dump_enabled_p ())
7833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7834 "can't use a fully-masked loop because"
7835 " ncopies is greater than 1.\n");
7836 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7838 else
7840 gcc_assert (ncopies == 1 && !slp_node);
7841 vect_record_loop_mask (loop_vinfo,
7842 &LOOP_VINFO_MASKS (loop_vinfo),
7843 1, vectype);
7846 return true;
7849 /* Use the lhs of the original scalar statement. */
7850 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7852 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7853 : gimple_get_lhs (stmt);
7854 lhs_type = TREE_TYPE (lhs);
7856 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7857 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7858 : TYPE_SIZE (TREE_TYPE (vectype)));
7859 vec_bitsize = TYPE_SIZE (vectype);
7861 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7862 tree vec_lhs, bitstart;
7863 if (slp_node)
7865 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7867 /* Get the correct slp vectorized stmt. */
7868 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7869 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7870 vec_lhs = gimple_phi_result (phi);
7871 else
7872 vec_lhs = gimple_get_lhs (vec_stmt);
7874 /* Get entry to use. */
7875 bitstart = bitsize_int (vec_index);
7876 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7878 else
7880 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7881 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7882 gcc_checking_assert (ncopies == 1
7883 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7885 /* For multiple copies, get the last copy. */
7886 for (int i = 1; i < ncopies; ++i)
7887 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7889 /* Get the last lane in the vector. */
7890 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7893 gimple_seq stmts = NULL;
7894 tree new_tree;
7895 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7897 /* Emit:
7899 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7901 where VEC_LHS is the vectorized live-out result and MASK is
7902 the loop mask for the final iteration. */
7903 gcc_assert (ncopies == 1 && !slp_node);
7904 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7905 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7906 1, vectype, 0);
7907 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7908 scalar_type, mask, vec_lhs);
7910 /* Convert the extracted vector element to the required scalar type. */
7911 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7913 else
7915 tree bftype = TREE_TYPE (vectype);
7916 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7917 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7918 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7919 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7920 &stmts, true, NULL_TREE);
7923 if (stmts)
7924 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7926 /* Replace use of lhs with newly computed result. If the use stmt is a
7927 single arg PHI, just replace all uses of PHI result. It's necessary
7928 because lcssa PHI defining lhs may be before newly inserted stmt. */
7929 use_operand_p use_p;
7930 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7931 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7932 && !is_gimple_debug (use_stmt))
7934 if (gimple_code (use_stmt) == GIMPLE_PHI
7935 && gimple_phi_num_args (use_stmt) == 1)
7937 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7939 else
7941 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7942 SET_USE (use_p, new_tree);
7944 update_stmt (use_stmt);
7947 return true;
7950 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7952 static void
7953 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7955 ssa_op_iter op_iter;
7956 imm_use_iterator imm_iter;
7957 def_operand_p def_p;
7958 gimple *ustmt;
7960 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7962 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7964 basic_block bb;
7966 if (!is_gimple_debug (ustmt))
7967 continue;
7969 bb = gimple_bb (ustmt);
7971 if (!flow_bb_inside_loop_p (loop, bb))
7973 if (gimple_debug_bind_p (ustmt))
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_NOTE, vect_location,
7977 "killing debug use\n");
7979 gimple_debug_bind_reset_value (ustmt);
7980 update_stmt (ustmt);
7982 else
7983 gcc_unreachable ();
7989 /* Given loop represented by LOOP_VINFO, return true if computation of
7990 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7991 otherwise. */
7993 static bool
7994 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7996 /* Constant case. */
7997 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7999 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8000 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8002 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8003 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8004 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8005 return true;
8008 widest_int max;
8009 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8010 /* Check the upper bound of loop niters. */
8011 if (get_max_loop_iterations (loop, &max))
8013 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8014 signop sgn = TYPE_SIGN (type);
8015 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8016 if (max < type_max)
8017 return true;
8019 return false;
8022 /* Return a mask type with half the number of elements as TYPE. */
8024 tree
8025 vect_halve_mask_nunits (tree type)
8027 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8028 return build_truth_vector_type (nunits, current_vector_size);
8031 /* Return a mask type with twice as many elements as TYPE. */
8033 tree
8034 vect_double_mask_nunits (tree type)
8036 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8037 return build_truth_vector_type (nunits, current_vector_size);
8040 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8041 contain a sequence of NVECTORS masks that each control a vector of type
8042 VECTYPE. */
8044 void
8045 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8046 unsigned int nvectors, tree vectype)
8048 gcc_assert (nvectors != 0);
8049 if (masks->length () < nvectors)
8050 masks->safe_grow_cleared (nvectors);
8051 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8052 /* The number of scalars per iteration and the number of vectors are
8053 both compile-time constants. */
8054 unsigned int nscalars_per_iter
8055 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8056 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8057 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8059 rgm->max_nscalars_per_iter = nscalars_per_iter;
8060 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8064 /* Given a complete set of masks MASKS, extract mask number INDEX
8065 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8066 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8068 See the comment above vec_loop_masks for more details about the mask
8069 arrangement. */
8071 tree
8072 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8073 unsigned int nvectors, tree vectype, unsigned int index)
8075 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8076 tree mask_type = rgm->mask_type;
8078 /* Populate the rgroup's mask array, if this is the first time we've
8079 used it. */
8080 if (rgm->masks.is_empty ())
8082 rgm->masks.safe_grow_cleared (nvectors);
8083 for (unsigned int i = 0; i < nvectors; ++i)
8085 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8086 /* Provide a dummy definition until the real one is available. */
8087 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8088 rgm->masks[i] = mask;
8092 tree mask = rgm->masks[index];
8093 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8094 TYPE_VECTOR_SUBPARTS (vectype)))
8096 /* A loop mask for data type X can be reused for data type Y
8097 if X has N times more elements than Y and if Y's elements
8098 are N times bigger than X's. In this case each sequence
8099 of N elements in the loop mask will be all-zero or all-one.
8100 We can then view-convert the mask so that each sequence of
8101 N elements is replaced by a single element. */
8102 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8103 TYPE_VECTOR_SUBPARTS (vectype)));
8104 gimple_seq seq = NULL;
8105 mask_type = build_same_sized_truth_vector_type (vectype);
8106 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8107 if (seq)
8108 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8110 return mask;
8113 /* Scale profiling counters by estimation for LOOP which is vectorized
8114 by factor VF. */
8116 static void
8117 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8119 edge preheader = loop_preheader_edge (loop);
8120 /* Reduce loop iterations by the vectorization factor. */
8121 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8122 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8124 if (freq_h.nonzero_p ())
8126 profile_probability p;
8128 /* Avoid dropping loop body profile counter to 0 because of zero count
8129 in loop's preheader. */
8130 if (!(freq_e == profile_count::zero ()))
8131 freq_e = freq_e.force_nonzero ();
8132 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8133 scale_loop_frequencies (loop, p);
8136 edge exit_e = single_exit (loop);
8137 exit_e->probability = profile_probability::always ()
8138 .apply_scale (1, new_est_niter + 1);
8140 edge exit_l = single_pred_edge (loop->latch);
8141 profile_probability prob = exit_l->probability;
8142 exit_l->probability = exit_e->probability.invert ();
8143 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8144 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8147 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8148 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8149 stmt_vec_info. */
8151 static void
8152 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8153 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8156 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8158 if (dump_enabled_p ())
8159 dump_printf_loc (MSG_NOTE, vect_location,
8160 "------>vectorizing statement: %G", stmt_info->stmt);
8162 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8163 vect_loop_kill_debug_uses (loop, stmt_info);
8165 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8166 && !STMT_VINFO_LIVE_P (stmt_info))
8167 return;
8169 if (STMT_VINFO_VECTYPE (stmt_info))
8171 poly_uint64 nunits
8172 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8173 if (!STMT_SLP_TYPE (stmt_info)
8174 && maybe_ne (nunits, vf)
8175 && dump_enabled_p ())
8176 /* For SLP VF is set according to unrolling factor, and not
8177 to vector size, hence for SLP this print is not valid. */
8178 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8181 /* Pure SLP statements have already been vectorized. We still need
8182 to apply loop vectorization to hybrid SLP statements. */
8183 if (PURE_SLP_STMT (stmt_info))
8184 return;
8186 if (dump_enabled_p ())
8187 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8189 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8190 *seen_store = stmt_info;
8193 /* Function vect_transform_loop.
8195 The analysis phase has determined that the loop is vectorizable.
8196 Vectorize the loop - created vectorized stmts to replace the scalar
8197 stmts in the loop, and update the loop exit condition.
8198 Returns scalar epilogue loop if any. */
8200 struct loop *
8201 vect_transform_loop (loop_vec_info loop_vinfo)
8203 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8204 struct loop *epilogue = NULL;
8205 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8206 int nbbs = loop->num_nodes;
8207 int i;
8208 tree niters_vector = NULL_TREE;
8209 tree step_vector = NULL_TREE;
8210 tree niters_vector_mult_vf = NULL_TREE;
8211 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8212 unsigned int lowest_vf = constant_lower_bound (vf);
8213 gimple *stmt;
8214 bool check_profitability = false;
8215 unsigned int th;
8217 DUMP_VECT_SCOPE ("vec_transform_loop");
8219 loop_vinfo->shared->check_datarefs ();
8221 /* Use the more conservative vectorization threshold. If the number
8222 of iterations is constant assume the cost check has been performed
8223 by our caller. If the threshold makes all loops profitable that
8224 run at least the (estimated) vectorization factor number of times
8225 checking is pointless, too. */
8226 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8227 if (th >= vect_vf_for_cost (loop_vinfo)
8228 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8230 if (dump_enabled_p ())
8231 dump_printf_loc (MSG_NOTE, vect_location,
8232 "Profitability threshold is %d loop iterations.\n",
8233 th);
8234 check_profitability = true;
8237 /* Make sure there exists a single-predecessor exit bb. Do this before
8238 versioning. */
8239 edge e = single_exit (loop);
8240 if (! single_pred_p (e->dest))
8242 split_loop_exit_edge (e, true);
8243 if (dump_enabled_p ())
8244 dump_printf (MSG_NOTE, "split exit edge\n");
8247 /* Version the loop first, if required, so the profitability check
8248 comes first. */
8250 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8252 poly_uint64 versioning_threshold
8253 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8254 if (check_profitability
8255 && ordered_p (poly_uint64 (th), versioning_threshold))
8257 versioning_threshold = ordered_max (poly_uint64 (th),
8258 versioning_threshold);
8259 check_profitability = false;
8261 vect_loop_versioning (loop_vinfo, th, check_profitability,
8262 versioning_threshold);
8263 check_profitability = false;
8266 /* Make sure there exists a single-predecessor exit bb also on the
8267 scalar loop copy. Do this after versioning but before peeling
8268 so CFG structure is fine for both scalar and if-converted loop
8269 to make slpeel_duplicate_current_defs_from_edges face matched
8270 loop closed PHI nodes on the exit. */
8271 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8273 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8274 if (! single_pred_p (e->dest))
8276 split_loop_exit_edge (e, true);
8277 if (dump_enabled_p ())
8278 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8282 tree niters = vect_build_loop_niters (loop_vinfo);
8283 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8284 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8285 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8286 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8287 &step_vector, &niters_vector_mult_vf, th,
8288 check_profitability, niters_no_overflow);
8290 if (niters_vector == NULL_TREE)
8292 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8293 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8294 && known_eq (lowest_vf, vf))
8296 niters_vector
8297 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8298 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8299 step_vector = build_one_cst (TREE_TYPE (niters));
8301 else
8302 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8303 &step_vector, niters_no_overflow);
8306 /* 1) Make sure the loop header has exactly two entries
8307 2) Make sure we have a preheader basic block. */
8309 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8311 split_edge (loop_preheader_edge (loop));
8313 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8314 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8315 /* This will deal with any possible peeling. */
8316 vect_prepare_for_masked_peels (loop_vinfo);
8318 /* Schedule the SLP instances first, then handle loop vectorization
8319 below. */
8320 if (!loop_vinfo->slp_instances.is_empty ())
8322 DUMP_VECT_SCOPE ("scheduling SLP instances");
8323 vect_schedule_slp (loop_vinfo);
8326 /* FORNOW: the vectorizer supports only loops which body consist
8327 of one basic block (header + empty latch). When the vectorizer will
8328 support more involved loop forms, the order by which the BBs are
8329 traversed need to be reconsidered. */
8331 for (i = 0; i < nbbs; i++)
8333 basic_block bb = bbs[i];
8334 stmt_vec_info stmt_info;
8336 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8337 gsi_next (&si))
8339 gphi *phi = si.phi ();
8340 if (dump_enabled_p ())
8341 dump_printf_loc (MSG_NOTE, vect_location,
8342 "------>vectorizing phi: %G", phi);
8343 stmt_info = loop_vinfo->lookup_stmt (phi);
8344 if (!stmt_info)
8345 continue;
8347 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8348 vect_loop_kill_debug_uses (loop, stmt_info);
8350 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8351 && !STMT_VINFO_LIVE_P (stmt_info))
8352 continue;
8354 if (STMT_VINFO_VECTYPE (stmt_info)
8355 && (maybe_ne
8356 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8357 && dump_enabled_p ())
8358 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8360 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8361 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8362 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8363 && ! PURE_SLP_STMT (stmt_info))
8365 if (dump_enabled_p ())
8366 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8367 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8371 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8372 !gsi_end_p (si);)
8374 stmt = gsi_stmt (si);
8375 /* During vectorization remove existing clobber stmts. */
8376 if (gimple_clobber_p (stmt))
8378 unlink_stmt_vdef (stmt);
8379 gsi_remove (&si, true);
8380 release_defs (stmt);
8382 else
8384 stmt_info = loop_vinfo->lookup_stmt (stmt);
8386 /* vector stmts created in the outer-loop during vectorization of
8387 stmts in an inner-loop may not have a stmt_info, and do not
8388 need to be vectorized. */
8389 stmt_vec_info seen_store = NULL;
8390 if (stmt_info)
8392 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8394 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8395 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8396 !gsi_end_p (subsi); gsi_next (&subsi))
8398 stmt_vec_info pat_stmt_info
8399 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8400 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8401 &si, &seen_store);
8403 stmt_vec_info pat_stmt_info
8404 = STMT_VINFO_RELATED_STMT (stmt_info);
8405 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8406 &seen_store);
8408 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8409 &seen_store);
8411 gsi_next (&si);
8412 if (seen_store)
8414 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8415 /* Interleaving. If IS_STORE is TRUE, the
8416 vectorization of the interleaving chain was
8417 completed - free all the stores in the chain. */
8418 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8419 else
8420 /* Free the attached stmt_vec_info and remove the stmt. */
8421 loop_vinfo->remove_stmt (stmt_info);
8426 /* Stub out scalar statements that must not survive vectorization.
8427 Doing this here helps with grouped statements, or statements that
8428 are involved in patterns. */
8429 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8430 !gsi_end_p (gsi); gsi_next (&gsi))
8432 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8433 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8435 tree lhs = gimple_get_lhs (call);
8436 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8438 tree zero = build_zero_cst (TREE_TYPE (lhs));
8439 gimple *new_stmt = gimple_build_assign (lhs, zero);
8440 gsi_replace (&gsi, new_stmt, true);
8444 } /* BBs in loop */
8446 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8447 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8448 if (integer_onep (step_vector))
8449 niters_no_overflow = true;
8450 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8451 niters_vector_mult_vf, !niters_no_overflow);
8453 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8454 scale_profile_for_vect_loop (loop, assumed_vf);
8456 /* True if the final iteration might not handle a full vector's
8457 worth of scalar iterations. */
8458 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8459 /* The minimum number of iterations performed by the epilogue. This
8460 is 1 when peeling for gaps because we always need a final scalar
8461 iteration. */
8462 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8463 /* +1 to convert latch counts to loop iteration counts,
8464 -min_epilogue_iters to remove iterations that cannot be performed
8465 by the vector code. */
8466 int bias_for_lowest = 1 - min_epilogue_iters;
8467 int bias_for_assumed = bias_for_lowest;
8468 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8469 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8471 /* When the amount of peeling is known at compile time, the first
8472 iteration will have exactly alignment_npeels active elements.
8473 In the worst case it will have at least one. */
8474 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8475 bias_for_lowest += lowest_vf - min_first_active;
8476 bias_for_assumed += assumed_vf - min_first_active;
8478 /* In these calculations the "- 1" converts loop iteration counts
8479 back to latch counts. */
8480 if (loop->any_upper_bound)
8481 loop->nb_iterations_upper_bound
8482 = (final_iter_may_be_partial
8483 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8484 lowest_vf) - 1
8485 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8486 lowest_vf) - 1);
8487 if (loop->any_likely_upper_bound)
8488 loop->nb_iterations_likely_upper_bound
8489 = (final_iter_may_be_partial
8490 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8491 + bias_for_lowest, lowest_vf) - 1
8492 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8493 + bias_for_lowest, lowest_vf) - 1);
8494 if (loop->any_estimate)
8495 loop->nb_iterations_estimate
8496 = (final_iter_may_be_partial
8497 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8498 assumed_vf) - 1
8499 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8500 assumed_vf) - 1);
8502 if (dump_enabled_p ())
8504 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8506 dump_printf_loc (MSG_NOTE, vect_location,
8507 "LOOP VECTORIZED\n");
8508 if (loop->inner)
8509 dump_printf_loc (MSG_NOTE, vect_location,
8510 "OUTER LOOP VECTORIZED\n");
8511 dump_printf (MSG_NOTE, "\n");
8513 else
8515 dump_printf_loc (MSG_NOTE, vect_location,
8516 "LOOP EPILOGUE VECTORIZED (VS=");
8517 dump_dec (MSG_NOTE, current_vector_size);
8518 dump_printf (MSG_NOTE, ")\n");
8522 /* Free SLP instances here because otherwise stmt reference counting
8523 won't work. */
8524 slp_instance instance;
8525 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8526 vect_free_slp_instance (instance, true);
8527 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8528 /* Clear-up safelen field since its value is invalid after vectorization
8529 since vectorized loop can have loop-carried dependencies. */
8530 loop->safelen = 0;
8532 /* Don't vectorize epilogue for epilogue. */
8533 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8534 epilogue = NULL;
8536 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8537 epilogue = NULL;
8539 if (epilogue)
8541 auto_vector_sizes vector_sizes;
8542 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8543 unsigned int next_size = 0;
8545 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8546 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8547 && known_eq (vf, lowest_vf))
8549 unsigned int eiters
8550 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8551 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8552 eiters = eiters % lowest_vf;
8553 epilogue->nb_iterations_upper_bound = eiters - 1;
8555 unsigned int ratio;
8556 while (next_size < vector_sizes.length ()
8557 && !(constant_multiple_p (current_vector_size,
8558 vector_sizes[next_size], &ratio)
8559 && eiters >= lowest_vf / ratio))
8560 next_size += 1;
8562 else
8563 while (next_size < vector_sizes.length ()
8564 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8565 next_size += 1;
8567 if (next_size == vector_sizes.length ())
8568 epilogue = NULL;
8571 if (epilogue)
8573 epilogue->force_vectorize = loop->force_vectorize;
8574 epilogue->safelen = loop->safelen;
8575 epilogue->dont_vectorize = false;
8577 /* We may need to if-convert epilogue to vectorize it. */
8578 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8579 tree_if_conversion (epilogue);
8582 return epilogue;
8585 /* The code below is trying to perform simple optimization - revert
8586 if-conversion for masked stores, i.e. if the mask of a store is zero
8587 do not perform it and all stored value producers also if possible.
8588 For example,
8589 for (i=0; i<n; i++)
8590 if (c[i])
8592 p1[i] += 1;
8593 p2[i] = p3[i] +2;
8595 this transformation will produce the following semi-hammock:
8597 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8599 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8600 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8601 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8602 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8603 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8604 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8608 void
8609 optimize_mask_stores (struct loop *loop)
8611 basic_block *bbs = get_loop_body (loop);
8612 unsigned nbbs = loop->num_nodes;
8613 unsigned i;
8614 basic_block bb;
8615 struct loop *bb_loop;
8616 gimple_stmt_iterator gsi;
8617 gimple *stmt;
8618 auto_vec<gimple *> worklist;
8620 vect_location = find_loop_location (loop);
8621 /* Pick up all masked stores in loop if any. */
8622 for (i = 0; i < nbbs; i++)
8624 bb = bbs[i];
8625 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8626 gsi_next (&gsi))
8628 stmt = gsi_stmt (gsi);
8629 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8630 worklist.safe_push (stmt);
8634 free (bbs);
8635 if (worklist.is_empty ())
8636 return;
8638 /* Loop has masked stores. */
8639 while (!worklist.is_empty ())
8641 gimple *last, *last_store;
8642 edge e, efalse;
8643 tree mask;
8644 basic_block store_bb, join_bb;
8645 gimple_stmt_iterator gsi_to;
8646 tree vdef, new_vdef;
8647 gphi *phi;
8648 tree vectype;
8649 tree zero;
8651 last = worklist.pop ();
8652 mask = gimple_call_arg (last, 2);
8653 bb = gimple_bb (last);
8654 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8655 the same loop as if_bb. It could be different to LOOP when two
8656 level loop-nest is vectorized and mask_store belongs to the inner
8657 one. */
8658 e = split_block (bb, last);
8659 bb_loop = bb->loop_father;
8660 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8661 join_bb = e->dest;
8662 store_bb = create_empty_bb (bb);
8663 add_bb_to_loop (store_bb, bb_loop);
8664 e->flags = EDGE_TRUE_VALUE;
8665 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8666 /* Put STORE_BB to likely part. */
8667 efalse->probability = profile_probability::unlikely ();
8668 store_bb->count = efalse->count ();
8669 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8670 if (dom_info_available_p (CDI_DOMINATORS))
8671 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8672 if (dump_enabled_p ())
8673 dump_printf_loc (MSG_NOTE, vect_location,
8674 "Create new block %d to sink mask stores.",
8675 store_bb->index);
8676 /* Create vector comparison with boolean result. */
8677 vectype = TREE_TYPE (mask);
8678 zero = build_zero_cst (vectype);
8679 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8680 gsi = gsi_last_bb (bb);
8681 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8682 /* Create new PHI node for vdef of the last masked store:
8683 .MEM_2 = VDEF <.MEM_1>
8684 will be converted to
8685 .MEM.3 = VDEF <.MEM_1>
8686 and new PHI node will be created in join bb
8687 .MEM_2 = PHI <.MEM_1, .MEM_3>
8689 vdef = gimple_vdef (last);
8690 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8691 gimple_set_vdef (last, new_vdef);
8692 phi = create_phi_node (vdef, join_bb);
8693 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8695 /* Put all masked stores with the same mask to STORE_BB if possible. */
8696 while (true)
8698 gimple_stmt_iterator gsi_from;
8699 gimple *stmt1 = NULL;
8701 /* Move masked store to STORE_BB. */
8702 last_store = last;
8703 gsi = gsi_for_stmt (last);
8704 gsi_from = gsi;
8705 /* Shift GSI to the previous stmt for further traversal. */
8706 gsi_prev (&gsi);
8707 gsi_to = gsi_start_bb (store_bb);
8708 gsi_move_before (&gsi_from, &gsi_to);
8709 /* Setup GSI_TO to the non-empty block start. */
8710 gsi_to = gsi_start_bb (store_bb);
8711 if (dump_enabled_p ())
8712 dump_printf_loc (MSG_NOTE, vect_location,
8713 "Move stmt to created bb\n%G", last);
8714 /* Move all stored value producers if possible. */
8715 while (!gsi_end_p (gsi))
8717 tree lhs;
8718 imm_use_iterator imm_iter;
8719 use_operand_p use_p;
8720 bool res;
8722 /* Skip debug statements. */
8723 if (is_gimple_debug (gsi_stmt (gsi)))
8725 gsi_prev (&gsi);
8726 continue;
8728 stmt1 = gsi_stmt (gsi);
8729 /* Do not consider statements writing to memory or having
8730 volatile operand. */
8731 if (gimple_vdef (stmt1)
8732 || gimple_has_volatile_ops (stmt1))
8733 break;
8734 gsi_from = gsi;
8735 gsi_prev (&gsi);
8736 lhs = gimple_get_lhs (stmt1);
8737 if (!lhs)
8738 break;
8740 /* LHS of vectorized stmt must be SSA_NAME. */
8741 if (TREE_CODE (lhs) != SSA_NAME)
8742 break;
8744 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8746 /* Remove dead scalar statement. */
8747 if (has_zero_uses (lhs))
8749 gsi_remove (&gsi_from, true);
8750 continue;
8754 /* Check that LHS does not have uses outside of STORE_BB. */
8755 res = true;
8756 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8758 gimple *use_stmt;
8759 use_stmt = USE_STMT (use_p);
8760 if (is_gimple_debug (use_stmt))
8761 continue;
8762 if (gimple_bb (use_stmt) != store_bb)
8764 res = false;
8765 break;
8768 if (!res)
8769 break;
8771 if (gimple_vuse (stmt1)
8772 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8773 break;
8775 /* Can move STMT1 to STORE_BB. */
8776 if (dump_enabled_p ())
8777 dump_printf_loc (MSG_NOTE, vect_location,
8778 "Move stmt to created bb\n%G", stmt1);
8779 gsi_move_before (&gsi_from, &gsi_to);
8780 /* Shift GSI_TO for further insertion. */
8781 gsi_prev (&gsi_to);
8783 /* Put other masked stores with the same mask to STORE_BB. */
8784 if (worklist.is_empty ()
8785 || gimple_call_arg (worklist.last (), 2) != mask
8786 || worklist.last () != stmt1)
8787 break;
8788 last = worklist.pop ();
8790 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);