2018-12-04 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-loop.c
blobfa926f4ebb5b4307b1b4e614056e5ba9bf2f4c9a
1 /* Loop Vectorization
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
185 if (stmt_vectype)
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
258 return opt_result::success ();
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i = 0; i < nbbs; i++)
304 basic_block bb = bbs[i];
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
315 gcc_assert (stmt_info);
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
347 vect_update_max_nunits (&vectorization_factor, vectype);
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
376 for (i = 0; i < mask_producers.length (); i++)
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
385 return opt_result::success ();
389 /* Function vect_is_simple_iv_evolution.
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
420 *init = init_expr;
421 *step = step_expr;
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
439 return true;
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
453 x_3 = ...;
456 outer2:
457 x_4 = PHI <x_3(inner)>;
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
476 /* Function vect_analyze_scalar_cycles_1.
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
534 worklist.safe_push (stmt_vinfo);
535 continue;
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
566 if (double_reduc)
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
576 else
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
587 else
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
612 /* Function vect_analyze_scalar_cycles.
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
621 Example1: reduction:
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
627 Example2: induction:
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
682 stmt_vec_info first;
683 unsigned i;
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
706 /* Function vect_get_loop_niters.
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
713 Return the loop exit condition. */
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
730 if (!exit)
731 return cond;
733 niter = chrec_dont_know;
734 may_be_zero = NULL_TREE;
735 niter_assumptions = boolean_true_node;
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737 || chrec_contains_undetermined (niter_desc.niter))
738 return cond;
740 niter_assumptions = niter_desc.assumptions;
741 may_be_zero = niter_desc.may_be_zero;
742 niter = niter_desc.niter;
744 if (may_be_zero && integer_zerop (may_be_zero))
745 may_be_zero = NULL_TREE;
747 if (may_be_zero)
749 if (COMPARISON_CLASS_P (may_be_zero))
751 /* Try to combine may_be_zero with assumptions, this can simplify
752 computation of niter expression. */
753 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
754 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
755 niter_assumptions,
756 fold_build1 (TRUTH_NOT_EXPR,
757 boolean_type_node,
758 may_be_zero));
759 else
760 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
761 build_int_cst (TREE_TYPE (niter), 0),
762 rewrite_to_non_trapping_overflow (niter));
764 may_be_zero = NULL_TREE;
766 else if (integer_nonzerop (may_be_zero))
768 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
769 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
770 return cond;
772 else
773 return cond;
776 *assumptions = niter_assumptions;
777 *number_of_iterationsm1 = niter;
779 /* We want the number of loop header executions which is the number
780 of latch executions plus one.
781 ??? For UINT_MAX latch executions this number overflows to zero
782 for loops like do { n++; } while (n != 0); */
783 if (niter && !chrec_contains_undetermined (niter))
784 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
785 build_int_cst (TREE_TYPE (niter), 1));
786 *number_of_iterations = niter;
788 return cond;
791 /* Function bb_in_loop_p
793 Used as predicate for dfs order traversal of the loop bbs. */
795 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data)
798 const struct loop *const loop = (const struct loop *)data;
799 if (flow_bb_inside_loop_p (loop, bb))
800 return true;
801 return false;
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
809 : vec_info (vec_info::loop, init_cost (loop_in), shared),
810 loop (loop_in),
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812 num_itersm1 (NULL_TREE),
813 num_iters (NULL_TREE),
814 num_iters_unchanged (NULL_TREE),
815 num_iters_assumptions (NULL_TREE),
816 th (0),
817 versioning_threshold (0),
818 vectorization_factor (0),
819 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE),
821 mask_compare_type (NULL_TREE),
822 unaligned_dr (NULL),
823 peeling_for_alignment (0),
824 ptr_mask (0),
825 ivexpr_map (NULL),
826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
829 can_fully_mask_p (true),
830 fully_masked_p (false),
831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop (NULL),
837 orig_loop_info (NULL)
839 /* CHECKME: We want to visit all BBs before their successors (except for
840 latch blocks, for which this assertion wouldn't hold). In the simple
841 case of the loop forms we allow, a dfs order of the BBs would the same
842 as reversed postorder traversal, so we are safe. */
844 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
845 bbs, loop->num_nodes, loop);
846 gcc_assert (nbbs == loop->num_nodes);
848 for (unsigned int i = 0; i < nbbs; i++)
850 basic_block bb = bbs[i];
851 gimple_stmt_iterator si;
853 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
855 gimple *phi = gsi_stmt (si);
856 gimple_set_uid (phi, 0);
857 add_stmt (phi);
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
862 gimple *stmt = gsi_stmt (si);
863 gimple_set_uid (stmt, 0);
864 add_stmt (stmt);
869 /* Free all levels of MASKS. */
871 void
872 release_vec_loop_masks (vec_loop_masks *masks)
874 rgroup_masks *rgm;
875 unsigned int i;
876 FOR_EACH_VEC_ELT (*masks, i, rgm)
877 rgm->masks.release ();
878 masks->release ();
881 /* Free all memory used by the _loop_vec_info, as well as all the
882 stmt_vec_info structs of all the stmts in the loop. */
884 _loop_vec_info::~_loop_vec_info ()
886 int nbbs;
887 gimple_stmt_iterator si;
888 int j;
890 nbbs = loop->num_nodes;
891 for (j = 0; j < nbbs; j++)
893 basic_block bb = bbs[j];
894 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
896 gimple *stmt = gsi_stmt (si);
898 /* We may have broken canonical form by moving a constant
899 into RHS1 of a commutative op. Fix such occurrences. */
900 if (operands_swapped && is_gimple_assign (stmt))
902 enum tree_code code = gimple_assign_rhs_code (stmt);
904 if ((code == PLUS_EXPR
905 || code == POINTER_PLUS_EXPR
906 || code == MULT_EXPR)
907 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
908 swap_ssa_operands (stmt,
909 gimple_assign_rhs1_ptr (stmt),
910 gimple_assign_rhs2_ptr (stmt));
911 else if (code == COND_EXPR
912 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
914 tree cond_expr = gimple_assign_rhs1 (stmt);
915 enum tree_code cond_code = TREE_CODE (cond_expr);
917 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
919 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
920 0));
921 cond_code = invert_tree_comparison (cond_code,
922 honor_nans);
923 if (cond_code != ERROR_MARK)
925 TREE_SET_CODE (cond_expr, cond_code);
926 swap_ssa_operands (stmt,
927 gimple_assign_rhs2_ptr (stmt),
928 gimple_assign_rhs3_ptr (stmt));
933 gsi_next (&si);
937 free (bbs);
939 release_vec_loop_masks (&masks);
940 delete ivexpr_map;
942 loop->aux = NULL;
945 /* Return an invariant or register for EXPR and emit necessary
946 computations in the LOOP_VINFO loop preheader. */
948 tree
949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
951 if (is_gimple_reg (expr)
952 || is_gimple_min_invariant (expr))
953 return expr;
955 if (! loop_vinfo->ivexpr_map)
956 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
957 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
958 if (! cached)
960 gimple_seq stmts = NULL;
961 cached = force_gimple_operand (unshare_expr (expr),
962 &stmts, true, NULL_TREE);
963 if (stmts)
965 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
966 gsi_insert_seq_on_edge_immediate (e, stmts);
969 return cached;
972 /* Return true if we can use CMP_TYPE as the comparison type to produce
973 all masks required to mask LOOP_VINFO. */
975 static bool
976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
978 rgroup_masks *rgm;
979 unsigned int i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
981 if (rgm->mask_type != NULL_TREE
982 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
983 cmp_type, rgm->mask_type,
984 OPTIMIZE_FOR_SPEED))
985 return false;
986 return true;
989 /* Calculate the maximum number of scalars per iteration for every
990 rgroup in LOOP_VINFO. */
992 static unsigned int
993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
995 unsigned int res = 1;
996 unsigned int i;
997 rgroup_masks *rgm;
998 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
999 res = MAX (res, rgm->max_nscalars_per_iter);
1000 return res;
1003 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1004 whether we can actually generate the masks required. Return true if so,
1005 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011 unsigned int min_ni_width;
1013 /* Use a normal loop if there are no statements that need masking.
1014 This only happens in rare degenerate cases: it means that the loop
1015 has no loads, no stores, and no live-out values. */
1016 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017 return false;
1019 /* Get the maximum number of iterations that is representable
1020 in the counter type. */
1021 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1024 /* Get a more refined estimate for the number of iterations. */
1025 widest_int max_back_edges;
1026 if (max_loop_iterations (loop, &max_back_edges))
1027 max_ni = wi::smin (max_ni, max_back_edges + 1);
1029 /* Account for rgroup masks, in which each bit is replicated N times. */
1030 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1032 /* Work out how many bits we need to represent the limit. */
1033 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1035 /* Find a scalar mode for which WHILE_ULT is supported. */
1036 opt_scalar_int_mode cmp_mode_iter;
1037 tree cmp_type = NULL_TREE;
1038 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1040 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041 if (cmp_bits >= min_ni_width
1042 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1044 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045 if (this_type
1046 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1048 /* Although we could stop as soon as we find a valid mode,
1049 it's often better to continue until we hit Pmode, since the
1050 operands to the WHILE are more likely to be reusable in
1051 address calculations. */
1052 cmp_type = this_type;
1053 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054 break;
1059 if (!cmp_type)
1060 return false;
1062 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063 return true;
1066 /* Calculate the cost of one scalar iteration of the loop. */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072 int nbbs = loop->num_nodes, factor;
1073 int innerloop_iters, i;
1075 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1077 /* Gather costs for statements in the scalar loop. */
1079 /* FORNOW. */
1080 innerloop_iters = 1;
1081 if (loop->inner)
1082 innerloop_iters = 50; /* FIXME */
1084 for (i = 0; i < nbbs; i++)
1086 gimple_stmt_iterator si;
1087 basic_block bb = bbs[i];
1089 if (bb->loop_father == loop->inner)
1090 factor = innerloop_iters;
1091 else
1092 factor = 1;
1094 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1096 gimple *stmt = gsi_stmt (si);
1097 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100 continue;
1102 /* Skip stmts that are not vectorized inside the loop. */
1103 if (stmt_info
1104 && !STMT_VINFO_RELEVANT_P (stmt_info)
1105 && (!STMT_VINFO_LIVE_P (stmt_info)
1106 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1107 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1108 continue;
1110 vect_cost_for_stmt kind;
1111 if (STMT_VINFO_DATA_REF (stmt_info))
1113 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114 kind = scalar_load;
1115 else
1116 kind = scalar_store;
1118 else
1119 kind = scalar_stmt;
1121 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122 factor, kind, stmt_info, 0, vect_prologue);
1126 /* Now accumulate cost. */
1127 void *target_cost_data = init_cost (loop);
1128 stmt_info_for_cost *si;
1129 int j;
1130 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131 j, si)
1132 (void) add_stmt_cost (target_cost_data, si->count,
1133 si->kind, si->stmt_info, si->misalign,
1134 vect_body);
1135 unsigned dummy, body_cost = 0;
1136 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137 destroy_cost_data (target_cost_data);
1138 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1142 /* Function vect_analyze_loop_form_1.
1144 Verify that certain CFG restrictions hold, including:
1145 - the loop has a pre-header
1146 - the loop has a single entry and exit
1147 - the loop exit condition is simple enough
1148 - the number of iterations can be analyzed, i.e, a countable loop. The
1149 niter could be analyzed under some assumptions. */
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153 tree *assumptions, tree *number_of_iterationsm1,
1154 tree *number_of_iterations, gcond **inner_loop_cond)
1156 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1158 /* Different restrictions apply when we are considering an inner-most loop,
1159 vs. an outer (nested) loop.
1160 (FORNOW. May want to relax some of these restrictions in the future). */
1162 if (!loop->inner)
1164 /* Inner-most loop. We currently require that the number of BBs is
1165 exactly 2 (the header and latch). Vectorizable inner-most loops
1166 look like this:
1168 (pre-header)
1170 header <--------+
1171 | | |
1172 | +--> latch --+
1174 (exit-bb) */
1176 if (loop->num_nodes != 2)
1177 return opt_result::failure_at (vect_location,
1178 "not vectorized:"
1179 " control flow in loop.\n");
1181 if (empty_block_p (loop->header))
1182 return opt_result::failure_at (vect_location,
1183 "not vectorized: empty loop.\n");
1185 else
1187 struct loop *innerloop = loop->inner;
1188 edge entryedge;
1190 /* Nested loop. We currently require that the loop is doubly-nested,
1191 contains a single inner loop, and the number of BBs is exactly 5.
1192 Vectorizable outer-loops look like this:
1194 (pre-header)
1196 header <---+
1198 inner-loop |
1200 tail ------+
1202 (exit-bb)
1204 The inner-loop has the properties expected of inner-most loops
1205 as described above. */
1207 if ((loop->inner)->inner || (loop->inner)->next)
1208 return opt_result::failure_at (vect_location,
1209 "not vectorized:"
1210 " multiple nested loops.\n");
1212 if (loop->num_nodes != 5)
1213 return opt_result::failure_at (vect_location,
1214 "not vectorized:"
1215 " control flow in loop.\n");
1217 entryedge = loop_preheader_edge (innerloop);
1218 if (entryedge->src != loop->header
1219 || !single_exit (innerloop)
1220 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221 return opt_result::failure_at (vect_location,
1222 "not vectorized:"
1223 " unsupported outerloop form.\n");
1225 /* Analyze the inner-loop. */
1226 tree inner_niterm1, inner_niter, inner_assumptions;
1227 opt_result res
1228 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229 &inner_assumptions, &inner_niterm1,
1230 &inner_niter, NULL);
1231 if (!res)
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "not vectorized: Bad inner loop.\n");
1236 return res;
1239 /* Don't support analyzing niter under assumptions for inner
1240 loop. */
1241 if (!integer_onep (inner_assumptions))
1242 return opt_result::failure_at (vect_location,
1243 "not vectorized: Bad inner loop.\n");
1245 if (!expr_invariant_in_loop_p (loop, inner_niter))
1246 return opt_result::failure_at (vect_location,
1247 "not vectorized: inner-loop count not"
1248 " invariant.\n");
1250 if (dump_enabled_p ())
1251 dump_printf_loc (MSG_NOTE, vect_location,
1252 "Considering outer-loop vectorization.\n");
1255 if (!single_exit (loop))
1256 return opt_result::failure_at (vect_location,
1257 "not vectorized: multiple exits.\n");
1258 if (EDGE_COUNT (loop->header->preds) != 2)
1259 return opt_result::failure_at (vect_location,
1260 "not vectorized:"
1261 " too many incoming edges.\n");
1263 /* We assume that the loop exit condition is at the end of the loop. i.e,
1264 that the loop is represented as a do-while (with a proper if-guard
1265 before the loop if needed), where the loop header contains all the
1266 executable statements, and the latch is empty. */
1267 if (!empty_block_p (loop->latch)
1268 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized: latch block not empty.\n");
1272 /* Make sure the exit is not abnormal. */
1273 edge e = single_exit (loop);
1274 if (e->flags & EDGE_ABNORMAL)
1275 return opt_result::failure_at (vect_location,
1276 "not vectorized:"
1277 " abnormal loop exit edge.\n");
1279 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280 number_of_iterationsm1);
1281 if (!*loop_cond)
1282 return opt_result::failure_at
1283 (vect_location,
1284 "not vectorized: complicated exit condition.\n");
1286 if (integer_zerop (*assumptions)
1287 || !*number_of_iterations
1288 || chrec_contains_undetermined (*number_of_iterations))
1289 return opt_result::failure_at
1290 (*loop_cond,
1291 "not vectorized: number of iterations cannot be computed.\n");
1293 if (integer_zerop (*number_of_iterations))
1294 return opt_result::failure_at
1295 (*loop_cond,
1296 "not vectorized: number of iterations = 0.\n");
1298 return opt_result::success ();
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1306 tree assumptions, number_of_iterations, number_of_iterationsm1;
1307 gcond *loop_cond, *inner_loop_cond = NULL;
1309 opt_result res
1310 = vect_analyze_loop_form_1 (loop, &loop_cond,
1311 &assumptions, &number_of_iterationsm1,
1312 &number_of_iterations, &inner_loop_cond);
1313 if (!res)
1314 return opt_loop_vec_info::propagate_failure (res);
1316 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320 if (!integer_onep (assumptions))
1322 /* We consider to vectorize this loop by versioning it under
1323 some assumptions. In order to do this, we need to clear
1324 existing information computed by scev and niter analyzer. */
1325 scev_reset_htab ();
1326 free_numbers_of_iterations_estimates (loop);
1327 /* Also set flag for this loop so that following scev and niter
1328 analysis are done under the assumptions. */
1329 loop_constraint_set (loop, LOOP_C_FINITE);
1330 /* Also record the assumptions for versioning. */
1331 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1336 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Symbolic number of iterations is ");
1340 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341 dump_printf (MSG_NOTE, "\n");
1345 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347 if (inner_loop_cond)
1349 stmt_vec_info inner_loop_cond_info
1350 = loop_vinfo->lookup_stmt (inner_loop_cond);
1351 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1354 gcc_assert (!loop->aux);
1355 loop->aux = loop_vinfo;
1356 return opt_loop_vec_info::success (loop_vinfo);
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362 statements update the vectorization factor. */
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369 int nbbs = loop->num_nodes;
1370 poly_uint64 vectorization_factor;
1371 int i;
1373 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1375 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376 gcc_assert (known_ne (vectorization_factor, 0U));
1378 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379 vectorization factor of the loop is the unrolling factor required by
1380 the SLP instances. If that unrolling factor is 1, we say, that we
1381 perform pure SLP on loop - cross iteration parallelism is not
1382 exploited. */
1383 bool only_slp_in_loop = true;
1384 for (i = 0; i < nbbs; i++)
1386 basic_block bb = bbs[i];
1387 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388 gsi_next (&si))
1390 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391 stmt_info = vect_stmt_to_vectorize (stmt_info);
1392 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394 && !PURE_SLP_STMT (stmt_info))
1395 /* STMT needs both SLP and loop-based vectorization. */
1396 only_slp_in_loop = false;
1400 if (only_slp_in_loop)
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_NOTE, vect_location,
1404 "Loop contains only SLP stmts\n");
1405 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1407 else
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "Loop contains SLP and non-SLP stmts\n");
1412 /* Both the vectorization factor and unroll factor have the form
1413 current_vector_size * X for some rational X, so they must have
1414 a common multiple. */
1415 vectorization_factor
1416 = force_common_multiple (vectorization_factor,
1417 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1420 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1421 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "Updating vectorization factor to ");
1425 dump_dec (MSG_NOTE, vectorization_factor);
1426 dump_printf (MSG_NOTE, ".\n");
1430 /* Return true if STMT_INFO describes a double reduction phi and if
1431 the other phi in the reduction is also relevant for vectorization.
1432 This rejects cases such as:
1434 outer1:
1435 x_1 = PHI <x_3(outer2), ...>;
1438 inner:
1439 x_2 = ...;
1442 outer2:
1443 x_3 = PHI <x_2(inner)>;
1445 if nothing in x_2 or elsewhere makes x_1 relevant. */
1447 static bool
1448 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1450 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1451 return false;
1453 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1456 /* Function vect_analyze_loop_operations.
1458 Scan the loop stmts and make sure they are all vectorizable. */
1460 static opt_result
1461 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1463 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1464 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1465 int nbbs = loop->num_nodes;
1466 int i;
1467 stmt_vec_info stmt_info;
1468 bool need_to_vectorize = false;
1469 bool ok;
1471 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1473 stmt_vector_for_cost cost_vec;
1474 cost_vec.create (2);
1476 for (i = 0; i < nbbs; i++)
1478 basic_block bb = bbs[i];
1480 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1481 gsi_next (&si))
1483 gphi *phi = si.phi ();
1484 ok = true;
1486 stmt_info = loop_vinfo->lookup_stmt (phi);
1487 if (dump_enabled_p ())
1488 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1489 if (virtual_operand_p (gimple_phi_result (phi)))
1490 continue;
1492 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1493 (i.e., a phi in the tail of the outer-loop). */
1494 if (! is_loop_header_bb_p (bb))
1496 /* FORNOW: we currently don't support the case that these phis
1497 are not used in the outerloop (unless it is double reduction,
1498 i.e., this phi is vect_reduction_def), cause this case
1499 requires to actually do something here. */
1500 if (STMT_VINFO_LIVE_P (stmt_info)
1501 && !vect_active_double_reduction_p (stmt_info))
1502 return opt_result::failure_at (phi,
1503 "Unsupported loop-closed phi"
1504 " in outer-loop.\n");
1506 /* If PHI is used in the outer loop, we check that its operand
1507 is defined in the inner loop. */
1508 if (STMT_VINFO_RELEVANT_P (stmt_info))
1510 tree phi_op;
1512 if (gimple_phi_num_args (phi) != 1)
1513 return opt_result::failure_at (phi, "unsupported phi");
1515 phi_op = PHI_ARG_DEF (phi, 0);
1516 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1517 if (!op_def_info)
1518 return opt_result::failure_at (phi, "unsupported phi");
1520 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1521 && (STMT_VINFO_RELEVANT (op_def_info)
1522 != vect_used_in_outer_by_reduction))
1523 return opt_result::failure_at (phi, "unsupported phi");
1526 continue;
1529 gcc_assert (stmt_info);
1531 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1532 || STMT_VINFO_LIVE_P (stmt_info))
1533 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1534 /* A scalar-dependence cycle that we don't support. */
1535 return opt_result::failure_at (phi,
1536 "not vectorized:"
1537 " scalar dependence cycle.\n");
1539 if (STMT_VINFO_RELEVANT_P (stmt_info))
1541 need_to_vectorize = true;
1542 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1543 && ! PURE_SLP_STMT (stmt_info))
1544 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1545 &cost_vec);
1546 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1547 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1548 && ! PURE_SLP_STMT (stmt_info))
1549 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1550 &cost_vec);
1553 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1554 if (ok
1555 && STMT_VINFO_LIVE_P (stmt_info)
1556 && !PURE_SLP_STMT (stmt_info))
1557 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1558 &cost_vec);
1560 if (!ok)
1561 return opt_result::failure_at (phi,
1562 "not vectorized: relevant phi not "
1563 "supported: %G",
1564 static_cast <gimple *> (phi));
1567 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1568 gsi_next (&si))
1570 gimple *stmt = gsi_stmt (si);
1571 if (!gimple_clobber_p (stmt))
1573 opt_result res
1574 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1575 &need_to_vectorize,
1576 NULL, NULL, &cost_vec);
1577 if (!res)
1578 return res;
1581 } /* bbs */
1583 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1584 cost_vec.release ();
1586 /* All operations in the loop are either irrelevant (deal with loop
1587 control, or dead), or only used outside the loop and can be moved
1588 out of the loop (e.g. invariants, inductions). The loop can be
1589 optimized away by scalar optimizations. We're better off not
1590 touching this loop. */
1591 if (!need_to_vectorize)
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_NOTE, vect_location,
1595 "All the computation can be taken out of the loop.\n");
1596 return opt_result::failure_at
1597 (vect_location,
1598 "not vectorized: redundant loop. no profit to vectorize.\n");
1601 return opt_result::success ();
1604 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1605 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1606 definitely no, or -1 if it's worth retrying. */
1608 static int
1609 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1611 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1612 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1614 /* Only fully-masked loops can have iteration counts less than the
1615 vectorization factor. */
1616 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1618 HOST_WIDE_INT max_niter;
1620 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1621 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1622 else
1623 max_niter = max_stmt_executions_int (loop);
1625 if (max_niter != -1
1626 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "not vectorized: iteration count smaller than "
1631 "vectorization factor.\n");
1632 return 0;
1636 int min_profitable_iters, min_profitable_estimate;
1637 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1638 &min_profitable_estimate);
1640 if (min_profitable_iters < 0)
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644 "not vectorized: vectorization not profitable.\n");
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "not vectorized: vector version will never be "
1648 "profitable.\n");
1649 return -1;
1652 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1653 * assumed_vf);
1655 /* Use the cost model only if it is more conservative than user specified
1656 threshold. */
1657 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1658 min_profitable_iters);
1660 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1662 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1663 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "not vectorized: vectorization not profitable.\n");
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_NOTE, vect_location,
1670 "not vectorized: iteration count smaller than user "
1671 "specified loop bound parameter or minimum profitable "
1672 "iterations (whichever is more conservative).\n");
1673 return 0;
1676 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1677 if (estimated_niter == -1)
1678 estimated_niter = likely_max_stmt_executions_int (loop);
1679 if (estimated_niter != -1
1680 && ((unsigned HOST_WIDE_INT) estimated_niter
1681 < MAX (th, (unsigned) min_profitable_estimate)))
1683 if (dump_enabled_p ())
1684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1685 "not vectorized: estimated iteration count too "
1686 "small.\n");
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE, vect_location,
1689 "not vectorized: estimated iteration count smaller "
1690 "than specified loop bound parameter or minimum "
1691 "profitable iterations (whichever is more "
1692 "conservative).\n");
1693 return -1;
1696 return 1;
1699 static opt_result
1700 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1701 vec<data_reference_p> *datarefs,
1702 unsigned int *n_stmts)
1704 *n_stmts = 0;
1705 for (unsigned i = 0; i < loop->num_nodes; i++)
1706 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1707 !gsi_end_p (gsi); gsi_next (&gsi))
1709 gimple *stmt = gsi_stmt (gsi);
1710 if (is_gimple_debug (stmt))
1711 continue;
1712 ++(*n_stmts);
1713 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1714 if (!res)
1716 if (is_gimple_call (stmt) && loop->safelen)
1718 tree fndecl = gimple_call_fndecl (stmt), op;
1719 if (fndecl != NULL_TREE)
1721 cgraph_node *node = cgraph_node::get (fndecl);
1722 if (node != NULL && node->simd_clones != NULL)
1724 unsigned int j, n = gimple_call_num_args (stmt);
1725 for (j = 0; j < n; j++)
1727 op = gimple_call_arg (stmt, j);
1728 if (DECL_P (op)
1729 || (REFERENCE_CLASS_P (op)
1730 && get_base_address (op)))
1731 break;
1733 op = gimple_call_lhs (stmt);
1734 /* Ignore #pragma omp declare simd functions
1735 if they don't have data references in the
1736 call stmt itself. */
1737 if (j == n
1738 && !(op
1739 && (DECL_P (op)
1740 || (REFERENCE_CLASS_P (op)
1741 && get_base_address (op)))))
1742 continue;
1746 return res;
1748 /* If dependence analysis will give up due to the limit on the
1749 number of datarefs stop here and fail fatally. */
1750 if (datarefs->length ()
1751 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1752 return opt_result::failure_at (stmt, "exceeded param "
1753 "loop-max-datarefs-for-datadeps\n");
1755 return opt_result::success ();
1758 /* Function vect_analyze_loop_2.
1760 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1761 for it. The different analyses will record information in the
1762 loop_vec_info struct. */
1763 static opt_result
1764 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1766 opt_result ok = opt_result::success ();
1767 int res;
1768 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1769 poly_uint64 min_vf = 2;
1771 /* The first group of checks is independent of the vector size. */
1772 fatal = true;
1774 /* Find all data references in the loop (which correspond to vdefs/vuses)
1775 and analyze their evolution in the loop. */
1777 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1779 /* Gather the data references and count stmts in the loop. */
1780 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1782 opt_result res
1783 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1784 &LOOP_VINFO_DATAREFS (loop_vinfo),
1785 n_stmts);
1786 if (!res)
1788 if (dump_enabled_p ())
1789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790 "not vectorized: loop contains function "
1791 "calls or data references that cannot "
1792 "be analyzed\n");
1793 return res;
1795 loop_vinfo->shared->save_datarefs ();
1797 else
1798 loop_vinfo->shared->check_datarefs ();
1800 /* Analyze the data references and also adjust the minimal
1801 vectorization factor according to the loads and stores. */
1803 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1804 if (!ok)
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 "bad data references.\n");
1809 return ok;
1812 /* Classify all cross-iteration scalar data-flow cycles.
1813 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1814 vect_analyze_scalar_cycles (loop_vinfo);
1816 vect_pattern_recog (loop_vinfo);
1818 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1820 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1821 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1823 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1824 if (!ok)
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "bad data access.\n");
1829 return ok;
1832 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1834 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1835 if (!ok)
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839 "unexpected pattern.\n");
1840 return ok;
1843 /* While the rest of the analysis below depends on it in some way. */
1844 fatal = false;
1846 /* Analyze data dependences between the data-refs in the loop
1847 and adjust the maximum vectorization factor according to
1848 the dependences.
1849 FORNOW: fail at the first data dependence that we encounter. */
1851 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1852 if (!ok)
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "bad data dependence.\n");
1857 return ok;
1859 if (max_vf != MAX_VECTORIZATION_FACTOR
1860 && maybe_lt (max_vf, min_vf))
1861 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1862 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1864 ok = vect_determine_vectorization_factor (loop_vinfo);
1865 if (!ok)
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869 "can't determine vectorization factor.\n");
1870 return ok;
1872 if (max_vf != MAX_VECTORIZATION_FACTOR
1873 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1874 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1876 /* Compute the scalar iteration cost. */
1877 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1879 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1880 unsigned th;
1882 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1883 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1884 if (!ok)
1885 return ok;
1887 /* If there are any SLP instances mark them as pure_slp. */
1888 bool slp = vect_make_slp_decision (loop_vinfo);
1889 if (slp)
1891 /* Find stmts that need to be both vectorized and SLPed. */
1892 vect_detect_hybrid_slp (loop_vinfo);
1894 /* Update the vectorization factor based on the SLP decision. */
1895 vect_update_vf_for_slp (loop_vinfo);
1898 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1900 /* We don't expect to have to roll back to anything other than an empty
1901 set of rgroups. */
1902 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1904 /* This is the point where we can re-start analysis with SLP forced off. */
1905 start_over:
1907 /* Now the vectorization factor is final. */
1908 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1909 gcc_assert (known_ne (vectorization_factor, 0U));
1911 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1913 dump_printf_loc (MSG_NOTE, vect_location,
1914 "vectorization_factor = ");
1915 dump_dec (MSG_NOTE, vectorization_factor);
1916 dump_printf (MSG_NOTE, ", niters = %wd\n",
1917 LOOP_VINFO_INT_NITERS (loop_vinfo));
1920 HOST_WIDE_INT max_niter
1921 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1923 /* Analyze the alignment of the data-refs in the loop.
1924 Fail if a data reference is found that cannot be vectorized. */
1926 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1927 if (!ok)
1929 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931 "bad data alignment.\n");
1932 return ok;
1935 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1936 It is important to call pruning after vect_analyze_data_ref_accesses,
1937 since we use grouping information gathered by interleaving analysis. */
1938 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1939 if (!ok)
1940 return ok;
1942 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1943 vectorization, since we do not want to add extra peeling or
1944 add versioning for alignment. */
1945 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946 /* This pass will decide on using loop versioning and/or loop peeling in
1947 order to enhance the alignment of data references in the loop. */
1948 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1949 else
1950 ok = vect_verify_datarefs_alignment (loop_vinfo);
1951 if (!ok)
1952 return ok;
1954 if (slp)
1956 /* Analyze operations in the SLP instances. Note this may
1957 remove unsupported SLP instances which makes the above
1958 SLP kind detection invalid. */
1959 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1960 vect_slp_analyze_operations (loop_vinfo);
1961 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1963 ok = opt_result::failure_at (vect_location,
1964 "unsupported SLP instances\n");
1965 goto again;
1969 /* Scan all the remaining operations in the loop that are not subject
1970 to SLP and make sure they are vectorizable. */
1971 ok = vect_analyze_loop_operations (loop_vinfo);
1972 if (!ok)
1974 if (dump_enabled_p ())
1975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1976 "bad operation or unsupported loop bound.\n");
1977 return ok;
1980 /* Decide whether to use a fully-masked loop for this vectorization
1981 factor. */
1982 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1983 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1984 && vect_verify_full_masking (loop_vinfo));
1985 if (dump_enabled_p ())
1987 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988 dump_printf_loc (MSG_NOTE, vect_location,
1989 "using a fully-masked loop.\n");
1990 else
1991 dump_printf_loc (MSG_NOTE, vect_location,
1992 "not using a fully-masked loop.\n");
1995 /* If epilog loop is required because of data accesses with gaps,
1996 one additional iteration needs to be peeled. Check if there is
1997 enough iterations for vectorization. */
1998 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1999 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2000 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2002 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2003 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2005 if (known_lt (wi::to_widest (scalar_niters), vf))
2006 return opt_result::failure_at (vect_location,
2007 "loop has no enough iterations to"
2008 " support peeling for gaps.\n");
2011 /* Check the costings of the loop make vectorizing worthwhile. */
2012 res = vect_analyze_loop_costing (loop_vinfo);
2013 if (res < 0)
2015 ok = opt_result::failure_at (vect_location,
2016 "Loop costings may not be worthwhile.\n");
2017 goto again;
2019 if (!res)
2020 return opt_result::failure_at (vect_location,
2021 "Loop costings not worthwhile.\n");
2023 /* Decide whether we need to create an epilogue loop to handle
2024 remaining scalar iterations. */
2025 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2027 unsigned HOST_WIDE_INT const_vf;
2028 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2029 /* The main loop handles all iterations. */
2030 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2031 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2034 /* Work out the (constant) number of iterations that need to be
2035 peeled for reasons other than niters. */
2036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2038 peel_niter += 1;
2039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2041 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2044 /* ??? When peeling for gaps but not alignment, we could
2045 try to check whether the (variable) niters is known to be
2046 VF * N + 1. That's something of a niche case though. */
2047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2050 < (unsigned) exact_log2 (const_vf))
2051 /* In case of versioning, check if the maximum number of
2052 iterations is greater than th. If they are identical,
2053 the epilogue is unnecessary. */
2054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2055 || ((unsigned HOST_WIDE_INT) max_niter
2056 > (th / const_vf) * const_vf))))
2057 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2059 /* If an epilogue loop is required make sure we can create one. */
2060 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2061 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2063 if (dump_enabled_p ())
2064 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2065 if (!vect_can_advance_ivs_p (loop_vinfo)
2066 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2067 single_exit (LOOP_VINFO_LOOP
2068 (loop_vinfo))))
2070 ok = opt_result::failure_at (vect_location,
2071 "not vectorized: can't create required "
2072 "epilog loop\n");
2073 goto again;
2077 /* During peeling, we need to check if number of loop iterations is
2078 enough for both peeled prolog loop and vector loop. This check
2079 can be merged along with threshold check of loop versioning, so
2080 increase threshold for this case if necessary. */
2081 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2083 poly_uint64 niters_th = 0;
2085 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2087 /* Niters for peeled prolog loop. */
2088 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2090 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2091 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2092 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2094 else
2095 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2098 /* Niters for at least one iteration of vectorized loop. */
2099 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2100 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2101 /* One additional iteration because of peeling for gap. */
2102 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2103 niters_th += 1;
2104 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2107 gcc_assert (known_eq (vectorization_factor,
2108 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2110 /* Ok to vectorize! */
2111 return opt_result::success ();
2113 again:
2114 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2115 gcc_assert (!ok);
2117 /* Try again with SLP forced off but if we didn't do any SLP there is
2118 no point in re-trying. */
2119 if (!slp)
2120 return ok;
2122 /* If there are reduction chains re-trying will fail anyway. */
2123 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2124 return ok;
2126 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2127 via interleaving or lane instructions. */
2128 slp_instance instance;
2129 slp_tree node;
2130 unsigned i, j;
2131 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2133 stmt_vec_info vinfo;
2134 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2135 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2136 continue;
2137 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2138 unsigned int size = DR_GROUP_SIZE (vinfo);
2139 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2140 if (! vect_store_lanes_supported (vectype, size, false)
2141 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2142 && ! vect_grouped_store_supported (vectype, size))
2143 return opt_result::failure_at (vinfo->stmt,
2144 "unsupported grouped store\n");
2145 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2147 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2148 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2149 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2150 size = DR_GROUP_SIZE (vinfo);
2151 vectype = STMT_VINFO_VECTYPE (vinfo);
2152 if (! vect_load_lanes_supported (vectype, size, false)
2153 && ! vect_grouped_load_supported (vectype, single_element_p,
2154 size))
2155 return opt_result::failure_at (vinfo->stmt,
2156 "unsupported grouped load\n");
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_NOTE, vect_location,
2162 "re-trying with SLP disabled\n");
2164 /* Roll back state appropriately. No SLP this time. */
2165 slp = false;
2166 /* Restore vectorization factor as it were without SLP. */
2167 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2168 /* Free the SLP instances. */
2169 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2170 vect_free_slp_instance (instance, false);
2171 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2172 /* Reset SLP type to loop_vect on all stmts. */
2173 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2175 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2176 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2177 !gsi_end_p (si); gsi_next (&si))
2179 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2180 STMT_SLP_TYPE (stmt_info) = loop_vect;
2182 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2183 !gsi_end_p (si); gsi_next (&si))
2185 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2186 STMT_SLP_TYPE (stmt_info) = loop_vect;
2187 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2189 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2190 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2191 STMT_SLP_TYPE (stmt_info) = loop_vect;
2192 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2193 !gsi_end_p (pi); gsi_next (&pi))
2194 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2195 = loop_vect;
2199 /* Free optimized alias test DDRS. */
2200 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2201 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2202 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2203 /* Reset target cost data. */
2204 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2205 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2206 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2207 /* Reset accumulated rgroup information. */
2208 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2209 /* Reset assorted flags. */
2210 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2211 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2212 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2213 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2214 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2216 goto start_over;
2219 /* Function vect_analyze_loop.
2221 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2222 for it. The different analyses will record information in the
2223 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2224 be vectorized. */
2225 opt_loop_vec_info
2226 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2227 vec_info_shared *shared)
2229 auto_vector_sizes vector_sizes;
2231 /* Autodetect first vector size we try. */
2232 current_vector_size = 0;
2233 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2234 unsigned int next_size = 0;
2236 DUMP_VECT_SCOPE ("analyze_loop_nest");
2238 if (loop_outer (loop)
2239 && loop_vec_info_for_loop (loop_outer (loop))
2240 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2241 return opt_loop_vec_info::failure_at (vect_location,
2242 "outer-loop already vectorized.\n");
2244 if (!find_loop_nest (loop, &shared->loop_nest))
2245 return opt_loop_vec_info::failure_at
2246 (vect_location,
2247 "not vectorized: loop nest containing two or more consecutive inner"
2248 " loops cannot be vectorized\n");
2250 unsigned n_stmts = 0;
2251 poly_uint64 autodetected_vector_size = 0;
2252 while (1)
2254 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2255 opt_loop_vec_info loop_vinfo
2256 = vect_analyze_loop_form (loop, shared);
2257 if (!loop_vinfo)
2259 if (dump_enabled_p ())
2260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261 "bad loop form.\n");
2262 return loop_vinfo;
2265 bool fatal = false;
2267 if (orig_loop_vinfo)
2268 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2270 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2271 if (res)
2273 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2275 return loop_vinfo;
2278 delete loop_vinfo;
2280 if (next_size == 0)
2281 autodetected_vector_size = current_vector_size;
2283 if (next_size < vector_sizes.length ()
2284 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2285 next_size += 1;
2287 if (fatal
2288 || next_size == vector_sizes.length ()
2289 || known_eq (current_vector_size, 0U))
2290 return opt_loop_vec_info::propagate_failure (res);
2292 /* Try the next biggest vector size. */
2293 current_vector_size = vector_sizes[next_size++];
2294 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "***** Re-trying analysis with "
2298 "vector size ");
2299 dump_dec (MSG_NOTE, current_vector_size);
2300 dump_printf (MSG_NOTE, "\n");
2305 /* Return true if there is an in-order reduction function for CODE, storing
2306 it in *REDUC_FN if so. */
2308 static bool
2309 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2311 switch (code)
2313 case PLUS_EXPR:
2314 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2315 return true;
2317 default:
2318 return false;
2322 /* Function reduction_fn_for_scalar_code
2324 Input:
2325 CODE - tree_code of a reduction operations.
2327 Output:
2328 REDUC_FN - the corresponding internal function to be used to reduce the
2329 vector of partial results into a single scalar result, or IFN_LAST
2330 if the operation is a supported reduction operation, but does not have
2331 such an internal function.
2333 Return FALSE if CODE currently cannot be vectorized as reduction. */
2335 static bool
2336 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2338 switch (code)
2340 case MAX_EXPR:
2341 *reduc_fn = IFN_REDUC_MAX;
2342 return true;
2344 case MIN_EXPR:
2345 *reduc_fn = IFN_REDUC_MIN;
2346 return true;
2348 case PLUS_EXPR:
2349 *reduc_fn = IFN_REDUC_PLUS;
2350 return true;
2352 case BIT_AND_EXPR:
2353 *reduc_fn = IFN_REDUC_AND;
2354 return true;
2356 case BIT_IOR_EXPR:
2357 *reduc_fn = IFN_REDUC_IOR;
2358 return true;
2360 case BIT_XOR_EXPR:
2361 *reduc_fn = IFN_REDUC_XOR;
2362 return true;
2364 case MULT_EXPR:
2365 case MINUS_EXPR:
2366 *reduc_fn = IFN_LAST;
2367 return true;
2369 default:
2370 return false;
2374 /* If there is a neutral value X such that SLP reduction NODE would not
2375 be affected by the introduction of additional X elements, return that X,
2376 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2377 is true if the SLP statements perform a single reduction, false if each
2378 statement performs an independent reduction. */
2380 static tree
2381 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2382 bool reduc_chain)
2384 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2385 stmt_vec_info stmt_vinfo = stmts[0];
2386 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2387 tree scalar_type = TREE_TYPE (vector_type);
2388 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2389 gcc_assert (loop);
2391 switch (code)
2393 case WIDEN_SUM_EXPR:
2394 case DOT_PROD_EXPR:
2395 case SAD_EXPR:
2396 case PLUS_EXPR:
2397 case MINUS_EXPR:
2398 case BIT_IOR_EXPR:
2399 case BIT_XOR_EXPR:
2400 return build_zero_cst (scalar_type);
2402 case MULT_EXPR:
2403 return build_one_cst (scalar_type);
2405 case BIT_AND_EXPR:
2406 return build_all_ones_cst (scalar_type);
2408 case MAX_EXPR:
2409 case MIN_EXPR:
2410 /* For MIN/MAX the initial values are neutral. A reduction chain
2411 has only a single initial value, so that value is neutral for
2412 all statements. */
2413 if (reduc_chain)
2414 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2415 loop_preheader_edge (loop));
2416 return NULL_TREE;
2418 default:
2419 return NULL_TREE;
2423 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2424 STMT is printed with a message MSG. */
2426 static void
2427 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2429 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2432 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2433 operation. Return true if the results of DEF_STMT_INFO are something
2434 that can be accumulated by such a reduction. */
2436 static bool
2437 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2439 return (is_gimple_assign (def_stmt_info->stmt)
2440 || is_gimple_call (def_stmt_info->stmt)
2441 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2442 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2443 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2444 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2447 /* Detect SLP reduction of the form:
2449 #a1 = phi <a5, a0>
2450 a2 = operation (a1)
2451 a3 = operation (a2)
2452 a4 = operation (a3)
2453 a5 = operation (a4)
2455 #a = phi <a5>
2457 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2458 FIRST_STMT is the first reduction stmt in the chain
2459 (a2 = operation (a1)).
2461 Return TRUE if a reduction chain was detected. */
2463 static bool
2464 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2465 gimple *first_stmt)
2467 struct loop *loop = (gimple_bb (phi))->loop_father;
2468 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2469 enum tree_code code;
2470 gimple *loop_use_stmt = NULL;
2471 stmt_vec_info use_stmt_info;
2472 tree lhs;
2473 imm_use_iterator imm_iter;
2474 use_operand_p use_p;
2475 int nloop_uses, size = 0, n_out_of_loop_uses;
2476 bool found = false;
2478 if (loop != vect_loop)
2479 return false;
2481 auto_vec<stmt_vec_info, 8> reduc_chain;
2482 lhs = PHI_RESULT (phi);
2483 code = gimple_assign_rhs_code (first_stmt);
2484 while (1)
2486 nloop_uses = 0;
2487 n_out_of_loop_uses = 0;
2488 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2490 gimple *use_stmt = USE_STMT (use_p);
2491 if (is_gimple_debug (use_stmt))
2492 continue;
2494 /* Check if we got back to the reduction phi. */
2495 if (use_stmt == phi)
2497 loop_use_stmt = use_stmt;
2498 found = true;
2499 break;
2502 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2504 loop_use_stmt = use_stmt;
2505 nloop_uses++;
2507 else
2508 n_out_of_loop_uses++;
2510 /* There are can be either a single use in the loop or two uses in
2511 phi nodes. */
2512 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2513 return false;
2516 if (found)
2517 break;
2519 /* We reached a statement with no loop uses. */
2520 if (nloop_uses == 0)
2521 return false;
2523 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2524 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2525 return false;
2527 if (!is_gimple_assign (loop_use_stmt)
2528 || code != gimple_assign_rhs_code (loop_use_stmt)
2529 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2530 return false;
2532 /* Insert USE_STMT into reduction chain. */
2533 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2534 reduc_chain.safe_push (use_stmt_info);
2536 lhs = gimple_assign_lhs (loop_use_stmt);
2537 size++;
2540 if (!found || loop_use_stmt != phi || size < 2)
2541 return false;
2543 /* Swap the operands, if needed, to make the reduction operand be the second
2544 operand. */
2545 lhs = PHI_RESULT (phi);
2546 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2548 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2549 if (gimple_assign_rhs2 (next_stmt) == lhs)
2551 tree op = gimple_assign_rhs1 (next_stmt);
2552 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2554 /* Check that the other def is either defined in the loop
2555 ("vect_internal_def"), or it's an induction (defined by a
2556 loop-header phi-node). */
2557 if (def_stmt_info
2558 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2559 && vect_valid_reduction_input_p (def_stmt_info))
2561 lhs = gimple_assign_lhs (next_stmt);
2562 continue;
2565 return false;
2567 else
2569 tree op = gimple_assign_rhs2 (next_stmt);
2570 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2572 /* Check that the other def is either defined in the loop
2573 ("vect_internal_def"), or it's an induction (defined by a
2574 loop-header phi-node). */
2575 if (def_stmt_info
2576 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2577 && vect_valid_reduction_input_p (def_stmt_info))
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2581 next_stmt);
2583 swap_ssa_operands (next_stmt,
2584 gimple_assign_rhs1_ptr (next_stmt),
2585 gimple_assign_rhs2_ptr (next_stmt));
2586 update_stmt (next_stmt);
2588 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2589 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2591 else
2592 return false;
2595 lhs = gimple_assign_lhs (next_stmt);
2598 /* Build up the actual chain. */
2599 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2601 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2602 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2604 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2605 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2607 /* Save the chain for further analysis in SLP detection. */
2608 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2609 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2611 return true;
2614 /* Return true if we need an in-order reduction for operation CODE
2615 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2616 overflow must wrap. */
2618 static bool
2619 needs_fold_left_reduction_p (tree type, tree_code code,
2620 bool need_wrapping_integral_overflow)
2622 /* CHECKME: check for !flag_finite_math_only too? */
2623 if (SCALAR_FLOAT_TYPE_P (type))
2624 switch (code)
2626 case MIN_EXPR:
2627 case MAX_EXPR:
2628 return false;
2630 default:
2631 return !flag_associative_math;
2634 if (INTEGRAL_TYPE_P (type))
2636 if (!operation_no_trapping_overflow (type, code))
2637 return true;
2638 if (need_wrapping_integral_overflow
2639 && !TYPE_OVERFLOW_WRAPS (type)
2640 && operation_can_overflow (code))
2641 return true;
2642 return false;
2645 if (SAT_FIXED_POINT_TYPE_P (type))
2646 return true;
2648 return false;
2651 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2652 reduction operation CODE has a handled computation expression. */
2654 bool
2655 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2656 tree loop_arg, enum tree_code code)
2658 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2659 auto_bitmap visited;
2660 tree lookfor = PHI_RESULT (phi);
2661 ssa_op_iter curri;
2662 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2663 while (USE_FROM_PTR (curr) != loop_arg)
2664 curr = op_iter_next_use (&curri);
2665 curri.i = curri.numops;
2668 path.safe_push (std::make_pair (curri, curr));
2669 tree use = USE_FROM_PTR (curr);
2670 if (use == lookfor)
2671 break;
2672 gimple *def = SSA_NAME_DEF_STMT (use);
2673 if (gimple_nop_p (def)
2674 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2676 pop:
2679 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2680 curri = x.first;
2681 curr = x.second;
2683 curr = op_iter_next_use (&curri);
2684 /* Skip already visited or non-SSA operands (from iterating
2685 over PHI args). */
2686 while (curr != NULL_USE_OPERAND_P
2687 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2688 || ! bitmap_set_bit (visited,
2689 SSA_NAME_VERSION
2690 (USE_FROM_PTR (curr)))));
2692 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2693 if (curr == NULL_USE_OPERAND_P)
2694 break;
2696 else
2698 if (gimple_code (def) == GIMPLE_PHI)
2699 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2700 else
2701 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2702 while (curr != NULL_USE_OPERAND_P
2703 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2704 || ! bitmap_set_bit (visited,
2705 SSA_NAME_VERSION
2706 (USE_FROM_PTR (curr)))))
2707 curr = op_iter_next_use (&curri);
2708 if (curr == NULL_USE_OPERAND_P)
2709 goto pop;
2712 while (1);
2713 if (dump_file && (dump_flags & TDF_DETAILS))
2715 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2716 unsigned i;
2717 std::pair<ssa_op_iter, use_operand_p> *x;
2718 FOR_EACH_VEC_ELT (path, i, x)
2719 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2720 dump_printf (MSG_NOTE, "\n");
2723 /* Check whether the reduction path detected is valid. */
2724 bool fail = path.length () == 0;
2725 bool neg = false;
2726 for (unsigned i = 1; i < path.length (); ++i)
2728 gimple *use_stmt = USE_STMT (path[i].second);
2729 tree op = USE_FROM_PTR (path[i].second);
2730 if (! has_single_use (op)
2731 || ! is_gimple_assign (use_stmt))
2733 fail = true;
2734 break;
2736 if (gimple_assign_rhs_code (use_stmt) != code)
2738 if (code == PLUS_EXPR
2739 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2741 /* Track whether we negate the reduction value each iteration. */
2742 if (gimple_assign_rhs2 (use_stmt) == op)
2743 neg = ! neg;
2745 else
2747 fail = true;
2748 break;
2752 return ! fail && ! neg;
2756 /* Function vect_is_simple_reduction
2758 (1) Detect a cross-iteration def-use cycle that represents a simple
2759 reduction computation. We look for the following pattern:
2761 loop_header:
2762 a1 = phi < a0, a2 >
2763 a3 = ...
2764 a2 = operation (a3, a1)
2768 a3 = ...
2769 loop_header:
2770 a1 = phi < a0, a2 >
2771 a2 = operation (a3, a1)
2773 such that:
2774 1. operation is commutative and associative and it is safe to
2775 change the order of the computation
2776 2. no uses for a2 in the loop (a2 is used out of the loop)
2777 3. no uses of a1 in the loop besides the reduction operation
2778 4. no uses of a1 outside the loop.
2780 Conditions 1,4 are tested here.
2781 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2783 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2784 nested cycles.
2786 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2787 reductions:
2789 a1 = phi < a0, a2 >
2790 inner loop (def of a3)
2791 a2 = phi < a3 >
2793 (4) Detect condition expressions, ie:
2794 for (int i = 0; i < N; i++)
2795 if (a[i] < val)
2796 ret_val = a[i];
2800 static stmt_vec_info
2801 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2802 bool *double_reduc,
2803 bool need_wrapping_integral_overflow,
2804 enum vect_reduction_type *v_reduc_type)
2806 gphi *phi = as_a <gphi *> (phi_info->stmt);
2807 struct loop *loop = (gimple_bb (phi))->loop_father;
2808 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2809 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2810 gimple *phi_use_stmt = NULL;
2811 enum tree_code orig_code, code;
2812 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2813 tree type;
2814 tree name;
2815 imm_use_iterator imm_iter;
2816 use_operand_p use_p;
2817 bool phi_def;
2819 *double_reduc = false;
2820 *v_reduc_type = TREE_CODE_REDUCTION;
2822 tree phi_name = PHI_RESULT (phi);
2823 /* ??? If there are no uses of the PHI result the inner loop reduction
2824 won't be detected as possibly double-reduction by vectorizable_reduction
2825 because that tries to walk the PHI arg from the preheader edge which
2826 can be constant. See PR60382. */
2827 if (has_zero_uses (phi_name))
2828 return NULL;
2829 unsigned nphi_def_loop_uses = 0;
2830 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2832 gimple *use_stmt = USE_STMT (use_p);
2833 if (is_gimple_debug (use_stmt))
2834 continue;
2836 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2838 if (dump_enabled_p ())
2839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2840 "intermediate value used outside loop.\n");
2842 return NULL;
2845 nphi_def_loop_uses++;
2846 phi_use_stmt = use_stmt;
2849 edge latch_e = loop_latch_edge (loop);
2850 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2851 if (TREE_CODE (loop_arg) != SSA_NAME)
2853 if (dump_enabled_p ())
2854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2855 "reduction: not ssa_name: %T\n", loop_arg);
2856 return NULL;
2859 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2860 if (!def_stmt_info
2861 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2862 return NULL;
2864 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2866 name = gimple_assign_lhs (def_stmt);
2867 phi_def = false;
2869 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2871 name = PHI_RESULT (def_stmt);
2872 phi_def = true;
2874 else
2876 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878 "reduction: unhandled reduction operation: %G",
2879 def_stmt_info->stmt);
2880 return NULL;
2883 unsigned nlatch_def_loop_uses = 0;
2884 auto_vec<gphi *, 3> lcphis;
2885 bool inner_loop_of_double_reduc = false;
2886 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2888 gimple *use_stmt = USE_STMT (use_p);
2889 if (is_gimple_debug (use_stmt))
2890 continue;
2891 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2892 nlatch_def_loop_uses++;
2893 else
2895 /* We can have more than one loop-closed PHI. */
2896 lcphis.safe_push (as_a <gphi *> (use_stmt));
2897 if (nested_in_vect_loop
2898 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2899 == vect_double_reduction_def))
2900 inner_loop_of_double_reduc = true;
2904 /* If this isn't a nested cycle or if the nested cycle reduction value
2905 is used ouside of the inner loop we cannot handle uses of the reduction
2906 value. */
2907 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2908 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912 "reduction used in loop.\n");
2913 return NULL;
2916 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2917 defined in the inner loop. */
2918 if (phi_def)
2920 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2921 op1 = PHI_ARG_DEF (def_stmt, 0);
2923 if (gimple_phi_num_args (def_stmt) != 1
2924 || TREE_CODE (op1) != SSA_NAME)
2926 if (dump_enabled_p ())
2927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2928 "unsupported phi node definition.\n");
2930 return NULL;
2933 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2934 if (gimple_bb (def1)
2935 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2936 && loop->inner
2937 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2938 && is_gimple_assign (def1)
2939 && is_a <gphi *> (phi_use_stmt)
2940 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2942 if (dump_enabled_p ())
2943 report_vect_op (MSG_NOTE, def_stmt,
2944 "detected double reduction: ");
2946 *double_reduc = true;
2947 return def_stmt_info;
2950 return NULL;
2953 /* If we are vectorizing an inner reduction we are executing that
2954 in the original order only in case we are not dealing with a
2955 double reduction. */
2956 bool check_reduction = true;
2957 if (flow_loop_nested_p (vect_loop, loop))
2959 gphi *lcphi;
2960 unsigned i;
2961 check_reduction = false;
2962 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2963 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2965 gimple *use_stmt = USE_STMT (use_p);
2966 if (is_gimple_debug (use_stmt))
2967 continue;
2968 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2969 check_reduction = true;
2973 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2974 code = orig_code = gimple_assign_rhs_code (def_stmt);
2976 if (nested_in_vect_loop && !check_reduction)
2978 /* FIXME: Even for non-reductions code generation is funneled
2979 through vectorizable_reduction for the stmt defining the
2980 PHI latch value. So we have to artificially restrict ourselves
2981 for the supported operations. */
2982 switch (get_gimple_rhs_class (code))
2984 case GIMPLE_BINARY_RHS:
2985 case GIMPLE_TERNARY_RHS:
2986 break;
2987 default:
2988 /* Not supported by vectorizable_reduction. */
2989 if (dump_enabled_p ())
2990 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2991 "nested cycle: not handled operation: ");
2992 return NULL;
2994 if (dump_enabled_p ())
2995 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2996 return def_stmt_info;
2999 /* We can handle "res -= x[i]", which is non-associative by
3000 simply rewriting this into "res += -x[i]". Avoid changing
3001 gimple instruction for the first simple tests and only do this
3002 if we're allowed to change code at all. */
3003 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3004 code = PLUS_EXPR;
3006 if (code == COND_EXPR)
3008 if (! nested_in_vect_loop)
3009 *v_reduc_type = COND_REDUCTION;
3011 op3 = gimple_assign_rhs1 (def_stmt);
3012 if (COMPARISON_CLASS_P (op3))
3014 op4 = TREE_OPERAND (op3, 1);
3015 op3 = TREE_OPERAND (op3, 0);
3017 if (op3 == phi_name || op4 == phi_name)
3019 if (dump_enabled_p ())
3020 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3021 "reduction: condition depends on previous"
3022 " iteration: ");
3023 return NULL;
3026 op1 = gimple_assign_rhs2 (def_stmt);
3027 op2 = gimple_assign_rhs3 (def_stmt);
3029 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3031 if (dump_enabled_p ())
3032 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3033 "reduction: not commutative/associative: ");
3034 return NULL;
3036 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3038 op1 = gimple_assign_rhs1 (def_stmt);
3039 op2 = gimple_assign_rhs2 (def_stmt);
3041 else
3043 if (dump_enabled_p ())
3044 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3045 "reduction: not handled operation: ");
3046 return NULL;
3049 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3051 if (dump_enabled_p ())
3052 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3053 "reduction: both uses not ssa_names: ");
3055 return NULL;
3058 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3059 if ((TREE_CODE (op1) == SSA_NAME
3060 && !types_compatible_p (type,TREE_TYPE (op1)))
3061 || (TREE_CODE (op2) == SSA_NAME
3062 && !types_compatible_p (type, TREE_TYPE (op2)))
3063 || (op3 && TREE_CODE (op3) == SSA_NAME
3064 && !types_compatible_p (type, TREE_TYPE (op3)))
3065 || (op4 && TREE_CODE (op4) == SSA_NAME
3066 && !types_compatible_p (type, TREE_TYPE (op4))))
3068 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_NOTE, vect_location,
3071 "reduction: multiple types: operation type: "
3072 "%T, operands types: %T,%T",
3073 type, TREE_TYPE (op1), TREE_TYPE (op2));
3074 if (op3)
3075 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3077 if (op4)
3078 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3079 dump_printf (MSG_NOTE, "\n");
3082 return NULL;
3085 /* Check whether it's ok to change the order of the computation.
3086 Generally, when vectorizing a reduction we change the order of the
3087 computation. This may change the behavior of the program in some
3088 cases, so we need to check that this is ok. One exception is when
3089 vectorizing an outer-loop: the inner-loop is executed sequentially,
3090 and therefore vectorizing reductions in the inner-loop during
3091 outer-loop vectorization is safe. */
3092 if (check_reduction
3093 && *v_reduc_type == TREE_CODE_REDUCTION
3094 && needs_fold_left_reduction_p (type, code,
3095 need_wrapping_integral_overflow))
3096 *v_reduc_type = FOLD_LEFT_REDUCTION;
3098 /* Reduction is safe. We're dealing with one of the following:
3099 1) integer arithmetic and no trapv
3100 2) floating point arithmetic, and special flags permit this optimization
3101 3) nested cycle (i.e., outer loop vectorization). */
3102 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3103 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3104 if (code != COND_EXPR && !def1_info && !def2_info)
3106 if (dump_enabled_p ())
3107 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3108 return NULL;
3111 /* Check that one def is the reduction def, defined by PHI,
3112 the other def is either defined in the loop ("vect_internal_def"),
3113 or it's an induction (defined by a loop-header phi-node). */
3115 if (def2_info
3116 && def2_info->stmt == phi
3117 && (code == COND_EXPR
3118 || !def1_info
3119 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3120 || vect_valid_reduction_input_p (def1_info)))
3122 if (dump_enabled_p ())
3123 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3124 return def_stmt_info;
3127 if (def1_info
3128 && def1_info->stmt == phi
3129 && (code == COND_EXPR
3130 || !def2_info
3131 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3132 || vect_valid_reduction_input_p (def2_info)))
3134 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3136 /* Check if we can swap operands (just for simplicity - so that
3137 the rest of the code can assume that the reduction variable
3138 is always the last (second) argument). */
3139 if (code == COND_EXPR)
3141 /* Swap cond_expr by inverting the condition. */
3142 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3143 enum tree_code invert_code = ERROR_MARK;
3144 enum tree_code cond_code = TREE_CODE (cond_expr);
3146 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3148 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3149 invert_code = invert_tree_comparison (cond_code, honor_nans);
3151 if (invert_code != ERROR_MARK)
3153 TREE_SET_CODE (cond_expr, invert_code);
3154 swap_ssa_operands (def_stmt,
3155 gimple_assign_rhs2_ptr (def_stmt),
3156 gimple_assign_rhs3_ptr (def_stmt));
3158 else
3160 if (dump_enabled_p ())
3161 report_vect_op (MSG_NOTE, def_stmt,
3162 "detected reduction: cannot swap operands "
3163 "for cond_expr");
3164 return NULL;
3167 else
3168 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3169 gimple_assign_rhs2_ptr (def_stmt));
3171 if (dump_enabled_p ())
3172 report_vect_op (MSG_NOTE, def_stmt,
3173 "detected reduction: need to swap operands: ");
3175 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3176 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3178 else
3180 if (dump_enabled_p ())
3181 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3184 return def_stmt_info;
3187 /* Try to find SLP reduction chain. */
3188 if (! nested_in_vect_loop
3189 && code != COND_EXPR
3190 && orig_code != MINUS_EXPR
3191 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3193 if (dump_enabled_p ())
3194 report_vect_op (MSG_NOTE, def_stmt,
3195 "reduction: detected reduction chain: ");
3197 return def_stmt_info;
3200 /* Look for the expression computing loop_arg from loop PHI result. */
3201 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3202 return def_stmt_info;
3204 if (dump_enabled_p ())
3206 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3207 "reduction: unknown pattern: ");
3210 return NULL;
3213 /* Wrapper around vect_is_simple_reduction, which will modify code
3214 in-place if it enables detection of more reductions. Arguments
3215 as there. */
3217 stmt_vec_info
3218 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3219 bool *double_reduc,
3220 bool need_wrapping_integral_overflow)
3222 enum vect_reduction_type v_reduc_type;
3223 stmt_vec_info def_info
3224 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3225 need_wrapping_integral_overflow,
3226 &v_reduc_type);
3227 if (def_info)
3229 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3230 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3231 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3232 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3234 return def_info;
3237 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3239 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3240 int *peel_iters_epilogue,
3241 stmt_vector_for_cost *scalar_cost_vec,
3242 stmt_vector_for_cost *prologue_cost_vec,
3243 stmt_vector_for_cost *epilogue_cost_vec)
3245 int retval = 0;
3246 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3248 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3250 *peel_iters_epilogue = assumed_vf / 2;
3251 if (dump_enabled_p ())
3252 dump_printf_loc (MSG_NOTE, vect_location,
3253 "cost model: epilogue peel iters set to vf/2 "
3254 "because loop iterations are unknown .\n");
3256 /* If peeled iterations are known but number of scalar loop
3257 iterations are unknown, count a taken branch per peeled loop. */
3258 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3259 NULL, 0, vect_prologue);
3260 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3261 NULL, 0, vect_epilogue);
3263 else
3265 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3266 peel_iters_prologue = niters < peel_iters_prologue ?
3267 niters : peel_iters_prologue;
3268 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3269 /* If we need to peel for gaps, but no peeling is required, we have to
3270 peel VF iterations. */
3271 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3272 *peel_iters_epilogue = assumed_vf;
3275 stmt_info_for_cost *si;
3276 int j;
3277 if (peel_iters_prologue)
3278 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3279 retval += record_stmt_cost (prologue_cost_vec,
3280 si->count * peel_iters_prologue,
3281 si->kind, si->stmt_info, si->misalign,
3282 vect_prologue);
3283 if (*peel_iters_epilogue)
3284 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3285 retval += record_stmt_cost (epilogue_cost_vec,
3286 si->count * *peel_iters_epilogue,
3287 si->kind, si->stmt_info, si->misalign,
3288 vect_epilogue);
3290 return retval;
3293 /* Function vect_estimate_min_profitable_iters
3295 Return the number of iterations required for the vector version of the
3296 loop to be profitable relative to the cost of the scalar version of the
3297 loop.
3299 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3300 of iterations for vectorization. -1 value means loop vectorization
3301 is not profitable. This returned value may be used for dynamic
3302 profitability check.
3304 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3305 for static check against estimated number of iterations. */
3307 static void
3308 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3309 int *ret_min_profitable_niters,
3310 int *ret_min_profitable_estimate)
3312 int min_profitable_iters;
3313 int min_profitable_estimate;
3314 int peel_iters_prologue;
3315 int peel_iters_epilogue;
3316 unsigned vec_inside_cost = 0;
3317 int vec_outside_cost = 0;
3318 unsigned vec_prologue_cost = 0;
3319 unsigned vec_epilogue_cost = 0;
3320 int scalar_single_iter_cost = 0;
3321 int scalar_outside_cost = 0;
3322 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3323 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3324 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3326 /* Cost model disabled. */
3327 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3331 *ret_min_profitable_niters = 0;
3332 *ret_min_profitable_estimate = 0;
3333 return;
3336 /* Requires loop versioning tests to handle misalignment. */
3337 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3339 /* FIXME: Make cost depend on complexity of individual check. */
3340 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3341 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3342 vect_prologue);
3343 if (dump_enabled_p ())
3344 dump_printf (MSG_NOTE,
3345 "cost model: Adding cost of checks for loop "
3346 "versioning to treat misalignment.\n");
3349 /* Requires loop versioning with alias checks. */
3350 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3352 /* FIXME: Make cost depend on complexity of individual check. */
3353 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3354 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3355 vect_prologue);
3356 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3357 if (len)
3358 /* Count LEN - 1 ANDs and LEN comparisons. */
3359 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3360 NULL, 0, vect_prologue);
3361 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3362 if (len)
3364 /* Count LEN - 1 ANDs and LEN comparisons. */
3365 unsigned int nstmts = len * 2 - 1;
3366 /* +1 for each bias that needs adding. */
3367 for (unsigned int i = 0; i < len; ++i)
3368 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3369 nstmts += 1;
3370 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3371 NULL, 0, vect_prologue);
3373 if (dump_enabled_p ())
3374 dump_printf (MSG_NOTE,
3375 "cost model: Adding cost of checks for loop "
3376 "versioning aliasing.\n");
3379 /* Requires loop versioning with niter checks. */
3380 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3382 /* FIXME: Make cost depend on complexity of individual check. */
3383 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3384 vect_prologue);
3385 if (dump_enabled_p ())
3386 dump_printf (MSG_NOTE,
3387 "cost model: Adding cost of checks for loop "
3388 "versioning niters.\n");
3391 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3392 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3393 vect_prologue);
3395 /* Count statements in scalar loop. Using this as scalar cost for a single
3396 iteration for now.
3398 TODO: Add outer loop support.
3400 TODO: Consider assigning different costs to different scalar
3401 statements. */
3403 scalar_single_iter_cost
3404 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3406 /* Add additional cost for the peeled instructions in prologue and epilogue
3407 loop. (For fully-masked loops there will be no peeling.)
3409 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3410 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3412 TODO: Build an expression that represents peel_iters for prologue and
3413 epilogue to be used in a run-time test. */
3415 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3417 peel_iters_prologue = 0;
3418 peel_iters_epilogue = 0;
3420 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3422 /* We need to peel exactly one iteration. */
3423 peel_iters_epilogue += 1;
3424 stmt_info_for_cost *si;
3425 int j;
3426 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3427 j, si)
3428 (void) add_stmt_cost (target_cost_data, si->count,
3429 si->kind, si->stmt_info, si->misalign,
3430 vect_epilogue);
3433 else if (npeel < 0)
3435 peel_iters_prologue = assumed_vf / 2;
3436 if (dump_enabled_p ())
3437 dump_printf (MSG_NOTE, "cost model: "
3438 "prologue peel iters set to vf/2.\n");
3440 /* If peeling for alignment is unknown, loop bound of main loop becomes
3441 unknown. */
3442 peel_iters_epilogue = assumed_vf / 2;
3443 if (dump_enabled_p ())
3444 dump_printf (MSG_NOTE, "cost model: "
3445 "epilogue peel iters set to vf/2 because "
3446 "peeling for alignment is unknown.\n");
3448 /* If peeled iterations are unknown, count a taken branch and a not taken
3449 branch per peeled loop. Even if scalar loop iterations are known,
3450 vector iterations are not known since peeled prologue iterations are
3451 not known. Hence guards remain the same. */
3452 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3453 NULL, 0, vect_prologue);
3454 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3455 NULL, 0, vect_prologue);
3456 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3457 NULL, 0, vect_epilogue);
3458 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3459 NULL, 0, vect_epilogue);
3460 stmt_info_for_cost *si;
3461 int j;
3462 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3464 (void) add_stmt_cost (target_cost_data,
3465 si->count * peel_iters_prologue,
3466 si->kind, si->stmt_info, si->misalign,
3467 vect_prologue);
3468 (void) add_stmt_cost (target_cost_data,
3469 si->count * peel_iters_epilogue,
3470 si->kind, si->stmt_info, si->misalign,
3471 vect_epilogue);
3474 else
3476 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3477 stmt_info_for_cost *si;
3478 int j;
3479 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3481 prologue_cost_vec.create (2);
3482 epilogue_cost_vec.create (2);
3483 peel_iters_prologue = npeel;
3485 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3486 &peel_iters_epilogue,
3487 &LOOP_VINFO_SCALAR_ITERATION_COST
3488 (loop_vinfo),
3489 &prologue_cost_vec,
3490 &epilogue_cost_vec);
3492 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3493 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3494 si->misalign, vect_prologue);
3496 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3497 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3498 si->misalign, vect_epilogue);
3500 prologue_cost_vec.release ();
3501 epilogue_cost_vec.release ();
3504 /* FORNOW: The scalar outside cost is incremented in one of the
3505 following ways:
3507 1. The vectorizer checks for alignment and aliasing and generates
3508 a condition that allows dynamic vectorization. A cost model
3509 check is ANDED with the versioning condition. Hence scalar code
3510 path now has the added cost of the versioning check.
3512 if (cost > th & versioning_check)
3513 jmp to vector code
3515 Hence run-time scalar is incremented by not-taken branch cost.
3517 2. The vectorizer then checks if a prologue is required. If the
3518 cost model check was not done before during versioning, it has to
3519 be done before the prologue check.
3521 if (cost <= th)
3522 prologue = scalar_iters
3523 if (prologue == 0)
3524 jmp to vector code
3525 else
3526 execute prologue
3527 if (prologue == num_iters)
3528 go to exit
3530 Hence the run-time scalar cost is incremented by a taken branch,
3531 plus a not-taken branch, plus a taken branch cost.
3533 3. The vectorizer then checks if an epilogue is required. If the
3534 cost model check was not done before during prologue check, it
3535 has to be done with the epilogue check.
3537 if (prologue == 0)
3538 jmp to vector code
3539 else
3540 execute prologue
3541 if (prologue == num_iters)
3542 go to exit
3543 vector code:
3544 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3545 jmp to epilogue
3547 Hence the run-time scalar cost should be incremented by 2 taken
3548 branches.
3550 TODO: The back end may reorder the BBS's differently and reverse
3551 conditions/branch directions. Change the estimates below to
3552 something more reasonable. */
3554 /* If the number of iterations is known and we do not do versioning, we can
3555 decide whether to vectorize at compile time. Hence the scalar version
3556 do not carry cost model guard costs. */
3557 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3558 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3560 /* Cost model check occurs at versioning. */
3561 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3562 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3563 else
3565 /* Cost model check occurs at prologue generation. */
3566 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3567 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3568 + vect_get_stmt_cost (cond_branch_not_taken);
3569 /* Cost model check occurs at epilogue generation. */
3570 else
3571 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3575 /* Complete the target-specific cost calculations. */
3576 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3577 &vec_inside_cost, &vec_epilogue_cost);
3579 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3581 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3584 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3585 vec_inside_cost);
3586 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3587 vec_prologue_cost);
3588 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3589 vec_epilogue_cost);
3590 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3591 scalar_single_iter_cost);
3592 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3593 scalar_outside_cost);
3594 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3595 vec_outside_cost);
3596 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3597 peel_iters_prologue);
3598 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3599 peel_iters_epilogue);
3602 /* Calculate number of iterations required to make the vector version
3603 profitable, relative to the loop bodies only. The following condition
3604 must hold true:
3605 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3606 where
3607 SIC = scalar iteration cost, VIC = vector iteration cost,
3608 VOC = vector outside cost, VF = vectorization factor,
3609 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3610 SOC = scalar outside cost for run time cost model check. */
3612 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3614 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3615 * assumed_vf
3616 - vec_inside_cost * peel_iters_prologue
3617 - vec_inside_cost * peel_iters_epilogue);
3618 if (min_profitable_iters <= 0)
3619 min_profitable_iters = 0;
3620 else
3622 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3623 - vec_inside_cost);
3625 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3626 <= (((int) vec_inside_cost * min_profitable_iters)
3627 + (((int) vec_outside_cost - scalar_outside_cost)
3628 * assumed_vf)))
3629 min_profitable_iters++;
3632 /* vector version will never be profitable. */
3633 else
3635 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3636 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3637 "vectorization did not happen for a simd loop");
3639 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641 "cost model: the vector iteration cost = %d "
3642 "divided by the scalar iteration cost = %d "
3643 "is greater or equal to the vectorization factor = %d"
3644 ".\n",
3645 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3646 *ret_min_profitable_niters = -1;
3647 *ret_min_profitable_estimate = -1;
3648 return;
3651 if (dump_enabled_p ())
3652 dump_printf (MSG_NOTE,
3653 " Calculated minimum iters for profitability: %d\n",
3654 min_profitable_iters);
3656 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3657 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3658 /* We want the vectorized loop to execute at least once. */
3659 min_profitable_iters = assumed_vf + peel_iters_prologue;
3661 if (dump_enabled_p ())
3662 dump_printf_loc (MSG_NOTE, vect_location,
3663 " Runtime profitability threshold = %d\n",
3664 min_profitable_iters);
3666 *ret_min_profitable_niters = min_profitable_iters;
3668 /* Calculate number of iterations required to make the vector version
3669 profitable, relative to the loop bodies only.
3671 Non-vectorized variant is SIC * niters and it must win over vector
3672 variant on the expected loop trip count. The following condition must hold true:
3673 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3675 if (vec_outside_cost <= 0)
3676 min_profitable_estimate = 0;
3677 else
3679 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3680 * assumed_vf
3681 - vec_inside_cost * peel_iters_prologue
3682 - vec_inside_cost * peel_iters_epilogue)
3683 / ((scalar_single_iter_cost * assumed_vf)
3684 - vec_inside_cost);
3686 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3687 if (dump_enabled_p ())
3688 dump_printf_loc (MSG_NOTE, vect_location,
3689 " Static estimate profitability threshold = %d\n",
3690 min_profitable_estimate);
3692 *ret_min_profitable_estimate = min_profitable_estimate;
3695 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3696 vector elements (not bits) for a vector with NELT elements. */
3697 static void
3698 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3699 vec_perm_builder *sel)
3701 /* The encoding is a single stepped pattern. Any wrap-around is handled
3702 by vec_perm_indices. */
3703 sel->new_vector (nelt, 1, 3);
3704 for (unsigned int i = 0; i < 3; i++)
3705 sel->quick_push (i + offset);
3708 /* Checks whether the target supports whole-vector shifts for vectors of mode
3709 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3710 it supports vec_perm_const with masks for all necessary shift amounts. */
3711 static bool
3712 have_whole_vector_shift (machine_mode mode)
3714 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3715 return true;
3717 /* Variable-length vectors should be handled via the optab. */
3718 unsigned int nelt;
3719 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3720 return false;
3722 vec_perm_builder sel;
3723 vec_perm_indices indices;
3724 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3726 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3727 indices.new_vector (sel, 2, nelt);
3728 if (!can_vec_perm_const_p (mode, indices, false))
3729 return false;
3731 return true;
3734 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3735 functions. Design better to avoid maintenance issues. */
3737 /* Function vect_model_reduction_cost.
3739 Models cost for a reduction operation, including the vector ops
3740 generated within the strip-mine loop, the initial definition before
3741 the loop, and the epilogue code that must be generated. */
3743 static void
3744 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3745 int ncopies, stmt_vector_for_cost *cost_vec)
3747 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3748 enum tree_code code;
3749 optab optab;
3750 tree vectype;
3751 machine_mode mode;
3752 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3753 struct loop *loop = NULL;
3755 if (loop_vinfo)
3756 loop = LOOP_VINFO_LOOP (loop_vinfo);
3758 /* Condition reductions generate two reductions in the loop. */
3759 vect_reduction_type reduction_type
3760 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3761 if (reduction_type == COND_REDUCTION)
3762 ncopies *= 2;
3764 vectype = STMT_VINFO_VECTYPE (stmt_info);
3765 mode = TYPE_MODE (vectype);
3766 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3768 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3770 if (reduction_type == EXTRACT_LAST_REDUCTION
3771 || reduction_type == FOLD_LEFT_REDUCTION)
3773 /* No extra instructions needed in the prologue. */
3774 prologue_cost = 0;
3776 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3777 /* Count one reduction-like operation per vector. */
3778 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3779 stmt_info, 0, vect_body);
3780 else
3782 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3783 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3784 inside_cost = record_stmt_cost (cost_vec, nelements,
3785 vec_to_scalar, stmt_info, 0,
3786 vect_body);
3787 inside_cost += record_stmt_cost (cost_vec, nelements,
3788 scalar_stmt, stmt_info, 0,
3789 vect_body);
3792 else
3794 /* Add in cost for initial definition.
3795 For cond reduction we have four vectors: initial index, step,
3796 initial result of the data reduction, initial value of the index
3797 reduction. */
3798 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3799 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3800 scalar_to_vec, stmt_info, 0,
3801 vect_prologue);
3803 /* Cost of reduction op inside loop. */
3804 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3805 stmt_info, 0, vect_body);
3808 /* Determine cost of epilogue code.
3810 We have a reduction operator that will reduce the vector in one statement.
3811 Also requires scalar extract. */
3813 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3815 if (reduc_fn != IFN_LAST)
3817 if (reduction_type == COND_REDUCTION)
3819 /* An EQ stmt and an COND_EXPR stmt. */
3820 epilogue_cost += record_stmt_cost (cost_vec, 2,
3821 vector_stmt, stmt_info, 0,
3822 vect_epilogue);
3823 /* Reduction of the max index and a reduction of the found
3824 values. */
3825 epilogue_cost += record_stmt_cost (cost_vec, 2,
3826 vec_to_scalar, stmt_info, 0,
3827 vect_epilogue);
3828 /* A broadcast of the max value. */
3829 epilogue_cost += record_stmt_cost (cost_vec, 1,
3830 scalar_to_vec, stmt_info, 0,
3831 vect_epilogue);
3833 else
3835 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3836 stmt_info, 0, vect_epilogue);
3837 epilogue_cost += record_stmt_cost (cost_vec, 1,
3838 vec_to_scalar, stmt_info, 0,
3839 vect_epilogue);
3842 else if (reduction_type == COND_REDUCTION)
3844 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3845 /* Extraction of scalar elements. */
3846 epilogue_cost += record_stmt_cost (cost_vec,
3847 2 * estimated_nunits,
3848 vec_to_scalar, stmt_info, 0,
3849 vect_epilogue);
3850 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3851 epilogue_cost += record_stmt_cost (cost_vec,
3852 2 * estimated_nunits - 3,
3853 scalar_stmt, stmt_info, 0,
3854 vect_epilogue);
3856 else if (reduction_type == EXTRACT_LAST_REDUCTION
3857 || reduction_type == FOLD_LEFT_REDUCTION)
3858 /* No extra instructions need in the epilogue. */
3860 else
3862 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3863 tree bitsize =
3864 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3865 int element_bitsize = tree_to_uhwi (bitsize);
3866 int nelements = vec_size_in_bits / element_bitsize;
3868 if (code == COND_EXPR)
3869 code = MAX_EXPR;
3871 optab = optab_for_tree_code (code, vectype, optab_default);
3873 /* We have a whole vector shift available. */
3874 if (optab != unknown_optab
3875 && VECTOR_MODE_P (mode)
3876 && optab_handler (optab, mode) != CODE_FOR_nothing
3877 && have_whole_vector_shift (mode))
3879 /* Final reduction via vector shifts and the reduction operator.
3880 Also requires scalar extract. */
3881 epilogue_cost += record_stmt_cost (cost_vec,
3882 exact_log2 (nelements) * 2,
3883 vector_stmt, stmt_info, 0,
3884 vect_epilogue);
3885 epilogue_cost += record_stmt_cost (cost_vec, 1,
3886 vec_to_scalar, stmt_info, 0,
3887 vect_epilogue);
3889 else
3890 /* Use extracts and reduction op for final reduction. For N
3891 elements, we have N extracts and N-1 reduction ops. */
3892 epilogue_cost += record_stmt_cost (cost_vec,
3893 nelements + nelements - 1,
3894 vector_stmt, stmt_info, 0,
3895 vect_epilogue);
3899 if (dump_enabled_p ())
3900 dump_printf (MSG_NOTE,
3901 "vect_model_reduction_cost: inside_cost = %d, "
3902 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3903 prologue_cost, epilogue_cost);
3907 /* Function vect_model_induction_cost.
3909 Models cost for induction operations. */
3911 static void
3912 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3913 stmt_vector_for_cost *cost_vec)
3915 unsigned inside_cost, prologue_cost;
3917 if (PURE_SLP_STMT (stmt_info))
3918 return;
3920 /* loop cost for vec_loop. */
3921 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3922 stmt_info, 0, vect_body);
3924 /* prologue cost for vec_init and vec_step. */
3925 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3926 stmt_info, 0, vect_prologue);
3928 if (dump_enabled_p ())
3929 dump_printf_loc (MSG_NOTE, vect_location,
3930 "vect_model_induction_cost: inside_cost = %d, "
3931 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3936 /* Function get_initial_def_for_reduction
3938 Input:
3939 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3940 INIT_VAL - the initial value of the reduction variable
3942 Output:
3943 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3944 of the reduction (used for adjusting the epilog - see below).
3945 Return a vector variable, initialized according to the operation that
3946 STMT_VINFO performs. This vector will be used as the initial value
3947 of the vector of partial results.
3949 Option1 (adjust in epilog): Initialize the vector as follows:
3950 add/bit or/xor: [0,0,...,0,0]
3951 mult/bit and: [1,1,...,1,1]
3952 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3953 and when necessary (e.g. add/mult case) let the caller know
3954 that it needs to adjust the result by init_val.
3956 Option2: Initialize the vector as follows:
3957 add/bit or/xor: [init_val,0,0,...,0]
3958 mult/bit and: [init_val,1,1,...,1]
3959 min/max/cond_expr: [init_val,init_val,...,init_val]
3960 and no adjustments are needed.
3962 For example, for the following code:
3964 s = init_val;
3965 for (i=0;i<n;i++)
3966 s = s + a[i];
3968 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3969 For a vector of 4 units, we want to return either [0,0,0,init_val],
3970 or [0,0,0,0] and let the caller know that it needs to adjust
3971 the result at the end by 'init_val'.
3973 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3974 initialization vector is simpler (same element in all entries), if
3975 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3977 A cost model should help decide between these two schemes. */
3979 tree
3980 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
3981 tree *adjustment_def)
3983 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3985 tree scalar_type = TREE_TYPE (init_val);
3986 tree vectype = get_vectype_for_scalar_type (scalar_type);
3987 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
3988 tree def_for_init;
3989 tree init_def;
3990 REAL_VALUE_TYPE real_init_val = dconst0;
3991 int int_init_val = 0;
3992 gimple_seq stmts = NULL;
3994 gcc_assert (vectype);
3996 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3997 || SCALAR_FLOAT_TYPE_P (scalar_type));
3999 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4000 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4002 vect_reduction_type reduction_type
4003 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4005 switch (code)
4007 case WIDEN_SUM_EXPR:
4008 case DOT_PROD_EXPR:
4009 case SAD_EXPR:
4010 case PLUS_EXPR:
4011 case MINUS_EXPR:
4012 case BIT_IOR_EXPR:
4013 case BIT_XOR_EXPR:
4014 case MULT_EXPR:
4015 case BIT_AND_EXPR:
4017 /* ADJUSTMENT_DEF is NULL when called from
4018 vect_create_epilog_for_reduction to vectorize double reduction. */
4019 if (adjustment_def)
4020 *adjustment_def = init_val;
4022 if (code == MULT_EXPR)
4024 real_init_val = dconst1;
4025 int_init_val = 1;
4028 if (code == BIT_AND_EXPR)
4029 int_init_val = -1;
4031 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4032 def_for_init = build_real (scalar_type, real_init_val);
4033 else
4034 def_for_init = build_int_cst (scalar_type, int_init_val);
4036 if (adjustment_def)
4037 /* Option1: the first element is '0' or '1' as well. */
4038 init_def = gimple_build_vector_from_val (&stmts, vectype,
4039 def_for_init);
4040 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4042 /* Option2 (variable length): the first element is INIT_VAL. */
4043 init_def = gimple_build_vector_from_val (&stmts, vectype,
4044 def_for_init);
4045 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4046 vectype, init_def, init_val);
4048 else
4050 /* Option2: the first element is INIT_VAL. */
4051 tree_vector_builder elts (vectype, 1, 2);
4052 elts.quick_push (init_val);
4053 elts.quick_push (def_for_init);
4054 init_def = gimple_build_vector (&stmts, &elts);
4057 break;
4059 case MIN_EXPR:
4060 case MAX_EXPR:
4061 case COND_EXPR:
4063 if (adjustment_def)
4065 *adjustment_def = NULL_TREE;
4066 if (reduction_type != COND_REDUCTION
4067 && reduction_type != EXTRACT_LAST_REDUCTION)
4069 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4070 break;
4073 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4074 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4076 break;
4078 default:
4079 gcc_unreachable ();
4082 if (stmts)
4083 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4084 return init_def;
4087 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4088 NUMBER_OF_VECTORS is the number of vector defs to create.
4089 If NEUTRAL_OP is nonnull, introducing extra elements of that
4090 value will not change the result. */
4092 static void
4093 get_initial_defs_for_reduction (slp_tree slp_node,
4094 vec<tree> *vec_oprnds,
4095 unsigned int number_of_vectors,
4096 bool reduc_chain, tree neutral_op)
4098 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4099 stmt_vec_info stmt_vinfo = stmts[0];
4100 unsigned HOST_WIDE_INT nunits;
4101 unsigned j, number_of_places_left_in_vector;
4102 tree vector_type;
4103 unsigned int group_size = stmts.length ();
4104 unsigned int i;
4105 struct loop *loop;
4106 auto_vec<tree, 16> permute_results;
4108 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4110 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4112 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4113 gcc_assert (loop);
4114 edge pe = loop_preheader_edge (loop);
4116 gcc_assert (!reduc_chain || neutral_op);
4118 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4119 created vectors. It is greater than 1 if unrolling is performed.
4121 For example, we have two scalar operands, s1 and s2 (e.g., group of
4122 strided accesses of size two), while NUNITS is four (i.e., four scalars
4123 of this type can be packed in a vector). The output vector will contain
4124 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4125 will be 2).
4127 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4128 vectors containing the operands.
4130 For example, NUNITS is four as before, and the group size is 8
4131 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4132 {s5, s6, s7, s8}. */
4134 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4135 nunits = group_size;
4137 number_of_places_left_in_vector = nunits;
4138 bool constant_p = true;
4139 tree_vector_builder elts (vector_type, nunits, 1);
4140 elts.quick_grow (nunits);
4141 for (j = 0; j < nunits * number_of_vectors; ++j)
4143 tree op;
4144 i = j % group_size;
4145 stmt_vinfo = stmts[i];
4147 /* Get the def before the loop. In reduction chain we have only
4148 one initial value. Else we have as many as PHIs in the group. */
4149 if (reduc_chain)
4150 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4151 else if (((vec_oprnds->length () + 1) * nunits
4152 - number_of_places_left_in_vector >= group_size)
4153 && neutral_op)
4154 op = neutral_op;
4155 else
4156 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4158 /* Create 'vect_ = {op0,op1,...,opn}'. */
4159 number_of_places_left_in_vector--;
4160 elts[nunits - number_of_places_left_in_vector - 1] = op;
4161 if (!CONSTANT_CLASS_P (op))
4162 constant_p = false;
4164 if (number_of_places_left_in_vector == 0)
4166 gimple_seq ctor_seq = NULL;
4167 tree init;
4168 if (constant_p && !neutral_op
4169 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4170 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4171 /* Build the vector directly from ELTS. */
4172 init = gimple_build_vector (&ctor_seq, &elts);
4173 else if (neutral_op)
4175 /* Build a vector of the neutral value and shift the
4176 other elements into place. */
4177 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4178 neutral_op);
4179 int k = nunits;
4180 while (k > 0 && elts[k - 1] == neutral_op)
4181 k -= 1;
4182 while (k > 0)
4184 k -= 1;
4185 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4186 vector_type, init, elts[k]);
4189 else
4191 /* First time round, duplicate ELTS to fill the
4192 required number of vectors, then cherry pick the
4193 appropriate result for each iteration. */
4194 if (vec_oprnds->is_empty ())
4195 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4196 number_of_vectors,
4197 permute_results);
4198 init = permute_results[number_of_vectors - j - 1];
4200 if (ctor_seq != NULL)
4201 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4202 vec_oprnds->quick_push (init);
4204 number_of_places_left_in_vector = nunits;
4205 elts.new_vector (vector_type, nunits, 1);
4206 elts.quick_grow (nunits);
4207 constant_p = true;
4213 /* Function vect_create_epilog_for_reduction
4215 Create code at the loop-epilog to finalize the result of a reduction
4216 computation.
4218 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4219 reduction statements.
4220 STMT_INFO is the scalar reduction stmt that is being vectorized.
4221 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4222 number of elements that we can fit in a vectype (nunits). In this case
4223 we have to generate more than one vector stmt - i.e - we need to "unroll"
4224 the vector stmt by a factor VF/nunits. For more details see documentation
4225 in vectorizable_operation.
4226 REDUC_FN is the internal function for the epilog reduction.
4227 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4228 computation.
4229 REDUC_INDEX is the index of the operand in the right hand side of the
4230 statement that is defined by REDUCTION_PHI.
4231 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4232 SLP_NODE is an SLP node containing a group of reduction statements. The
4233 first one in this group is STMT_INFO.
4234 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4235 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4236 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4237 any value of the IV in the loop.
4238 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4239 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4240 null if this is not an SLP reduction
4242 This function:
4243 1. Creates the reduction def-use cycles: sets the arguments for
4244 REDUCTION_PHIS:
4245 The loop-entry argument is the vectorized initial-value of the reduction.
4246 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4247 sums.
4248 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4249 by calling the function specified by REDUC_FN if available, or by
4250 other means (whole-vector shifts or a scalar loop).
4251 The function also creates a new phi node at the loop exit to preserve
4252 loop-closed form, as illustrated below.
4254 The flow at the entry to this function:
4256 loop:
4257 vec_def = phi <null, null> # REDUCTION_PHI
4258 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4259 s_loop = scalar_stmt # (scalar) STMT_INFO
4260 loop_exit:
4261 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4262 use <s_out0>
4263 use <s_out0>
4265 The above is transformed by this function into:
4267 loop:
4268 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4269 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4270 s_loop = scalar_stmt # (scalar) STMT_INFO
4271 loop_exit:
4272 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4273 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4274 v_out2 = reduce <v_out1>
4275 s_out3 = extract_field <v_out2, 0>
4276 s_out4 = adjust_result <s_out3>
4277 use <s_out4>
4278 use <s_out4>
4281 static void
4282 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4283 stmt_vec_info stmt_info,
4284 gimple *reduc_def_stmt,
4285 int ncopies, internal_fn reduc_fn,
4286 vec<stmt_vec_info> reduction_phis,
4287 bool double_reduc,
4288 slp_tree slp_node,
4289 slp_instance slp_node_instance,
4290 tree induc_val, enum tree_code induc_code,
4291 tree neutral_op)
4293 stmt_vec_info prev_phi_info;
4294 tree vectype;
4295 machine_mode mode;
4296 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4297 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4298 basic_block exit_bb;
4299 tree scalar_dest;
4300 tree scalar_type;
4301 gimple *new_phi = NULL, *phi;
4302 stmt_vec_info phi_info;
4303 gimple_stmt_iterator exit_gsi;
4304 tree vec_dest;
4305 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4306 gimple *epilog_stmt = NULL;
4307 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4308 gimple *exit_phi;
4309 tree bitsize;
4310 tree adjustment_def = NULL;
4311 tree vec_initial_def = NULL;
4312 tree expr, def, initial_def = NULL;
4313 tree orig_name, scalar_result;
4314 imm_use_iterator imm_iter, phi_imm_iter;
4315 use_operand_p use_p, phi_use_p;
4316 gimple *use_stmt;
4317 stmt_vec_info reduction_phi_info = NULL;
4318 bool nested_in_vect_loop = false;
4319 auto_vec<gimple *> new_phis;
4320 auto_vec<stmt_vec_info> inner_phis;
4321 int j, i;
4322 auto_vec<tree> scalar_results;
4323 unsigned int group_size = 1, k, ratio;
4324 auto_vec<tree> vec_initial_defs;
4325 auto_vec<gimple *> phis;
4326 bool slp_reduc = false;
4327 bool direct_slp_reduc;
4328 tree new_phi_result;
4329 stmt_vec_info inner_phi = NULL;
4330 tree induction_index = NULL_TREE;
4332 if (slp_node)
4333 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4335 if (nested_in_vect_loop_p (loop, stmt_info))
4337 outer_loop = loop;
4338 loop = loop->inner;
4339 nested_in_vect_loop = true;
4340 gcc_assert (!slp_node);
4343 vectype = STMT_VINFO_VECTYPE (stmt_info);
4344 gcc_assert (vectype);
4345 mode = TYPE_MODE (vectype);
4347 /* 1. Create the reduction def-use cycle:
4348 Set the arguments of REDUCTION_PHIS, i.e., transform
4350 loop:
4351 vec_def = phi <null, null> # REDUCTION_PHI
4352 VECT_DEF = vector_stmt # vectorized form of STMT
4355 into:
4357 loop:
4358 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4359 VECT_DEF = vector_stmt # vectorized form of STMT
4362 (in case of SLP, do it for all the phis). */
4364 /* Get the loop-entry arguments. */
4365 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4366 if (slp_node)
4368 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4369 vec_initial_defs.reserve (vec_num);
4370 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4371 &vec_initial_defs, vec_num,
4372 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4373 neutral_op);
4375 else
4377 /* Get at the scalar def before the loop, that defines the initial value
4378 of the reduction variable. */
4379 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4380 loop_preheader_edge (loop));
4381 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4382 and we can't use zero for induc_val, use initial_def. Similarly
4383 for REDUC_MIN and initial_def larger than the base. */
4384 if (TREE_CODE (initial_def) == INTEGER_CST
4385 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4386 == INTEGER_INDUC_COND_REDUCTION)
4387 && !integer_zerop (induc_val)
4388 && ((induc_code == MAX_EXPR
4389 && tree_int_cst_lt (initial_def, induc_val))
4390 || (induc_code == MIN_EXPR
4391 && tree_int_cst_lt (induc_val, initial_def))))
4392 induc_val = initial_def;
4394 if (double_reduc)
4395 /* In case of double reduction we only create a vector variable
4396 to be put in the reduction phi node. The actual statement
4397 creation is done later in this function. */
4398 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4399 else if (nested_in_vect_loop)
4401 /* Do not use an adjustment def as that case is not supported
4402 correctly if ncopies is not one. */
4403 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4404 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4405 stmt_info);
4407 else
4408 vec_initial_def
4409 = get_initial_def_for_reduction (stmt_info, initial_def,
4410 &adjustment_def);
4411 vec_initial_defs.create (1);
4412 vec_initial_defs.quick_push (vec_initial_def);
4415 /* Set phi nodes arguments. */
4416 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4418 tree vec_init_def = vec_initial_defs[i];
4419 tree def = vect_defs[i];
4420 for (j = 0; j < ncopies; j++)
4422 if (j != 0)
4424 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4425 if (nested_in_vect_loop)
4426 vec_init_def
4427 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4430 /* Set the loop-entry arg of the reduction-phi. */
4432 gphi *phi = as_a <gphi *> (phi_info->stmt);
4433 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4434 == INTEGER_INDUC_COND_REDUCTION)
4436 /* Initialise the reduction phi to zero. This prevents initial
4437 values of non-zero interferring with the reduction op. */
4438 gcc_assert (ncopies == 1);
4439 gcc_assert (i == 0);
4441 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4442 tree induc_val_vec
4443 = build_vector_from_val (vec_init_def_type, induc_val);
4445 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4446 UNKNOWN_LOCATION);
4448 else
4449 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4450 UNKNOWN_LOCATION);
4452 /* Set the loop-latch arg for the reduction-phi. */
4453 if (j > 0)
4454 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4456 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4458 if (dump_enabled_p ())
4459 dump_printf_loc (MSG_NOTE, vect_location,
4460 "transform reduction: created def-use cycle: %G%G",
4461 phi, SSA_NAME_DEF_STMT (def));
4465 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4466 which is updated with the current index of the loop for every match of
4467 the original loop's cond_expr (VEC_STMT). This results in a vector
4468 containing the last time the condition passed for that vector lane.
4469 The first match will be a 1 to allow 0 to be used for non-matching
4470 indexes. If there are no matches at all then the vector will be all
4471 zeroes. */
4472 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4474 tree indx_before_incr, indx_after_incr;
4475 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4477 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4478 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4480 int scalar_precision
4481 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4482 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4483 tree cr_index_vector_type = build_vector_type
4484 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4486 /* First we create a simple vector induction variable which starts
4487 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4488 vector size (STEP). */
4490 /* Create a {1,2,3,...} vector. */
4491 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4493 /* Create a vector of the step value. */
4494 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4495 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4497 /* Create an induction variable. */
4498 gimple_stmt_iterator incr_gsi;
4499 bool insert_after;
4500 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4501 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4502 insert_after, &indx_before_incr, &indx_after_incr);
4504 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4505 filled with zeros (VEC_ZERO). */
4507 /* Create a vector of 0s. */
4508 tree zero = build_zero_cst (cr_index_scalar_type);
4509 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4511 /* Create a vector phi node. */
4512 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4513 new_phi = create_phi_node (new_phi_tree, loop->header);
4514 loop_vinfo->add_stmt (new_phi);
4515 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4516 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4518 /* Now take the condition from the loops original cond_expr
4519 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4520 every match uses values from the induction variable
4521 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4522 (NEW_PHI_TREE).
4523 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4524 the new cond_expr (INDEX_COND_EXPR). */
4526 /* Duplicate the condition from vec_stmt. */
4527 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4529 /* Create a conditional, where the condition is taken from vec_stmt
4530 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4531 else is the phi (NEW_PHI_TREE). */
4532 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4533 ccompare, indx_before_incr,
4534 new_phi_tree);
4535 induction_index = make_ssa_name (cr_index_vector_type);
4536 gimple *index_condition = gimple_build_assign (induction_index,
4537 index_cond_expr);
4538 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4539 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4540 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4542 /* Update the phi with the vec cond. */
4543 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4544 loop_latch_edge (loop), UNKNOWN_LOCATION);
4547 /* 2. Create epilog code.
4548 The reduction epilog code operates across the elements of the vector
4549 of partial results computed by the vectorized loop.
4550 The reduction epilog code consists of:
4552 step 1: compute the scalar result in a vector (v_out2)
4553 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4554 step 3: adjust the scalar result (s_out3) if needed.
4556 Step 1 can be accomplished using one the following three schemes:
4557 (scheme 1) using reduc_fn, if available.
4558 (scheme 2) using whole-vector shifts, if available.
4559 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4560 combined.
4562 The overall epilog code looks like this:
4564 s_out0 = phi <s_loop> # original EXIT_PHI
4565 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4566 v_out2 = reduce <v_out1> # step 1
4567 s_out3 = extract_field <v_out2, 0> # step 2
4568 s_out4 = adjust_result <s_out3> # step 3
4570 (step 3 is optional, and steps 1 and 2 may be combined).
4571 Lastly, the uses of s_out0 are replaced by s_out4. */
4574 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4575 v_out1 = phi <VECT_DEF>
4576 Store them in NEW_PHIS. */
4578 exit_bb = single_exit (loop)->dest;
4579 prev_phi_info = NULL;
4580 new_phis.create (vect_defs.length ());
4581 FOR_EACH_VEC_ELT (vect_defs, i, def)
4583 for (j = 0; j < ncopies; j++)
4585 tree new_def = copy_ssa_name (def);
4586 phi = create_phi_node (new_def, exit_bb);
4587 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4588 if (j == 0)
4589 new_phis.quick_push (phi);
4590 else
4592 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4593 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4596 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4597 prev_phi_info = phi_info;
4601 /* The epilogue is created for the outer-loop, i.e., for the loop being
4602 vectorized. Create exit phis for the outer loop. */
4603 if (double_reduc)
4605 loop = outer_loop;
4606 exit_bb = single_exit (loop)->dest;
4607 inner_phis.create (vect_defs.length ());
4608 FOR_EACH_VEC_ELT (new_phis, i, phi)
4610 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4611 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4612 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4613 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4614 PHI_RESULT (phi));
4615 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4616 inner_phis.quick_push (phi_info);
4617 new_phis[i] = outer_phi;
4618 while (STMT_VINFO_RELATED_STMT (phi_info))
4620 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4621 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4622 outer_phi = create_phi_node (new_result, exit_bb);
4623 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4624 PHI_RESULT (phi_info->stmt));
4625 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4626 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4627 prev_phi_info = outer_phi_info;
4632 exit_gsi = gsi_after_labels (exit_bb);
4634 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4635 (i.e. when reduc_fn is not available) and in the final adjustment
4636 code (if needed). Also get the original scalar reduction variable as
4637 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4638 represents a reduction pattern), the tree-code and scalar-def are
4639 taken from the original stmt that the pattern-stmt (STMT) replaces.
4640 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4641 are taken from STMT. */
4643 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4644 if (orig_stmt_info != stmt_info)
4646 /* Reduction pattern */
4647 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4648 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4651 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4652 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4653 partial results are added and not subtracted. */
4654 if (code == MINUS_EXPR)
4655 code = PLUS_EXPR;
4657 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4658 scalar_type = TREE_TYPE (scalar_dest);
4659 scalar_results.create (group_size);
4660 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4661 bitsize = TYPE_SIZE (scalar_type);
4663 /* In case this is a reduction in an inner-loop while vectorizing an outer
4664 loop - we don't need to extract a single scalar result at the end of the
4665 inner-loop (unless it is double reduction, i.e., the use of reduction is
4666 outside the outer-loop). The final vector of partial results will be used
4667 in the vectorized outer-loop, or reduced to a scalar result at the end of
4668 the outer-loop. */
4669 if (nested_in_vect_loop && !double_reduc)
4670 goto vect_finalize_reduction;
4672 /* SLP reduction without reduction chain, e.g.,
4673 # a1 = phi <a2, a0>
4674 # b1 = phi <b2, b0>
4675 a2 = operation (a1)
4676 b2 = operation (b1) */
4677 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4679 /* True if we should implement SLP_REDUC using native reduction operations
4680 instead of scalar operations. */
4681 direct_slp_reduc = (reduc_fn != IFN_LAST
4682 && slp_reduc
4683 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4685 /* In case of reduction chain, e.g.,
4686 # a1 = phi <a3, a0>
4687 a2 = operation (a1)
4688 a3 = operation (a2),
4690 we may end up with more than one vector result. Here we reduce them to
4691 one vector. */
4692 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4694 tree first_vect = PHI_RESULT (new_phis[0]);
4695 gassign *new_vec_stmt = NULL;
4696 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4697 for (k = 1; k < new_phis.length (); k++)
4699 gimple *next_phi = new_phis[k];
4700 tree second_vect = PHI_RESULT (next_phi);
4701 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4702 new_vec_stmt = gimple_build_assign (tem, code,
4703 first_vect, second_vect);
4704 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4705 first_vect = tem;
4708 new_phi_result = first_vect;
4709 if (new_vec_stmt)
4711 new_phis.truncate (0);
4712 new_phis.safe_push (new_vec_stmt);
4715 /* Likewise if we couldn't use a single defuse cycle. */
4716 else if (ncopies > 1)
4718 gcc_assert (new_phis.length () == 1);
4719 tree first_vect = PHI_RESULT (new_phis[0]);
4720 gassign *new_vec_stmt = NULL;
4721 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4722 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4723 for (int k = 1; k < ncopies; ++k)
4725 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4726 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4727 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4728 new_vec_stmt = gimple_build_assign (tem, code,
4729 first_vect, second_vect);
4730 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4731 first_vect = tem;
4733 new_phi_result = first_vect;
4734 new_phis.truncate (0);
4735 new_phis.safe_push (new_vec_stmt);
4737 else
4738 new_phi_result = PHI_RESULT (new_phis[0]);
4740 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4741 && reduc_fn != IFN_LAST)
4743 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4744 various data values where the condition matched and another vector
4745 (INDUCTION_INDEX) containing all the indexes of those matches. We
4746 need to extract the last matching index (which will be the index with
4747 highest value) and use this to index into the data vector.
4748 For the case where there were no matches, the data vector will contain
4749 all default values and the index vector will be all zeros. */
4751 /* Get various versions of the type of the vector of indexes. */
4752 tree index_vec_type = TREE_TYPE (induction_index);
4753 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4754 tree index_scalar_type = TREE_TYPE (index_vec_type);
4755 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4756 (index_vec_type);
4758 /* Get an unsigned integer version of the type of the data vector. */
4759 int scalar_precision
4760 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4761 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4762 tree vectype_unsigned = build_vector_type
4763 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4765 /* First we need to create a vector (ZERO_VEC) of zeros and another
4766 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4767 can create using a MAX reduction and then expanding.
4768 In the case where the loop never made any matches, the max index will
4769 be zero. */
4771 /* Vector of {0, 0, 0,...}. */
4772 tree zero_vec = make_ssa_name (vectype);
4773 tree zero_vec_rhs = build_zero_cst (vectype);
4774 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4775 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4777 /* Find maximum value from the vector of found indexes. */
4778 tree max_index = make_ssa_name (index_scalar_type);
4779 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4780 1, induction_index);
4781 gimple_call_set_lhs (max_index_stmt, max_index);
4782 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4784 /* Vector of {max_index, max_index, max_index,...}. */
4785 tree max_index_vec = make_ssa_name (index_vec_type);
4786 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4787 max_index);
4788 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4789 max_index_vec_rhs);
4790 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4792 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4793 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4794 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4795 otherwise. Only one value should match, resulting in a vector
4796 (VEC_COND) with one data value and the rest zeros.
4797 In the case where the loop never made any matches, every index will
4798 match, resulting in a vector with all data values (which will all be
4799 the default value). */
4801 /* Compare the max index vector to the vector of found indexes to find
4802 the position of the max value. */
4803 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4804 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4805 induction_index,
4806 max_index_vec);
4807 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4809 /* Use the compare to choose either values from the data vector or
4810 zero. */
4811 tree vec_cond = make_ssa_name (vectype);
4812 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4813 vec_compare, new_phi_result,
4814 zero_vec);
4815 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4817 /* Finally we need to extract the data value from the vector (VEC_COND)
4818 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4819 reduction, but because this doesn't exist, we can use a MAX reduction
4820 instead. The data value might be signed or a float so we need to cast
4821 it first.
4822 In the case where the loop never made any matches, the data values are
4823 all identical, and so will reduce down correctly. */
4825 /* Make the matched data values unsigned. */
4826 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4827 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4828 vec_cond);
4829 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4830 VIEW_CONVERT_EXPR,
4831 vec_cond_cast_rhs);
4832 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4834 /* Reduce down to a scalar value. */
4835 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4836 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4837 1, vec_cond_cast);
4838 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4839 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4841 /* Convert the reduced value back to the result type and set as the
4842 result. */
4843 gimple_seq stmts = NULL;
4844 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4845 data_reduc);
4846 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4847 scalar_results.safe_push (new_temp);
4849 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4850 && reduc_fn == IFN_LAST)
4852 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4853 idx = 0;
4854 idx_val = induction_index[0];
4855 val = data_reduc[0];
4856 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4857 if (induction_index[i] > idx_val)
4858 val = data_reduc[i], idx_val = induction_index[i];
4859 return val; */
4861 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4862 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4863 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4864 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4865 /* Enforced by vectorizable_reduction, which ensures we have target
4866 support before allowing a conditional reduction on variable-length
4867 vectors. */
4868 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4869 tree idx_val = NULL_TREE, val = NULL_TREE;
4870 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4872 tree old_idx_val = idx_val;
4873 tree old_val = val;
4874 idx_val = make_ssa_name (idx_eltype);
4875 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4876 build3 (BIT_FIELD_REF, idx_eltype,
4877 induction_index,
4878 bitsize_int (el_size),
4879 bitsize_int (off)));
4880 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4881 val = make_ssa_name (data_eltype);
4882 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4883 build3 (BIT_FIELD_REF,
4884 data_eltype,
4885 new_phi_result,
4886 bitsize_int (el_size),
4887 bitsize_int (off)));
4888 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4889 if (off != 0)
4891 tree new_idx_val = idx_val;
4892 tree new_val = val;
4893 if (off != v_size - el_size)
4895 new_idx_val = make_ssa_name (idx_eltype);
4896 epilog_stmt = gimple_build_assign (new_idx_val,
4897 MAX_EXPR, idx_val,
4898 old_idx_val);
4899 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4901 new_val = make_ssa_name (data_eltype);
4902 epilog_stmt = gimple_build_assign (new_val,
4903 COND_EXPR,
4904 build2 (GT_EXPR,
4905 boolean_type_node,
4906 idx_val,
4907 old_idx_val),
4908 val, old_val);
4909 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4910 idx_val = new_idx_val;
4911 val = new_val;
4914 /* Convert the reduced value back to the result type and set as the
4915 result. */
4916 gimple_seq stmts = NULL;
4917 val = gimple_convert (&stmts, scalar_type, val);
4918 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4919 scalar_results.safe_push (val);
4922 /* 2.3 Create the reduction code, using one of the three schemes described
4923 above. In SLP we simply need to extract all the elements from the
4924 vector (without reducing them), so we use scalar shifts. */
4925 else if (reduc_fn != IFN_LAST && !slp_reduc)
4927 tree tmp;
4928 tree vec_elem_type;
4930 /* Case 1: Create:
4931 v_out2 = reduc_expr <v_out1> */
4933 if (dump_enabled_p ())
4934 dump_printf_loc (MSG_NOTE, vect_location,
4935 "Reduce using direct vector reduction.\n");
4937 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4938 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4940 tree tmp_dest
4941 = vect_create_destination_var (scalar_dest, vec_elem_type);
4942 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4943 new_phi_result);
4944 gimple_set_lhs (epilog_stmt, tmp_dest);
4945 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4946 gimple_set_lhs (epilog_stmt, new_temp);
4947 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4949 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4950 new_temp);
4952 else
4954 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4955 new_phi_result);
4956 gimple_set_lhs (epilog_stmt, new_scalar_dest);
4959 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4960 gimple_set_lhs (epilog_stmt, new_temp);
4961 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4963 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4964 == INTEGER_INDUC_COND_REDUCTION)
4965 && !operand_equal_p (initial_def, induc_val, 0))
4967 /* Earlier we set the initial value to be a vector if induc_val
4968 values. Check the result and if it is induc_val then replace
4969 with the original initial value, unless induc_val is
4970 the same as initial_def already. */
4971 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4972 induc_val);
4974 tmp = make_ssa_name (new_scalar_dest);
4975 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4976 initial_def, new_temp);
4977 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4978 new_temp = tmp;
4981 scalar_results.safe_push (new_temp);
4983 else if (direct_slp_reduc)
4985 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4986 with the elements for other SLP statements replaced with the
4987 neutral value. We can then do a normal reduction on each vector. */
4989 /* Enforced by vectorizable_reduction. */
4990 gcc_assert (new_phis.length () == 1);
4991 gcc_assert (pow2p_hwi (group_size));
4993 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4994 vec<stmt_vec_info> orig_phis
4995 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4996 gimple_seq seq = NULL;
4998 /* Build a vector {0, 1, 2, ...}, with the same number of elements
4999 and the same element size as VECTYPE. */
5000 tree index = build_index_vector (vectype, 0, 1);
5001 tree index_type = TREE_TYPE (index);
5002 tree index_elt_type = TREE_TYPE (index_type);
5003 tree mask_type = build_same_sized_truth_vector_type (index_type);
5005 /* Create a vector that, for each element, identifies which of
5006 the REDUC_GROUP_SIZE results should use it. */
5007 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5008 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5009 build_vector_from_val (index_type, index_mask));
5011 /* Get a neutral vector value. This is simply a splat of the neutral
5012 scalar value if we have one, otherwise the initial scalar value
5013 is itself a neutral value. */
5014 tree vector_identity = NULL_TREE;
5015 if (neutral_op)
5016 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5017 neutral_op);
5018 for (unsigned int i = 0; i < group_size; ++i)
5020 /* If there's no univeral neutral value, we can use the
5021 initial scalar value from the original PHI. This is used
5022 for MIN and MAX reduction, for example. */
5023 if (!neutral_op)
5025 tree scalar_value
5026 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5027 loop_preheader_edge (loop));
5028 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5029 scalar_value);
5032 /* Calculate the equivalent of:
5034 sel[j] = (index[j] == i);
5036 which selects the elements of NEW_PHI_RESULT that should
5037 be included in the result. */
5038 tree compare_val = build_int_cst (index_elt_type, i);
5039 compare_val = build_vector_from_val (index_type, compare_val);
5040 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5041 index, compare_val);
5043 /* Calculate the equivalent of:
5045 vec = seq ? new_phi_result : vector_identity;
5047 VEC is now suitable for a full vector reduction. */
5048 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5049 sel, new_phi_result, vector_identity);
5051 /* Do the reduction and convert it to the appropriate type. */
5052 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5053 TREE_TYPE (vectype), vec);
5054 scalar = gimple_convert (&seq, scalar_type, scalar);
5055 scalar_results.safe_push (scalar);
5057 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5059 else
5061 bool reduce_with_shift;
5062 tree vec_temp;
5064 /* COND reductions all do the final reduction with MAX_EXPR
5065 or MIN_EXPR. */
5066 if (code == COND_EXPR)
5068 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5069 == INTEGER_INDUC_COND_REDUCTION)
5070 code = induc_code;
5071 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5072 == CONST_COND_REDUCTION)
5073 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5074 else
5075 code = MAX_EXPR;
5078 /* See if the target wants to do the final (shift) reduction
5079 in a vector mode of smaller size and first reduce upper/lower
5080 halves against each other. */
5081 enum machine_mode mode1 = mode;
5082 tree vectype1 = vectype;
5083 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5084 unsigned sz1 = sz;
5085 if (!slp_reduc
5086 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5087 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5089 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5090 reduce_with_shift = have_whole_vector_shift (mode1);
5091 if (!VECTOR_MODE_P (mode1))
5092 reduce_with_shift = false;
5093 else
5095 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5096 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5097 reduce_with_shift = false;
5100 /* First reduce the vector to the desired vector size we should
5101 do shift reduction on by combining upper and lower halves. */
5102 new_temp = new_phi_result;
5103 while (sz > sz1)
5105 gcc_assert (!slp_reduc);
5106 sz /= 2;
5107 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5109 /* The target has to make sure we support lowpart/highpart
5110 extraction, either via direct vector extract or through
5111 an integer mode punning. */
5112 tree dst1, dst2;
5113 if (convert_optab_handler (vec_extract_optab,
5114 TYPE_MODE (TREE_TYPE (new_temp)),
5115 TYPE_MODE (vectype1))
5116 != CODE_FOR_nothing)
5118 /* Extract sub-vectors directly once vec_extract becomes
5119 a conversion optab. */
5120 dst1 = make_ssa_name (vectype1);
5121 epilog_stmt
5122 = gimple_build_assign (dst1, BIT_FIELD_REF,
5123 build3 (BIT_FIELD_REF, vectype1,
5124 new_temp, TYPE_SIZE (vectype1),
5125 bitsize_int (0)));
5126 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5127 dst2 = make_ssa_name (vectype1);
5128 epilog_stmt
5129 = gimple_build_assign (dst2, BIT_FIELD_REF,
5130 build3 (BIT_FIELD_REF, vectype1,
5131 new_temp, TYPE_SIZE (vectype1),
5132 bitsize_int (sz * BITS_PER_UNIT)));
5133 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5135 else
5137 /* Extract via punning to appropriately sized integer mode
5138 vector. */
5139 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5141 tree etype = build_vector_type (eltype, 2);
5142 gcc_assert (convert_optab_handler (vec_extract_optab,
5143 TYPE_MODE (etype),
5144 TYPE_MODE (eltype))
5145 != CODE_FOR_nothing);
5146 tree tem = make_ssa_name (etype);
5147 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5148 build1 (VIEW_CONVERT_EXPR,
5149 etype, new_temp));
5150 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5151 new_temp = tem;
5152 tem = make_ssa_name (eltype);
5153 epilog_stmt
5154 = gimple_build_assign (tem, BIT_FIELD_REF,
5155 build3 (BIT_FIELD_REF, eltype,
5156 new_temp, TYPE_SIZE (eltype),
5157 bitsize_int (0)));
5158 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5159 dst1 = make_ssa_name (vectype1);
5160 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5161 build1 (VIEW_CONVERT_EXPR,
5162 vectype1, tem));
5163 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5164 tem = make_ssa_name (eltype);
5165 epilog_stmt
5166 = gimple_build_assign (tem, BIT_FIELD_REF,
5167 build3 (BIT_FIELD_REF, eltype,
5168 new_temp, TYPE_SIZE (eltype),
5169 bitsize_int (sz * BITS_PER_UNIT)));
5170 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5171 dst2 = make_ssa_name (vectype1);
5172 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5173 build1 (VIEW_CONVERT_EXPR,
5174 vectype1, tem));
5175 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5178 new_temp = make_ssa_name (vectype1);
5179 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5180 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5183 if (reduce_with_shift && !slp_reduc)
5185 int element_bitsize = tree_to_uhwi (bitsize);
5186 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5187 for variable-length vectors and also requires direct target support
5188 for loop reductions. */
5189 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5190 int nelements = vec_size_in_bits / element_bitsize;
5191 vec_perm_builder sel;
5192 vec_perm_indices indices;
5194 int elt_offset;
5196 tree zero_vec = build_zero_cst (vectype1);
5197 /* Case 2: Create:
5198 for (offset = nelements/2; offset >= 1; offset/=2)
5200 Create: va' = vec_shift <va, offset>
5201 Create: va = vop <va, va'>
5202 } */
5204 tree rhs;
5206 if (dump_enabled_p ())
5207 dump_printf_loc (MSG_NOTE, vect_location,
5208 "Reduce using vector shifts\n");
5210 mode1 = TYPE_MODE (vectype1);
5211 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5212 for (elt_offset = nelements / 2;
5213 elt_offset >= 1;
5214 elt_offset /= 2)
5216 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5217 indices.new_vector (sel, 2, nelements);
5218 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5219 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5220 new_temp, zero_vec, mask);
5221 new_name = make_ssa_name (vec_dest, epilog_stmt);
5222 gimple_assign_set_lhs (epilog_stmt, new_name);
5223 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5225 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5226 new_temp);
5227 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5228 gimple_assign_set_lhs (epilog_stmt, new_temp);
5229 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5232 /* 2.4 Extract the final scalar result. Create:
5233 s_out3 = extract_field <v_out2, bitpos> */
5235 if (dump_enabled_p ())
5236 dump_printf_loc (MSG_NOTE, vect_location,
5237 "extract scalar result\n");
5239 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5240 bitsize, bitsize_zero_node);
5241 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5242 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5243 gimple_assign_set_lhs (epilog_stmt, new_temp);
5244 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5245 scalar_results.safe_push (new_temp);
5247 else
5249 /* Case 3: Create:
5250 s = extract_field <v_out2, 0>
5251 for (offset = element_size;
5252 offset < vector_size;
5253 offset += element_size;)
5255 Create: s' = extract_field <v_out2, offset>
5256 Create: s = op <s, s'> // For non SLP cases
5257 } */
5259 if (dump_enabled_p ())
5260 dump_printf_loc (MSG_NOTE, vect_location,
5261 "Reduce using scalar code.\n");
5263 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5264 int element_bitsize = tree_to_uhwi (bitsize);
5265 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5267 int bit_offset;
5268 if (gimple_code (new_phi) == GIMPLE_PHI)
5269 vec_temp = PHI_RESULT (new_phi);
5270 else
5271 vec_temp = gimple_assign_lhs (new_phi);
5272 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5273 bitsize_zero_node);
5274 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5275 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5276 gimple_assign_set_lhs (epilog_stmt, new_temp);
5277 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5279 /* In SLP we don't need to apply reduction operation, so we just
5280 collect s' values in SCALAR_RESULTS. */
5281 if (slp_reduc)
5282 scalar_results.safe_push (new_temp);
5284 for (bit_offset = element_bitsize;
5285 bit_offset < vec_size_in_bits;
5286 bit_offset += element_bitsize)
5288 tree bitpos = bitsize_int (bit_offset);
5289 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5290 bitsize, bitpos);
5292 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5293 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5294 gimple_assign_set_lhs (epilog_stmt, new_name);
5295 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5297 if (slp_reduc)
5299 /* In SLP we don't need to apply reduction operation, so
5300 we just collect s' values in SCALAR_RESULTS. */
5301 new_temp = new_name;
5302 scalar_results.safe_push (new_name);
5304 else
5306 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5307 new_name, new_temp);
5308 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5309 gimple_assign_set_lhs (epilog_stmt, new_temp);
5310 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5315 /* The only case where we need to reduce scalar results in SLP, is
5316 unrolling. If the size of SCALAR_RESULTS is greater than
5317 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5318 REDUC_GROUP_SIZE. */
5319 if (slp_reduc)
5321 tree res, first_res, new_res;
5322 gimple *new_stmt;
5324 /* Reduce multiple scalar results in case of SLP unrolling. */
5325 for (j = group_size; scalar_results.iterate (j, &res);
5326 j++)
5328 first_res = scalar_results[j % group_size];
5329 new_stmt = gimple_build_assign (new_scalar_dest, code,
5330 first_res, res);
5331 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5332 gimple_assign_set_lhs (new_stmt, new_res);
5333 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5334 scalar_results[j % group_size] = new_res;
5337 else
5338 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5339 scalar_results.safe_push (new_temp);
5342 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5343 == INTEGER_INDUC_COND_REDUCTION)
5344 && !operand_equal_p (initial_def, induc_val, 0))
5346 /* Earlier we set the initial value to be a vector if induc_val
5347 values. Check the result and if it is induc_val then replace
5348 with the original initial value, unless induc_val is
5349 the same as initial_def already. */
5350 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5351 induc_val);
5353 tree tmp = make_ssa_name (new_scalar_dest);
5354 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5355 initial_def, new_temp);
5356 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357 scalar_results[0] = tmp;
5361 vect_finalize_reduction:
5363 if (double_reduc)
5364 loop = loop->inner;
5366 /* 2.5 Adjust the final result by the initial value of the reduction
5367 variable. (When such adjustment is not needed, then
5368 'adjustment_def' is zero). For example, if code is PLUS we create:
5369 new_temp = loop_exit_def + adjustment_def */
5371 if (adjustment_def)
5373 gcc_assert (!slp_reduc);
5374 if (nested_in_vect_loop)
5376 new_phi = new_phis[0];
5377 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5378 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5379 new_dest = vect_create_destination_var (scalar_dest, vectype);
5381 else
5383 new_temp = scalar_results[0];
5384 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5385 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5386 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5389 epilog_stmt = gimple_build_assign (new_dest, expr);
5390 new_temp = make_ssa_name (new_dest, epilog_stmt);
5391 gimple_assign_set_lhs (epilog_stmt, new_temp);
5392 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5393 if (nested_in_vect_loop)
5395 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5396 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5397 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5399 if (!double_reduc)
5400 scalar_results.quick_push (new_temp);
5401 else
5402 scalar_results[0] = new_temp;
5404 else
5405 scalar_results[0] = new_temp;
5407 new_phis[0] = epilog_stmt;
5410 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5411 phis with new adjusted scalar results, i.e., replace use <s_out0>
5412 with use <s_out4>.
5414 Transform:
5415 loop_exit:
5416 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5417 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5418 v_out2 = reduce <v_out1>
5419 s_out3 = extract_field <v_out2, 0>
5420 s_out4 = adjust_result <s_out3>
5421 use <s_out0>
5422 use <s_out0>
5424 into:
5426 loop_exit:
5427 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5428 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5429 v_out2 = reduce <v_out1>
5430 s_out3 = extract_field <v_out2, 0>
5431 s_out4 = adjust_result <s_out3>
5432 use <s_out4>
5433 use <s_out4> */
5436 /* In SLP reduction chain we reduce vector results into one vector if
5437 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5438 LHS of the last stmt in the reduction chain, since we are looking for
5439 the loop exit phi node. */
5440 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5442 stmt_vec_info dest_stmt_info
5443 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5444 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5445 group_size = 1;
5448 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5449 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5450 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5451 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5452 correspond to the first vector stmt, etc.
5453 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5454 if (group_size > new_phis.length ())
5456 ratio = group_size / new_phis.length ();
5457 gcc_assert (!(group_size % new_phis.length ()));
5459 else
5460 ratio = 1;
5462 stmt_vec_info epilog_stmt_info = NULL;
5463 for (k = 0; k < group_size; k++)
5465 if (k % ratio == 0)
5467 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5468 reduction_phi_info = reduction_phis[k / ratio];
5469 if (double_reduc)
5470 inner_phi = inner_phis[k / ratio];
5473 if (slp_reduc)
5475 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5477 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5478 /* SLP statements can't participate in patterns. */
5479 gcc_assert (!orig_stmt_info);
5480 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5483 phis.create (3);
5484 /* Find the loop-closed-use at the loop exit of the original scalar
5485 result. (The reduction result is expected to have two immediate uses -
5486 one at the latch block, and one at the loop exit). */
5487 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5488 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5489 && !is_gimple_debug (USE_STMT (use_p)))
5490 phis.safe_push (USE_STMT (use_p));
5492 /* While we expect to have found an exit_phi because of loop-closed-ssa
5493 form we can end up without one if the scalar cycle is dead. */
5495 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5497 if (outer_loop)
5499 stmt_vec_info exit_phi_vinfo
5500 = loop_vinfo->lookup_stmt (exit_phi);
5501 gphi *vect_phi;
5503 /* FORNOW. Currently not supporting the case that an inner-loop
5504 reduction is not used in the outer-loop (but only outside the
5505 outer-loop), unless it is double reduction. */
5506 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5507 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5508 || double_reduc);
5510 if (double_reduc)
5511 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5512 else
5513 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5514 if (!double_reduc
5515 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5516 != vect_double_reduction_def)
5517 continue;
5519 /* Handle double reduction:
5521 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5522 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5523 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5524 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5526 At that point the regular reduction (stmt2 and stmt3) is
5527 already vectorized, as well as the exit phi node, stmt4.
5528 Here we vectorize the phi node of double reduction, stmt1, and
5529 update all relevant statements. */
5531 /* Go through all the uses of s2 to find double reduction phi
5532 node, i.e., stmt1 above. */
5533 orig_name = PHI_RESULT (exit_phi);
5534 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5536 stmt_vec_info use_stmt_vinfo;
5537 tree vect_phi_init, preheader_arg, vect_phi_res;
5538 basic_block bb = gimple_bb (use_stmt);
5540 /* Check that USE_STMT is really double reduction phi
5541 node. */
5542 if (gimple_code (use_stmt) != GIMPLE_PHI
5543 || gimple_phi_num_args (use_stmt) != 2
5544 || bb->loop_father != outer_loop)
5545 continue;
5546 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5547 if (!use_stmt_vinfo
5548 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5549 != vect_double_reduction_def)
5550 continue;
5552 /* Create vector phi node for double reduction:
5553 vs1 = phi <vs0, vs2>
5554 vs1 was created previously in this function by a call to
5555 vect_get_vec_def_for_operand and is stored in
5556 vec_initial_def;
5557 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5558 vs0 is created here. */
5560 /* Create vector phi node. */
5561 vect_phi = create_phi_node (vec_initial_def, bb);
5562 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5564 /* Create vs0 - initial def of the double reduction phi. */
5565 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5566 loop_preheader_edge (outer_loop));
5567 vect_phi_init = get_initial_def_for_reduction
5568 (stmt_info, preheader_arg, NULL);
5570 /* Update phi node arguments with vs0 and vs2. */
5571 add_phi_arg (vect_phi, vect_phi_init,
5572 loop_preheader_edge (outer_loop),
5573 UNKNOWN_LOCATION);
5574 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5575 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5576 if (dump_enabled_p ())
5577 dump_printf_loc (MSG_NOTE, vect_location,
5578 "created double reduction phi node: %G",
5579 vect_phi);
5581 vect_phi_res = PHI_RESULT (vect_phi);
5583 /* Replace the use, i.e., set the correct vs1 in the regular
5584 reduction phi node. FORNOW, NCOPIES is always 1, so the
5585 loop is redundant. */
5586 stmt_vec_info use_info = reduction_phi_info;
5587 for (j = 0; j < ncopies; j++)
5589 edge pr_edge = loop_preheader_edge (loop);
5590 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5591 pr_edge->dest_idx, vect_phi_res);
5592 use_info = STMT_VINFO_RELATED_STMT (use_info);
5598 phis.release ();
5599 if (nested_in_vect_loop)
5601 if (double_reduc)
5602 loop = outer_loop;
5603 else
5604 continue;
5607 phis.create (3);
5608 /* Find the loop-closed-use at the loop exit of the original scalar
5609 result. (The reduction result is expected to have two immediate uses,
5610 one at the latch block, and one at the loop exit). For double
5611 reductions we are looking for exit phis of the outer loop. */
5612 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5614 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5616 if (!is_gimple_debug (USE_STMT (use_p)))
5617 phis.safe_push (USE_STMT (use_p));
5619 else
5621 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5623 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5625 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5627 if (!flow_bb_inside_loop_p (loop,
5628 gimple_bb (USE_STMT (phi_use_p)))
5629 && !is_gimple_debug (USE_STMT (phi_use_p)))
5630 phis.safe_push (USE_STMT (phi_use_p));
5636 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5638 /* Replace the uses: */
5639 orig_name = PHI_RESULT (exit_phi);
5640 scalar_result = scalar_results[k];
5641 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5642 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5643 SET_USE (use_p, scalar_result);
5646 phis.release ();
5650 /* Return a vector of type VECTYPE that is equal to the vector select
5651 operation "MASK ? VEC : IDENTITY". Insert the select statements
5652 before GSI. */
5654 static tree
5655 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5656 tree vec, tree identity)
5658 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5659 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5660 mask, vec, identity);
5661 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5662 return cond;
5665 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5666 order, starting with LHS. Insert the extraction statements before GSI and
5667 associate the new scalar SSA names with variable SCALAR_DEST.
5668 Return the SSA name for the result. */
5670 static tree
5671 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5672 tree_code code, tree lhs, tree vector_rhs)
5674 tree vectype = TREE_TYPE (vector_rhs);
5675 tree scalar_type = TREE_TYPE (vectype);
5676 tree bitsize = TYPE_SIZE (scalar_type);
5677 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5678 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5680 for (unsigned HOST_WIDE_INT bit_offset = 0;
5681 bit_offset < vec_size_in_bits;
5682 bit_offset += element_bitsize)
5684 tree bitpos = bitsize_int (bit_offset);
5685 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5686 bitsize, bitpos);
5688 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5689 rhs = make_ssa_name (scalar_dest, stmt);
5690 gimple_assign_set_lhs (stmt, rhs);
5691 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5693 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5694 tree new_name = make_ssa_name (scalar_dest, stmt);
5695 gimple_assign_set_lhs (stmt, new_name);
5696 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5697 lhs = new_name;
5699 return lhs;
5702 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5703 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5704 statement. CODE is the operation performed by STMT_INFO and OPS are
5705 its scalar operands. REDUC_INDEX is the index of the operand in
5706 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5707 implements in-order reduction, or IFN_LAST if we should open-code it.
5708 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5709 that should be used to control the operation in a fully-masked loop. */
5711 static bool
5712 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5713 gimple_stmt_iterator *gsi,
5714 stmt_vec_info *vec_stmt, slp_tree slp_node,
5715 gimple *reduc_def_stmt,
5716 tree_code code, internal_fn reduc_fn,
5717 tree ops[3], tree vectype_in,
5718 int reduc_index, vec_loop_masks *masks)
5720 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5721 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5722 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5723 stmt_vec_info new_stmt_info = NULL;
5725 int ncopies;
5726 if (slp_node)
5727 ncopies = 1;
5728 else
5729 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5731 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5732 gcc_assert (ncopies == 1);
5733 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5734 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5735 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5736 == FOLD_LEFT_REDUCTION);
5738 if (slp_node)
5739 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5740 TYPE_VECTOR_SUBPARTS (vectype_in)));
5742 tree op0 = ops[1 - reduc_index];
5744 int group_size = 1;
5745 stmt_vec_info scalar_dest_def_info;
5746 auto_vec<tree> vec_oprnds0;
5747 if (slp_node)
5749 vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5750 slp_node);
5751 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5752 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5754 else
5756 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5757 vec_oprnds0.create (1);
5758 vec_oprnds0.quick_push (loop_vec_def0);
5759 scalar_dest_def_info = stmt_info;
5762 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5763 tree scalar_type = TREE_TYPE (scalar_dest);
5764 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5766 int vec_num = vec_oprnds0.length ();
5767 gcc_assert (vec_num == 1 || slp_node);
5768 tree vec_elem_type = TREE_TYPE (vectype_out);
5769 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5771 tree vector_identity = NULL_TREE;
5772 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5773 vector_identity = build_zero_cst (vectype_out);
5775 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5776 int i;
5777 tree def0;
5778 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5780 gimple *new_stmt;
5781 tree mask = NULL_TREE;
5782 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5783 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5785 /* Handle MINUS by adding the negative. */
5786 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5788 tree negated = make_ssa_name (vectype_out);
5789 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5790 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5791 def0 = negated;
5794 if (mask)
5795 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5796 vector_identity);
5798 /* On the first iteration the input is simply the scalar phi
5799 result, and for subsequent iterations it is the output of
5800 the preceding operation. */
5801 if (reduc_fn != IFN_LAST)
5803 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5804 /* For chained SLP reductions the output of the previous reduction
5805 operation serves as the input of the next. For the final statement
5806 the output cannot be a temporary - we reuse the original
5807 scalar destination of the last statement. */
5808 if (i != vec_num - 1)
5810 gimple_set_lhs (new_stmt, scalar_dest_var);
5811 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5812 gimple_set_lhs (new_stmt, reduc_var);
5815 else
5817 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5818 reduc_var, def0);
5819 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5820 /* Remove the statement, so that we can use the same code paths
5821 as for statements that we've just created. */
5822 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5823 gsi_remove (&tmp_gsi, true);
5826 if (i == vec_num - 1)
5828 gimple_set_lhs (new_stmt, scalar_dest);
5829 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5830 new_stmt);
5832 else
5833 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5834 new_stmt, gsi);
5836 if (slp_node)
5837 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5840 if (!slp_node)
5841 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5843 return true;
5846 /* Function is_nonwrapping_integer_induction.
5848 Check if STMT_VINO (which is part of loop LOOP) both increments and
5849 does not cause overflow. */
5851 static bool
5852 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5854 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5855 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5856 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5857 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5858 widest_int ni, max_loop_value, lhs_max;
5859 wi::overflow_type overflow = wi::OVF_NONE;
5861 /* Make sure the loop is integer based. */
5862 if (TREE_CODE (base) != INTEGER_CST
5863 || TREE_CODE (step) != INTEGER_CST)
5864 return false;
5866 /* Check that the max size of the loop will not wrap. */
5868 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5869 return true;
5871 if (! max_stmt_executions (loop, &ni))
5872 return false;
5874 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5875 &overflow);
5876 if (overflow)
5877 return false;
5879 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5880 TYPE_SIGN (lhs_type), &overflow);
5881 if (overflow)
5882 return false;
5884 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5885 <= TYPE_PRECISION (lhs_type));
5888 /* Function vectorizable_reduction.
5890 Check if STMT_INFO performs a reduction operation that can be vectorized.
5891 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5892 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5893 Return true if STMT_INFO is vectorizable in this way.
5895 This function also handles reduction idioms (patterns) that have been
5896 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5897 may be of this form:
5898 X = pattern_expr (arg0, arg1, ..., X)
5899 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5900 sequence that had been detected and replaced by the pattern-stmt
5901 (STMT_INFO).
5903 This function also handles reduction of condition expressions, for example:
5904 for (int i = 0; i < N; i++)
5905 if (a[i] < value)
5906 last = a[i];
5907 This is handled by vectorising the loop and creating an additional vector
5908 containing the loop indexes for which "a[i] < value" was true. In the
5909 function epilogue this is reduced to a single max value and then used to
5910 index into the vector of results.
5912 In some cases of reduction patterns, the type of the reduction variable X is
5913 different than the type of the other arguments of STMT_INFO.
5914 In such cases, the vectype that is used when transforming STMT_INFO into
5915 a vector stmt is different than the vectype that is used to determine the
5916 vectorization factor, because it consists of a different number of elements
5917 than the actual number of elements that are being operated upon in parallel.
5919 For example, consider an accumulation of shorts into an int accumulator.
5920 On some targets it's possible to vectorize this pattern operating on 8
5921 shorts at a time (hence, the vectype for purposes of determining the
5922 vectorization factor should be V8HI); on the other hand, the vectype that
5923 is used to create the vector form is actually V4SI (the type of the result).
5925 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5926 indicates what is the actual level of parallelism (V8HI in the example), so
5927 that the right vectorization factor would be derived. This vectype
5928 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5929 be used to create the vectorized stmt. The right vectype for the vectorized
5930 stmt is obtained from the type of the result X:
5931 get_vectype_for_scalar_type (TREE_TYPE (X))
5933 This means that, contrary to "regular" reductions (or "regular" stmts in
5934 general), the following equation:
5935 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5936 does *NOT* necessarily hold for reduction patterns. */
5938 bool
5939 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5940 stmt_vec_info *vec_stmt, slp_tree slp_node,
5941 slp_instance slp_node_instance,
5942 stmt_vector_for_cost *cost_vec)
5944 tree vec_dest;
5945 tree scalar_dest;
5946 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5947 tree vectype_in = NULL_TREE;
5948 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5949 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5950 enum tree_code code, orig_code;
5951 internal_fn reduc_fn;
5952 machine_mode vec_mode;
5953 int op_type;
5954 optab optab;
5955 tree new_temp = NULL_TREE;
5956 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5957 stmt_vec_info cond_stmt_vinfo = NULL;
5958 enum tree_code cond_reduc_op_code = ERROR_MARK;
5959 tree scalar_type;
5960 bool is_simple_use;
5961 int i;
5962 int ncopies;
5963 int epilog_copies;
5964 stmt_vec_info prev_stmt_info, prev_phi_info;
5965 bool single_defuse_cycle = false;
5966 stmt_vec_info new_stmt_info = NULL;
5967 int j;
5968 tree ops[3];
5969 enum vect_def_type dts[3];
5970 bool nested_cycle = false, found_nested_cycle_def = false;
5971 bool double_reduc = false;
5972 basic_block def_bb;
5973 struct loop * def_stmt_loop;
5974 tree def_arg;
5975 auto_vec<tree> vec_oprnds0;
5976 auto_vec<tree> vec_oprnds1;
5977 auto_vec<tree> vec_oprnds2;
5978 auto_vec<tree> vect_defs;
5979 auto_vec<stmt_vec_info> phis;
5980 int vec_num;
5981 tree def0, tem;
5982 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5983 tree cond_reduc_val = NULL_TREE;
5985 /* Make sure it was already recognized as a reduction computation. */
5986 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5987 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5988 return false;
5990 if (nested_in_vect_loop_p (loop, stmt_info))
5992 loop = loop->inner;
5993 nested_cycle = true;
5996 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5997 gcc_assert (slp_node
5998 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6000 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6002 tree phi_result = gimple_phi_result (phi);
6003 /* Analysis is fully done on the reduction stmt invocation. */
6004 if (! vec_stmt)
6006 if (slp_node)
6007 slp_node_instance->reduc_phis = slp_node;
6009 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6010 return true;
6013 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6014 /* Leave the scalar phi in place. Note that checking
6015 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6016 for reductions involving a single statement. */
6017 return true;
6019 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6020 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6022 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6023 == EXTRACT_LAST_REDUCTION)
6024 /* Leave the scalar phi in place. */
6025 return true;
6027 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6028 code = gimple_assign_rhs_code (reduc_stmt);
6029 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6031 tree op = gimple_op (reduc_stmt, k);
6032 if (op == phi_result)
6033 continue;
6034 if (k == 1 && code == COND_EXPR)
6035 continue;
6036 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6037 gcc_assert (is_simple_use);
6038 if (dt == vect_constant_def || dt == vect_external_def)
6039 continue;
6040 if (!vectype_in
6041 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6042 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6043 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6044 break;
6046 /* For a nested cycle we might end up with an operation like
6047 phi_result * phi_result. */
6048 if (!vectype_in)
6049 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6050 gcc_assert (vectype_in);
6052 if (slp_node)
6053 ncopies = 1;
6054 else
6055 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6057 stmt_vec_info use_stmt_info;
6058 if (ncopies > 1
6059 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6060 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6061 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6062 single_defuse_cycle = true;
6064 /* Create the destination vector */
6065 scalar_dest = gimple_assign_lhs (reduc_stmt);
6066 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6068 if (slp_node)
6069 /* The size vect_schedule_slp_instance computes is off for us. */
6070 vec_num = vect_get_num_vectors
6071 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6072 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6073 vectype_in);
6074 else
6075 vec_num = 1;
6077 /* Generate the reduction PHIs upfront. */
6078 prev_phi_info = NULL;
6079 for (j = 0; j < ncopies; j++)
6081 if (j == 0 || !single_defuse_cycle)
6083 for (i = 0; i < vec_num; i++)
6085 /* Create the reduction-phi that defines the reduction
6086 operand. */
6087 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6088 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6090 if (slp_node)
6091 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6092 else
6094 if (j == 0)
6095 STMT_VINFO_VEC_STMT (stmt_info)
6096 = *vec_stmt = new_phi_info;
6097 else
6098 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6099 prev_phi_info = new_phi_info;
6105 return true;
6108 /* 1. Is vectorizable reduction? */
6109 /* Not supportable if the reduction variable is used in the loop, unless
6110 it's a reduction chain. */
6111 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6112 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6113 return false;
6115 /* Reductions that are not used even in an enclosing outer-loop,
6116 are expected to be "live" (used out of the loop). */
6117 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6118 && !STMT_VINFO_LIVE_P (stmt_info))
6119 return false;
6121 /* 2. Has this been recognized as a reduction pattern?
6123 Check if STMT represents a pattern that has been recognized
6124 in earlier analysis stages. For stmts that represent a pattern,
6125 the STMT_VINFO_RELATED_STMT field records the last stmt in
6126 the original sequence that constitutes the pattern. */
6128 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6129 if (orig_stmt_info)
6131 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6132 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6135 /* 3. Check the operands of the operation. The first operands are defined
6136 inside the loop body. The last operand is the reduction variable,
6137 which is defined by the loop-header-phi. */
6139 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6141 /* Flatten RHS. */
6142 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6144 case GIMPLE_BINARY_RHS:
6145 code = gimple_assign_rhs_code (stmt);
6146 op_type = TREE_CODE_LENGTH (code);
6147 gcc_assert (op_type == binary_op);
6148 ops[0] = gimple_assign_rhs1 (stmt);
6149 ops[1] = gimple_assign_rhs2 (stmt);
6150 break;
6152 case GIMPLE_TERNARY_RHS:
6153 code = gimple_assign_rhs_code (stmt);
6154 op_type = TREE_CODE_LENGTH (code);
6155 gcc_assert (op_type == ternary_op);
6156 ops[0] = gimple_assign_rhs1 (stmt);
6157 ops[1] = gimple_assign_rhs2 (stmt);
6158 ops[2] = gimple_assign_rhs3 (stmt);
6159 break;
6161 case GIMPLE_UNARY_RHS:
6162 return false;
6164 default:
6165 gcc_unreachable ();
6168 if (code == COND_EXPR && slp_node)
6169 return false;
6171 scalar_dest = gimple_assign_lhs (stmt);
6172 scalar_type = TREE_TYPE (scalar_dest);
6173 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6174 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6175 return false;
6177 /* Do not try to vectorize bit-precision reductions. */
6178 if (!type_has_mode_precision_p (scalar_type))
6179 return false;
6181 /* All uses but the last are expected to be defined in the loop.
6182 The last use is the reduction variable. In case of nested cycle this
6183 assumption is not true: we use reduc_index to record the index of the
6184 reduction variable. */
6185 stmt_vec_info reduc_def_info;
6186 if (orig_stmt_info)
6187 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6188 else
6189 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6190 gcc_assert (reduc_def_info);
6191 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6192 tree reduc_def = PHI_RESULT (reduc_def_phi);
6193 int reduc_index = -1;
6194 for (i = 0; i < op_type; i++)
6196 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6197 if (i == 0 && code == COND_EXPR)
6198 continue;
6200 stmt_vec_info def_stmt_info;
6201 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6202 &def_stmt_info);
6203 dt = dts[i];
6204 gcc_assert (is_simple_use);
6205 if (dt == vect_reduction_def
6206 && ops[i] == reduc_def)
6208 reduc_index = i;
6209 continue;
6211 else if (tem)
6213 /* To properly compute ncopies we are interested in the widest
6214 input type in case we're looking at a widening accumulation. */
6215 if (!vectype_in
6216 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6217 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6218 vectype_in = tem;
6221 if (dt != vect_internal_def
6222 && dt != vect_external_def
6223 && dt != vect_constant_def
6224 && dt != vect_induction_def
6225 && !(dt == vect_nested_cycle && nested_cycle))
6226 return false;
6228 if (dt == vect_nested_cycle
6229 && ops[i] == reduc_def)
6231 found_nested_cycle_def = true;
6232 reduc_index = i;
6235 if (i == 1 && code == COND_EXPR)
6237 /* Record how value of COND_EXPR is defined. */
6238 if (dt == vect_constant_def)
6240 cond_reduc_dt = dt;
6241 cond_reduc_val = ops[i];
6243 if (dt == vect_induction_def
6244 && def_stmt_info
6245 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6247 cond_reduc_dt = dt;
6248 cond_stmt_vinfo = def_stmt_info;
6253 if (!vectype_in)
6254 vectype_in = vectype_out;
6256 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6257 directy used in stmt. */
6258 if (reduc_index == -1)
6260 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6262 if (dump_enabled_p ())
6263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6264 "in-order reduction chain without SLP.\n");
6265 return false;
6269 if (!(reduc_index == -1
6270 || dts[reduc_index] == vect_reduction_def
6271 || dts[reduc_index] == vect_nested_cycle
6272 || ((dts[reduc_index] == vect_internal_def
6273 || dts[reduc_index] == vect_external_def
6274 || dts[reduc_index] == vect_constant_def
6275 || dts[reduc_index] == vect_induction_def)
6276 && nested_cycle && found_nested_cycle_def)))
6278 /* For pattern recognized stmts, orig_stmt might be a reduction,
6279 but some helper statements for the pattern might not, or
6280 might be COND_EXPRs with reduction uses in the condition. */
6281 gcc_assert (orig_stmt_info);
6282 return false;
6285 /* PHIs should not participate in patterns. */
6286 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6287 enum vect_reduction_type v_reduc_type
6288 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6289 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6291 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6292 /* If we have a condition reduction, see if we can simplify it further. */
6293 if (v_reduc_type == COND_REDUCTION)
6295 /* TODO: We can't yet handle reduction chains, since we need to treat
6296 each COND_EXPR in the chain specially, not just the last one.
6297 E.g. for:
6299 x_1 = PHI <x_3, ...>
6300 x_2 = a_2 ? ... : x_1;
6301 x_3 = a_3 ? ... : x_2;
6303 we're interested in the last element in x_3 for which a_2 || a_3
6304 is true, whereas the current reduction chain handling would
6305 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6306 as a reduction operation. */
6307 if (reduc_index == -1)
6309 if (dump_enabled_p ())
6310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6311 "conditional reduction chains not supported\n");
6312 return false;
6315 /* vect_is_simple_reduction ensured that operand 2 is the
6316 loop-carried operand. */
6317 gcc_assert (reduc_index == 2);
6319 /* Loop peeling modifies initial value of reduction PHI, which
6320 makes the reduction stmt to be transformed different to the
6321 original stmt analyzed. We need to record reduction code for
6322 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6323 it can be used directly at transform stage. */
6324 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6325 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6327 /* Also set the reduction type to CONST_COND_REDUCTION. */
6328 gcc_assert (cond_reduc_dt == vect_constant_def);
6329 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6331 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6332 vectype_in, OPTIMIZE_FOR_SPEED))
6334 if (dump_enabled_p ())
6335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6336 "optimizing condition reduction with"
6337 " FOLD_EXTRACT_LAST.\n");
6338 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6340 else if (cond_reduc_dt == vect_induction_def)
6342 tree base
6343 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6344 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6346 gcc_assert (TREE_CODE (base) == INTEGER_CST
6347 && TREE_CODE (step) == INTEGER_CST);
6348 cond_reduc_val = NULL_TREE;
6349 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6350 above base; punt if base is the minimum value of the type for
6351 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6352 if (tree_int_cst_sgn (step) == -1)
6354 cond_reduc_op_code = MIN_EXPR;
6355 if (tree_int_cst_sgn (base) == -1)
6356 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6357 else if (tree_int_cst_lt (base,
6358 TYPE_MAX_VALUE (TREE_TYPE (base))))
6359 cond_reduc_val
6360 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6362 else
6364 cond_reduc_op_code = MAX_EXPR;
6365 if (tree_int_cst_sgn (base) == 1)
6366 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6367 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6368 base))
6369 cond_reduc_val
6370 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6372 if (cond_reduc_val)
6374 if (dump_enabled_p ())
6375 dump_printf_loc (MSG_NOTE, vect_location,
6376 "condition expression based on "
6377 "integer induction.\n");
6378 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6379 = INTEGER_INDUC_COND_REDUCTION;
6382 else if (cond_reduc_dt == vect_constant_def)
6384 enum vect_def_type cond_initial_dt;
6385 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6386 tree cond_initial_val
6387 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6389 gcc_assert (cond_reduc_val != NULL_TREE);
6390 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6391 if (cond_initial_dt == vect_constant_def
6392 && types_compatible_p (TREE_TYPE (cond_initial_val),
6393 TREE_TYPE (cond_reduc_val)))
6395 tree e = fold_binary (LE_EXPR, boolean_type_node,
6396 cond_initial_val, cond_reduc_val);
6397 if (e && (integer_onep (e) || integer_zerop (e)))
6399 if (dump_enabled_p ())
6400 dump_printf_loc (MSG_NOTE, vect_location,
6401 "condition expression based on "
6402 "compile time constant.\n");
6403 /* Record reduction code at analysis stage. */
6404 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6405 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6406 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6407 = CONST_COND_REDUCTION;
6413 if (orig_stmt_info)
6414 gcc_assert (tmp == orig_stmt_info
6415 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6416 else
6417 /* We changed STMT to be the first stmt in reduction chain, hence we
6418 check that in this case the first element in the chain is STMT. */
6419 gcc_assert (tmp == stmt_info
6420 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6422 if (STMT_VINFO_LIVE_P (reduc_def_info))
6423 return false;
6425 if (slp_node)
6426 ncopies = 1;
6427 else
6428 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6430 gcc_assert (ncopies >= 1);
6432 vec_mode = TYPE_MODE (vectype_in);
6433 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6435 if (nested_cycle)
6437 def_bb = gimple_bb (reduc_def_phi);
6438 def_stmt_loop = def_bb->loop_father;
6439 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6440 loop_preheader_edge (def_stmt_loop));
6441 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6442 if (def_arg_stmt_info
6443 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6444 == vect_double_reduction_def))
6445 double_reduc = true;
6448 vect_reduction_type reduction_type
6449 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6450 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6451 && ncopies > 1)
6453 if (dump_enabled_p ())
6454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6455 "multiple types in double reduction or condition "
6456 "reduction.\n");
6457 return false;
6460 if (code == COND_EXPR)
6462 /* Only call during the analysis stage, otherwise we'll lose
6463 STMT_VINFO_TYPE. */
6464 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6465 true, NULL, cost_vec))
6467 if (dump_enabled_p ())
6468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6469 "unsupported condition in reduction\n");
6470 return false;
6473 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6474 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6476 /* Only call during the analysis stage, otherwise we'll lose
6477 STMT_VINFO_TYPE. We only support this for nested cycles
6478 without double reductions at the moment. */
6479 if (!nested_cycle
6480 || double_reduc
6481 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6482 NULL, cost_vec)))
6484 if (dump_enabled_p ())
6485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6486 "unsupported shift or rotation in reduction\n");
6487 return false;
6490 else
6492 /* 4. Supportable by target? */
6494 /* 4.1. check support for the operation in the loop */
6495 optab = optab_for_tree_code (code, vectype_in, optab_default);
6496 if (!optab)
6498 if (dump_enabled_p ())
6499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6500 "no optab.\n");
6502 return false;
6505 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6507 if (dump_enabled_p ())
6508 dump_printf (MSG_NOTE, "op not supported by target.\n");
6510 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6511 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6512 return false;
6514 if (dump_enabled_p ())
6515 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6518 /* Worthwhile without SIMD support? */
6519 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6520 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6522 if (dump_enabled_p ())
6523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6524 "not worthwhile without SIMD support.\n");
6526 return false;
6530 /* 4.2. Check support for the epilog operation.
6532 If STMT represents a reduction pattern, then the type of the
6533 reduction variable may be different than the type of the rest
6534 of the arguments. For example, consider the case of accumulation
6535 of shorts into an int accumulator; The original code:
6536 S1: int_a = (int) short_a;
6537 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6539 was replaced with:
6540 STMT: int_acc = widen_sum <short_a, int_acc>
6542 This means that:
6543 1. The tree-code that is used to create the vector operation in the
6544 epilog code (that reduces the partial results) is not the
6545 tree-code of STMT, but is rather the tree-code of the original
6546 stmt from the pattern that STMT is replacing. I.e, in the example
6547 above we want to use 'widen_sum' in the loop, but 'plus' in the
6548 epilog.
6549 2. The type (mode) we use to check available target support
6550 for the vector operation to be created in the *epilog*, is
6551 determined by the type of the reduction variable (in the example
6552 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6553 However the type (mode) we use to check available target support
6554 for the vector operation to be created *inside the loop*, is
6555 determined by the type of the other arguments to STMT (in the
6556 example we'd check this: optab_handler (widen_sum_optab,
6557 vect_short_mode)).
6559 This is contrary to "regular" reductions, in which the types of all
6560 the arguments are the same as the type of the reduction variable.
6561 For "regular" reductions we can therefore use the same vector type
6562 (and also the same tree-code) when generating the epilog code and
6563 when generating the code inside the loop. */
6565 if (orig_stmt_info
6566 && (reduction_type == TREE_CODE_REDUCTION
6567 || reduction_type == FOLD_LEFT_REDUCTION))
6569 /* This is a reduction pattern: get the vectype from the type of the
6570 reduction variable, and get the tree-code from orig_stmt. */
6571 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6572 gcc_assert (vectype_out);
6573 vec_mode = TYPE_MODE (vectype_out);
6575 else
6577 /* Regular reduction: use the same vectype and tree-code as used for
6578 the vector code inside the loop can be used for the epilog code. */
6579 orig_code = code;
6581 if (code == MINUS_EXPR)
6582 orig_code = PLUS_EXPR;
6584 /* For simple condition reductions, replace with the actual expression
6585 we want to base our reduction around. */
6586 if (reduction_type == CONST_COND_REDUCTION)
6588 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6589 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6591 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6592 orig_code = cond_reduc_op_code;
6595 reduc_fn = IFN_LAST;
6597 if (reduction_type == TREE_CODE_REDUCTION
6598 || reduction_type == FOLD_LEFT_REDUCTION
6599 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6600 || reduction_type == CONST_COND_REDUCTION)
6602 if (reduction_type == FOLD_LEFT_REDUCTION
6603 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6604 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6606 if (reduc_fn != IFN_LAST
6607 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6608 OPTIMIZE_FOR_SPEED))
6610 if (dump_enabled_p ())
6611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6612 "reduc op not supported by target.\n");
6614 reduc_fn = IFN_LAST;
6617 else
6619 if (!nested_cycle || double_reduc)
6621 if (dump_enabled_p ())
6622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623 "no reduc code for scalar code.\n");
6625 return false;
6629 else if (reduction_type == COND_REDUCTION)
6631 int scalar_precision
6632 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6633 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6634 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6635 nunits_out);
6637 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6638 OPTIMIZE_FOR_SPEED))
6639 reduc_fn = IFN_REDUC_MAX;
6642 if (reduction_type != EXTRACT_LAST_REDUCTION
6643 && (!nested_cycle || double_reduc)
6644 && reduc_fn == IFN_LAST
6645 && !nunits_out.is_constant ())
6647 if (dump_enabled_p ())
6648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6649 "missing target support for reduction on"
6650 " variable-length vectors.\n");
6651 return false;
6654 /* For SLP reductions, see if there is a neutral value we can use. */
6655 tree neutral_op = NULL_TREE;
6656 if (slp_node)
6657 neutral_op = neutral_op_for_slp_reduction
6658 (slp_node_instance->reduc_phis, code,
6659 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6661 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6663 /* We can't support in-order reductions of code such as this:
6665 for (int i = 0; i < n1; ++i)
6666 for (int j = 0; j < n2; ++j)
6667 l += a[j];
6669 since GCC effectively transforms the loop when vectorizing:
6671 for (int i = 0; i < n1 / VF; ++i)
6672 for (int j = 0; j < n2; ++j)
6673 for (int k = 0; k < VF; ++k)
6674 l += a[j];
6676 which is a reassociation of the original operation. */
6677 if (dump_enabled_p ())
6678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6679 "in-order double reduction not supported.\n");
6681 return false;
6684 if (reduction_type == FOLD_LEFT_REDUCTION
6685 && slp_node
6686 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6688 /* We cannot use in-order reductions in this case because there is
6689 an implicit reassociation of the operations involved. */
6690 if (dump_enabled_p ())
6691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6692 "in-order unchained SLP reductions not supported.\n");
6693 return false;
6696 /* For double reductions, and for SLP reductions with a neutral value,
6697 we construct a variable-length initial vector by loading a vector
6698 full of the neutral value and then shift-and-inserting the start
6699 values into the low-numbered elements. */
6700 if ((double_reduc || neutral_op)
6701 && !nunits_out.is_constant ()
6702 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6703 vectype_out, OPTIMIZE_FOR_SPEED))
6705 if (dump_enabled_p ())
6706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6707 "reduction on variable-length vectors requires"
6708 " target support for a vector-shift-and-insert"
6709 " operation.\n");
6710 return false;
6713 /* Check extra constraints for variable-length unchained SLP reductions. */
6714 if (STMT_SLP_TYPE (stmt_info)
6715 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6716 && !nunits_out.is_constant ())
6718 /* We checked above that we could build the initial vector when
6719 there's a neutral element value. Check here for the case in
6720 which each SLP statement has its own initial value and in which
6721 that value needs to be repeated for every instance of the
6722 statement within the initial vector. */
6723 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6724 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6725 if (!neutral_op
6726 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6728 if (dump_enabled_p ())
6729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6730 "unsupported form of SLP reduction for"
6731 " variable-length vectors: cannot build"
6732 " initial vector.\n");
6733 return false;
6735 /* The epilogue code relies on the number of elements being a multiple
6736 of the group size. The duplicate-and-interleave approach to setting
6737 up the the initial vector does too. */
6738 if (!multiple_p (nunits_out, group_size))
6740 if (dump_enabled_p ())
6741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6742 "unsupported form of SLP reduction for"
6743 " variable-length vectors: the vector size"
6744 " is not a multiple of the number of results.\n");
6745 return false;
6749 /* In case of widenning multiplication by a constant, we update the type
6750 of the constant to be the type of the other operand. We check that the
6751 constant fits the type in the pattern recognition pass. */
6752 if (code == DOT_PROD_EXPR
6753 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6755 if (TREE_CODE (ops[0]) == INTEGER_CST)
6756 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6757 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6758 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6759 else
6761 if (dump_enabled_p ())
6762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6763 "invalid types in dot-prod\n");
6765 return false;
6769 if (reduction_type == COND_REDUCTION)
6771 widest_int ni;
6773 if (! max_loop_iterations (loop, &ni))
6775 if (dump_enabled_p ())
6776 dump_printf_loc (MSG_NOTE, vect_location,
6777 "loop count not known, cannot create cond "
6778 "reduction.\n");
6779 return false;
6781 /* Convert backedges to iterations. */
6782 ni += 1;
6784 /* The additional index will be the same type as the condition. Check
6785 that the loop can fit into this less one (because we'll use up the
6786 zero slot for when there are no matches). */
6787 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6788 if (wi::geu_p (ni, wi::to_widest (max_index)))
6790 if (dump_enabled_p ())
6791 dump_printf_loc (MSG_NOTE, vect_location,
6792 "loop size is greater than data size.\n");
6793 return false;
6797 /* In case the vectorization factor (VF) is bigger than the number
6798 of elements that we can fit in a vectype (nunits), we have to generate
6799 more than one vector stmt - i.e - we need to "unroll" the
6800 vector stmt by a factor VF/nunits. For more details see documentation
6801 in vectorizable_operation. */
6803 /* If the reduction is used in an outer loop we need to generate
6804 VF intermediate results, like so (e.g. for ncopies=2):
6805 r0 = phi (init, r0)
6806 r1 = phi (init, r1)
6807 r0 = x0 + r0;
6808 r1 = x1 + r1;
6809 (i.e. we generate VF results in 2 registers).
6810 In this case we have a separate def-use cycle for each copy, and therefore
6811 for each copy we get the vector def for the reduction variable from the
6812 respective phi node created for this copy.
6814 Otherwise (the reduction is unused in the loop nest), we can combine
6815 together intermediate results, like so (e.g. for ncopies=2):
6816 r = phi (init, r)
6817 r = x0 + r;
6818 r = x1 + r;
6819 (i.e. we generate VF/2 results in a single register).
6820 In this case for each copy we get the vector def for the reduction variable
6821 from the vectorized reduction operation generated in the previous iteration.
6823 This only works when we see both the reduction PHI and its only consumer
6824 in vectorizable_reduction and there are no intermediate stmts
6825 participating. */
6826 stmt_vec_info use_stmt_info;
6827 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6828 if (ncopies > 1
6829 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6830 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6831 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6833 single_defuse_cycle = true;
6834 epilog_copies = 1;
6836 else
6837 epilog_copies = ncopies;
6839 /* If the reduction stmt is one of the patterns that have lane
6840 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6841 if ((ncopies > 1
6842 && ! single_defuse_cycle)
6843 && (code == DOT_PROD_EXPR
6844 || code == WIDEN_SUM_EXPR
6845 || code == SAD_EXPR))
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849 "multi def-use cycle not possible for lane-reducing "
6850 "reduction operation\n");
6851 return false;
6854 if (slp_node)
6855 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6856 else
6857 vec_num = 1;
6859 internal_fn cond_fn = get_conditional_internal_fn (code);
6860 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6862 if (!vec_stmt) /* transformation not required. */
6864 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6865 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6867 if (reduction_type != FOLD_LEFT_REDUCTION
6868 && (cond_fn == IFN_LAST
6869 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6870 OPTIMIZE_FOR_SPEED)))
6872 if (dump_enabled_p ())
6873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6874 "can't use a fully-masked loop because no"
6875 " conditional operation is available.\n");
6876 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6878 else if (reduc_index == -1)
6880 if (dump_enabled_p ())
6881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6882 "can't use a fully-masked loop for chained"
6883 " reductions.\n");
6884 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6886 else
6887 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6888 vectype_in);
6890 if (dump_enabled_p ()
6891 && reduction_type == FOLD_LEFT_REDUCTION)
6892 dump_printf_loc (MSG_NOTE, vect_location,
6893 "using an in-order (fold-left) reduction.\n");
6894 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6895 return true;
6898 /* Transform. */
6900 if (dump_enabled_p ())
6901 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6903 /* FORNOW: Multiple types are not supported for condition. */
6904 if (code == COND_EXPR)
6905 gcc_assert (ncopies == 1);
6907 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6909 if (reduction_type == FOLD_LEFT_REDUCTION)
6910 return vectorize_fold_left_reduction
6911 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6912 reduc_fn, ops, vectype_in, reduc_index, masks);
6914 if (reduction_type == EXTRACT_LAST_REDUCTION)
6916 gcc_assert (!slp_node);
6917 return vectorizable_condition (stmt_info, gsi, vec_stmt,
6918 true, NULL, NULL);
6921 /* Create the destination vector */
6922 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6924 prev_stmt_info = NULL;
6925 prev_phi_info = NULL;
6926 if (!slp_node)
6928 vec_oprnds0.create (1);
6929 vec_oprnds1.create (1);
6930 if (op_type == ternary_op)
6931 vec_oprnds2.create (1);
6934 phis.create (vec_num);
6935 vect_defs.create (vec_num);
6936 if (!slp_node)
6937 vect_defs.quick_push (NULL_TREE);
6939 if (slp_node)
6940 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6941 else
6942 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6944 for (j = 0; j < ncopies; j++)
6946 if (code == COND_EXPR)
6948 gcc_assert (!slp_node);
6949 vectorizable_condition (stmt_info, gsi, vec_stmt,
6950 true, NULL, NULL);
6951 break;
6953 if (code == LSHIFT_EXPR
6954 || code == RSHIFT_EXPR)
6956 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
6957 break;
6960 /* Handle uses. */
6961 if (j == 0)
6963 if (slp_node)
6965 /* Get vec defs for all the operands except the reduction index,
6966 ensuring the ordering of the ops in the vector is kept. */
6967 auto_vec<tree, 3> slp_ops;
6968 auto_vec<vec<tree>, 3> vec_defs;
6970 slp_ops.quick_push (ops[0]);
6971 slp_ops.quick_push (ops[1]);
6972 if (op_type == ternary_op)
6973 slp_ops.quick_push (ops[2]);
6975 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6977 vec_oprnds0.safe_splice (vec_defs[0]);
6978 vec_defs[0].release ();
6979 vec_oprnds1.safe_splice (vec_defs[1]);
6980 vec_defs[1].release ();
6981 if (op_type == ternary_op)
6983 vec_oprnds2.safe_splice (vec_defs[2]);
6984 vec_defs[2].release ();
6987 else
6989 vec_oprnds0.quick_push
6990 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6991 vec_oprnds1.quick_push
6992 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6993 if (op_type == ternary_op)
6994 vec_oprnds2.quick_push
6995 (vect_get_vec_def_for_operand (ops[2], stmt_info));
6998 else
7000 if (!slp_node)
7002 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7004 if (single_defuse_cycle && reduc_index == 0)
7005 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7006 else
7007 vec_oprnds0[0]
7008 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7009 vec_oprnds0[0]);
7010 if (single_defuse_cycle && reduc_index == 1)
7011 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7012 else
7013 vec_oprnds1[0]
7014 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7015 vec_oprnds1[0]);
7016 if (op_type == ternary_op)
7018 if (single_defuse_cycle && reduc_index == 2)
7019 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7020 else
7021 vec_oprnds2[0]
7022 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7023 vec_oprnds2[0]);
7028 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7030 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7031 if (masked_loop_p)
7033 /* Make sure that the reduction accumulator is vop[0]. */
7034 if (reduc_index == 1)
7036 gcc_assert (commutative_tree_code (code));
7037 std::swap (vop[0], vop[1]);
7039 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7040 vectype_in, i * ncopies + j);
7041 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7042 vop[0], vop[1],
7043 vop[0]);
7044 new_temp = make_ssa_name (vec_dest, call);
7045 gimple_call_set_lhs (call, new_temp);
7046 gimple_call_set_nothrow (call, true);
7047 new_stmt_info
7048 = vect_finish_stmt_generation (stmt_info, call, gsi);
7050 else
7052 if (op_type == ternary_op)
7053 vop[2] = vec_oprnds2[i];
7055 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7056 vop[0], vop[1], vop[2]);
7057 new_temp = make_ssa_name (vec_dest, new_stmt);
7058 gimple_assign_set_lhs (new_stmt, new_temp);
7059 new_stmt_info
7060 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7063 if (slp_node)
7065 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7066 vect_defs.quick_push (new_temp);
7068 else
7069 vect_defs[0] = new_temp;
7072 if (slp_node)
7073 continue;
7075 if (j == 0)
7076 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7077 else
7078 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7080 prev_stmt_info = new_stmt_info;
7083 /* Finalize the reduction-phi (set its arguments) and create the
7084 epilog reduction code. */
7085 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7086 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7088 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7089 epilog_copies, reduc_fn, phis,
7090 double_reduc, slp_node, slp_node_instance,
7091 cond_reduc_val, cond_reduc_op_code,
7092 neutral_op);
7094 return true;
7097 /* Function vect_min_worthwhile_factor.
7099 For a loop where we could vectorize the operation indicated by CODE,
7100 return the minimum vectorization factor that makes it worthwhile
7101 to use generic vectors. */
7102 static unsigned int
7103 vect_min_worthwhile_factor (enum tree_code code)
7105 switch (code)
7107 case PLUS_EXPR:
7108 case MINUS_EXPR:
7109 case NEGATE_EXPR:
7110 return 4;
7112 case BIT_AND_EXPR:
7113 case BIT_IOR_EXPR:
7114 case BIT_XOR_EXPR:
7115 case BIT_NOT_EXPR:
7116 return 2;
7118 default:
7119 return INT_MAX;
7123 /* Return true if VINFO indicates we are doing loop vectorization and if
7124 it is worth decomposing CODE operations into scalar operations for
7125 that loop's vectorization factor. */
7127 bool
7128 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7130 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7131 unsigned HOST_WIDE_INT value;
7132 return (loop_vinfo
7133 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7134 && value >= vect_min_worthwhile_factor (code));
7137 /* Function vectorizable_induction
7139 Check if STMT_INFO performs an induction computation that can be vectorized.
7140 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7141 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7142 Return true if STMT_INFO is vectorizable in this way. */
7144 bool
7145 vectorizable_induction (stmt_vec_info stmt_info,
7146 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7147 stmt_vec_info *vec_stmt, slp_tree slp_node,
7148 stmt_vector_for_cost *cost_vec)
7150 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7151 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7152 unsigned ncopies;
7153 bool nested_in_vect_loop = false;
7154 struct loop *iv_loop;
7155 tree vec_def;
7156 edge pe = loop_preheader_edge (loop);
7157 basic_block new_bb;
7158 tree new_vec, vec_init, vec_step, t;
7159 tree new_name;
7160 gimple *new_stmt;
7161 gphi *induction_phi;
7162 tree induc_def, vec_dest;
7163 tree init_expr, step_expr;
7164 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7165 unsigned i;
7166 tree expr;
7167 gimple_seq stmts;
7168 imm_use_iterator imm_iter;
7169 use_operand_p use_p;
7170 gimple *exit_phi;
7171 edge latch_e;
7172 tree loop_arg;
7173 gimple_stmt_iterator si;
7175 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7176 if (!phi)
7177 return false;
7179 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7180 return false;
7182 /* Make sure it was recognized as induction computation. */
7183 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7184 return false;
7186 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7187 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7189 if (slp_node)
7190 ncopies = 1;
7191 else
7192 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7193 gcc_assert (ncopies >= 1);
7195 /* FORNOW. These restrictions should be relaxed. */
7196 if (nested_in_vect_loop_p (loop, stmt_info))
7198 imm_use_iterator imm_iter;
7199 use_operand_p use_p;
7200 gimple *exit_phi;
7201 edge latch_e;
7202 tree loop_arg;
7204 if (ncopies > 1)
7206 if (dump_enabled_p ())
7207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7208 "multiple types in nested loop.\n");
7209 return false;
7212 /* FORNOW: outer loop induction with SLP not supported. */
7213 if (STMT_SLP_TYPE (stmt_info))
7214 return false;
7216 exit_phi = NULL;
7217 latch_e = loop_latch_edge (loop->inner);
7218 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7219 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7221 gimple *use_stmt = USE_STMT (use_p);
7222 if (is_gimple_debug (use_stmt))
7223 continue;
7225 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7227 exit_phi = use_stmt;
7228 break;
7231 if (exit_phi)
7233 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7234 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7235 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7237 if (dump_enabled_p ())
7238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7239 "inner-loop induction only used outside "
7240 "of the outer vectorized loop.\n");
7241 return false;
7245 nested_in_vect_loop = true;
7246 iv_loop = loop->inner;
7248 else
7249 iv_loop = loop;
7250 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7252 if (slp_node && !nunits.is_constant ())
7254 /* The current SLP code creates the initial value element-by-element. */
7255 if (dump_enabled_p ())
7256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7257 "SLP induction not supported for variable-length"
7258 " vectors.\n");
7259 return false;
7262 if (!vec_stmt) /* transformation not required. */
7264 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7265 DUMP_VECT_SCOPE ("vectorizable_induction");
7266 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7267 return true;
7270 /* Transform. */
7272 /* Compute a vector variable, initialized with the first VF values of
7273 the induction variable. E.g., for an iv with IV_PHI='X' and
7274 evolution S, for a vector of 4 units, we want to compute:
7275 [X, X + S, X + 2*S, X + 3*S]. */
7277 if (dump_enabled_p ())
7278 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7280 latch_e = loop_latch_edge (iv_loop);
7281 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7283 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7284 gcc_assert (step_expr != NULL_TREE);
7286 pe = loop_preheader_edge (iv_loop);
7287 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7288 loop_preheader_edge (iv_loop));
7290 stmts = NULL;
7291 if (!nested_in_vect_loop)
7293 /* Convert the initial value to the desired type. */
7294 tree new_type = TREE_TYPE (vectype);
7295 init_expr = gimple_convert (&stmts, new_type, init_expr);
7297 /* If we are using the loop mask to "peel" for alignment then we need
7298 to adjust the start value here. */
7299 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7300 if (skip_niters != NULL_TREE)
7302 if (FLOAT_TYPE_P (vectype))
7303 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7304 skip_niters);
7305 else
7306 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7307 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7308 skip_niters, step_expr);
7309 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7310 init_expr, skip_step);
7314 /* Convert the step to the desired type. */
7315 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7317 if (stmts)
7319 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7320 gcc_assert (!new_bb);
7323 /* Find the first insertion point in the BB. */
7324 basic_block bb = gimple_bb (phi);
7325 si = gsi_after_labels (bb);
7327 /* For SLP induction we have to generate several IVs as for example
7328 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7329 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7330 [VF*S, VF*S, VF*S, VF*S] for all. */
7331 if (slp_node)
7333 /* Enforced above. */
7334 unsigned int const_nunits = nunits.to_constant ();
7336 /* Generate [VF*S, VF*S, ... ]. */
7337 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7339 expr = build_int_cst (integer_type_node, vf);
7340 expr = fold_convert (TREE_TYPE (step_expr), expr);
7342 else
7343 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7344 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7345 expr, step_expr);
7346 if (! CONSTANT_CLASS_P (new_name))
7347 new_name = vect_init_vector (stmt_info, new_name,
7348 TREE_TYPE (step_expr), NULL);
7349 new_vec = build_vector_from_val (vectype, new_name);
7350 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7352 /* Now generate the IVs. */
7353 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7354 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7355 unsigned elts = const_nunits * nvects;
7356 unsigned nivs = least_common_multiple (group_size,
7357 const_nunits) / const_nunits;
7358 gcc_assert (elts % group_size == 0);
7359 tree elt = init_expr;
7360 unsigned ivn;
7361 for (ivn = 0; ivn < nivs; ++ivn)
7363 tree_vector_builder elts (vectype, const_nunits, 1);
7364 stmts = NULL;
7365 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7367 if (ivn*const_nunits + eltn >= group_size
7368 && (ivn * const_nunits + eltn) % group_size == 0)
7369 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7370 elt, step_expr);
7371 elts.quick_push (elt);
7373 vec_init = gimple_build_vector (&stmts, &elts);
7374 if (stmts)
7376 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7377 gcc_assert (!new_bb);
7380 /* Create the induction-phi that defines the induction-operand. */
7381 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7382 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7383 stmt_vec_info induction_phi_info
7384 = loop_vinfo->add_stmt (induction_phi);
7385 induc_def = PHI_RESULT (induction_phi);
7387 /* Create the iv update inside the loop */
7388 vec_def = make_ssa_name (vec_dest);
7389 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7390 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7391 loop_vinfo->add_stmt (new_stmt);
7393 /* Set the arguments of the phi node: */
7394 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7395 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7396 UNKNOWN_LOCATION);
7398 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7401 /* Re-use IVs when we can. */
7402 if (ivn < nvects)
7404 unsigned vfp
7405 = least_common_multiple (group_size, const_nunits) / group_size;
7406 /* Generate [VF'*S, VF'*S, ... ]. */
7407 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7409 expr = build_int_cst (integer_type_node, vfp);
7410 expr = fold_convert (TREE_TYPE (step_expr), expr);
7412 else
7413 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7414 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7415 expr, step_expr);
7416 if (! CONSTANT_CLASS_P (new_name))
7417 new_name = vect_init_vector (stmt_info, new_name,
7418 TREE_TYPE (step_expr), NULL);
7419 new_vec = build_vector_from_val (vectype, new_name);
7420 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7421 for (; ivn < nvects; ++ivn)
7423 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7424 tree def;
7425 if (gimple_code (iv) == GIMPLE_PHI)
7426 def = gimple_phi_result (iv);
7427 else
7428 def = gimple_assign_lhs (iv);
7429 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7430 PLUS_EXPR,
7431 def, vec_step);
7432 if (gimple_code (iv) == GIMPLE_PHI)
7433 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7434 else
7436 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7437 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7439 SLP_TREE_VEC_STMTS (slp_node).quick_push
7440 (loop_vinfo->add_stmt (new_stmt));
7444 return true;
7447 /* Create the vector that holds the initial_value of the induction. */
7448 if (nested_in_vect_loop)
7450 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7451 been created during vectorization of previous stmts. We obtain it
7452 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7453 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7454 /* If the initial value is not of proper type, convert it. */
7455 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7457 new_stmt
7458 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7459 vect_simple_var,
7460 "vec_iv_"),
7461 VIEW_CONVERT_EXPR,
7462 build1 (VIEW_CONVERT_EXPR, vectype,
7463 vec_init));
7464 vec_init = gimple_assign_lhs (new_stmt);
7465 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7466 new_stmt);
7467 gcc_assert (!new_bb);
7468 loop_vinfo->add_stmt (new_stmt);
7471 else
7473 /* iv_loop is the loop to be vectorized. Create:
7474 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7475 stmts = NULL;
7476 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7478 unsigned HOST_WIDE_INT const_nunits;
7479 if (nunits.is_constant (&const_nunits))
7481 tree_vector_builder elts (vectype, const_nunits, 1);
7482 elts.quick_push (new_name);
7483 for (i = 1; i < const_nunits; i++)
7485 /* Create: new_name_i = new_name + step_expr */
7486 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7487 new_name, step_expr);
7488 elts.quick_push (new_name);
7490 /* Create a vector from [new_name_0, new_name_1, ...,
7491 new_name_nunits-1] */
7492 vec_init = gimple_build_vector (&stmts, &elts);
7494 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7495 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7496 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7497 new_name, step_expr);
7498 else
7500 /* Build:
7501 [base, base, base, ...]
7502 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7503 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7504 gcc_assert (flag_associative_math);
7505 tree index = build_index_vector (vectype, 0, 1);
7506 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7507 new_name);
7508 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7509 step_expr);
7510 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7511 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7512 vec_init, step_vec);
7513 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7514 vec_init, base_vec);
7517 if (stmts)
7519 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7520 gcc_assert (!new_bb);
7525 /* Create the vector that holds the step of the induction. */
7526 if (nested_in_vect_loop)
7527 /* iv_loop is nested in the loop to be vectorized. Generate:
7528 vec_step = [S, S, S, S] */
7529 new_name = step_expr;
7530 else
7532 /* iv_loop is the loop to be vectorized. Generate:
7533 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7534 gimple_seq seq = NULL;
7535 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7537 expr = build_int_cst (integer_type_node, vf);
7538 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7540 else
7541 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7542 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7543 expr, step_expr);
7544 if (seq)
7546 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7547 gcc_assert (!new_bb);
7551 t = unshare_expr (new_name);
7552 gcc_assert (CONSTANT_CLASS_P (new_name)
7553 || TREE_CODE (new_name) == SSA_NAME);
7554 new_vec = build_vector_from_val (vectype, t);
7555 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7558 /* Create the following def-use cycle:
7559 loop prolog:
7560 vec_init = ...
7561 vec_step = ...
7562 loop:
7563 vec_iv = PHI <vec_init, vec_loop>
7565 STMT
7567 vec_loop = vec_iv + vec_step; */
7569 /* Create the induction-phi that defines the induction-operand. */
7570 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7571 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7572 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7573 induc_def = PHI_RESULT (induction_phi);
7575 /* Create the iv update inside the loop */
7576 vec_def = make_ssa_name (vec_dest);
7577 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7578 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7579 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7581 /* Set the arguments of the phi node: */
7582 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7583 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7584 UNKNOWN_LOCATION);
7586 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7588 /* In case that vectorization factor (VF) is bigger than the number
7589 of elements that we can fit in a vectype (nunits), we have to generate
7590 more than one vector stmt - i.e - we need to "unroll" the
7591 vector stmt by a factor VF/nunits. For more details see documentation
7592 in vectorizable_operation. */
7594 if (ncopies > 1)
7596 gimple_seq seq = NULL;
7597 stmt_vec_info prev_stmt_vinfo;
7598 /* FORNOW. This restriction should be relaxed. */
7599 gcc_assert (!nested_in_vect_loop);
7601 /* Create the vector that holds the step of the induction. */
7602 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7604 expr = build_int_cst (integer_type_node, nunits);
7605 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7607 else
7608 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7609 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7610 expr, step_expr);
7611 if (seq)
7613 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7614 gcc_assert (!new_bb);
7617 t = unshare_expr (new_name);
7618 gcc_assert (CONSTANT_CLASS_P (new_name)
7619 || TREE_CODE (new_name) == SSA_NAME);
7620 new_vec = build_vector_from_val (vectype, t);
7621 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7623 vec_def = induc_def;
7624 prev_stmt_vinfo = induction_phi_info;
7625 for (i = 1; i < ncopies; i++)
7627 /* vec_i = vec_prev + vec_step */
7628 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7629 vec_def, vec_step);
7630 vec_def = make_ssa_name (vec_dest, new_stmt);
7631 gimple_assign_set_lhs (new_stmt, vec_def);
7633 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7634 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7635 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7636 prev_stmt_vinfo = new_stmt_info;
7640 if (nested_in_vect_loop)
7642 /* Find the loop-closed exit-phi of the induction, and record
7643 the final vector of induction results: */
7644 exit_phi = NULL;
7645 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7647 gimple *use_stmt = USE_STMT (use_p);
7648 if (is_gimple_debug (use_stmt))
7649 continue;
7651 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7653 exit_phi = use_stmt;
7654 break;
7657 if (exit_phi)
7659 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7660 /* FORNOW. Currently not supporting the case that an inner-loop induction
7661 is not used in the outer-loop (i.e. only outside the outer-loop). */
7662 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7663 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7665 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7666 if (dump_enabled_p ())
7667 dump_printf_loc (MSG_NOTE, vect_location,
7668 "vector of inductions after inner-loop:%G",
7669 new_stmt);
7674 if (dump_enabled_p ())
7675 dump_printf_loc (MSG_NOTE, vect_location,
7676 "transform induction: created def-use cycle: %G%G",
7677 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7679 return true;
7682 /* Function vectorizable_live_operation.
7684 STMT_INFO computes a value that is used outside the loop. Check if
7685 it can be supported. */
7687 bool
7688 vectorizable_live_operation (stmt_vec_info stmt_info,
7689 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7690 slp_tree slp_node, int slp_index,
7691 stmt_vec_info *vec_stmt,
7692 stmt_vector_for_cost *)
7694 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7695 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7696 imm_use_iterator imm_iter;
7697 tree lhs, lhs_type, bitsize, vec_bitsize;
7698 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7699 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7700 int ncopies;
7701 gimple *use_stmt;
7702 auto_vec<tree> vec_oprnds;
7703 int vec_entry = 0;
7704 poly_uint64 vec_index = 0;
7706 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7708 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7709 return false;
7711 /* FORNOW. CHECKME. */
7712 if (nested_in_vect_loop_p (loop, stmt_info))
7713 return false;
7715 /* If STMT is not relevant and it is a simple assignment and its inputs are
7716 invariant then it can remain in place, unvectorized. The original last
7717 scalar value that it computes will be used. */
7718 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7720 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7721 if (dump_enabled_p ())
7722 dump_printf_loc (MSG_NOTE, vect_location,
7723 "statement is simple and uses invariant. Leaving in "
7724 "place.\n");
7725 return true;
7728 if (slp_node)
7729 ncopies = 1;
7730 else
7731 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7733 if (slp_node)
7735 gcc_assert (slp_index >= 0);
7737 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7738 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7740 /* Get the last occurrence of the scalar index from the concatenation of
7741 all the slp vectors. Calculate which slp vector it is and the index
7742 within. */
7743 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7745 /* Calculate which vector contains the result, and which lane of
7746 that vector we need. */
7747 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7749 if (dump_enabled_p ())
7750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7751 "Cannot determine which vector holds the"
7752 " final result.\n");
7753 return false;
7757 if (!vec_stmt)
7759 /* No transformation required. */
7760 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7762 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7763 OPTIMIZE_FOR_SPEED))
7765 if (dump_enabled_p ())
7766 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7767 "can't use a fully-masked loop because "
7768 "the target doesn't support extract last "
7769 "reduction.\n");
7770 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7772 else if (slp_node)
7774 if (dump_enabled_p ())
7775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7776 "can't use a fully-masked loop because an "
7777 "SLP statement is live after the loop.\n");
7778 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7780 else if (ncopies > 1)
7782 if (dump_enabled_p ())
7783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7784 "can't use a fully-masked loop because"
7785 " ncopies is greater than 1.\n");
7786 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7788 else
7790 gcc_assert (ncopies == 1 && !slp_node);
7791 vect_record_loop_mask (loop_vinfo,
7792 &LOOP_VINFO_MASKS (loop_vinfo),
7793 1, vectype);
7796 return true;
7799 /* Use the lhs of the original scalar statement. */
7800 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7802 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7803 : gimple_get_lhs (stmt);
7804 lhs_type = TREE_TYPE (lhs);
7806 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7807 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7808 : TYPE_SIZE (TREE_TYPE (vectype)));
7809 vec_bitsize = TYPE_SIZE (vectype);
7811 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7812 tree vec_lhs, bitstart;
7813 if (slp_node)
7815 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7817 /* Get the correct slp vectorized stmt. */
7818 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7819 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7820 vec_lhs = gimple_phi_result (phi);
7821 else
7822 vec_lhs = gimple_get_lhs (vec_stmt);
7824 /* Get entry to use. */
7825 bitstart = bitsize_int (vec_index);
7826 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7828 else
7830 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7831 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7832 gcc_checking_assert (ncopies == 1
7833 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7835 /* For multiple copies, get the last copy. */
7836 for (int i = 1; i < ncopies; ++i)
7837 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7839 /* Get the last lane in the vector. */
7840 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7843 gimple_seq stmts = NULL;
7844 tree new_tree;
7845 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7847 /* Emit:
7849 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7851 where VEC_LHS is the vectorized live-out result and MASK is
7852 the loop mask for the final iteration. */
7853 gcc_assert (ncopies == 1 && !slp_node);
7854 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7855 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7856 1, vectype, 0);
7857 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7858 scalar_type, mask, vec_lhs);
7860 /* Convert the extracted vector element to the required scalar type. */
7861 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7863 else
7865 tree bftype = TREE_TYPE (vectype);
7866 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7867 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7868 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7869 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7870 &stmts, true, NULL_TREE);
7873 if (stmts)
7874 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7876 /* Replace use of lhs with newly computed result. If the use stmt is a
7877 single arg PHI, just replace all uses of PHI result. It's necessary
7878 because lcssa PHI defining lhs may be before newly inserted stmt. */
7879 use_operand_p use_p;
7880 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7881 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7882 && !is_gimple_debug (use_stmt))
7884 if (gimple_code (use_stmt) == GIMPLE_PHI
7885 && gimple_phi_num_args (use_stmt) == 1)
7887 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7889 else
7891 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7892 SET_USE (use_p, new_tree);
7894 update_stmt (use_stmt);
7897 return true;
7900 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7902 static void
7903 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7905 ssa_op_iter op_iter;
7906 imm_use_iterator imm_iter;
7907 def_operand_p def_p;
7908 gimple *ustmt;
7910 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7912 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7914 basic_block bb;
7916 if (!is_gimple_debug (ustmt))
7917 continue;
7919 bb = gimple_bb (ustmt);
7921 if (!flow_bb_inside_loop_p (loop, bb))
7923 if (gimple_debug_bind_p (ustmt))
7925 if (dump_enabled_p ())
7926 dump_printf_loc (MSG_NOTE, vect_location,
7927 "killing debug use\n");
7929 gimple_debug_bind_reset_value (ustmt);
7930 update_stmt (ustmt);
7932 else
7933 gcc_unreachable ();
7939 /* Given loop represented by LOOP_VINFO, return true if computation of
7940 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7941 otherwise. */
7943 static bool
7944 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7946 /* Constant case. */
7947 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7949 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7950 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7952 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7953 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7954 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7955 return true;
7958 widest_int max;
7959 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7960 /* Check the upper bound of loop niters. */
7961 if (get_max_loop_iterations (loop, &max))
7963 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7964 signop sgn = TYPE_SIGN (type);
7965 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7966 if (max < type_max)
7967 return true;
7969 return false;
7972 /* Return a mask type with half the number of elements as TYPE. */
7974 tree
7975 vect_halve_mask_nunits (tree type)
7977 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7978 return build_truth_vector_type (nunits, current_vector_size);
7981 /* Return a mask type with twice as many elements as TYPE. */
7983 tree
7984 vect_double_mask_nunits (tree type)
7986 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7987 return build_truth_vector_type (nunits, current_vector_size);
7990 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7991 contain a sequence of NVECTORS masks that each control a vector of type
7992 VECTYPE. */
7994 void
7995 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
7996 unsigned int nvectors, tree vectype)
7998 gcc_assert (nvectors != 0);
7999 if (masks->length () < nvectors)
8000 masks->safe_grow_cleared (nvectors);
8001 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8002 /* The number of scalars per iteration and the number of vectors are
8003 both compile-time constants. */
8004 unsigned int nscalars_per_iter
8005 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8006 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8007 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8009 rgm->max_nscalars_per_iter = nscalars_per_iter;
8010 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8014 /* Given a complete set of masks MASKS, extract mask number INDEX
8015 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8016 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8018 See the comment above vec_loop_masks for more details about the mask
8019 arrangement. */
8021 tree
8022 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8023 unsigned int nvectors, tree vectype, unsigned int index)
8025 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8026 tree mask_type = rgm->mask_type;
8028 /* Populate the rgroup's mask array, if this is the first time we've
8029 used it. */
8030 if (rgm->masks.is_empty ())
8032 rgm->masks.safe_grow_cleared (nvectors);
8033 for (unsigned int i = 0; i < nvectors; ++i)
8035 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8036 /* Provide a dummy definition until the real one is available. */
8037 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8038 rgm->masks[i] = mask;
8042 tree mask = rgm->masks[index];
8043 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8044 TYPE_VECTOR_SUBPARTS (vectype)))
8046 /* A loop mask for data type X can be reused for data type Y
8047 if X has N times more elements than Y and if Y's elements
8048 are N times bigger than X's. In this case each sequence
8049 of N elements in the loop mask will be all-zero or all-one.
8050 We can then view-convert the mask so that each sequence of
8051 N elements is replaced by a single element. */
8052 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8053 TYPE_VECTOR_SUBPARTS (vectype)));
8054 gimple_seq seq = NULL;
8055 mask_type = build_same_sized_truth_vector_type (vectype);
8056 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8057 if (seq)
8058 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8060 return mask;
8063 /* Scale profiling counters by estimation for LOOP which is vectorized
8064 by factor VF. */
8066 static void
8067 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8069 edge preheader = loop_preheader_edge (loop);
8070 /* Reduce loop iterations by the vectorization factor. */
8071 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8072 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8074 if (freq_h.nonzero_p ())
8076 profile_probability p;
8078 /* Avoid dropping loop body profile counter to 0 because of zero count
8079 in loop's preheader. */
8080 if (!(freq_e == profile_count::zero ()))
8081 freq_e = freq_e.force_nonzero ();
8082 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8083 scale_loop_frequencies (loop, p);
8086 edge exit_e = single_exit (loop);
8087 exit_e->probability = profile_probability::always ()
8088 .apply_scale (1, new_est_niter + 1);
8090 edge exit_l = single_pred_edge (loop->latch);
8091 profile_probability prob = exit_l->probability;
8092 exit_l->probability = exit_e->probability.invert ();
8093 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8094 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8097 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8098 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8099 stmt_vec_info. */
8101 static void
8102 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8103 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8105 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8106 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8108 if (dump_enabled_p ())
8109 dump_printf_loc (MSG_NOTE, vect_location,
8110 "------>vectorizing statement: %G", stmt_info->stmt);
8112 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8113 vect_loop_kill_debug_uses (loop, stmt_info);
8115 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8116 && !STMT_VINFO_LIVE_P (stmt_info))
8117 return;
8119 if (STMT_VINFO_VECTYPE (stmt_info))
8121 poly_uint64 nunits
8122 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8123 if (!STMT_SLP_TYPE (stmt_info)
8124 && maybe_ne (nunits, vf)
8125 && dump_enabled_p ())
8126 /* For SLP VF is set according to unrolling factor, and not
8127 to vector size, hence for SLP this print is not valid. */
8128 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8131 /* Pure SLP statements have already been vectorized. We still need
8132 to apply loop vectorization to hybrid SLP statements. */
8133 if (PURE_SLP_STMT (stmt_info))
8134 return;
8136 if (dump_enabled_p ())
8137 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8139 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8140 *seen_store = stmt_info;
8143 /* Function vect_transform_loop.
8145 The analysis phase has determined that the loop is vectorizable.
8146 Vectorize the loop - created vectorized stmts to replace the scalar
8147 stmts in the loop, and update the loop exit condition.
8148 Returns scalar epilogue loop if any. */
8150 struct loop *
8151 vect_transform_loop (loop_vec_info loop_vinfo)
8153 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8154 struct loop *epilogue = NULL;
8155 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8156 int nbbs = loop->num_nodes;
8157 int i;
8158 tree niters_vector = NULL_TREE;
8159 tree step_vector = NULL_TREE;
8160 tree niters_vector_mult_vf = NULL_TREE;
8161 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8162 unsigned int lowest_vf = constant_lower_bound (vf);
8163 gimple *stmt;
8164 bool check_profitability = false;
8165 unsigned int th;
8167 DUMP_VECT_SCOPE ("vec_transform_loop");
8169 loop_vinfo->shared->check_datarefs ();
8171 /* Use the more conservative vectorization threshold. If the number
8172 of iterations is constant assume the cost check has been performed
8173 by our caller. If the threshold makes all loops profitable that
8174 run at least the (estimated) vectorization factor number of times
8175 checking is pointless, too. */
8176 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8177 if (th >= vect_vf_for_cost (loop_vinfo)
8178 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8180 if (dump_enabled_p ())
8181 dump_printf_loc (MSG_NOTE, vect_location,
8182 "Profitability threshold is %d loop iterations.\n",
8183 th);
8184 check_profitability = true;
8187 /* Make sure there exists a single-predecessor exit bb. Do this before
8188 versioning. */
8189 edge e = single_exit (loop);
8190 if (! single_pred_p (e->dest))
8192 split_loop_exit_edge (e, true);
8193 if (dump_enabled_p ())
8194 dump_printf (MSG_NOTE, "split exit edge\n");
8197 /* Version the loop first, if required, so the profitability check
8198 comes first. */
8200 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8202 poly_uint64 versioning_threshold
8203 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8204 if (check_profitability
8205 && ordered_p (poly_uint64 (th), versioning_threshold))
8207 versioning_threshold = ordered_max (poly_uint64 (th),
8208 versioning_threshold);
8209 check_profitability = false;
8211 vect_loop_versioning (loop_vinfo, th, check_profitability,
8212 versioning_threshold);
8213 check_profitability = false;
8216 /* Make sure there exists a single-predecessor exit bb also on the
8217 scalar loop copy. Do this after versioning but before peeling
8218 so CFG structure is fine for both scalar and if-converted loop
8219 to make slpeel_duplicate_current_defs_from_edges face matched
8220 loop closed PHI nodes on the exit. */
8221 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8223 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8224 if (! single_pred_p (e->dest))
8226 split_loop_exit_edge (e, true);
8227 if (dump_enabled_p ())
8228 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8232 tree niters = vect_build_loop_niters (loop_vinfo);
8233 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8234 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8235 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8236 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8237 &step_vector, &niters_vector_mult_vf, th,
8238 check_profitability, niters_no_overflow);
8240 if (niters_vector == NULL_TREE)
8242 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8243 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8244 && known_eq (lowest_vf, vf))
8246 niters_vector
8247 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8248 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8249 step_vector = build_one_cst (TREE_TYPE (niters));
8251 else
8252 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8253 &step_vector, niters_no_overflow);
8256 /* 1) Make sure the loop header has exactly two entries
8257 2) Make sure we have a preheader basic block. */
8259 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8261 split_edge (loop_preheader_edge (loop));
8263 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8264 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8265 /* This will deal with any possible peeling. */
8266 vect_prepare_for_masked_peels (loop_vinfo);
8268 /* Schedule the SLP instances first, then handle loop vectorization
8269 below. */
8270 if (!loop_vinfo->slp_instances.is_empty ())
8272 DUMP_VECT_SCOPE ("scheduling SLP instances");
8273 vect_schedule_slp (loop_vinfo);
8276 /* FORNOW: the vectorizer supports only loops which body consist
8277 of one basic block (header + empty latch). When the vectorizer will
8278 support more involved loop forms, the order by which the BBs are
8279 traversed need to be reconsidered. */
8281 for (i = 0; i < nbbs; i++)
8283 basic_block bb = bbs[i];
8284 stmt_vec_info stmt_info;
8286 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8287 gsi_next (&si))
8289 gphi *phi = si.phi ();
8290 if (dump_enabled_p ())
8291 dump_printf_loc (MSG_NOTE, vect_location,
8292 "------>vectorizing phi: %G", phi);
8293 stmt_info = loop_vinfo->lookup_stmt (phi);
8294 if (!stmt_info)
8295 continue;
8297 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8298 vect_loop_kill_debug_uses (loop, stmt_info);
8300 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8301 && !STMT_VINFO_LIVE_P (stmt_info))
8302 continue;
8304 if (STMT_VINFO_VECTYPE (stmt_info)
8305 && (maybe_ne
8306 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8307 && dump_enabled_p ())
8308 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8310 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8311 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8312 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8313 && ! PURE_SLP_STMT (stmt_info))
8315 if (dump_enabled_p ())
8316 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8317 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8321 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8322 !gsi_end_p (si);)
8324 stmt = gsi_stmt (si);
8325 /* During vectorization remove existing clobber stmts. */
8326 if (gimple_clobber_p (stmt))
8328 unlink_stmt_vdef (stmt);
8329 gsi_remove (&si, true);
8330 release_defs (stmt);
8332 else
8334 stmt_info = loop_vinfo->lookup_stmt (stmt);
8336 /* vector stmts created in the outer-loop during vectorization of
8337 stmts in an inner-loop may not have a stmt_info, and do not
8338 need to be vectorized. */
8339 stmt_vec_info seen_store = NULL;
8340 if (stmt_info)
8342 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8344 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8345 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8346 !gsi_end_p (subsi); gsi_next (&subsi))
8348 stmt_vec_info pat_stmt_info
8349 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8350 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8351 &si, &seen_store);
8353 stmt_vec_info pat_stmt_info
8354 = STMT_VINFO_RELATED_STMT (stmt_info);
8355 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8356 &seen_store);
8358 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8359 &seen_store);
8361 gsi_next (&si);
8362 if (seen_store)
8364 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8365 /* Interleaving. If IS_STORE is TRUE, the
8366 vectorization of the interleaving chain was
8367 completed - free all the stores in the chain. */
8368 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8369 else
8370 /* Free the attached stmt_vec_info and remove the stmt. */
8371 loop_vinfo->remove_stmt (stmt_info);
8376 /* Stub out scalar statements that must not survive vectorization.
8377 Doing this here helps with grouped statements, or statements that
8378 are involved in patterns. */
8379 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8380 !gsi_end_p (gsi); gsi_next (&gsi))
8382 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8383 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8385 tree lhs = gimple_get_lhs (call);
8386 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8388 tree zero = build_zero_cst (TREE_TYPE (lhs));
8389 gimple *new_stmt = gimple_build_assign (lhs, zero);
8390 gsi_replace (&gsi, new_stmt, true);
8394 } /* BBs in loop */
8396 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8397 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8398 if (integer_onep (step_vector))
8399 niters_no_overflow = true;
8400 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8401 niters_vector_mult_vf, !niters_no_overflow);
8403 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8404 scale_profile_for_vect_loop (loop, assumed_vf);
8406 /* True if the final iteration might not handle a full vector's
8407 worth of scalar iterations. */
8408 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8409 /* The minimum number of iterations performed by the epilogue. This
8410 is 1 when peeling for gaps because we always need a final scalar
8411 iteration. */
8412 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8413 /* +1 to convert latch counts to loop iteration counts,
8414 -min_epilogue_iters to remove iterations that cannot be performed
8415 by the vector code. */
8416 int bias_for_lowest = 1 - min_epilogue_iters;
8417 int bias_for_assumed = bias_for_lowest;
8418 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8419 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8421 /* When the amount of peeling is known at compile time, the first
8422 iteration will have exactly alignment_npeels active elements.
8423 In the worst case it will have at least one. */
8424 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8425 bias_for_lowest += lowest_vf - min_first_active;
8426 bias_for_assumed += assumed_vf - min_first_active;
8428 /* In these calculations the "- 1" converts loop iteration counts
8429 back to latch counts. */
8430 if (loop->any_upper_bound)
8431 loop->nb_iterations_upper_bound
8432 = (final_iter_may_be_partial
8433 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8434 lowest_vf) - 1
8435 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8436 lowest_vf) - 1);
8437 if (loop->any_likely_upper_bound)
8438 loop->nb_iterations_likely_upper_bound
8439 = (final_iter_may_be_partial
8440 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8441 + bias_for_lowest, lowest_vf) - 1
8442 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8443 + bias_for_lowest, lowest_vf) - 1);
8444 if (loop->any_estimate)
8445 loop->nb_iterations_estimate
8446 = (final_iter_may_be_partial
8447 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8448 assumed_vf) - 1
8449 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8450 assumed_vf) - 1);
8452 if (dump_enabled_p ())
8454 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8456 dump_printf_loc (MSG_NOTE, vect_location,
8457 "LOOP VECTORIZED\n");
8458 if (loop->inner)
8459 dump_printf_loc (MSG_NOTE, vect_location,
8460 "OUTER LOOP VECTORIZED\n");
8461 dump_printf (MSG_NOTE, "\n");
8463 else
8465 dump_printf_loc (MSG_NOTE, vect_location,
8466 "LOOP EPILOGUE VECTORIZED (VS=");
8467 dump_dec (MSG_NOTE, current_vector_size);
8468 dump_printf (MSG_NOTE, ")\n");
8472 /* Loops vectorized with a variable factor won't benefit from
8473 unrolling/peeling. */
8474 if (!vf.is_constant ())
8476 loop->unroll = 1;
8477 if (dump_enabled_p ())
8478 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8479 " variable-length vectorization factor\n");
8481 /* Free SLP instances here because otherwise stmt reference counting
8482 won't work. */
8483 slp_instance instance;
8484 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8485 vect_free_slp_instance (instance, true);
8486 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8487 /* Clear-up safelen field since its value is invalid after vectorization
8488 since vectorized loop can have loop-carried dependencies. */
8489 loop->safelen = 0;
8491 /* Don't vectorize epilogue for epilogue. */
8492 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8493 epilogue = NULL;
8495 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8496 epilogue = NULL;
8498 if (epilogue)
8500 auto_vector_sizes vector_sizes;
8501 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8502 unsigned int next_size = 0;
8504 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8505 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8506 && known_eq (vf, lowest_vf))
8508 unsigned int eiters
8509 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8510 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
8511 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8512 eiters
8513 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8514 epilogue->nb_iterations_upper_bound = eiters - 1;
8515 epilogue->any_upper_bound = true;
8517 unsigned int ratio;
8518 while (next_size < vector_sizes.length ()
8519 && !(constant_multiple_p (current_vector_size,
8520 vector_sizes[next_size], &ratio)
8521 && eiters >= lowest_vf / ratio))
8522 next_size += 1;
8524 else
8525 while (next_size < vector_sizes.length ()
8526 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8527 next_size += 1;
8529 if (next_size == vector_sizes.length ())
8530 epilogue = NULL;
8533 if (epilogue)
8535 epilogue->force_vectorize = loop->force_vectorize;
8536 epilogue->safelen = loop->safelen;
8537 epilogue->dont_vectorize = false;
8539 /* We may need to if-convert epilogue to vectorize it. */
8540 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8541 tree_if_conversion (epilogue);
8544 return epilogue;
8547 /* The code below is trying to perform simple optimization - revert
8548 if-conversion for masked stores, i.e. if the mask of a store is zero
8549 do not perform it and all stored value producers also if possible.
8550 For example,
8551 for (i=0; i<n; i++)
8552 if (c[i])
8554 p1[i] += 1;
8555 p2[i] = p3[i] +2;
8557 this transformation will produce the following semi-hammock:
8559 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8561 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8562 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8563 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8564 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8565 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8566 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8570 void
8571 optimize_mask_stores (struct loop *loop)
8573 basic_block *bbs = get_loop_body (loop);
8574 unsigned nbbs = loop->num_nodes;
8575 unsigned i;
8576 basic_block bb;
8577 struct loop *bb_loop;
8578 gimple_stmt_iterator gsi;
8579 gimple *stmt;
8580 auto_vec<gimple *> worklist;
8582 vect_location = find_loop_location (loop);
8583 /* Pick up all masked stores in loop if any. */
8584 for (i = 0; i < nbbs; i++)
8586 bb = bbs[i];
8587 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8588 gsi_next (&gsi))
8590 stmt = gsi_stmt (gsi);
8591 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8592 worklist.safe_push (stmt);
8596 free (bbs);
8597 if (worklist.is_empty ())
8598 return;
8600 /* Loop has masked stores. */
8601 while (!worklist.is_empty ())
8603 gimple *last, *last_store;
8604 edge e, efalse;
8605 tree mask;
8606 basic_block store_bb, join_bb;
8607 gimple_stmt_iterator gsi_to;
8608 tree vdef, new_vdef;
8609 gphi *phi;
8610 tree vectype;
8611 tree zero;
8613 last = worklist.pop ();
8614 mask = gimple_call_arg (last, 2);
8615 bb = gimple_bb (last);
8616 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8617 the same loop as if_bb. It could be different to LOOP when two
8618 level loop-nest is vectorized and mask_store belongs to the inner
8619 one. */
8620 e = split_block (bb, last);
8621 bb_loop = bb->loop_father;
8622 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8623 join_bb = e->dest;
8624 store_bb = create_empty_bb (bb);
8625 add_bb_to_loop (store_bb, bb_loop);
8626 e->flags = EDGE_TRUE_VALUE;
8627 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8628 /* Put STORE_BB to likely part. */
8629 efalse->probability = profile_probability::unlikely ();
8630 store_bb->count = efalse->count ();
8631 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8632 if (dom_info_available_p (CDI_DOMINATORS))
8633 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8634 if (dump_enabled_p ())
8635 dump_printf_loc (MSG_NOTE, vect_location,
8636 "Create new block %d to sink mask stores.",
8637 store_bb->index);
8638 /* Create vector comparison with boolean result. */
8639 vectype = TREE_TYPE (mask);
8640 zero = build_zero_cst (vectype);
8641 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8642 gsi = gsi_last_bb (bb);
8643 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8644 /* Create new PHI node for vdef of the last masked store:
8645 .MEM_2 = VDEF <.MEM_1>
8646 will be converted to
8647 .MEM.3 = VDEF <.MEM_1>
8648 and new PHI node will be created in join bb
8649 .MEM_2 = PHI <.MEM_1, .MEM_3>
8651 vdef = gimple_vdef (last);
8652 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8653 gimple_set_vdef (last, new_vdef);
8654 phi = create_phi_node (vdef, join_bb);
8655 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8657 /* Put all masked stores with the same mask to STORE_BB if possible. */
8658 while (true)
8660 gimple_stmt_iterator gsi_from;
8661 gimple *stmt1 = NULL;
8663 /* Move masked store to STORE_BB. */
8664 last_store = last;
8665 gsi = gsi_for_stmt (last);
8666 gsi_from = gsi;
8667 /* Shift GSI to the previous stmt for further traversal. */
8668 gsi_prev (&gsi);
8669 gsi_to = gsi_start_bb (store_bb);
8670 gsi_move_before (&gsi_from, &gsi_to);
8671 /* Setup GSI_TO to the non-empty block start. */
8672 gsi_to = gsi_start_bb (store_bb);
8673 if (dump_enabled_p ())
8674 dump_printf_loc (MSG_NOTE, vect_location,
8675 "Move stmt to created bb\n%G", last);
8676 /* Move all stored value producers if possible. */
8677 while (!gsi_end_p (gsi))
8679 tree lhs;
8680 imm_use_iterator imm_iter;
8681 use_operand_p use_p;
8682 bool res;
8684 /* Skip debug statements. */
8685 if (is_gimple_debug (gsi_stmt (gsi)))
8687 gsi_prev (&gsi);
8688 continue;
8690 stmt1 = gsi_stmt (gsi);
8691 /* Do not consider statements writing to memory or having
8692 volatile operand. */
8693 if (gimple_vdef (stmt1)
8694 || gimple_has_volatile_ops (stmt1))
8695 break;
8696 gsi_from = gsi;
8697 gsi_prev (&gsi);
8698 lhs = gimple_get_lhs (stmt1);
8699 if (!lhs)
8700 break;
8702 /* LHS of vectorized stmt must be SSA_NAME. */
8703 if (TREE_CODE (lhs) != SSA_NAME)
8704 break;
8706 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8708 /* Remove dead scalar statement. */
8709 if (has_zero_uses (lhs))
8711 gsi_remove (&gsi_from, true);
8712 continue;
8716 /* Check that LHS does not have uses outside of STORE_BB. */
8717 res = true;
8718 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8720 gimple *use_stmt;
8721 use_stmt = USE_STMT (use_p);
8722 if (is_gimple_debug (use_stmt))
8723 continue;
8724 if (gimple_bb (use_stmt) != store_bb)
8726 res = false;
8727 break;
8730 if (!res)
8731 break;
8733 if (gimple_vuse (stmt1)
8734 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8735 break;
8737 /* Can move STMT1 to STORE_BB. */
8738 if (dump_enabled_p ())
8739 dump_printf_loc (MSG_NOTE, vect_location,
8740 "Move stmt to created bb\n%G", stmt1);
8741 gsi_move_before (&gsi_from, &gsi_to);
8742 /* Shift GSI_TO for further insertion. */
8743 gsi_prev (&gsi_to);
8745 /* Put other masked stores with the same mask to STORE_BB. */
8746 if (worklist.is_empty ()
8747 || gimple_call_arg (worklist.last (), 2) != mask
8748 || worklist.last () != stmt1)
8749 break;
8750 last = worklist.pop ();
8752 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);