[42/46] Add vec_info::replace_stmt
[official-gcc.git] / gcc / tree-vect-loop-manip.c
blobdd7463a9c242d3d0bd0ef6ebb722f5ea6715143e
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
51 /*************************************************************************
52 Simple Loop Peeling Utilities
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
58 /* Renames the use *OP_P. */
60 static void
61 rename_use_op (use_operand_p op_p)
63 tree new_name;
65 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66 return;
68 new_name = get_current_def (USE_FROM_PTR (op_p));
70 /* Something defined outside of the loop. */
71 if (!new_name)
72 return;
74 /* An ordinary ssa name defined in the loop. */
76 SET_USE (op_p, new_name);
80 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82 true. */
84 static void
85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
87 gimple *stmt;
88 use_operand_p use_p;
89 ssa_op_iter iter;
90 edge e;
91 edge_iterator ei;
92 struct loop *loop = bb->loop_father;
93 struct loop *outer_loop = NULL;
95 if (rename_from_outer_loop)
97 gcc_assert (loop);
98 outer_loop = loop_outer (loop);
101 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102 gsi_next (&gsi))
104 stmt = gsi_stmt (gsi);
105 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 rename_use_op (use_p);
109 FOR_EACH_EDGE (e, ei, bb->preds)
111 if (!flow_bb_inside_loop_p (loop, e->src))
113 if (!rename_from_outer_loop)
114 continue;
115 if (e->src != outer_loop->header)
117 if (outer_loop->inner->next)
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e->src)
123 || single_pred (e->src) != outer_loop->header)
124 continue;
128 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 gsi_next (&gsi))
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
135 struct adjust_info
137 tree from, to;
138 basic_block bb;
141 /* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
146 more. */
147 static vec<adjust_info, va_heap> adjust_vec;
149 /* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
153 static void
154 adjust_debug_stmts_now (adjust_info *ai)
156 basic_block bbphi = ai->bb;
157 tree orig_def = ai->from;
158 tree new_def = ai->to;
159 imm_use_iterator imm_iter;
160 gimple *stmt;
161 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
165 /* Adjust any debug stmts that held onto non-loop-closed
166 references. */
167 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
169 use_operand_p use_p;
170 basic_block bbuse;
172 if (!is_gimple_debug (stmt))
173 continue;
175 gcc_assert (gimple_debug_bind_p (stmt));
177 bbuse = gimple_bb (stmt);
179 if ((bbuse == bbphi
180 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 && !(bbuse == bbdef
182 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
184 if (new_def)
185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 SET_USE (use_p, new_def);
187 else
189 gimple_debug_bind_reset_value (stmt);
190 update_stmt (stmt);
196 /* Adjust debug stmts as scheduled before. */
198 static void
199 adjust_vec_debug_stmts (void)
201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
202 return;
204 gcc_assert (adjust_vec.exists ());
206 while (!adjust_vec.is_empty ())
208 adjust_debug_stmts_now (&adjust_vec.last ());
209 adjust_vec.pop ();
213 /* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
218 static void
219 adjust_debug_stmts (tree from, tree to, basic_block bb)
221 adjust_info ai;
223 if (MAY_HAVE_DEBUG_BIND_STMTS
224 && TREE_CODE (from) == SSA_NAME
225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
226 && ! virtual_operand_p (from))
228 ai.from = from;
229 ai.to = to;
230 ai.bb = bb;
232 if (adjust_vec.exists ())
233 adjust_vec.safe_push (ai);
234 else
235 adjust_debug_stmts_now (&ai);
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
242 transformations. */
244 static void
245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
251 if (MAY_HAVE_DEBUG_BIND_STMTS)
252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 gimple_bb (update_phi));
256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
260 static void
261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262 tree next_mask)
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
269 /* Add SEQ to the end of LOOP's preheader block. */
271 static void
272 add_preheader_seq (struct loop *loop, gimple_seq seq)
274 if (seq)
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
282 /* Add SEQ to the beginning of LOOP's header block. */
284 static void
285 add_header_seq (struct loop *loop, gimple_seq seq)
287 if (seq)
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 /* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
299 static bool
300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
319 static bool
320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 rgroup_masks *src_rgm)
323 tree src_masktype = src_rgm->mask_type;
324 tree dest_masktype = dest_rgm->mask_type;
325 machine_mode src_mode = TYPE_MODE (src_masktype);
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
330 /* Unpacking the source masks gives at least as many mask bits as
331 we need. We can then VIEW_CONVERT any excess bits away. */
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
335 tree src = src_rgm->masks[i / 2];
336 tree dest = dest_rgm->masks[i];
337 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338 ? VEC_UNPACK_HI_EXPR
339 : VEC_UNPACK_LO_EXPR);
340 gassign *stmt;
341 if (dest_masktype == unpack_masktype)
342 stmt = gimple_build_assign (dest, code, src);
343 else
345 tree temp = make_ssa_name (unpack_masktype);
346 stmt = gimple_build_assign (temp, code, src);
347 gimple_seq_add_stmt (seq, stmt);
348 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349 build1 (VIEW_CONVERT_EXPR,
350 dest_masktype, temp));
352 gimple_seq_add_stmt (seq, stmt);
354 return true;
356 vec_perm_indices indices[2];
357 if (dest_masktype == src_masktype
358 && interleave_supported_p (&indices[0], src_masktype, 0)
359 && interleave_supported_p (&indices[1], src_masktype, 1))
361 /* The destination requires twice as many mask bits as the source, so
362 we can use interleaving permutes to double up the number of bits. */
363 tree masks[2];
364 for (unsigned int i = 0; i < 2; ++i)
365 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
368 tree src = src_rgm->masks[i / 2];
369 tree dest = dest_rgm->masks[i];
370 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371 src, src, masks[i & 1]);
372 gimple_seq_add_stmt (seq, stmt);
374 return true;
376 return false;
379 /* Helper for vect_set_loop_condition_masked. Generate definitions for
380 all the masks in RGM and return a mask that is nonzero when the loop
381 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
382 Use LOOP_COND_GSI to insert code before the exit gcond.
384 RGM belongs to loop LOOP. The loop originally iterated NITERS
385 times and has been vectorized according to LOOP_VINFO. Each iteration
386 of the vectorized loop handles VF iterations of the scalar loop.
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389 starts with NITERS_SKIP dummy iterations of the scalar loop before
390 the real work starts. The mask elements for these dummy iterations
391 must be 0, to ensure that the extra iterations do not have an effect.
393 It is known that:
395 NITERS * RGM->max_nscalars_per_iter
397 does not overflow. However, MIGHT_WRAP_P says whether an induction
398 variable that starts at 0 and has step:
400 VF * RGM->max_nscalars_per_iter
402 might overflow before hitting a value above:
404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
406 This means that we cannot guarantee that such an induction variable
407 would ever hit a value that produces a set of all-false masks for RGM. */
409 static tree
410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411 gimple_seq *preheader_seq,
412 gimple_stmt_iterator loop_cond_gsi,
413 rgroup_masks *rgm, tree vf,
414 tree niters, tree niters_skip,
415 bool might_wrap_p)
417 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418 tree mask_type = rgm->mask_type;
419 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
422 /* Calculate the maximum number of scalar values that the rgroup
423 handles in total, the number that it handles for each iteration
424 of the vector loop, and the number that it should skip during the
425 first iteration of the vector loop. */
426 tree nscalars_total = niters;
427 tree nscalars_step = vf;
428 tree nscalars_skip = niters_skip;
429 if (nscalars_per_iter != 1)
431 /* We checked before choosing to use a fully-masked loop that these
432 multiplications don't overflow. */
433 tree factor = build_int_cst (compare_type, nscalars_per_iter);
434 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
435 nscalars_total, factor);
436 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437 nscalars_step, factor);
438 if (nscalars_skip)
439 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
440 nscalars_skip, factor);
443 /* Create an induction variable that counts the number of scalars
444 processed. */
445 tree index_before_incr, index_after_incr;
446 gimple_stmt_iterator incr_gsi;
447 bool insert_after;
448 tree zero_index = build_int_cst (compare_type, 0);
449 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450 create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
451 insert_after, &index_before_incr, &index_after_incr);
453 tree test_index, test_limit, first_limit;
454 gimple_stmt_iterator *test_gsi;
455 if (might_wrap_p)
457 /* In principle the loop should stop iterating once the incremented
458 IV reaches a value greater than or equal to:
460 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
462 However, there's no guarantee that this addition doesn't overflow
463 the comparison type, or that the IV hits a value above it before
464 wrapping around. We therefore adjust the limit down by one
465 IV step:
467 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468 -[infinite-prec] NSCALARS_STEP
470 and compare the IV against this limit _before_ incrementing it.
471 Since the comparison type is unsigned, we actually want the
472 subtraction to saturate at zero:
474 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
475 -[sat] NSCALARS_STEP
477 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
479 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
481 where the rightmost subtraction can be done directly in
482 COMPARE_TYPE. */
483 test_index = index_before_incr;
484 tree adjust = nscalars_step;
485 if (nscalars_skip)
486 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
487 adjust, nscalars_skip);
488 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
489 nscalars_total, adjust);
490 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
491 test_limit, adjust);
492 test_gsi = &incr_gsi;
494 /* Get a safe limit for the first iteration. */
495 if (nscalars_skip)
497 /* The first vector iteration can handle at most NSCALARS_STEP
498 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
499 NSCALARS_SKIP to that cannot overflow. */
500 tree const_limit = build_int_cst (compare_type,
501 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
502 * nscalars_per_iter);
503 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
504 nscalars_total, const_limit);
505 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
506 first_limit, nscalars_skip);
508 else
509 /* For the first iteration it doesn't matter whether the IV hits
510 a value above NSCALARS_TOTAL. That only matters for the latch
511 condition. */
512 first_limit = nscalars_total;
514 else
516 /* Test the incremented IV, which will always hit a value above
517 the bound before wrapping. */
518 test_index = index_after_incr;
519 test_limit = nscalars_total;
520 if (nscalars_skip)
521 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
522 test_limit, nscalars_skip);
523 test_gsi = &loop_cond_gsi;
525 first_limit = test_limit;
528 /* Provide a definition of each mask in the group. */
529 tree next_mask = NULL_TREE;
530 tree mask;
531 unsigned int i;
532 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
534 /* Previous masks will cover BIAS scalars. This mask covers the
535 next batch. */
536 poly_uint64 bias = nscalars_per_mask * i;
537 tree bias_tree = build_int_cst (compare_type, bias);
538 gimple *tmp_stmt;
540 /* See whether the first iteration of the vector loop is known
541 to have a full mask. */
542 poly_uint64 const_limit;
543 bool first_iteration_full
544 = (poly_int_tree_p (first_limit, &const_limit)
545 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
547 /* Rather than have a new IV that starts at BIAS and goes up to
548 TEST_LIMIT, prefer to use the same 0-based IV for each mask
549 and adjust the bound down by BIAS. */
550 tree this_test_limit = test_limit;
551 if (i != 0)
553 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
554 compare_type, this_test_limit,
555 bias_tree);
556 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
557 compare_type, this_test_limit,
558 bias_tree);
561 /* Create the initial mask. First include all scalars that
562 are within the loop limit. */
563 tree init_mask = NULL_TREE;
564 if (!first_iteration_full)
566 tree start, end;
567 if (first_limit == test_limit)
569 /* Use a natural test between zero (the initial IV value)
570 and the loop limit. The "else" block would be valid too,
571 but this choice can avoid the need to load BIAS_TREE into
572 a register. */
573 start = zero_index;
574 end = this_test_limit;
576 else
578 /* FIRST_LIMIT is the maximum number of scalars handled by the
579 first iteration of the vector loop. Test the portion
580 associated with this mask. */
581 start = bias_tree;
582 end = first_limit;
585 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
586 tmp_stmt = vect_gen_while (init_mask, start, end);
587 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
590 /* Now AND out the bits that are within the number of skipped
591 scalars. */
592 poly_uint64 const_skip;
593 if (nscalars_skip
594 && !(poly_int_tree_p (nscalars_skip, &const_skip)
595 && known_le (const_skip, bias)))
597 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
598 bias_tree, nscalars_skip);
599 if (init_mask)
600 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
601 init_mask, unskipped_mask);
602 else
603 init_mask = unskipped_mask;
606 if (!init_mask)
607 /* First iteration is full. */
608 init_mask = build_minus_one_cst (mask_type);
610 /* Get the mask value for the next iteration of the loop. */
611 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
612 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
613 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
615 vect_set_loop_mask (loop, mask, init_mask, next_mask);
617 return next_mask;
620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
622 number of iterations of the original scalar loop that should be
623 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
624 as for vect_set_loop_condition.
626 Insert the branch-back condition before LOOP_COND_GSI and return the
627 final gcond. */
629 static gcond *
630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
631 tree niters, tree final_iv,
632 bool niters_maybe_zero,
633 gimple_stmt_iterator loop_cond_gsi)
635 gimple_seq preheader_seq = NULL;
636 gimple_seq header_seq = NULL;
638 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
639 unsigned int compare_precision = TYPE_PRECISION (compare_type);
640 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
641 tree orig_niters = niters;
643 /* Type of the initial value of NITERS. */
644 tree ni_actual_type = TREE_TYPE (niters);
645 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
647 /* Convert NITERS to the same size as the compare. */
648 if (compare_precision > ni_actual_precision
649 && niters_maybe_zero)
651 /* We know that there is always at least one iteration, so if the
652 count is zero then it must have wrapped. Cope with this by
653 subtracting 1 before the conversion and adding 1 to the result. */
654 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
655 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
656 niters, build_minus_one_cst (ni_actual_type));
657 niters = gimple_convert (&preheader_seq, compare_type, niters);
658 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
659 niters, build_one_cst (compare_type));
661 else
662 niters = gimple_convert (&preheader_seq, compare_type, niters);
664 /* Convert skip_niters to the right type. */
665 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
667 /* Now calculate the value that the induction variable must be able
668 to hit in order to ensure that we end the loop with an all-false mask.
669 This involves adding the maximum number of inactive trailing scalar
670 iterations. */
671 widest_int iv_limit;
672 bool known_max_iters = max_loop_iterations (loop, &iv_limit);
673 if (known_max_iters)
675 if (niters_skip)
677 /* Add the maximum number of skipped iterations to the
678 maximum iteration count. */
679 if (TREE_CODE (niters_skip) == INTEGER_CST)
680 iv_limit += wi::to_widest (niters_skip);
681 else
682 iv_limit += max_vf - 1;
684 /* IV_LIMIT is the maximum number of latch iterations, which is also
685 the maximum in-range IV value. Round this value down to the previous
686 vector alignment boundary and then add an extra full iteration. */
687 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
688 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
691 /* Get the vectorization factor in tree form. */
692 tree vf = build_int_cst (compare_type,
693 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
695 /* Iterate over all the rgroups and fill in their masks. We could use
696 the first mask from any rgroup for the loop condition; here we
697 arbitrarily pick the last. */
698 tree test_mask = NULL_TREE;
699 rgroup_masks *rgm;
700 unsigned int i;
701 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
702 FOR_EACH_VEC_ELT (*masks, i, rgm)
703 if (!rgm->masks.is_empty ())
705 /* First try using permutes. This adds a single vector
706 instruction to the loop for each mask, but needs no extra
707 loop invariants or IVs. */
708 unsigned int nmasks = i + 1;
709 if ((nmasks & 1) == 0)
711 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
712 if (!half_rgm->masks.is_empty ()
713 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
714 continue;
717 /* See whether zero-based IV would ever generate all-false masks
718 before wrapping around. */
719 bool might_wrap_p
720 = (!known_max_iters
721 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
722 UNSIGNED)
723 > compare_precision));
725 /* Set up all masks for this group. */
726 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
727 &preheader_seq,
728 loop_cond_gsi, rgm, vf,
729 niters, niters_skip,
730 might_wrap_p);
733 /* Emit all accumulated statements. */
734 add_preheader_seq (loop, preheader_seq);
735 add_header_seq (loop, header_seq);
737 /* Get a boolean result that tells us whether to iterate. */
738 edge exit_edge = single_exit (loop);
739 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
740 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
741 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
742 NULL_TREE, NULL_TREE);
743 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
745 /* The loop iterates (NITERS - 1) / VF + 1 times.
746 Subtract one from this to get the latch count. */
747 tree step = build_int_cst (compare_type,
748 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
749 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
750 build_minus_one_cst (compare_type));
751 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
752 niters_minus_one, step);
754 if (final_iv)
756 gassign *assign = gimple_build_assign (final_iv, orig_niters);
757 gsi_insert_on_edge_immediate (single_exit (loop), assign);
760 return cond_stmt;
763 /* Like vect_set_loop_condition, but handle the case in which there
764 are no loop masks. */
766 static gcond *
767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
768 tree step, tree final_iv,
769 bool niters_maybe_zero,
770 gimple_stmt_iterator loop_cond_gsi)
772 tree indx_before_incr, indx_after_incr;
773 gcond *cond_stmt;
774 gcond *orig_cond;
775 edge pe = loop_preheader_edge (loop);
776 edge exit_edge = single_exit (loop);
777 gimple_stmt_iterator incr_gsi;
778 bool insert_after;
779 enum tree_code code;
780 tree niters_type = TREE_TYPE (niters);
782 orig_cond = get_loop_exit_condition (loop);
783 gcc_assert (orig_cond);
784 loop_cond_gsi = gsi_for_stmt (orig_cond);
786 tree init, limit;
787 if (!niters_maybe_zero && integer_onep (step))
789 /* In this case we can use a simple 0-based IV:
792 x = 0;
796 x += 1;
798 while (x < NITERS); */
799 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
800 init = build_zero_cst (niters_type);
801 limit = niters;
803 else
805 /* The following works for all values of NITERS except 0:
808 x = 0;
812 x += STEP;
814 while (x <= NITERS - STEP);
816 so that the loop continues to iterate if x + STEP - 1 < NITERS
817 but stops if x + STEP - 1 >= NITERS.
819 However, if NITERS is zero, x never hits a value above NITERS - STEP
820 before wrapping around. There are two obvious ways of dealing with
821 this:
823 - start at STEP - 1 and compare x before incrementing it
824 - start at -1 and compare x after incrementing it
826 The latter is simpler and is what we use. The loop in this case
827 looks like:
830 x = -1;
834 x += STEP;
836 while (x < NITERS - STEP);
838 In both cases the loop limit is NITERS - STEP. */
839 gimple_seq seq = NULL;
840 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
841 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
842 if (seq)
844 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
845 gcc_assert (!new_bb);
847 if (niters_maybe_zero)
849 /* Case C. */
850 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
851 init = build_all_ones_cst (niters_type);
853 else
855 /* Case B. */
856 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
857 init = build_zero_cst (niters_type);
861 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
862 create_iv (init, step, NULL_TREE, loop,
863 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
864 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
865 true, NULL_TREE, true,
866 GSI_SAME_STMT);
867 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
868 true, GSI_SAME_STMT);
870 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
871 NULL_TREE);
873 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
875 /* Record the number of latch iterations. */
876 if (limit == niters)
877 /* Case A: the loop iterates NITERS times. Subtract one to get the
878 latch count. */
879 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
880 build_int_cst (niters_type, 1));
881 else
882 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883 Subtract one from this to get the latch count. */
884 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
885 limit, step);
887 if (final_iv)
889 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
890 indx_after_incr, init);
891 gsi_insert_on_edge_immediate (single_exit (loop), assign);
894 return cond_stmt;
897 /* If we're using fully-masked loops, make LOOP iterate:
899 N == (NITERS - 1) / STEP + 1
901 times. When NITERS is zero, this is equivalent to making the loop
902 execute (1 << M) / STEP times, where M is the precision of NITERS.
903 NITERS_MAYBE_ZERO is true if this last case might occur.
905 If we're not using fully-masked loops, make LOOP iterate:
907 N == (NITERS - STEP) / STEP + 1
909 times, where NITERS is known to be outside the range [1, STEP - 1].
910 This is equivalent to making the loop execute NITERS / STEP times
911 when NITERS is nonzero and (1 << M) / STEP times otherwise.
912 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
914 If FINAL_IV is nonnull, it is an SSA name that should be set to
915 N * STEP on exit from the loop.
917 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
919 void
920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
921 tree niters, tree step, tree final_iv,
922 bool niters_maybe_zero)
924 gcond *cond_stmt;
925 gcond *orig_cond = get_loop_exit_condition (loop);
926 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
928 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
929 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
930 final_iv, niters_maybe_zero,
931 loop_cond_gsi);
932 else
933 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
934 final_iv, niters_maybe_zero,
935 loop_cond_gsi);
937 /* Remove old loop exit test. */
938 stmt_vec_info orig_cond_info;
939 if (loop_vinfo
940 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
941 loop_vinfo->remove_stmt (orig_cond_info);
942 else
943 gsi_remove (&loop_cond_gsi, true);
945 if (dump_enabled_p ())
947 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
948 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
952 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
953 For all PHI arguments in FROM->dest and TO->dest from those
954 edges ensure that TO->dest PHI arguments have current_def
955 to that in from. */
957 static void
958 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
960 gimple_stmt_iterator gsi_from, gsi_to;
962 for (gsi_from = gsi_start_phis (from->dest),
963 gsi_to = gsi_start_phis (to->dest);
964 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
966 gimple *from_phi = gsi_stmt (gsi_from);
967 gimple *to_phi = gsi_stmt (gsi_to);
968 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
969 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
970 if (virtual_operand_p (from_arg))
972 gsi_next (&gsi_from);
973 continue;
975 if (virtual_operand_p (to_arg))
977 gsi_next (&gsi_to);
978 continue;
980 if (TREE_CODE (from_arg) != SSA_NAME)
981 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
982 else
984 if (get_current_def (to_arg) == NULL_TREE)
985 set_current_def (to_arg, get_current_def (from_arg));
987 gsi_next (&gsi_from);
988 gsi_next (&gsi_to);
991 gphi *from_phi = get_virtual_phi (from->dest);
992 gphi *to_phi = get_virtual_phi (to->dest);
993 if (from_phi)
994 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
995 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
999 /* Given LOOP this function generates a new copy of it and puts it
1000 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1001 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1002 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1003 entry or exit of LOOP. */
1005 struct loop *
1006 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1007 struct loop *scalar_loop, edge e)
1009 struct loop *new_loop;
1010 basic_block *new_bbs, *bbs, *pbbs;
1011 bool at_exit;
1012 bool was_imm_dom;
1013 basic_block exit_dest;
1014 edge exit, new_exit;
1015 bool duplicate_outer_loop = false;
1017 exit = single_exit (loop);
1018 at_exit = (e == exit);
1019 if (!at_exit && e != loop_preheader_edge (loop))
1020 return NULL;
1022 if (scalar_loop == NULL)
1023 scalar_loop = loop;
1025 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1026 pbbs = bbs + 1;
1027 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1028 /* Allow duplication of outer loops. */
1029 if (scalar_loop->inner)
1030 duplicate_outer_loop = true;
1031 /* Check whether duplication is possible. */
1032 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1034 free (bbs);
1035 return NULL;
1038 /* Generate new loop structure. */
1039 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1040 duplicate_subloops (scalar_loop, new_loop);
1042 exit_dest = exit->dest;
1043 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1044 exit_dest) == loop->header ?
1045 true : false);
1047 /* Also copy the pre-header, this avoids jumping through hoops to
1048 duplicate the loop entry PHI arguments. Create an empty
1049 pre-header unconditionally for this. */
1050 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1051 edge entry_e = single_pred_edge (preheader);
1052 bbs[0] = preheader;
1053 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1055 exit = single_exit (scalar_loop);
1056 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1057 &exit, 1, &new_exit, NULL,
1058 at_exit ? loop->latch : e->src, true);
1059 exit = single_exit (loop);
1060 basic_block new_preheader = new_bbs[0];
1062 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1064 if (scalar_loop != loop)
1066 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1067 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1068 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1069 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1070 header) to have current_def set, so copy them over. */
1071 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1072 exit);
1073 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1075 EDGE_SUCC (loop->latch, 0));
1078 if (at_exit) /* Add the loop copy at exit. */
1080 if (scalar_loop != loop)
1082 gphi_iterator gsi;
1083 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1085 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1086 gsi_next (&gsi))
1088 gphi *phi = gsi.phi ();
1089 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1090 location_t orig_locus
1091 = gimple_phi_arg_location_from_edge (phi, e);
1093 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1096 redirect_edge_and_branch_force (e, new_preheader);
1097 flush_pending_stmts (e);
1098 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1099 if (was_imm_dom || duplicate_outer_loop)
1100 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1102 /* And remove the non-necessary forwarder again. Keep the other
1103 one so we have a proper pre-header for the loop at the exit edge. */
1104 redirect_edge_pred (single_succ_edge (preheader),
1105 single_pred (preheader));
1106 delete_basic_block (preheader);
1107 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1108 loop_preheader_edge (scalar_loop)->src);
1110 else /* Add the copy at entry. */
1112 if (scalar_loop != loop)
1114 /* Remove the non-necessary forwarder of scalar_loop again. */
1115 redirect_edge_pred (single_succ_edge (preheader),
1116 single_pred (preheader));
1117 delete_basic_block (preheader);
1118 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1119 loop_preheader_edge (scalar_loop)->src);
1120 preheader = split_edge (loop_preheader_edge (loop));
1121 entry_e = single_pred_edge (preheader);
1124 redirect_edge_and_branch_force (entry_e, new_preheader);
1125 flush_pending_stmts (entry_e);
1126 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1128 redirect_edge_and_branch_force (new_exit, preheader);
1129 flush_pending_stmts (new_exit);
1130 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1132 /* And remove the non-necessary forwarder again. Keep the other
1133 one so we have a proper pre-header for the loop at the exit edge. */
1134 redirect_edge_pred (single_succ_edge (new_preheader),
1135 single_pred (new_preheader));
1136 delete_basic_block (new_preheader);
1137 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1138 loop_preheader_edge (new_loop)->src);
1141 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1142 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1143 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1145 if (scalar_loop != loop)
1147 /* Update new_loop->header PHIs, so that on the preheader
1148 edge they are the ones from loop rather than scalar_loop. */
1149 gphi_iterator gsi_orig, gsi_new;
1150 edge orig_e = loop_preheader_edge (loop);
1151 edge new_e = loop_preheader_edge (new_loop);
1153 for (gsi_orig = gsi_start_phis (loop->header),
1154 gsi_new = gsi_start_phis (new_loop->header);
1155 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1156 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1158 gphi *orig_phi = gsi_orig.phi ();
1159 gphi *new_phi = gsi_new.phi ();
1160 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1161 location_t orig_locus
1162 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1164 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1168 free (new_bbs);
1169 free (bbs);
1171 checking_verify_dominators (CDI_DOMINATORS);
1173 return new_loop;
1177 /* Given the condition expression COND, put it as the last statement of
1178 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1179 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1180 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1181 new edge as irreducible if IRREDUCIBLE_P is true. */
1183 static edge
1184 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1185 basic_block guard_to, basic_block dom_bb,
1186 profile_probability probability, bool irreducible_p)
1188 gimple_stmt_iterator gsi;
1189 edge new_e, enter_e;
1190 gcond *cond_stmt;
1191 gimple_seq gimplify_stmt_list = NULL;
1193 enter_e = EDGE_SUCC (guard_bb, 0);
1194 enter_e->flags &= ~EDGE_FALLTHRU;
1195 enter_e->flags |= EDGE_FALSE_VALUE;
1196 gsi = gsi_last_bb (guard_bb);
1198 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1199 NULL_TREE);
1200 if (gimplify_stmt_list)
1201 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1203 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1204 gsi = gsi_last_bb (guard_bb);
1205 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1207 /* Add new edge to connect guard block to the merge/loop-exit block. */
1208 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1210 new_e->probability = probability;
1211 if (irreducible_p)
1212 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1214 enter_e->probability = probability.invert ();
1215 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1217 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1218 if (enter_e->dest->loop_father->header == enter_e->dest)
1219 split_edge (enter_e);
1221 return new_e;
1225 /* This function verifies that the following restrictions apply to LOOP:
1226 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1227 for innermost loop and 5 basic blocks for outer-loop.
1228 (2) it is single entry, single exit
1229 (3) its exit condition is the last stmt in the header
1230 (4) E is the entry/exit edge of LOOP.
1233 bool
1234 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1236 edge exit_e = single_exit (loop);
1237 edge entry_e = loop_preheader_edge (loop);
1238 gcond *orig_cond = get_loop_exit_condition (loop);
1239 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1240 unsigned int num_bb = loop->inner? 5 : 2;
1242 /* All loops have an outer scope; the only case loop->outer is NULL is for
1243 the function itself. */
1244 if (!loop_outer (loop)
1245 || loop->num_nodes != num_bb
1246 || !empty_block_p (loop->latch)
1247 || !single_exit (loop)
1248 /* Verify that new loop exit condition can be trivially modified. */
1249 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1250 || (e != exit_e && e != entry_e))
1251 return false;
1253 return true;
1256 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1257 in the exit bb and rename all the uses after the loop. This simplifies
1258 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1259 (but normally loop closed SSA form doesn't require virtual PHIs to be
1260 in the same form). Doing this early simplifies the checking what
1261 uses should be renamed. */
1263 static void
1264 create_lcssa_for_virtual_phi (struct loop *loop)
1266 gphi_iterator gsi;
1267 edge exit_e = single_exit (loop);
1269 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1270 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1272 gphi *phi = gsi.phi ();
1273 for (gsi = gsi_start_phis (exit_e->dest);
1274 !gsi_end_p (gsi); gsi_next (&gsi))
1275 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1276 break;
1277 if (gsi_end_p (gsi))
1279 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1280 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1281 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1282 imm_use_iterator imm_iter;
1283 gimple *stmt;
1284 use_operand_p use_p;
1286 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1287 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1288 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1289 gimple_phi_set_result (new_phi, new_vop);
1290 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1291 if (stmt != new_phi
1292 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1293 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1294 SET_USE (use_p, new_vop);
1296 break;
1301 /* Function vect_get_loop_location.
1303 Extract the location of the loop in the source code.
1304 If the loop is not well formed for vectorization, an estimated
1305 location is calculated.
1306 Return the loop location if succeed and NULL if not. */
1308 dump_user_location_t
1309 find_loop_location (struct loop *loop)
1311 gimple *stmt = NULL;
1312 basic_block bb;
1313 gimple_stmt_iterator si;
1315 if (!loop)
1316 return dump_user_location_t ();
1318 stmt = get_loop_exit_condition (loop);
1320 if (stmt
1321 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1322 return stmt;
1324 /* If we got here the loop is probably not "well formed",
1325 try to estimate the loop location */
1327 if (!loop->header)
1328 return dump_user_location_t ();
1330 bb = loop->header;
1332 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1334 stmt = gsi_stmt (si);
1335 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1336 return stmt;
1339 return dump_user_location_t ();
1342 /* Return true if the phi described by STMT_INFO defines an IV of the
1343 loop to be vectorized. */
1345 static bool
1346 iv_phi_p (stmt_vec_info stmt_info)
1348 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1349 if (virtual_operand_p (PHI_RESULT (phi)))
1350 return false;
1352 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1353 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1354 return false;
1356 return true;
1359 /* Function vect_can_advance_ivs_p
1361 In case the number of iterations that LOOP iterates is unknown at compile
1362 time, an epilog loop will be generated, and the loop induction variables
1363 (IVs) will be "advanced" to the value they are supposed to take just before
1364 the epilog loop. Here we check that the access function of the loop IVs
1365 and the expression that represents the loop bound are simple enough.
1366 These restrictions will be relaxed in the future. */
1368 bool
1369 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1371 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1372 basic_block bb = loop->header;
1373 gphi_iterator gsi;
1375 /* Analyze phi functions of the loop header. */
1377 if (dump_enabled_p ())
1378 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1379 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1381 tree evolution_part;
1383 gphi *phi = gsi.phi ();
1384 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1385 if (dump_enabled_p ())
1387 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1388 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi_info->stmt, 0);
1391 /* Skip virtual phi's. The data dependences that are associated with
1392 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1394 Skip reduction phis. */
1395 if (!iv_phi_p (phi_info))
1397 if (dump_enabled_p ())
1398 dump_printf_loc (MSG_NOTE, vect_location,
1399 "reduc or virtual phi. skip.\n");
1400 continue;
1403 /* Analyze the evolution function. */
1405 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1406 if (evolution_part == NULL_TREE)
1408 if (dump_enabled_p ())
1409 dump_printf (MSG_MISSED_OPTIMIZATION,
1410 "No access function or evolution.\n");
1411 return false;
1414 /* FORNOW: We do not transform initial conditions of IVs
1415 which evolution functions are not invariants in the loop. */
1417 if (!expr_invariant_in_loop_p (loop, evolution_part))
1419 if (dump_enabled_p ())
1420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421 "evolution not invariant in loop.\n");
1422 return false;
1425 /* FORNOW: We do not transform initial conditions of IVs
1426 which evolution functions are a polynomial of degree >= 2. */
1428 if (tree_is_chrec (evolution_part))
1430 if (dump_enabled_p ())
1431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432 "evolution is chrec.\n");
1433 return false;
1437 return true;
1441 /* Function vect_update_ivs_after_vectorizer.
1443 "Advance" the induction variables of LOOP to the value they should take
1444 after the execution of LOOP. This is currently necessary because the
1445 vectorizer does not handle induction variables that are used after the
1446 loop. Such a situation occurs when the last iterations of LOOP are
1447 peeled, because:
1448 1. We introduced new uses after LOOP for IVs that were not originally used
1449 after LOOP: the IVs of LOOP are now used by an epilog loop.
1450 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1451 times, whereas the loop IVs should be bumped N times.
1453 Input:
1454 - LOOP - a loop that is going to be vectorized. The last few iterations
1455 of LOOP were peeled.
1456 - NITERS - the number of iterations that LOOP executes (before it is
1457 vectorized). i.e, the number of times the ivs should be bumped.
1458 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1459 coming out from LOOP on which there are uses of the LOOP ivs
1460 (this is the path from LOOP->exit to epilog_loop->preheader).
1462 The new definitions of the ivs are placed in LOOP->exit.
1463 The phi args associated with the edge UPDATE_E in the bb
1464 UPDATE_E->dest are updated accordingly.
1466 Assumption 1: Like the rest of the vectorizer, this function assumes
1467 a single loop exit that has a single predecessor.
1469 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1470 organized in the same order.
1472 Assumption 3: The access function of the ivs is simple enough (see
1473 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1475 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1476 coming out of LOOP on which the ivs of LOOP are used (this is the path
1477 that leads to the epilog loop; other paths skip the epilog loop). This
1478 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1479 needs to have its phis updated.
1482 static void
1483 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1484 tree niters, edge update_e)
1486 gphi_iterator gsi, gsi1;
1487 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1488 basic_block update_bb = update_e->dest;
1489 basic_block exit_bb = single_exit (loop)->dest;
1491 /* Make sure there exists a single-predecessor exit bb: */
1492 gcc_assert (single_pred_p (exit_bb));
1493 gcc_assert (single_succ_edge (exit_bb) == update_e);
1495 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1496 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1497 gsi_next (&gsi), gsi_next (&gsi1))
1499 tree init_expr;
1500 tree step_expr, off;
1501 tree type;
1502 tree var, ni, ni_name;
1503 gimple_stmt_iterator last_gsi;
1505 gphi *phi = gsi.phi ();
1506 gphi *phi1 = gsi1.phi ();
1507 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1508 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_NOTE, vect_location,
1511 "vect_update_ivs_after_vectorizer: phi: ");
1512 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1515 /* Skip reduction and virtual phis. */
1516 if (!iv_phi_p (phi_info))
1518 if (dump_enabled_p ())
1519 dump_printf_loc (MSG_NOTE, vect_location,
1520 "reduc or virtual phi. skip.\n");
1521 continue;
1524 type = TREE_TYPE (gimple_phi_result (phi));
1525 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1526 step_expr = unshare_expr (step_expr);
1528 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1529 of degree >= 2 or exponential. */
1530 gcc_assert (!tree_is_chrec (step_expr));
1532 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1534 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1535 fold_convert (TREE_TYPE (step_expr), niters),
1536 step_expr);
1537 if (POINTER_TYPE_P (type))
1538 ni = fold_build_pointer_plus (init_expr, off);
1539 else
1540 ni = fold_build2 (PLUS_EXPR, type,
1541 init_expr, fold_convert (type, off));
1543 var = create_tmp_var (type, "tmp");
1545 last_gsi = gsi_last_bb (exit_bb);
1546 gimple_seq new_stmts = NULL;
1547 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1548 /* Exit_bb shouldn't be empty. */
1549 if (!gsi_end_p (last_gsi))
1550 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1551 else
1552 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1554 /* Fix phi expressions in the successor bb. */
1555 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1559 /* Return a gimple value containing the misalignment (measured in vector
1560 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1561 it is away from a perfectly aligned address. Add any new statements
1562 to SEQ. */
1564 static tree
1565 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1567 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1568 stmt_vec_info stmt_info = dr_info->stmt;
1569 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1571 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1572 gcc_assert (target_align != 0);
1574 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1575 size_zero_node) < 0;
1576 tree offset = (negative
1577 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1578 : size_zero_node);
1579 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1580 offset);
1581 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1582 tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1583 HOST_WIDE_INT elem_size
1584 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1585 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1587 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1588 tree int_start_addr = fold_convert (type, start_addr);
1589 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1590 target_align_minus_1);
1592 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1593 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1594 elem_size_log);
1596 return misalign_in_elems;
1599 /* Function vect_gen_prolog_loop_niters
1601 Generate the number of iterations which should be peeled as prolog for the
1602 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1603 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1604 As a result, after the execution of this loop, the data reference DR will
1605 refer to an aligned location. The following computation is generated:
1607 If the misalignment of DR is known at compile time:
1608 addr_mis = int mis = DR_MISALIGNMENT (dr);
1609 Else, compute address misalignment in bytes:
1610 addr_mis = addr & (target_align - 1)
1612 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1614 (elem_size = element type size; an element is the scalar element whose type
1615 is the inner type of the vectype)
1617 The computations will be emitted at the end of BB. We also compute and
1618 store upper bound (included) of the result in BOUND.
1620 When the step of the data-ref in the loop is not 1 (as in interleaved data
1621 and SLP), the number of iterations of the prolog must be divided by the step
1622 (which is equal to the size of interleaved group).
1624 The above formulas assume that VF == number of elements in the vector. This
1625 may not hold when there are multiple-types in the loop.
1626 In this case, for some data-references in the loop the VF does not represent
1627 the number of elements that fit in the vector. Therefore, instead of VF we
1628 use TYPE_VECTOR_SUBPARTS. */
1630 static tree
1631 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1632 basic_block bb, int *bound)
1634 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1635 tree var;
1636 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1637 gimple_seq stmts = NULL, new_stmts = NULL;
1638 tree iters, iters_name;
1639 stmt_vec_info stmt_info = dr_info->stmt;
1640 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1641 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1643 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1645 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1647 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_NOTE, vect_location,
1649 "known peeling = %d.\n", npeel);
1651 iters = build_int_cst (niters_type, npeel);
1652 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1654 else
1656 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1657 tree type = TREE_TYPE (misalign_in_elems);
1658 HOST_WIDE_INT elem_size
1659 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1660 HOST_WIDE_INT align_in_elems = target_align / elem_size;
1661 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1662 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1664 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1665 & (align_in_elems - 1)). */
1666 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1667 size_zero_node) < 0;
1668 if (negative)
1669 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1670 align_in_elems_tree);
1671 else
1672 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1673 misalign_in_elems);
1674 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1675 iters = fold_convert (niters_type, iters);
1676 *bound = align_in_elems - 1;
1679 if (dump_enabled_p ())
1681 dump_printf_loc (MSG_NOTE, vect_location,
1682 "niters for prolog loop: ");
1683 dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1684 dump_printf (MSG_NOTE, "\n");
1687 var = create_tmp_var (niters_type, "prolog_loop_niters");
1688 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1690 if (new_stmts)
1691 gimple_seq_add_seq (&stmts, new_stmts);
1692 if (stmts)
1694 gcc_assert (single_succ_p (bb));
1695 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1696 if (gsi_end_p (gsi))
1697 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1698 else
1699 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1701 return iters_name;
1705 /* Function vect_update_init_of_dr
1707 If CODE is PLUS, the vector loop starts NITERS iterations after the
1708 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1709 iterations before the scalar one (using masking to skip inactive
1710 elements). This function updates the information recorded in DR to
1711 account for the difference. Specifically, it updates the OFFSET
1712 field of DR. */
1714 static void
1715 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1717 tree offset = DR_OFFSET (dr);
1719 niters = fold_build2 (MULT_EXPR, sizetype,
1720 fold_convert (sizetype, niters),
1721 fold_convert (sizetype, DR_STEP (dr)));
1722 offset = fold_build2 (code, sizetype,
1723 fold_convert (sizetype, offset), niters);
1724 DR_OFFSET (dr) = offset;
1728 /* Function vect_update_inits_of_drs
1730 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1731 CODE and NITERS are as for vect_update_inits_of_dr. */
1733 static void
1734 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1735 tree_code code)
1737 unsigned int i;
1738 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1739 struct data_reference *dr;
1741 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1743 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1744 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1746 gimple_seq seq;
1747 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1748 tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1750 niters = fold_convert (sizetype, niters);
1751 niters = force_gimple_operand (niters, &seq, false, var);
1752 if (seq)
1754 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1755 gcc_assert (!new_bb);
1759 FOR_EACH_VEC_ELT (datarefs, i, dr)
1761 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1762 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1763 vect_update_init_of_dr (dr, niters, code);
1767 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1768 by masking. This involves calculating the number of iterations to
1769 be peeled and then aligning all memory references appropriately. */
1771 void
1772 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1774 tree misalign_in_elems;
1775 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1777 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1779 /* From the information recorded in LOOP_VINFO get the number of iterations
1780 that need to be skipped via masking. */
1781 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1783 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1784 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1785 misalign_in_elems = build_int_cst (type, misalign);
1787 else
1789 gimple_seq seq1 = NULL, seq2 = NULL;
1790 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1791 misalign_in_elems = fold_convert (type, misalign_in_elems);
1792 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1793 &seq2, true, NULL_TREE);
1794 gimple_seq_add_seq (&seq1, seq2);
1795 if (seq1)
1797 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1798 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1799 gcc_assert (!new_bb);
1803 if (dump_enabled_p ())
1805 dump_printf_loc (MSG_NOTE, vect_location,
1806 "misalignment for fully-masked loop: ");
1807 dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1808 dump_printf (MSG_NOTE, "\n");
1811 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1813 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1816 /* This function builds ni_name = number of iterations. Statements
1817 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1818 it to TRUE if new ssa_var is generated. */
1820 tree
1821 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1823 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1824 if (TREE_CODE (ni) == INTEGER_CST)
1825 return ni;
1826 else
1828 tree ni_name, var;
1829 gimple_seq stmts = NULL;
1830 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1832 var = create_tmp_var (TREE_TYPE (ni), "niters");
1833 ni_name = force_gimple_operand (ni, &stmts, false, var);
1834 if (stmts)
1836 gsi_insert_seq_on_edge_immediate (pe, stmts);
1837 if (new_var_p != NULL)
1838 *new_var_p = true;
1841 return ni_name;
1845 /* Calculate the number of iterations above which vectorized loop will be
1846 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1847 of prolog loop. If it's integer const, the integer number is also passed
1848 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1849 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1850 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1851 threshold below which the scalar (rather than vectorized) loop will be
1852 executed. This function stores the upper bound (inclusive) of the result
1853 in BOUND_SCALAR. */
1855 static tree
1856 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1857 int bound_prolog, poly_int64 bound_epilog, int th,
1858 poly_uint64 *bound_scalar,
1859 bool check_profitability)
1861 tree type = TREE_TYPE (niters_prolog);
1862 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1863 build_int_cst (type, bound_epilog));
1865 *bound_scalar = bound_prolog + bound_epilog;
1866 if (check_profitability)
1868 /* TH indicates the minimum niters of vectorized loop, while we
1869 compute the maximum niters of scalar loop. */
1870 th--;
1871 /* Peeling for constant times. */
1872 if (int_niters_prolog >= 0)
1874 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1875 return build_int_cst (type, *bound_scalar);
1877 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1878 and BOUND_EPILOG are inclusive upper bounds. */
1879 if (known_ge (th, bound_prolog + bound_epilog))
1881 *bound_scalar = th;
1882 return build_int_cst (type, th);
1884 /* Need to do runtime comparison. */
1885 else if (maybe_gt (th, bound_epilog))
1887 *bound_scalar = upper_bound (*bound_scalar, th);
1888 return fold_build2 (MAX_EXPR, type,
1889 build_int_cst (type, th), niters);
1892 return niters;
1895 /* NITERS is the number of times that the original scalar loop executes
1896 after peeling. Work out the maximum number of iterations N that can
1897 be handled by the vectorized form of the loop and then either:
1899 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1901 niters_vector = N
1903 b) set *STEP_VECTOR_PTR to one and generate:
1905 niters_vector = N / vf
1907 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1908 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1909 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1911 void
1912 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1913 tree *niters_vector_ptr, tree *step_vector_ptr,
1914 bool niters_no_overflow)
1916 tree ni_minus_gap, var;
1917 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1918 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1919 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1920 tree log_vf = NULL_TREE;
1922 /* If epilogue loop is required because of data accesses with gaps, we
1923 subtract one iteration from the total number of iterations here for
1924 correct calculation of RATIO. */
1925 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1927 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1928 build_one_cst (type));
1929 if (!is_gimple_val (ni_minus_gap))
1931 var = create_tmp_var (type, "ni_gap");
1932 gimple *stmts = NULL;
1933 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1934 true, var);
1935 gsi_insert_seq_on_edge_immediate (pe, stmts);
1938 else
1939 ni_minus_gap = niters;
1941 unsigned HOST_WIDE_INT const_vf;
1942 if (vf.is_constant (&const_vf)
1943 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1945 /* Create: niters >> log2(vf) */
1946 /* If it's known that niters == number of latch executions + 1 doesn't
1947 overflow, we can generate niters >> log2(vf); otherwise we generate
1948 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1949 will be at least one. */
1950 log_vf = build_int_cst (type, exact_log2 (const_vf));
1951 if (niters_no_overflow)
1952 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1953 else
1954 niters_vector
1955 = fold_build2 (PLUS_EXPR, type,
1956 fold_build2 (RSHIFT_EXPR, type,
1957 fold_build2 (MINUS_EXPR, type,
1958 ni_minus_gap,
1959 build_int_cst (type, vf)),
1960 log_vf),
1961 build_int_cst (type, 1));
1962 step_vector = build_one_cst (type);
1964 else
1966 niters_vector = ni_minus_gap;
1967 step_vector = build_int_cst (type, vf);
1970 if (!is_gimple_val (niters_vector))
1972 var = create_tmp_var (type, "bnd");
1973 gimple_seq stmts = NULL;
1974 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1975 gsi_insert_seq_on_edge_immediate (pe, stmts);
1976 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1977 we set range information to make niters analyzer's life easier. */
1978 if (stmts != NULL && log_vf)
1979 set_range_info (niters_vector, VR_RANGE,
1980 wi::to_wide (build_int_cst (type, 1)),
1981 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1982 TYPE_MAX_VALUE (type),
1983 log_vf)));
1985 *niters_vector_ptr = niters_vector;
1986 *step_vector_ptr = step_vector;
1988 return;
1991 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1992 loop specified by LOOP_VINFO after vectorization, compute the number
1993 of iterations before vectorization (niters_vector * vf) and store it
1994 to NITERS_VECTOR_MULT_VF_PTR. */
1996 static void
1997 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1998 tree niters_vector,
1999 tree *niters_vector_mult_vf_ptr)
2001 /* We should be using a step_vector of VF if VF is variable. */
2002 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2003 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2004 tree type = TREE_TYPE (niters_vector);
2005 tree log_vf = build_int_cst (type, exact_log2 (vf));
2006 basic_block exit_bb = single_exit (loop)->dest;
2008 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2009 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2010 niters_vector, log_vf);
2011 if (!is_gimple_val (niters_vector_mult_vf))
2013 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2014 gimple_seq stmts = NULL;
2015 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2016 &stmts, true, var);
2017 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2018 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2020 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2023 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2024 from SECOND/FIRST and puts it at the original loop's preheader/exit
2025 edge, the two loops are arranged as below:
2027 preheader_a:
2028 first_loop:
2029 header_a:
2030 i_1 = PHI<i_0, i_2>;
2032 i_2 = i_1 + 1;
2033 if (cond_a)
2034 goto latch_a;
2035 else
2036 goto between_bb;
2037 latch_a:
2038 goto header_a;
2040 between_bb:
2041 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2043 second_loop:
2044 header_b:
2045 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2046 or with i_2 if no LCSSA phi is created
2047 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2049 i_4 = i_3 + 1;
2050 if (cond_b)
2051 goto latch_b;
2052 else
2053 goto exit_bb;
2054 latch_b:
2055 goto header_b;
2057 exit_bb:
2059 This function creates loop closed SSA for the first loop; update the
2060 second loop's PHI nodes by replacing argument on incoming edge with the
2061 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2062 is false, Loop closed ssa phis will only be created for non-iv phis for
2063 the first loop.
2065 This function assumes exit bb of the first loop is preheader bb of the
2066 second loop, i.e, between_bb in the example code. With PHIs updated,
2067 the second loop will execute rest iterations of the first. */
2069 static void
2070 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2071 struct loop *first, struct loop *second,
2072 bool create_lcssa_for_iv_phis)
2074 gphi_iterator gsi_update, gsi_orig;
2075 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2077 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2078 edge second_preheader_e = loop_preheader_edge (second);
2079 basic_block between_bb = single_exit (first)->dest;
2081 gcc_assert (between_bb == second_preheader_e->src);
2082 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2083 /* Either the first loop or the second is the loop to be vectorized. */
2084 gcc_assert (loop == first || loop == second);
2086 for (gsi_orig = gsi_start_phis (first->header),
2087 gsi_update = gsi_start_phis (second->header);
2088 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2089 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2091 gphi *orig_phi = gsi_orig.phi ();
2092 gphi *update_phi = gsi_update.phi ();
2094 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2095 /* Generate lcssa PHI node for the first loop. */
2096 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2097 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2098 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2100 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2101 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2102 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2103 arg = new_res;
2106 /* Update PHI node in the second loop by replacing arg on the loop's
2107 incoming edge. */
2108 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2112 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2113 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2114 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2115 appear like below:
2117 guard_bb:
2118 if (cond)
2119 goto merge_bb;
2120 else
2121 goto skip_loop;
2123 skip_loop:
2124 header_a:
2125 i_1 = PHI<i_0, i_2>;
2127 i_2 = i_1 + 1;
2128 if (cond_a)
2129 goto latch_a;
2130 else
2131 goto exit_a;
2132 latch_a:
2133 goto header_a;
2135 exit_a:
2136 i_5 = PHI<i_2>;
2138 merge_bb:
2139 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2141 update_loop:
2142 header_b:
2143 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2145 i_4 = i_3 + 1;
2146 if (cond_b)
2147 goto latch_b;
2148 else
2149 goto exit_bb;
2150 latch_b:
2151 goto header_b;
2153 exit_bb:
2155 This function creates PHI nodes at merge_bb and replaces the use of i_5
2156 in the update_loop's PHI node with the result of new PHI result. */
2158 static void
2159 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2160 struct loop *update_loop,
2161 edge guard_edge, edge merge_edge)
2163 source_location merge_loc, guard_loc;
2164 edge orig_e = loop_preheader_edge (skip_loop);
2165 edge update_e = loop_preheader_edge (update_loop);
2166 gphi_iterator gsi_orig, gsi_update;
2168 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2169 gsi_update = gsi_start_phis (update_loop->header));
2170 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2171 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2173 gphi *orig_phi = gsi_orig.phi ();
2174 gphi *update_phi = gsi_update.phi ();
2176 /* Generate new phi node at merge bb of the guard. */
2177 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2178 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2180 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2181 args in NEW_PHI for these edges. */
2182 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2183 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2184 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2185 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2186 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2187 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2189 /* Update phi in UPDATE_PHI. */
2190 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2194 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2195 this function searches for the corresponding lcssa phi node in exit
2196 bb of LOOP. If it is found, return the phi result; otherwise return
2197 NULL. */
2199 static tree
2200 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2201 gphi *lcssa_phi)
2203 gphi_iterator gsi;
2204 edge e = single_exit (loop);
2206 gcc_assert (single_pred_p (e->dest));
2207 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2209 gphi *phi = gsi.phi ();
2210 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2211 PHI_ARG_DEF (lcssa_phi, 0), 0))
2212 return PHI_RESULT (phi);
2214 return NULL_TREE;
2217 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2218 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2219 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2220 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2221 The CFG looks like:
2223 loop:
2224 header_a:
2225 i_1 = PHI<i_0, i_2>;
2227 i_2 = i_1 + 1;
2228 if (cond_a)
2229 goto latch_a;
2230 else
2231 goto exit_a;
2232 latch_a:
2233 goto header_a;
2235 exit_a:
2237 guard_bb:
2238 if (cond)
2239 goto merge_bb;
2240 else
2241 goto epilog_loop;
2243 ;; fall_through_bb
2245 epilog_loop:
2246 header_b:
2247 i_3 = PHI<i_2, i_4>;
2249 i_4 = i_3 + 1;
2250 if (cond_b)
2251 goto latch_b;
2252 else
2253 goto merge_bb;
2254 latch_b:
2255 goto header_b;
2257 merge_bb:
2258 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2260 exit_bb:
2261 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2263 For each name used out side EPILOG (i.e - for each name that has a lcssa
2264 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2265 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2266 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2267 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2268 in exit_bb will also be updated. */
2270 static void
2271 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2272 edge guard_edge, edge merge_edge)
2274 gphi_iterator gsi;
2275 basic_block merge_bb = guard_edge->dest;
2277 gcc_assert (single_succ_p (merge_bb));
2278 edge e = single_succ_edge (merge_bb);
2279 basic_block exit_bb = e->dest;
2280 gcc_assert (single_pred_p (exit_bb));
2281 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2283 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2285 gphi *update_phi = gsi.phi ();
2286 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2287 /* This loop-closed-phi actually doesn't represent a use out of the
2288 loop - the phi arg is a constant. */
2289 if (TREE_CODE (old_arg) != SSA_NAME)
2290 continue;
2292 tree merge_arg = get_current_def (old_arg);
2293 if (!merge_arg)
2294 merge_arg = old_arg;
2296 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2297 /* If the var is live after loop but not a reduction, we simply
2298 use the old arg. */
2299 if (!guard_arg)
2300 guard_arg = old_arg;
2302 /* Create new phi node in MERGE_BB: */
2303 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2304 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2306 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2307 the two PHI args in merge_phi for these edges. */
2308 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2309 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2311 /* Update the original phi in exit_bb. */
2312 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2316 /* EPILOG loop is duplicated from the original loop for vectorizing,
2317 the arg of its loop closed ssa PHI needs to be updated. */
2319 static void
2320 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2322 gphi_iterator gsi;
2323 basic_block exit_bb = single_exit (epilog)->dest;
2325 gcc_assert (single_pred_p (exit_bb));
2326 edge e = EDGE_PRED (exit_bb, 0);
2327 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2328 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2331 /* Function vect_do_peeling.
2333 Input:
2334 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2336 preheader:
2337 LOOP:
2338 header_bb:
2339 loop_body
2340 if (exit_loop_cond) goto exit_bb
2341 else goto header_bb
2342 exit_bb:
2344 - NITERS: The number of iterations of the loop.
2345 - NITERSM1: The number of iterations of the loop's latch.
2346 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2347 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2348 CHECK_PROFITABILITY is true.
2349 Output:
2350 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2351 iterate after vectorization; see vect_set_loop_condition for details.
2352 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2353 should be set to the number of scalar iterations handled by the
2354 vector loop. The SSA name is only used on exit from the loop.
2356 This function peels prolog and epilog from the loop, adds guards skipping
2357 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2358 would look like:
2360 guard_bb_1:
2361 if (prefer_scalar_loop) goto merge_bb_1
2362 else goto guard_bb_2
2364 guard_bb_2:
2365 if (skip_prolog) goto merge_bb_2
2366 else goto prolog_preheader
2368 prolog_preheader:
2369 PROLOG:
2370 prolog_header_bb:
2371 prolog_body
2372 if (exit_prolog_cond) goto prolog_exit_bb
2373 else goto prolog_header_bb
2374 prolog_exit_bb:
2376 merge_bb_2:
2378 vector_preheader:
2379 VECTOR LOOP:
2380 vector_header_bb:
2381 vector_body
2382 if (exit_vector_cond) goto vector_exit_bb
2383 else goto vector_header_bb
2384 vector_exit_bb:
2386 guard_bb_3:
2387 if (skip_epilog) goto merge_bb_3
2388 else goto epilog_preheader
2390 merge_bb_1:
2392 epilog_preheader:
2393 EPILOG:
2394 epilog_header_bb:
2395 epilog_body
2396 if (exit_epilog_cond) goto merge_bb_3
2397 else goto epilog_header_bb
2399 merge_bb_3:
2401 Note this function peels prolog and epilog only if it's necessary,
2402 as well as guards.
2403 Returns created epilogue or NULL.
2405 TODO: Guard for prefer_scalar_loop should be emitted along with
2406 versioning conditions if loop versioning is needed. */
2409 struct loop *
2410 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2411 tree *niters_vector, tree *step_vector,
2412 tree *niters_vector_mult_vf_var, int th,
2413 bool check_profitability, bool niters_no_overflow)
2415 edge e, guard_e;
2416 tree type = TREE_TYPE (niters), guard_cond;
2417 basic_block guard_bb, guard_to;
2418 profile_probability prob_prolog, prob_vector, prob_epilog;
2419 int estimated_vf;
2420 int prolog_peeling = 0;
2421 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2422 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2424 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2425 poly_uint64 bound_epilog = 0;
2426 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2427 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2428 bound_epilog += vf - 1;
2429 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2430 bound_epilog += 1;
2431 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2432 poly_uint64 bound_scalar = bound_epilog;
2434 if (!prolog_peeling && !epilog_peeling)
2435 return NULL;
2437 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2438 estimated_vf = vect_vf_for_cost (loop_vinfo);
2439 if (estimated_vf == 2)
2440 estimated_vf = 3;
2441 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2442 .apply_scale (estimated_vf - 1, estimated_vf);
2444 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2445 struct loop *first_loop = loop;
2446 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2447 create_lcssa_for_virtual_phi (loop);
2448 update_ssa (TODO_update_ssa_only_virtuals);
2450 if (MAY_HAVE_DEBUG_BIND_STMTS)
2452 gcc_assert (!adjust_vec.exists ());
2453 adjust_vec.create (32);
2455 initialize_original_copy_tables ();
2457 /* Record the anchor bb at which the guard should be placed if the scalar
2458 loop might be preferred. */
2459 basic_block anchor = loop_preheader_edge (loop)->src;
2461 /* Generate the number of iterations for the prolog loop. We do this here
2462 so that we can also get the upper bound on the number of iterations. */
2463 tree niters_prolog;
2464 int bound_prolog = 0;
2465 if (prolog_peeling)
2466 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2467 &bound_prolog);
2468 else
2469 niters_prolog = build_int_cst (type, 0);
2471 /* Prolog loop may be skipped. */
2472 bool skip_prolog = (prolog_peeling != 0);
2473 /* Skip to epilog if scalar loop may be preferred. It's only needed
2474 when we peel for epilog loop and when it hasn't been checked with
2475 loop versioning. */
2476 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2477 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2478 bound_prolog + bound_epilog)
2479 : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2480 /* Epilog loop must be executed if the number of iterations for epilog
2481 loop is known at compile time, otherwise we need to add a check at
2482 the end of vector loop and skip to the end of epilog loop. */
2483 bool skip_epilog = (prolog_peeling < 0
2484 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2485 || !vf.is_constant ());
2486 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2487 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2488 skip_epilog = false;
2490 if (skip_vector)
2492 split_edge (loop_preheader_edge (loop));
2494 /* Due to the order in which we peel prolog and epilog, we first
2495 propagate probability to the whole loop. The purpose is to
2496 avoid adjusting probabilities of both prolog and vector loops
2497 separately. Note in this case, the probability of epilog loop
2498 needs to be scaled back later. */
2499 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2500 if (prob_vector.initialized_p ())
2502 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2503 scale_loop_profile (loop, prob_vector, 0);
2507 dump_user_location_t loop_loc = find_loop_location (loop);
2508 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2509 if (prolog_peeling)
2511 e = loop_preheader_edge (loop);
2512 if (!slpeel_can_duplicate_loop_p (loop, e))
2514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2515 "loop can't be duplicated to preheader edge.\n");
2516 gcc_unreachable ();
2518 /* Peel prolog and put it on preheader edge of loop. */
2519 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2520 if (!prolog)
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2523 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2524 gcc_unreachable ();
2526 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2527 first_loop = prolog;
2528 reset_original_copy_tables ();
2530 /* Update the number of iterations for prolog loop. */
2531 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2532 vect_set_loop_condition (prolog, NULL, niters_prolog,
2533 step_prolog, NULL_TREE, false);
2535 /* Skip the prolog loop. */
2536 if (skip_prolog)
2538 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2539 niters_prolog, build_int_cst (type, 0));
2540 guard_bb = loop_preheader_edge (prolog)->src;
2541 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2542 guard_to = split_edge (loop_preheader_edge (loop));
2543 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2544 guard_to, guard_bb,
2545 prob_prolog.invert (),
2546 irred_flag);
2547 e = EDGE_PRED (guard_to, 0);
2548 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2549 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2551 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2552 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2554 /* Update init address of DRs. */
2555 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2556 /* Update niters for vector loop. */
2557 LOOP_VINFO_NITERS (loop_vinfo)
2558 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2559 LOOP_VINFO_NITERSM1 (loop_vinfo)
2560 = fold_build2 (MINUS_EXPR, type,
2561 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2562 bool new_var_p = false;
2563 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2564 /* It's guaranteed that vector loop bound before vectorization is at
2565 least VF, so set range information for newly generated var. */
2566 if (new_var_p)
2567 set_range_info (niters, VR_RANGE,
2568 wi::to_wide (build_int_cst (type, vf)),
2569 wi::to_wide (TYPE_MAX_VALUE (type)));
2571 /* Prolog iterates at most bound_prolog times, latch iterates at
2572 most bound_prolog - 1 times. */
2573 record_niter_bound (prolog, bound_prolog - 1, false, true);
2574 delete_update_ssa ();
2575 adjust_vec_debug_stmts ();
2576 scev_reset ();
2579 if (epilog_peeling)
2581 e = single_exit (loop);
2582 if (!slpeel_can_duplicate_loop_p (loop, e))
2584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2585 "loop can't be duplicated to exit edge.\n");
2586 gcc_unreachable ();
2588 /* Peel epilog and put it on exit edge of loop. */
2589 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2590 if (!epilog)
2592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2593 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2594 gcc_unreachable ();
2596 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2598 /* Scalar version loop may be preferred. In this case, add guard
2599 and skip to epilog. Note this only happens when the number of
2600 iterations of loop is unknown at compile time, otherwise this
2601 won't be vectorized. */
2602 if (skip_vector)
2604 /* Additional epilogue iteration is peeled if gap exists. */
2605 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2606 bound_prolog, bound_epilog,
2607 th, &bound_scalar,
2608 check_profitability);
2609 /* Build guard against NITERSM1 since NITERS may overflow. */
2610 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2611 guard_bb = anchor;
2612 guard_to = split_edge (loop_preheader_edge (epilog));
2613 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2614 guard_to, guard_bb,
2615 prob_vector.invert (),
2616 irred_flag);
2617 e = EDGE_PRED (guard_to, 0);
2618 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2619 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2621 /* Simply propagate profile info from guard_bb to guard_to which is
2622 a merge point of control flow. */
2623 guard_to->count = guard_bb->count;
2625 /* Scale probability of epilog loop back.
2626 FIXME: We should avoid scaling down and back up. Profile may
2627 get lost if we scale down to 0. */
2628 basic_block *bbs = get_loop_body (epilog);
2629 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2630 bbs[i]->count = bbs[i]->count.apply_scale
2631 (bbs[i]->count,
2632 bbs[i]->count.apply_probability
2633 (prob_vector));
2634 free (bbs);
2637 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2638 tree niters_vector_mult_vf;
2639 /* If loop is peeled for non-zero constant times, now niters refers to
2640 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2641 overflows. */
2642 niters_no_overflow |= (prolog_peeling > 0);
2643 vect_gen_vector_loop_niters (loop_vinfo, niters,
2644 niters_vector, step_vector,
2645 niters_no_overflow);
2646 if (!integer_onep (*step_vector))
2648 /* On exit from the loop we will have an easy way of calcalating
2649 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2650 until then. */
2651 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2652 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2653 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2655 else
2656 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2657 &niters_vector_mult_vf);
2658 /* Update IVs of original loop as if they were advanced by
2659 niters_vector_mult_vf steps. */
2660 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2661 edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2662 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2663 update_e);
2665 if (skip_epilog)
2667 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2668 niters, niters_vector_mult_vf);
2669 guard_bb = single_exit (loop)->dest;
2670 guard_to = split_edge (single_exit (epilog));
2671 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2672 skip_vector ? anchor : guard_bb,
2673 prob_epilog.invert (),
2674 irred_flag);
2675 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2676 single_exit (epilog));
2677 /* Only need to handle basic block before epilog loop if it's not
2678 the guard_bb, which is the case when skip_vector is true. */
2679 if (guard_bb != bb_before_epilog)
2681 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2683 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2685 scale_loop_profile (epilog, prob_epilog, 0);
2687 else
2688 slpeel_update_phi_nodes_for_lcssa (epilog);
2690 unsigned HOST_WIDE_INT bound;
2691 if (bound_scalar.is_constant (&bound))
2693 gcc_assert (bound != 0);
2694 /* -1 to convert loop iterations to latch iterations. */
2695 record_niter_bound (epilog, bound - 1, false, true);
2698 delete_update_ssa ();
2699 adjust_vec_debug_stmts ();
2700 scev_reset ();
2702 adjust_vec.release ();
2703 free_original_copy_tables ();
2705 return epilog;
2708 /* Function vect_create_cond_for_niters_checks.
2710 Create a conditional expression that represents the run-time checks for
2711 loop's niter. The loop is guaranteed to terminate if the run-time
2712 checks hold.
2714 Input:
2715 COND_EXPR - input conditional expression. New conditions will be chained
2716 with logical AND operation. If it is NULL, then the function
2717 is used to return the number of alias checks.
2718 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2719 to be checked.
2721 Output:
2722 COND_EXPR - conditional expression.
2724 The returned COND_EXPR is the conditional expression to be used in the
2725 if statement that controls which version of the loop gets executed at
2726 runtime. */
2728 static void
2729 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2731 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2733 if (*cond_expr)
2734 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2735 *cond_expr, part_cond_expr);
2736 else
2737 *cond_expr = part_cond_expr;
2740 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2741 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2743 static void
2744 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2746 if (*cond_expr)
2747 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2748 *cond_expr, part_cond_expr);
2749 else
2750 *cond_expr = part_cond_expr;
2753 /* Function vect_create_cond_for_align_checks.
2755 Create a conditional expression that represents the alignment checks for
2756 all of data references (array element references) whose alignment must be
2757 checked at runtime.
2759 Input:
2760 COND_EXPR - input conditional expression. New conditions will be chained
2761 with logical AND operation.
2762 LOOP_VINFO - two fields of the loop information are used.
2763 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2764 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2766 Output:
2767 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2768 expression.
2769 The returned value is the conditional expression to be used in the if
2770 statement that controls which version of the loop gets executed at runtime.
2772 The algorithm makes two assumptions:
2773 1) The number of bytes "n" in a vector is a power of 2.
2774 2) An address "a" is aligned if a%n is zero and that this
2775 test can be done as a&(n-1) == 0. For example, for 16
2776 byte vectors the test is a&0xf == 0. */
2778 static void
2779 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2780 tree *cond_expr,
2781 gimple_seq *cond_expr_stmt_list)
2783 vec<stmt_vec_info> may_misalign_stmts
2784 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2785 stmt_vec_info stmt_info;
2786 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2787 tree mask_cst;
2788 unsigned int i;
2789 tree int_ptrsize_type;
2790 char tmp_name[20];
2791 tree or_tmp_name = NULL_TREE;
2792 tree and_tmp_name;
2793 gimple *and_stmt;
2794 tree ptrsize_zero;
2795 tree part_cond_expr;
2797 /* Check that mask is one less than a power of 2, i.e., mask is
2798 all zeros followed by all ones. */
2799 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2801 int_ptrsize_type = signed_type_for (ptr_type_node);
2803 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2804 of the first vector of the i'th data reference. */
2806 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2808 gimple_seq new_stmt_list = NULL;
2809 tree addr_base;
2810 tree addr_tmp_name;
2811 tree new_or_tmp_name;
2812 gimple *addr_stmt, *or_stmt;
2813 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2814 bool negative = tree_int_cst_compare
2815 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2816 tree offset = negative
2817 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2819 /* create: addr_tmp = (int)(address_of_first_vector) */
2820 addr_base =
2821 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2822 offset);
2823 if (new_stmt_list != NULL)
2824 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2826 sprintf (tmp_name, "addr2int%d", i);
2827 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2828 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2829 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2831 /* The addresses are OR together. */
2833 if (or_tmp_name != NULL_TREE)
2835 /* create: or_tmp = or_tmp | addr_tmp */
2836 sprintf (tmp_name, "orptrs%d", i);
2837 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2838 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2839 or_tmp_name, addr_tmp_name);
2840 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2841 or_tmp_name = new_or_tmp_name;
2843 else
2844 or_tmp_name = addr_tmp_name;
2846 } /* end for i */
2848 mask_cst = build_int_cst (int_ptrsize_type, mask);
2850 /* create: and_tmp = or_tmp & mask */
2851 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2853 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2854 or_tmp_name, mask_cst);
2855 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2857 /* Make and_tmp the left operand of the conditional test against zero.
2858 if and_tmp has a nonzero bit then some address is unaligned. */
2859 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2860 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2861 and_tmp_name, ptrsize_zero);
2862 chain_cond_expr (cond_expr, part_cond_expr);
2865 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2866 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2867 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2868 and this new condition are true. Treat a null *COND_EXPR as "true". */
2870 static void
2871 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2873 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2874 unsigned int i;
2875 vec_object_pair *pair;
2876 FOR_EACH_VEC_ELT (pairs, i, pair)
2878 tree addr1 = build_fold_addr_expr (pair->first);
2879 tree addr2 = build_fold_addr_expr (pair->second);
2880 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2881 addr1, addr2);
2882 chain_cond_expr (cond_expr, part_cond_expr);
2886 /* Create an expression that is true when all lower-bound conditions for
2887 the vectorized loop are met. Chain this condition with *COND_EXPR. */
2889 static void
2890 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2892 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2893 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2895 tree expr = lower_bounds[i].expr;
2896 tree type = unsigned_type_for (TREE_TYPE (expr));
2897 expr = fold_convert (type, expr);
2898 poly_uint64 bound = lower_bounds[i].min_value;
2899 if (!lower_bounds[i].unsigned_p)
2901 expr = fold_build2 (PLUS_EXPR, type, expr,
2902 build_int_cstu (type, bound - 1));
2903 bound += bound - 1;
2905 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2906 build_int_cstu (type, bound));
2907 chain_cond_expr (cond_expr, part_cond_expr);
2911 /* Function vect_create_cond_for_alias_checks.
2913 Create a conditional expression that represents the run-time checks for
2914 overlapping of address ranges represented by a list of data references
2915 relations passed as input.
2917 Input:
2918 COND_EXPR - input conditional expression. New conditions will be chained
2919 with logical AND operation. If it is NULL, then the function
2920 is used to return the number of alias checks.
2921 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2922 to be checked.
2924 Output:
2925 COND_EXPR - conditional expression.
2927 The returned COND_EXPR is the conditional expression to be used in the if
2928 statement that controls which version of the loop gets executed at runtime.
2931 void
2932 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2934 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2935 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2937 if (comp_alias_ddrs.is_empty ())
2938 return;
2940 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2941 &comp_alias_ddrs, cond_expr);
2942 if (dump_enabled_p ())
2943 dump_printf_loc (MSG_NOTE, vect_location,
2944 "created %u versioning for alias checks.\n",
2945 comp_alias_ddrs.length ());
2949 /* Function vect_loop_versioning.
2951 If the loop has data references that may or may not be aligned or/and
2952 has data reference relations whose independence was not proven then
2953 two versions of the loop need to be generated, one which is vectorized
2954 and one which isn't. A test is then generated to control which of the
2955 loops is executed. The test checks for the alignment of all of the
2956 data references that may or may not be aligned. An additional
2957 sequence of runtime tests is generated for each pairs of DDRs whose
2958 independence was not proven. The vectorized version of loop is
2959 executed only if both alias and alignment tests are passed.
2961 The test generated to check which version of loop is executed
2962 is modified to also check for profitability as indicated by the
2963 cost model threshold TH.
2965 The versioning precondition(s) are placed in *COND_EXPR and
2966 *COND_EXPR_STMT_LIST. */
2968 void
2969 vect_loop_versioning (loop_vec_info loop_vinfo,
2970 unsigned int th, bool check_profitability,
2971 poly_uint64 versioning_threshold)
2973 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2974 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2975 basic_block condition_bb;
2976 gphi_iterator gsi;
2977 gimple_stmt_iterator cond_exp_gsi;
2978 basic_block merge_bb;
2979 basic_block new_exit_bb;
2980 edge new_exit_e, e;
2981 gphi *orig_phi, *new_phi;
2982 tree cond_expr = NULL_TREE;
2983 gimple_seq cond_expr_stmt_list = NULL;
2984 tree arg;
2985 profile_probability prob = profile_probability::likely ();
2986 gimple_seq gimplify_stmt_list = NULL;
2987 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2988 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2989 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2990 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2992 if (check_profitability)
2993 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2994 build_int_cst (TREE_TYPE (scalar_loop_iters),
2995 th - 1));
2996 if (maybe_ne (versioning_threshold, 0U))
2998 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2999 build_int_cst (TREE_TYPE (scalar_loop_iters),
3000 versioning_threshold - 1));
3001 if (cond_expr)
3002 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3003 expr, cond_expr);
3004 else
3005 cond_expr = expr;
3008 if (version_niter)
3009 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3011 if (cond_expr)
3012 cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3013 is_gimple_condexpr, NULL_TREE);
3015 if (version_align)
3016 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3017 &cond_expr_stmt_list);
3019 if (version_alias)
3021 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3022 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3023 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3026 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3027 &gimplify_stmt_list,
3028 is_gimple_condexpr, NULL_TREE);
3029 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3031 initialize_original_copy_tables ();
3032 if (scalar_loop)
3034 edge scalar_e;
3035 basic_block preheader, scalar_preheader;
3037 /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3038 scale LOOP's frequencies instead. */
3039 nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3040 prob, prob.invert (), prob, prob.invert (), true);
3041 scale_loop_frequencies (loop, prob);
3042 /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3043 while we need to move it above LOOP's preheader. */
3044 e = loop_preheader_edge (loop);
3045 scalar_e = loop_preheader_edge (scalar_loop);
3046 /* The vector loop preheader might not be empty, since new
3047 invariants could have been created while analyzing the loop. */
3048 gcc_assert (single_pred_p (e->src));
3049 gcc_assert (empty_block_p (scalar_e->src)
3050 && single_pred_p (scalar_e->src));
3051 gcc_assert (single_pred_p (condition_bb));
3052 preheader = e->src;
3053 scalar_preheader = scalar_e->src;
3054 scalar_e = find_edge (condition_bb, scalar_preheader);
3055 e = single_pred_edge (preheader);
3056 redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3057 scalar_preheader);
3058 redirect_edge_and_branch_force (scalar_e, preheader);
3059 redirect_edge_and_branch_force (e, condition_bb);
3060 set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3061 single_pred (condition_bb));
3062 set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3063 single_pred (scalar_preheader));
3064 set_immediate_dominator (CDI_DOMINATORS, preheader,
3065 condition_bb);
3067 else
3068 nloop = loop_version (loop, cond_expr, &condition_bb,
3069 prob, prob.invert (), prob, prob.invert (), true);
3071 if (version_niter)
3073 /* The versioned loop could be infinite, we need to clear existing
3074 niter information which is copied from the original loop. */
3075 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3076 vect_free_loop_info_assumptions (nloop);
3077 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3078 loop_constraint_set (loop, LOOP_C_INFINITE);
3081 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3082 && dump_enabled_p ())
3084 if (version_alias)
3085 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3086 "loop versioned for vectorization because of "
3087 "possible aliasing\n");
3088 if (version_align)
3089 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3090 "loop versioned for vectorization to enhance "
3091 "alignment\n");
3094 free_original_copy_tables ();
3096 /* Loop versioning violates an assumption we try to maintain during
3097 vectorization - that the loop exit block has a single predecessor.
3098 After versioning, the exit block of both loop versions is the same
3099 basic block (i.e. it has two predecessors). Just in order to simplify
3100 following transformations in the vectorizer, we fix this situation
3101 here by adding a new (empty) block on the exit-edge of the loop,
3102 with the proper loop-exit phis to maintain loop-closed-form.
3103 If loop versioning wasn't done from loop, but scalar_loop instead,
3104 merge_bb will have already just a single successor. */
3106 merge_bb = single_exit (loop)->dest;
3107 if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3109 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3110 new_exit_bb = split_edge (single_exit (loop));
3111 new_exit_e = single_exit (loop);
3112 e = EDGE_SUCC (new_exit_bb, 0);
3114 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3116 tree new_res;
3117 orig_phi = gsi.phi ();
3118 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3119 new_phi = create_phi_node (new_res, new_exit_bb);
3120 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3121 add_phi_arg (new_phi, arg, new_exit_e,
3122 gimple_phi_arg_location_from_edge (orig_phi, e));
3123 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3127 /* End loop-exit-fixes after versioning. */
3129 if (cond_expr_stmt_list)
3131 cond_exp_gsi = gsi_last_bb (condition_bb);
3132 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3133 GSI_SAME_STMT);
3135 update_ssa (TODO_update_ssa);