PR rtl-optimization/86620
[official-gcc.git] / gcc / tree-vect-loop-manip.c
blob53684e57e9490f3ffef6eea46f40a08a9a1669ee
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
51 /*************************************************************************
52 Simple Loop Peeling Utilities
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
58 /* Renames the use *OP_P. */
60 static void
61 rename_use_op (use_operand_p op_p)
63 tree new_name;
65 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66 return;
68 new_name = get_current_def (USE_FROM_PTR (op_p));
70 /* Something defined outside of the loop. */
71 if (!new_name)
72 return;
74 /* An ordinary ssa name defined in the loop. */
76 SET_USE (op_p, new_name);
80 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82 true. */
84 static void
85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
87 gimple *stmt;
88 use_operand_p use_p;
89 ssa_op_iter iter;
90 edge e;
91 edge_iterator ei;
92 struct loop *loop = bb->loop_father;
93 struct loop *outer_loop = NULL;
95 if (rename_from_outer_loop)
97 gcc_assert (loop);
98 outer_loop = loop_outer (loop);
101 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102 gsi_next (&gsi))
104 stmt = gsi_stmt (gsi);
105 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 rename_use_op (use_p);
109 FOR_EACH_EDGE (e, ei, bb->preds)
111 if (!flow_bb_inside_loop_p (loop, e->src))
113 if (!rename_from_outer_loop)
114 continue;
115 if (e->src != outer_loop->header)
117 if (outer_loop->inner->next)
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e->src)
123 || single_pred (e->src) != outer_loop->header)
124 continue;
128 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 gsi_next (&gsi))
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
135 struct adjust_info
137 tree from, to;
138 basic_block bb;
141 /* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
146 more. */
147 static vec<adjust_info, va_heap> adjust_vec;
149 /* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
153 static void
154 adjust_debug_stmts_now (adjust_info *ai)
156 basic_block bbphi = ai->bb;
157 tree orig_def = ai->from;
158 tree new_def = ai->to;
159 imm_use_iterator imm_iter;
160 gimple *stmt;
161 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
165 /* Adjust any debug stmts that held onto non-loop-closed
166 references. */
167 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
169 use_operand_p use_p;
170 basic_block bbuse;
172 if (!is_gimple_debug (stmt))
173 continue;
175 gcc_assert (gimple_debug_bind_p (stmt));
177 bbuse = gimple_bb (stmt);
179 if ((bbuse == bbphi
180 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 && !(bbuse == bbdef
182 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
184 if (new_def)
185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 SET_USE (use_p, new_def);
187 else
189 gimple_debug_bind_reset_value (stmt);
190 update_stmt (stmt);
196 /* Adjust debug stmts as scheduled before. */
198 static void
199 adjust_vec_debug_stmts (void)
201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
202 return;
204 gcc_assert (adjust_vec.exists ());
206 while (!adjust_vec.is_empty ())
208 adjust_debug_stmts_now (&adjust_vec.last ());
209 adjust_vec.pop ();
213 /* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
218 static void
219 adjust_debug_stmts (tree from, tree to, basic_block bb)
221 adjust_info ai;
223 if (MAY_HAVE_DEBUG_BIND_STMTS
224 && TREE_CODE (from) == SSA_NAME
225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
226 && ! virtual_operand_p (from))
228 ai.from = from;
229 ai.to = to;
230 ai.bb = bb;
232 if (adjust_vec.exists ())
233 adjust_vec.safe_push (ai);
234 else
235 adjust_debug_stmts_now (&ai);
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
242 transformations. */
244 static void
245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
251 if (MAY_HAVE_DEBUG_BIND_STMTS)
252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 gimple_bb (update_phi));
256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
260 static void
261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262 tree next_mask)
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
269 /* Add SEQ to the end of LOOP's preheader block. */
271 static void
272 add_preheader_seq (struct loop *loop, gimple_seq seq)
274 if (seq)
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
282 /* Add SEQ to the beginning of LOOP's header block. */
284 static void
285 add_header_seq (struct loop *loop, gimple_seq seq)
287 if (seq)
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 /* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
299 static bool
300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
319 static bool
320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 rgroup_masks *src_rgm)
323 tree src_masktype = src_rgm->mask_type;
324 tree dest_masktype = dest_rgm->mask_type;
325 machine_mode src_mode = TYPE_MODE (src_masktype);
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
330 /* Unpacking the source masks gives at least as many mask bits as
331 we need. We can then VIEW_CONVERT any excess bits away. */
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
335 tree src = src_rgm->masks[i / 2];
336 tree dest = dest_rgm->masks[i];
337 tree_code code = (i & 1 ? VEC_UNPACK_HI_EXPR
338 : VEC_UNPACK_LO_EXPR);
339 gassign *stmt;
340 if (dest_masktype == unpack_masktype)
341 stmt = gimple_build_assign (dest, code, src);
342 else
344 tree temp = make_ssa_name (unpack_masktype);
345 stmt = gimple_build_assign (temp, code, src);
346 gimple_seq_add_stmt (seq, stmt);
347 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
348 build1 (VIEW_CONVERT_EXPR,
349 dest_masktype, temp));
351 gimple_seq_add_stmt (seq, stmt);
353 return true;
355 vec_perm_indices indices[2];
356 if (dest_masktype == src_masktype
357 && interleave_supported_p (&indices[0], src_masktype, 0)
358 && interleave_supported_p (&indices[1], src_masktype, 1))
360 /* The destination requires twice as many mask bits as the source, so
361 we can use interleaving permutes to double up the number of bits. */
362 tree masks[2];
363 for (unsigned int i = 0; i < 2; ++i)
364 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
365 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367 tree src = src_rgm->masks[i / 2];
368 tree dest = dest_rgm->masks[i];
369 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
370 src, src, masks[i & 1]);
371 gimple_seq_add_stmt (seq, stmt);
373 return true;
375 return false;
378 /* Helper for vect_set_loop_condition_masked. Generate definitions for
379 all the masks in RGM and return a mask that is nonzero when the loop
380 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
381 Use LOOP_COND_GSI to insert code before the exit gcond.
383 RGM belongs to loop LOOP. The loop originally iterated NITERS
384 times and has been vectorized according to LOOP_VINFO. Each iteration
385 of the vectorized loop handles VF iterations of the scalar loop.
387 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
388 starts with NITERS_SKIP dummy iterations of the scalar loop before
389 the real work starts. The mask elements for these dummy iterations
390 must be 0, to ensure that the extra iterations do not have an effect.
392 It is known that:
394 NITERS * RGM->max_nscalars_per_iter
396 does not overflow. However, MIGHT_WRAP_P says whether an induction
397 variable that starts at 0 and has step:
399 VF * RGM->max_nscalars_per_iter
401 might overflow before hitting a value above:
403 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405 This means that we cannot guarantee that such an induction variable
406 would ever hit a value that produces a set of all-false masks for RGM. */
408 static tree
409 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
410 gimple_seq *preheader_seq,
411 gimple_stmt_iterator loop_cond_gsi,
412 rgroup_masks *rgm, tree vf,
413 tree niters, tree niters_skip,
414 bool might_wrap_p)
416 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
417 tree mask_type = rgm->mask_type;
418 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
419 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421 /* Calculate the maximum number of scalar values that the rgroup
422 handles in total, the number that it handles for each iteration
423 of the vector loop, and the number that it should skip during the
424 first iteration of the vector loop. */
425 tree nscalars_total = niters;
426 tree nscalars_step = vf;
427 tree nscalars_skip = niters_skip;
428 if (nscalars_per_iter != 1)
430 /* We checked before choosing to use a fully-masked loop that these
431 multiplications don't overflow. */
432 tree factor = build_int_cst (compare_type, nscalars_per_iter);
433 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
434 nscalars_total, factor);
435 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
436 nscalars_step, factor);
437 if (nscalars_skip)
438 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
439 nscalars_skip, factor);
442 /* Create an induction variable that counts the number of scalars
443 processed. */
444 tree index_before_incr, index_after_incr;
445 gimple_stmt_iterator incr_gsi;
446 bool insert_after;
447 tree zero_index = build_int_cst (compare_type, 0);
448 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
449 create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
450 insert_after, &index_before_incr, &index_after_incr);
452 tree test_index, test_limit, first_limit;
453 gimple_stmt_iterator *test_gsi;
454 if (might_wrap_p)
456 /* In principle the loop should stop iterating once the incremented
457 IV reaches a value greater than or equal to:
459 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
461 However, there's no guarantee that this addition doesn't overflow
462 the comparison type, or that the IV hits a value above it before
463 wrapping around. We therefore adjust the limit down by one
464 IV step:
466 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
467 -[infinite-prec] NSCALARS_STEP
469 and compare the IV against this limit _before_ incrementing it.
470 Since the comparison type is unsigned, we actually want the
471 subtraction to saturate at zero:
473 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
474 -[sat] NSCALARS_STEP
476 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
478 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
480 where the rightmost subtraction can be done directly in
481 COMPARE_TYPE. */
482 test_index = index_before_incr;
483 tree adjust = nscalars_step;
484 if (nscalars_skip)
485 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
486 adjust, nscalars_skip);
487 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
488 nscalars_total, adjust);
489 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
490 test_limit, adjust);
491 test_gsi = &incr_gsi;
493 /* Get a safe limit for the first iteration. */
494 if (nscalars_skip)
496 /* The first vector iteration can handle at most NSCALARS_STEP
497 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
498 NSCALARS_SKIP to that cannot overflow. */
499 tree const_limit = build_int_cst (compare_type,
500 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
501 * nscalars_per_iter);
502 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
503 nscalars_total, const_limit);
504 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
505 first_limit, nscalars_skip);
507 else
508 /* For the first iteration it doesn't matter whether the IV hits
509 a value above NSCALARS_TOTAL. That only matters for the latch
510 condition. */
511 first_limit = nscalars_total;
513 else
515 /* Test the incremented IV, which will always hit a value above
516 the bound before wrapping. */
517 test_index = index_after_incr;
518 test_limit = nscalars_total;
519 if (nscalars_skip)
520 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
521 test_limit, nscalars_skip);
522 test_gsi = &loop_cond_gsi;
524 first_limit = test_limit;
527 /* Provide a definition of each mask in the group. */
528 tree next_mask = NULL_TREE;
529 tree mask;
530 unsigned int i;
531 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
533 /* Previous masks will cover BIAS scalars. This mask covers the
534 next batch. */
535 poly_uint64 bias = nscalars_per_mask * i;
536 tree bias_tree = build_int_cst (compare_type, bias);
537 gimple *tmp_stmt;
539 /* See whether the first iteration of the vector loop is known
540 to have a full mask. */
541 poly_uint64 const_limit;
542 bool first_iteration_full
543 = (poly_int_tree_p (first_limit, &const_limit)
544 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
546 /* Rather than have a new IV that starts at BIAS and goes up to
547 TEST_LIMIT, prefer to use the same 0-based IV for each mask
548 and adjust the bound down by BIAS. */
549 tree this_test_limit = test_limit;
550 if (i != 0)
552 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
553 compare_type, this_test_limit,
554 bias_tree);
555 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
556 compare_type, this_test_limit,
557 bias_tree);
560 /* Create the initial mask. First include all scalars that
561 are within the loop limit. */
562 tree init_mask = NULL_TREE;
563 if (!first_iteration_full)
565 tree start, end;
566 if (first_limit == test_limit)
568 /* Use a natural test between zero (the initial IV value)
569 and the loop limit. The "else" block would be valid too,
570 but this choice can avoid the need to load BIAS_TREE into
571 a register. */
572 start = zero_index;
573 end = this_test_limit;
575 else
577 /* FIRST_LIMIT is the maximum number of scalars handled by the
578 first iteration of the vector loop. Test the portion
579 associated with this mask. */
580 start = bias_tree;
581 end = first_limit;
584 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
585 tmp_stmt = vect_gen_while (init_mask, start, end);
586 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
589 /* Now AND out the bits that are within the number of skipped
590 scalars. */
591 poly_uint64 const_skip;
592 if (nscalars_skip
593 && !(poly_int_tree_p (nscalars_skip, &const_skip)
594 && known_le (const_skip, bias)))
596 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
597 bias_tree, nscalars_skip);
598 if (init_mask)
599 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
600 init_mask, unskipped_mask);
601 else
602 init_mask = unskipped_mask;
605 if (!init_mask)
606 /* First iteration is full. */
607 init_mask = build_minus_one_cst (mask_type);
609 /* Get the mask value for the next iteration of the loop. */
610 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
611 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
612 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
614 vect_set_loop_mask (loop, mask, init_mask, next_mask);
616 return next_mask;
619 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
620 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
621 number of iterations of the original scalar loop that should be
622 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
623 as for vect_set_loop_condition.
625 Insert the branch-back condition before LOOP_COND_GSI and return the
626 final gcond. */
628 static gcond *
629 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
630 tree niters, tree final_iv,
631 bool niters_maybe_zero,
632 gimple_stmt_iterator loop_cond_gsi)
634 gimple_seq preheader_seq = NULL;
635 gimple_seq header_seq = NULL;
637 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
638 unsigned int compare_precision = TYPE_PRECISION (compare_type);
639 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
640 tree orig_niters = niters;
642 /* Type of the initial value of NITERS. */
643 tree ni_actual_type = TREE_TYPE (niters);
644 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
646 /* Convert NITERS to the same size as the compare. */
647 if (compare_precision > ni_actual_precision
648 && niters_maybe_zero)
650 /* We know that there is always at least one iteration, so if the
651 count is zero then it must have wrapped. Cope with this by
652 subtracting 1 before the conversion and adding 1 to the result. */
653 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
654 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
655 niters, build_minus_one_cst (ni_actual_type));
656 niters = gimple_convert (&preheader_seq, compare_type, niters);
657 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
658 niters, build_one_cst (compare_type));
660 else
661 niters = gimple_convert (&preheader_seq, compare_type, niters);
663 /* Convert skip_niters to the right type. */
664 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
666 /* Now calculate the value that the induction variable must be able
667 to hit in order to ensure that we end the loop with an all-false mask.
668 This involves adding the maximum number of inactive trailing scalar
669 iterations. */
670 widest_int iv_limit;
671 bool known_max_iters = max_loop_iterations (loop, &iv_limit);
672 if (known_max_iters)
674 if (niters_skip)
676 /* Add the maximum number of skipped iterations to the
677 maximum iteration count. */
678 if (TREE_CODE (niters_skip) == INTEGER_CST)
679 iv_limit += wi::to_widest (niters_skip);
680 else
681 iv_limit += max_vf - 1;
683 /* IV_LIMIT is the maximum number of latch iterations, which is also
684 the maximum in-range IV value. Round this value down to the previous
685 vector alignment boundary and then add an extra full iteration. */
686 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
687 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
690 /* Get the vectorization factor in tree form. */
691 tree vf = build_int_cst (compare_type,
692 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
694 /* Iterate over all the rgroups and fill in their masks. We could use
695 the first mask from any rgroup for the loop condition; here we
696 arbitrarily pick the last. */
697 tree test_mask = NULL_TREE;
698 rgroup_masks *rgm;
699 unsigned int i;
700 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
701 FOR_EACH_VEC_ELT (*masks, i, rgm)
702 if (!rgm->masks.is_empty ())
704 /* First try using permutes. This adds a single vector
705 instruction to the loop for each mask, but needs no extra
706 loop invariants or IVs. */
707 unsigned int nmasks = i + 1;
708 if ((nmasks & 1) == 0)
710 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
711 if (!half_rgm->masks.is_empty ()
712 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
713 continue;
716 /* See whether zero-based IV would ever generate all-false masks
717 before wrapping around. */
718 bool might_wrap_p
719 = (!known_max_iters
720 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
721 UNSIGNED)
722 > compare_precision));
724 /* Set up all masks for this group. */
725 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
726 &preheader_seq,
727 loop_cond_gsi, rgm, vf,
728 niters, niters_skip,
729 might_wrap_p);
732 /* Emit all accumulated statements. */
733 add_preheader_seq (loop, preheader_seq);
734 add_header_seq (loop, header_seq);
736 /* Get a boolean result that tells us whether to iterate. */
737 edge exit_edge = single_exit (loop);
738 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
739 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
740 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
741 NULL_TREE, NULL_TREE);
742 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
744 /* The loop iterates (NITERS - 1) / VF + 1 times.
745 Subtract one from this to get the latch count. */
746 tree step = build_int_cst (compare_type,
747 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
748 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
749 build_minus_one_cst (compare_type));
750 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
751 niters_minus_one, step);
753 if (final_iv)
755 gassign *assign = gimple_build_assign (final_iv, orig_niters);
756 gsi_insert_on_edge_immediate (single_exit (loop), assign);
759 return cond_stmt;
762 /* Like vect_set_loop_condition, but handle the case in which there
763 are no loop masks. */
765 static gcond *
766 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
767 tree step, tree final_iv,
768 bool niters_maybe_zero,
769 gimple_stmt_iterator loop_cond_gsi)
771 tree indx_before_incr, indx_after_incr;
772 gcond *cond_stmt;
773 gcond *orig_cond;
774 edge pe = loop_preheader_edge (loop);
775 edge exit_edge = single_exit (loop);
776 gimple_stmt_iterator incr_gsi;
777 bool insert_after;
778 enum tree_code code;
779 tree niters_type = TREE_TYPE (niters);
781 orig_cond = get_loop_exit_condition (loop);
782 gcc_assert (orig_cond);
783 loop_cond_gsi = gsi_for_stmt (orig_cond);
785 tree init, limit;
786 if (!niters_maybe_zero && integer_onep (step))
788 /* In this case we can use a simple 0-based IV:
791 x = 0;
795 x += 1;
797 while (x < NITERS); */
798 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
799 init = build_zero_cst (niters_type);
800 limit = niters;
802 else
804 /* The following works for all values of NITERS except 0:
807 x = 0;
811 x += STEP;
813 while (x <= NITERS - STEP);
815 so that the loop continues to iterate if x + STEP - 1 < NITERS
816 but stops if x + STEP - 1 >= NITERS.
818 However, if NITERS is zero, x never hits a value above NITERS - STEP
819 before wrapping around. There are two obvious ways of dealing with
820 this:
822 - start at STEP - 1 and compare x before incrementing it
823 - start at -1 and compare x after incrementing it
825 The latter is simpler and is what we use. The loop in this case
826 looks like:
829 x = -1;
833 x += STEP;
835 while (x < NITERS - STEP);
837 In both cases the loop limit is NITERS - STEP. */
838 gimple_seq seq = NULL;
839 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
840 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
841 if (seq)
843 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
844 gcc_assert (!new_bb);
846 if (niters_maybe_zero)
848 /* Case C. */
849 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
850 init = build_all_ones_cst (niters_type);
852 else
854 /* Case B. */
855 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
856 init = build_zero_cst (niters_type);
860 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
861 create_iv (init, step, NULL_TREE, loop,
862 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
863 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
864 true, NULL_TREE, true,
865 GSI_SAME_STMT);
866 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
867 true, GSI_SAME_STMT);
869 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
870 NULL_TREE);
872 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
874 /* Record the number of latch iterations. */
875 if (limit == niters)
876 /* Case A: the loop iterates NITERS times. Subtract one to get the
877 latch count. */
878 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
879 build_int_cst (niters_type, 1));
880 else
881 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
882 Subtract one from this to get the latch count. */
883 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
884 limit, step);
886 if (final_iv)
888 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
889 indx_after_incr, init);
890 gsi_insert_on_edge_immediate (single_exit (loop), assign);
893 return cond_stmt;
896 /* If we're using fully-masked loops, make LOOP iterate:
898 N == (NITERS - 1) / STEP + 1
900 times. When NITERS is zero, this is equivalent to making the loop
901 execute (1 << M) / STEP times, where M is the precision of NITERS.
902 NITERS_MAYBE_ZERO is true if this last case might occur.
904 If we're not using fully-masked loops, make LOOP iterate:
906 N == (NITERS - STEP) / STEP + 1
908 times, where NITERS is known to be outside the range [1, STEP - 1].
909 This is equivalent to making the loop execute NITERS / STEP times
910 when NITERS is nonzero and (1 << M) / STEP times otherwise.
911 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
913 If FINAL_IV is nonnull, it is an SSA name that should be set to
914 N * STEP on exit from the loop.
916 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
918 void
919 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
920 tree niters, tree step, tree final_iv,
921 bool niters_maybe_zero)
923 gcond *cond_stmt;
924 gcond *orig_cond = get_loop_exit_condition (loop);
925 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
927 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
928 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
929 final_iv, niters_maybe_zero,
930 loop_cond_gsi);
931 else
932 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
933 final_iv, niters_maybe_zero,
934 loop_cond_gsi);
936 /* Remove old loop exit test. */
937 gsi_remove (&loop_cond_gsi, true);
938 free_stmt_vec_info (orig_cond);
940 if (dump_enabled_p ())
942 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
943 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
947 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
948 For all PHI arguments in FROM->dest and TO->dest from those
949 edges ensure that TO->dest PHI arguments have current_def
950 to that in from. */
952 static void
953 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
955 gimple_stmt_iterator gsi_from, gsi_to;
957 for (gsi_from = gsi_start_phis (from->dest),
958 gsi_to = gsi_start_phis (to->dest);
959 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
961 gimple *from_phi = gsi_stmt (gsi_from);
962 gimple *to_phi = gsi_stmt (gsi_to);
963 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
964 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
965 if (virtual_operand_p (from_arg))
967 gsi_next (&gsi_from);
968 continue;
970 if (virtual_operand_p (to_arg))
972 gsi_next (&gsi_to);
973 continue;
975 if (TREE_CODE (from_arg) != SSA_NAME)
976 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
977 else
979 if (get_current_def (to_arg) == NULL_TREE)
980 set_current_def (to_arg, get_current_def (from_arg));
982 gsi_next (&gsi_from);
983 gsi_next (&gsi_to);
986 gphi *from_phi = get_virtual_phi (from->dest);
987 gphi *to_phi = get_virtual_phi (to->dest);
988 if (from_phi)
989 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
990 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
994 /* Given LOOP this function generates a new copy of it and puts it
995 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
996 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
997 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
998 entry or exit of LOOP. */
1000 struct loop *
1001 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1002 struct loop *scalar_loop, edge e)
1004 struct loop *new_loop;
1005 basic_block *new_bbs, *bbs, *pbbs;
1006 bool at_exit;
1007 bool was_imm_dom;
1008 basic_block exit_dest;
1009 edge exit, new_exit;
1010 bool duplicate_outer_loop = false;
1012 exit = single_exit (loop);
1013 at_exit = (e == exit);
1014 if (!at_exit && e != loop_preheader_edge (loop))
1015 return NULL;
1017 if (scalar_loop == NULL)
1018 scalar_loop = loop;
1020 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1021 pbbs = bbs + 1;
1022 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1023 /* Allow duplication of outer loops. */
1024 if (scalar_loop->inner)
1025 duplicate_outer_loop = true;
1026 /* Check whether duplication is possible. */
1027 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1029 free (bbs);
1030 return NULL;
1033 /* Generate new loop structure. */
1034 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1035 duplicate_subloops (scalar_loop, new_loop);
1037 exit_dest = exit->dest;
1038 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1039 exit_dest) == loop->header ?
1040 true : false);
1042 /* Also copy the pre-header, this avoids jumping through hoops to
1043 duplicate the loop entry PHI arguments. Create an empty
1044 pre-header unconditionally for this. */
1045 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1046 edge entry_e = single_pred_edge (preheader);
1047 bbs[0] = preheader;
1048 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1050 exit = single_exit (scalar_loop);
1051 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1052 &exit, 1, &new_exit, NULL,
1053 at_exit ? loop->latch : e->src, true);
1054 exit = single_exit (loop);
1055 basic_block new_preheader = new_bbs[0];
1057 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1059 if (scalar_loop != loop)
1061 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1062 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1063 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1064 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1065 header) to have current_def set, so copy them over. */
1066 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1067 exit);
1068 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1070 EDGE_SUCC (loop->latch, 0));
1073 if (at_exit) /* Add the loop copy at exit. */
1075 if (scalar_loop != loop)
1077 gphi_iterator gsi;
1078 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1080 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1081 gsi_next (&gsi))
1083 gphi *phi = gsi.phi ();
1084 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1085 location_t orig_locus
1086 = gimple_phi_arg_location_from_edge (phi, e);
1088 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1091 redirect_edge_and_branch_force (e, new_preheader);
1092 flush_pending_stmts (e);
1093 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1094 if (was_imm_dom || duplicate_outer_loop)
1095 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1097 /* And remove the non-necessary forwarder again. Keep the other
1098 one so we have a proper pre-header for the loop at the exit edge. */
1099 redirect_edge_pred (single_succ_edge (preheader),
1100 single_pred (preheader));
1101 delete_basic_block (preheader);
1102 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1103 loop_preheader_edge (scalar_loop)->src);
1105 else /* Add the copy at entry. */
1107 if (scalar_loop != loop)
1109 /* Remove the non-necessary forwarder of scalar_loop again. */
1110 redirect_edge_pred (single_succ_edge (preheader),
1111 single_pred (preheader));
1112 delete_basic_block (preheader);
1113 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1114 loop_preheader_edge (scalar_loop)->src);
1115 preheader = split_edge (loop_preheader_edge (loop));
1116 entry_e = single_pred_edge (preheader);
1119 redirect_edge_and_branch_force (entry_e, new_preheader);
1120 flush_pending_stmts (entry_e);
1121 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1123 redirect_edge_and_branch_force (new_exit, preheader);
1124 flush_pending_stmts (new_exit);
1125 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1127 /* And remove the non-necessary forwarder again. Keep the other
1128 one so we have a proper pre-header for the loop at the exit edge. */
1129 redirect_edge_pred (single_succ_edge (new_preheader),
1130 single_pred (new_preheader));
1131 delete_basic_block (new_preheader);
1132 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1133 loop_preheader_edge (new_loop)->src);
1136 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1137 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1138 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1140 if (scalar_loop != loop)
1142 /* Update new_loop->header PHIs, so that on the preheader
1143 edge they are the ones from loop rather than scalar_loop. */
1144 gphi_iterator gsi_orig, gsi_new;
1145 edge orig_e = loop_preheader_edge (loop);
1146 edge new_e = loop_preheader_edge (new_loop);
1148 for (gsi_orig = gsi_start_phis (loop->header),
1149 gsi_new = gsi_start_phis (new_loop->header);
1150 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1151 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1153 gphi *orig_phi = gsi_orig.phi ();
1154 gphi *new_phi = gsi_new.phi ();
1155 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1156 location_t orig_locus
1157 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1159 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1163 free (new_bbs);
1164 free (bbs);
1166 checking_verify_dominators (CDI_DOMINATORS);
1168 return new_loop;
1172 /* Given the condition expression COND, put it as the last statement of
1173 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1174 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1175 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1176 new edge as irreducible if IRREDUCIBLE_P is true. */
1178 static edge
1179 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1180 basic_block guard_to, basic_block dom_bb,
1181 profile_probability probability, bool irreducible_p)
1183 gimple_stmt_iterator gsi;
1184 edge new_e, enter_e;
1185 gcond *cond_stmt;
1186 gimple_seq gimplify_stmt_list = NULL;
1188 enter_e = EDGE_SUCC (guard_bb, 0);
1189 enter_e->flags &= ~EDGE_FALLTHRU;
1190 enter_e->flags |= EDGE_FALSE_VALUE;
1191 gsi = gsi_last_bb (guard_bb);
1193 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1194 NULL_TREE);
1195 if (gimplify_stmt_list)
1196 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1198 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1199 gsi = gsi_last_bb (guard_bb);
1200 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1202 /* Add new edge to connect guard block to the merge/loop-exit block. */
1203 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1205 new_e->probability = probability;
1206 if (irreducible_p)
1207 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1209 enter_e->probability = probability.invert ();
1210 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1212 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1213 if (enter_e->dest->loop_father->header == enter_e->dest)
1214 split_edge (enter_e);
1216 return new_e;
1220 /* This function verifies that the following restrictions apply to LOOP:
1221 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1222 for innermost loop and 5 basic blocks for outer-loop.
1223 (2) it is single entry, single exit
1224 (3) its exit condition is the last stmt in the header
1225 (4) E is the entry/exit edge of LOOP.
1228 bool
1229 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1231 edge exit_e = single_exit (loop);
1232 edge entry_e = loop_preheader_edge (loop);
1233 gcond *orig_cond = get_loop_exit_condition (loop);
1234 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1235 unsigned int num_bb = loop->inner? 5 : 2;
1237 /* All loops have an outer scope; the only case loop->outer is NULL is for
1238 the function itself. */
1239 if (!loop_outer (loop)
1240 || loop->num_nodes != num_bb
1241 || !empty_block_p (loop->latch)
1242 || !single_exit (loop)
1243 /* Verify that new loop exit condition can be trivially modified. */
1244 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1245 || (e != exit_e && e != entry_e))
1246 return false;
1248 return true;
1251 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1252 in the exit bb and rename all the uses after the loop. This simplifies
1253 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1254 (but normally loop closed SSA form doesn't require virtual PHIs to be
1255 in the same form). Doing this early simplifies the checking what
1256 uses should be renamed. */
1258 static void
1259 create_lcssa_for_virtual_phi (struct loop *loop)
1261 gphi_iterator gsi;
1262 edge exit_e = single_exit (loop);
1264 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1265 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1267 gphi *phi = gsi.phi ();
1268 for (gsi = gsi_start_phis (exit_e->dest);
1269 !gsi_end_p (gsi); gsi_next (&gsi))
1270 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1271 break;
1272 if (gsi_end_p (gsi))
1274 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1275 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1276 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1277 imm_use_iterator imm_iter;
1278 gimple *stmt;
1279 use_operand_p use_p;
1281 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1282 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1283 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1284 gimple_phi_set_result (new_phi, new_vop);
1285 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1286 if (stmt != new_phi
1287 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1288 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1289 SET_USE (use_p, new_vop);
1291 break;
1296 /* Function vect_get_loop_location.
1298 Extract the location of the loop in the source code.
1299 If the loop is not well formed for vectorization, an estimated
1300 location is calculated.
1301 Return the loop location if succeed and NULL if not. */
1303 source_location
1304 find_loop_location (struct loop *loop)
1306 gimple *stmt = NULL;
1307 basic_block bb;
1308 gimple_stmt_iterator si;
1310 if (!loop)
1311 return UNKNOWN_LOCATION;
1313 stmt = get_loop_exit_condition (loop);
1315 if (stmt
1316 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1317 return gimple_location (stmt);
1319 /* If we got here the loop is probably not "well formed",
1320 try to estimate the loop location */
1322 if (!loop->header)
1323 return UNKNOWN_LOCATION;
1325 bb = loop->header;
1327 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329 stmt = gsi_stmt (si);
1330 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1331 return gimple_location (stmt);
1334 return UNKNOWN_LOCATION;
1337 /* Return true if PHI defines an IV of the loop to be vectorized. */
1339 static bool
1340 iv_phi_p (gphi *phi)
1342 if (virtual_operand_p (PHI_RESULT (phi)))
1343 return false;
1345 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1346 gcc_assert (stmt_info != NULL);
1347 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1348 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1349 return false;
1351 return true;
1354 /* Function vect_can_advance_ivs_p
1356 In case the number of iterations that LOOP iterates is unknown at compile
1357 time, an epilog loop will be generated, and the loop induction variables
1358 (IVs) will be "advanced" to the value they are supposed to take just before
1359 the epilog loop. Here we check that the access function of the loop IVs
1360 and the expression that represents the loop bound are simple enough.
1361 These restrictions will be relaxed in the future. */
1363 bool
1364 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367 basic_block bb = loop->header;
1368 gphi_iterator gsi;
1370 /* Analyze phi functions of the loop header. */
1372 if (dump_enabled_p ())
1373 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1374 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376 tree evolution_part;
1378 gphi *phi = gsi.phi ();
1379 if (dump_enabled_p ())
1381 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1382 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1385 /* Skip virtual phi's. The data dependences that are associated with
1386 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1388 Skip reduction phis. */
1389 if (!iv_phi_p (phi))
1391 if (dump_enabled_p ())
1392 dump_printf_loc (MSG_NOTE, vect_location,
1393 "reduc or virtual phi. skip.\n");
1394 continue;
1397 /* Analyze the evolution function. */
1399 evolution_part
1400 = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1401 if (evolution_part == NULL_TREE)
1403 if (dump_enabled_p ())
1404 dump_printf (MSG_MISSED_OPTIMIZATION,
1405 "No access function or evolution.\n");
1406 return false;
1409 /* FORNOW: We do not transform initial conditions of IVs
1410 which evolution functions are not invariants in the loop. */
1412 if (!expr_invariant_in_loop_p (loop, evolution_part))
1414 if (dump_enabled_p ())
1415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416 "evolution not invariant in loop.\n");
1417 return false;
1420 /* FORNOW: We do not transform initial conditions of IVs
1421 which evolution functions are a polynomial of degree >= 2. */
1423 if (tree_is_chrec (evolution_part))
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1427 "evolution is chrec.\n");
1428 return false;
1432 return true;
1436 /* Function vect_update_ivs_after_vectorizer.
1438 "Advance" the induction variables of LOOP to the value they should take
1439 after the execution of LOOP. This is currently necessary because the
1440 vectorizer does not handle induction variables that are used after the
1441 loop. Such a situation occurs when the last iterations of LOOP are
1442 peeled, because:
1443 1. We introduced new uses after LOOP for IVs that were not originally used
1444 after LOOP: the IVs of LOOP are now used by an epilog loop.
1445 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1446 times, whereas the loop IVs should be bumped N times.
1448 Input:
1449 - LOOP - a loop that is going to be vectorized. The last few iterations
1450 of LOOP were peeled.
1451 - NITERS - the number of iterations that LOOP executes (before it is
1452 vectorized). i.e, the number of times the ivs should be bumped.
1453 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1454 coming out from LOOP on which there are uses of the LOOP ivs
1455 (this is the path from LOOP->exit to epilog_loop->preheader).
1457 The new definitions of the ivs are placed in LOOP->exit.
1458 The phi args associated with the edge UPDATE_E in the bb
1459 UPDATE_E->dest are updated accordingly.
1461 Assumption 1: Like the rest of the vectorizer, this function assumes
1462 a single loop exit that has a single predecessor.
1464 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1465 organized in the same order.
1467 Assumption 3: The access function of the ivs is simple enough (see
1468 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1470 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1471 coming out of LOOP on which the ivs of LOOP are used (this is the path
1472 that leads to the epilog loop; other paths skip the epilog loop). This
1473 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1474 needs to have its phis updated.
1477 static void
1478 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1479 tree niters, edge update_e)
1481 gphi_iterator gsi, gsi1;
1482 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1483 basic_block update_bb = update_e->dest;
1484 basic_block exit_bb = single_exit (loop)->dest;
1486 /* Make sure there exists a single-predecessor exit bb: */
1487 gcc_assert (single_pred_p (exit_bb));
1488 gcc_assert (single_succ_edge (exit_bb) == update_e);
1490 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1491 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1492 gsi_next (&gsi), gsi_next (&gsi1))
1494 tree init_expr;
1495 tree step_expr, off;
1496 tree type;
1497 tree var, ni, ni_name;
1498 gimple_stmt_iterator last_gsi;
1500 gphi *phi = gsi.phi ();
1501 gphi *phi1 = gsi1.phi ();
1502 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_NOTE, vect_location,
1505 "vect_update_ivs_after_vectorizer: phi: ");
1506 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1509 /* Skip reduction and virtual phis. */
1510 if (!iv_phi_p (phi))
1512 if (dump_enabled_p ())
1513 dump_printf_loc (MSG_NOTE, vect_location,
1514 "reduc or virtual phi. skip.\n");
1515 continue;
1518 type = TREE_TYPE (gimple_phi_result (phi));
1519 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1520 step_expr = unshare_expr (step_expr);
1522 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1523 of degree >= 2 or exponential. */
1524 gcc_assert (!tree_is_chrec (step_expr));
1526 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1528 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1529 fold_convert (TREE_TYPE (step_expr), niters),
1530 step_expr);
1531 if (POINTER_TYPE_P (type))
1532 ni = fold_build_pointer_plus (init_expr, off);
1533 else
1534 ni = fold_build2 (PLUS_EXPR, type,
1535 init_expr, fold_convert (type, off));
1537 var = create_tmp_var (type, "tmp");
1539 last_gsi = gsi_last_bb (exit_bb);
1540 gimple_seq new_stmts = NULL;
1541 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1542 /* Exit_bb shouldn't be empty. */
1543 if (!gsi_end_p (last_gsi))
1544 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1545 else
1546 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1548 /* Fix phi expressions in the successor bb. */
1549 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1553 /* Return a gimple value containing the misalignment (measured in vector
1554 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1555 it is away from a perfectly aligned address. Add any new statements
1556 to SEQ. */
1558 static tree
1559 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1561 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1562 gimple *dr_stmt = DR_STMT (dr);
1563 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1564 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1566 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1567 gcc_assert (target_align != 0);
1569 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1570 tree offset = (negative
1571 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1572 : size_zero_node);
1573 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq,
1574 offset);
1575 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1576 tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1577 HOST_WIDE_INT elem_size
1578 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1579 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1581 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1582 tree int_start_addr = fold_convert (type, start_addr);
1583 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1584 target_align_minus_1);
1586 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1587 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1588 elem_size_log);
1590 return misalign_in_elems;
1593 /* Function vect_gen_prolog_loop_niters
1595 Generate the number of iterations which should be peeled as prolog for the
1596 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1597 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1598 As a result, after the execution of this loop, the data reference DR will
1599 refer to an aligned location. The following computation is generated:
1601 If the misalignment of DR is known at compile time:
1602 addr_mis = int mis = DR_MISALIGNMENT (dr);
1603 Else, compute address misalignment in bytes:
1604 addr_mis = addr & (target_align - 1)
1606 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1608 (elem_size = element type size; an element is the scalar element whose type
1609 is the inner type of the vectype)
1611 The computations will be emitted at the end of BB. We also compute and
1612 store upper bound (included) of the result in BOUND.
1614 When the step of the data-ref in the loop is not 1 (as in interleaved data
1615 and SLP), the number of iterations of the prolog must be divided by the step
1616 (which is equal to the size of interleaved group).
1618 The above formulas assume that VF == number of elements in the vector. This
1619 may not hold when there are multiple-types in the loop.
1620 In this case, for some data-references in the loop the VF does not represent
1621 the number of elements that fit in the vector. Therefore, instead of VF we
1622 use TYPE_VECTOR_SUBPARTS. */
1624 static tree
1625 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1626 basic_block bb, int *bound)
1628 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1629 tree var;
1630 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1631 gimple_seq stmts = NULL, new_stmts = NULL;
1632 tree iters, iters_name;
1633 gimple *dr_stmt = DR_STMT (dr);
1634 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1635 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1636 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1638 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1640 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_NOTE, vect_location,
1644 "known peeling = %d.\n", npeel);
1646 iters = build_int_cst (niters_type, npeel);
1647 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649 else
1651 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1652 tree type = TREE_TYPE (misalign_in_elems);
1653 HOST_WIDE_INT elem_size
1654 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1655 HOST_WIDE_INT align_in_elems = target_align / elem_size;
1656 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1657 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1659 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1660 & (align_in_elems - 1)). */
1661 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1662 if (negative)
1663 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1664 align_in_elems_tree);
1665 else
1666 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1667 misalign_in_elems);
1668 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1669 iters = fold_convert (niters_type, iters);
1670 *bound = align_in_elems - 1;
1673 if (dump_enabled_p ())
1675 dump_printf_loc (MSG_NOTE, vect_location,
1676 "niters for prolog loop: ");
1677 dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1678 dump_printf (MSG_NOTE, "\n");
1681 var = create_tmp_var (niters_type, "prolog_loop_niters");
1682 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1684 if (new_stmts)
1685 gimple_seq_add_seq (&stmts, new_stmts);
1686 if (stmts)
1688 gcc_assert (single_succ_p (bb));
1689 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1690 if (gsi_end_p (gsi))
1691 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1692 else
1693 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1695 return iters_name;
1699 /* Function vect_update_init_of_dr
1701 If CODE is PLUS, the vector loop starts NITERS iterations after the
1702 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1703 iterations before the scalar one (using masking to skip inactive
1704 elements). This function updates the information recorded in DR to
1705 account for the difference. Specifically, it updates the OFFSET
1706 field of DR. */
1708 static void
1709 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1711 tree offset = DR_OFFSET (dr);
1713 niters = fold_build2 (MULT_EXPR, sizetype,
1714 fold_convert (sizetype, niters),
1715 fold_convert (sizetype, DR_STEP (dr)));
1716 offset = fold_build2 (code, sizetype,
1717 fold_convert (sizetype, offset), niters);
1718 DR_OFFSET (dr) = offset;
1722 /* Function vect_update_inits_of_drs
1724 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1725 CODE and NITERS are as for vect_update_inits_of_dr. */
1727 static void
1728 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1729 tree_code code)
1731 unsigned int i;
1732 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1733 struct data_reference *dr;
1735 if (dump_enabled_p ())
1736 dump_printf_loc (MSG_NOTE, vect_location,
1737 "=== vect_update_inits_of_dr ===\n");
1739 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1740 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1742 gimple_seq seq;
1743 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1744 tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1746 niters = fold_convert (sizetype, niters);
1747 niters = force_gimple_operand (niters, &seq, false, var);
1748 if (seq)
1750 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1751 gcc_assert (!new_bb);
1755 FOR_EACH_VEC_ELT (datarefs, i, dr)
1756 vect_update_init_of_dr (dr, niters, code);
1759 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1760 by masking. This involves calculating the number of iterations to
1761 be peeled and then aligning all memory references appropriately. */
1763 void
1764 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1766 tree misalign_in_elems;
1767 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1769 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1771 /* From the information recorded in LOOP_VINFO get the number of iterations
1772 that need to be skipped via masking. */
1773 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1775 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1776 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1777 misalign_in_elems = build_int_cst (type, misalign);
1779 else
1781 gimple_seq seq1 = NULL, seq2 = NULL;
1782 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1783 misalign_in_elems = fold_convert (type, misalign_in_elems);
1784 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1785 &seq2, true, NULL_TREE);
1786 gimple_seq_add_seq (&seq1, seq2);
1787 if (seq1)
1789 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1790 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1791 gcc_assert (!new_bb);
1795 if (dump_enabled_p ())
1797 dump_printf_loc (MSG_NOTE, vect_location,
1798 "misalignment for fully-masked loop: ");
1799 dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1800 dump_printf (MSG_NOTE, "\n");
1803 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1805 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1808 /* This function builds ni_name = number of iterations. Statements
1809 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1810 it to TRUE if new ssa_var is generated. */
1812 tree
1813 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1815 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1816 if (TREE_CODE (ni) == INTEGER_CST)
1817 return ni;
1818 else
1820 tree ni_name, var;
1821 gimple_seq stmts = NULL;
1822 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1824 var = create_tmp_var (TREE_TYPE (ni), "niters");
1825 ni_name = force_gimple_operand (ni, &stmts, false, var);
1826 if (stmts)
1828 gsi_insert_seq_on_edge_immediate (pe, stmts);
1829 if (new_var_p != NULL)
1830 *new_var_p = true;
1833 return ni_name;
1837 /* Calculate the number of iterations above which vectorized loop will be
1838 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1839 of prolog loop. If it's integer const, the integer number is also passed
1840 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1841 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1842 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1843 threshold below which the scalar (rather than vectorized) loop will be
1844 executed. This function stores the upper bound (inclusive) of the result
1845 in BOUND_SCALAR. */
1847 static tree
1848 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1849 int bound_prolog, poly_int64 bound_epilog, int th,
1850 poly_uint64 *bound_scalar,
1851 bool check_profitability)
1853 tree type = TREE_TYPE (niters_prolog);
1854 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1855 build_int_cst (type, bound_epilog));
1857 *bound_scalar = bound_prolog + bound_epilog;
1858 if (check_profitability)
1860 /* TH indicates the minimum niters of vectorized loop, while we
1861 compute the maximum niters of scalar loop. */
1862 th--;
1863 /* Peeling for constant times. */
1864 if (int_niters_prolog >= 0)
1866 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1867 return build_int_cst (type, *bound_scalar);
1869 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1870 and BOUND_EPILOG are inclusive upper bounds. */
1871 if (known_ge (th, bound_prolog + bound_epilog))
1873 *bound_scalar = th;
1874 return build_int_cst (type, th);
1876 /* Need to do runtime comparison. */
1877 else if (maybe_gt (th, bound_epilog))
1879 *bound_scalar = upper_bound (*bound_scalar, th);
1880 return fold_build2 (MAX_EXPR, type,
1881 build_int_cst (type, th), niters);
1884 return niters;
1887 /* NITERS is the number of times that the original scalar loop executes
1888 after peeling. Work out the maximum number of iterations N that can
1889 be handled by the vectorized form of the loop and then either:
1891 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1893 niters_vector = N
1895 b) set *STEP_VECTOR_PTR to one and generate:
1897 niters_vector = N / vf
1899 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1900 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1901 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1903 void
1904 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1905 tree *niters_vector_ptr, tree *step_vector_ptr,
1906 bool niters_no_overflow)
1908 tree ni_minus_gap, var;
1909 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1910 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1911 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1912 tree log_vf = NULL_TREE;
1914 /* If epilogue loop is required because of data accesses with gaps, we
1915 subtract one iteration from the total number of iterations here for
1916 correct calculation of RATIO. */
1917 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1919 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1920 build_one_cst (type));
1921 if (!is_gimple_val (ni_minus_gap))
1923 var = create_tmp_var (type, "ni_gap");
1924 gimple *stmts = NULL;
1925 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1926 true, var);
1927 gsi_insert_seq_on_edge_immediate (pe, stmts);
1930 else
1931 ni_minus_gap = niters;
1933 unsigned HOST_WIDE_INT const_vf;
1934 if (vf.is_constant (&const_vf)
1935 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1937 /* Create: niters >> log2(vf) */
1938 /* If it's known that niters == number of latch executions + 1 doesn't
1939 overflow, we can generate niters >> log2(vf); otherwise we generate
1940 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1941 will be at least one. */
1942 log_vf = build_int_cst (type, exact_log2 (const_vf));
1943 if (niters_no_overflow)
1944 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1945 else
1946 niters_vector
1947 = fold_build2 (PLUS_EXPR, type,
1948 fold_build2 (RSHIFT_EXPR, type,
1949 fold_build2 (MINUS_EXPR, type,
1950 ni_minus_gap,
1951 build_int_cst (type, vf)),
1952 log_vf),
1953 build_int_cst (type, 1));
1954 step_vector = build_one_cst (type);
1956 else
1958 niters_vector = ni_minus_gap;
1959 step_vector = build_int_cst (type, vf);
1962 if (!is_gimple_val (niters_vector))
1964 var = create_tmp_var (type, "bnd");
1965 gimple_seq stmts = NULL;
1966 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1967 gsi_insert_seq_on_edge_immediate (pe, stmts);
1968 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1969 we set range information to make niters analyzer's life easier. */
1970 if (stmts != NULL && log_vf)
1971 set_range_info (niters_vector, VR_RANGE,
1972 wi::to_wide (build_int_cst (type, 1)),
1973 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1974 TYPE_MAX_VALUE (type),
1975 log_vf)));
1977 *niters_vector_ptr = niters_vector;
1978 *step_vector_ptr = step_vector;
1980 return;
1983 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1984 loop specified by LOOP_VINFO after vectorization, compute the number
1985 of iterations before vectorization (niters_vector * vf) and store it
1986 to NITERS_VECTOR_MULT_VF_PTR. */
1988 static void
1989 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1990 tree niters_vector,
1991 tree *niters_vector_mult_vf_ptr)
1993 /* We should be using a step_vector of VF if VF is variable. */
1994 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1995 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1996 tree type = TREE_TYPE (niters_vector);
1997 tree log_vf = build_int_cst (type, exact_log2 (vf));
1998 basic_block exit_bb = single_exit (loop)->dest;
2000 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2001 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2002 niters_vector, log_vf);
2003 if (!is_gimple_val (niters_vector_mult_vf))
2005 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2006 gimple_seq stmts = NULL;
2007 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2008 &stmts, true, var);
2009 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2010 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2012 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2015 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2016 from SECOND/FIRST and puts it at the original loop's preheader/exit
2017 edge, the two loops are arranged as below:
2019 preheader_a:
2020 first_loop:
2021 header_a:
2022 i_1 = PHI<i_0, i_2>;
2024 i_2 = i_1 + 1;
2025 if (cond_a)
2026 goto latch_a;
2027 else
2028 goto between_bb;
2029 latch_a:
2030 goto header_a;
2032 between_bb:
2033 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2035 second_loop:
2036 header_b:
2037 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2038 or with i_2 if no LCSSA phi is created
2039 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2041 i_4 = i_3 + 1;
2042 if (cond_b)
2043 goto latch_b;
2044 else
2045 goto exit_bb;
2046 latch_b:
2047 goto header_b;
2049 exit_bb:
2051 This function creates loop closed SSA for the first loop; update the
2052 second loop's PHI nodes by replacing argument on incoming edge with the
2053 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2054 is false, Loop closed ssa phis will only be created for non-iv phis for
2055 the first loop.
2057 This function assumes exit bb of the first loop is preheader bb of the
2058 second loop, i.e, between_bb in the example code. With PHIs updated,
2059 the second loop will execute rest iterations of the first. */
2061 static void
2062 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2063 struct loop *first, struct loop *second,
2064 bool create_lcssa_for_iv_phis)
2066 gphi_iterator gsi_update, gsi_orig;
2067 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2069 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2070 edge second_preheader_e = loop_preheader_edge (second);
2071 basic_block between_bb = single_exit (first)->dest;
2073 gcc_assert (between_bb == second_preheader_e->src);
2074 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2075 /* Either the first loop or the second is the loop to be vectorized. */
2076 gcc_assert (loop == first || loop == second);
2078 for (gsi_orig = gsi_start_phis (first->header),
2079 gsi_update = gsi_start_phis (second->header);
2080 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2081 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2083 gphi *orig_phi = gsi_orig.phi ();
2084 gphi *update_phi = gsi_update.phi ();
2086 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2087 /* Generate lcssa PHI node for the first loop. */
2088 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2089 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
2091 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2092 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2093 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2094 arg = new_res;
2097 /* Update PHI node in the second loop by replacing arg on the loop's
2098 incoming edge. */
2099 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2103 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2104 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2105 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2106 appear like below:
2108 guard_bb:
2109 if (cond)
2110 goto merge_bb;
2111 else
2112 goto skip_loop;
2114 skip_loop:
2115 header_a:
2116 i_1 = PHI<i_0, i_2>;
2118 i_2 = i_1 + 1;
2119 if (cond_a)
2120 goto latch_a;
2121 else
2122 goto exit_a;
2123 latch_a:
2124 goto header_a;
2126 exit_a:
2127 i_5 = PHI<i_2>;
2129 merge_bb:
2130 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2132 update_loop:
2133 header_b:
2134 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2136 i_4 = i_3 + 1;
2137 if (cond_b)
2138 goto latch_b;
2139 else
2140 goto exit_bb;
2141 latch_b:
2142 goto header_b;
2144 exit_bb:
2146 This function creates PHI nodes at merge_bb and replaces the use of i_5
2147 in the update_loop's PHI node with the result of new PHI result. */
2149 static void
2150 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2151 struct loop *update_loop,
2152 edge guard_edge, edge merge_edge)
2154 source_location merge_loc, guard_loc;
2155 edge orig_e = loop_preheader_edge (skip_loop);
2156 edge update_e = loop_preheader_edge (update_loop);
2157 gphi_iterator gsi_orig, gsi_update;
2159 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2160 gsi_update = gsi_start_phis (update_loop->header));
2161 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2162 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2164 gphi *orig_phi = gsi_orig.phi ();
2165 gphi *update_phi = gsi_update.phi ();
2167 /* Generate new phi node at merge bb of the guard. */
2168 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2169 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2171 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2172 args in NEW_PHI for these edges. */
2173 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2174 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2175 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2176 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2177 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2178 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2180 /* Update phi in UPDATE_PHI. */
2181 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2185 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2186 this function searches for the corresponding lcssa phi node in exit
2187 bb of LOOP. If it is found, return the phi result; otherwise return
2188 NULL. */
2190 static tree
2191 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2192 gphi *lcssa_phi)
2194 gphi_iterator gsi;
2195 edge e = single_exit (loop);
2197 gcc_assert (single_pred_p (e->dest));
2198 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2200 gphi *phi = gsi.phi ();
2201 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2202 PHI_ARG_DEF (lcssa_phi, 0), 0))
2203 return PHI_RESULT (phi);
2205 return NULL_TREE;
2208 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2209 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2210 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2211 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2212 The CFG looks like:
2214 loop:
2215 header_a:
2216 i_1 = PHI<i_0, i_2>;
2218 i_2 = i_1 + 1;
2219 if (cond_a)
2220 goto latch_a;
2221 else
2222 goto exit_a;
2223 latch_a:
2224 goto header_a;
2226 exit_a:
2228 guard_bb:
2229 if (cond)
2230 goto merge_bb;
2231 else
2232 goto epilog_loop;
2234 ;; fall_through_bb
2236 epilog_loop:
2237 header_b:
2238 i_3 = PHI<i_2, i_4>;
2240 i_4 = i_3 + 1;
2241 if (cond_b)
2242 goto latch_b;
2243 else
2244 goto merge_bb;
2245 latch_b:
2246 goto header_b;
2248 merge_bb:
2249 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2251 exit_bb:
2252 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2254 For each name used out side EPILOG (i.e - for each name that has a lcssa
2255 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2256 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2257 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2258 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2259 in exit_bb will also be updated. */
2261 static void
2262 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2263 edge guard_edge, edge merge_edge)
2265 gphi_iterator gsi;
2266 basic_block merge_bb = guard_edge->dest;
2268 gcc_assert (single_succ_p (merge_bb));
2269 edge e = single_succ_edge (merge_bb);
2270 basic_block exit_bb = e->dest;
2271 gcc_assert (single_pred_p (exit_bb));
2272 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2274 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2276 gphi *update_phi = gsi.phi ();
2277 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2278 /* This loop-closed-phi actually doesn't represent a use out of the
2279 loop - the phi arg is a constant. */
2280 if (TREE_CODE (old_arg) != SSA_NAME)
2281 continue;
2283 tree merge_arg = get_current_def (old_arg);
2284 if (!merge_arg)
2285 merge_arg = old_arg;
2287 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2288 /* If the var is live after loop but not a reduction, we simply
2289 use the old arg. */
2290 if (!guard_arg)
2291 guard_arg = old_arg;
2293 /* Create new phi node in MERGE_BB: */
2294 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2295 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2297 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2298 the two PHI args in merge_phi for these edges. */
2299 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2300 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2302 /* Update the original phi in exit_bb. */
2303 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2307 /* EPILOG loop is duplicated from the original loop for vectorizing,
2308 the arg of its loop closed ssa PHI needs to be updated. */
2310 static void
2311 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2313 gphi_iterator gsi;
2314 basic_block exit_bb = single_exit (epilog)->dest;
2316 gcc_assert (single_pred_p (exit_bb));
2317 edge e = EDGE_PRED (exit_bb, 0);
2318 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2319 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2322 /* Function vect_do_peeling.
2324 Input:
2325 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2327 preheader:
2328 LOOP:
2329 header_bb:
2330 loop_body
2331 if (exit_loop_cond) goto exit_bb
2332 else goto header_bb
2333 exit_bb:
2335 - NITERS: The number of iterations of the loop.
2336 - NITERSM1: The number of iterations of the loop's latch.
2337 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2338 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2339 CHECK_PROFITABILITY is true.
2340 Output:
2341 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2342 iterate after vectorization; see vect_set_loop_condition for details.
2343 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2344 should be set to the number of scalar iterations handled by the
2345 vector loop. The SSA name is only used on exit from the loop.
2347 This function peels prolog and epilog from the loop, adds guards skipping
2348 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2349 would look like:
2351 guard_bb_1:
2352 if (prefer_scalar_loop) goto merge_bb_1
2353 else goto guard_bb_2
2355 guard_bb_2:
2356 if (skip_prolog) goto merge_bb_2
2357 else goto prolog_preheader
2359 prolog_preheader:
2360 PROLOG:
2361 prolog_header_bb:
2362 prolog_body
2363 if (exit_prolog_cond) goto prolog_exit_bb
2364 else goto prolog_header_bb
2365 prolog_exit_bb:
2367 merge_bb_2:
2369 vector_preheader:
2370 VECTOR LOOP:
2371 vector_header_bb:
2372 vector_body
2373 if (exit_vector_cond) goto vector_exit_bb
2374 else goto vector_header_bb
2375 vector_exit_bb:
2377 guard_bb_3:
2378 if (skip_epilog) goto merge_bb_3
2379 else goto epilog_preheader
2381 merge_bb_1:
2383 epilog_preheader:
2384 EPILOG:
2385 epilog_header_bb:
2386 epilog_body
2387 if (exit_epilog_cond) goto merge_bb_3
2388 else goto epilog_header_bb
2390 merge_bb_3:
2392 Note this function peels prolog and epilog only if it's necessary,
2393 as well as guards.
2394 Returns created epilogue or NULL.
2396 TODO: Guard for prefer_scalar_loop should be emitted along with
2397 versioning conditions if loop versioning is needed. */
2400 struct loop *
2401 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2402 tree *niters_vector, tree *step_vector,
2403 tree *niters_vector_mult_vf_var, int th,
2404 bool check_profitability, bool niters_no_overflow)
2406 edge e, guard_e;
2407 tree type = TREE_TYPE (niters), guard_cond;
2408 basic_block guard_bb, guard_to;
2409 profile_probability prob_prolog, prob_vector, prob_epilog;
2410 int estimated_vf;
2411 int prolog_peeling = 0;
2412 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2413 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2415 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2416 poly_uint64 bound_epilog = 0;
2417 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2418 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2419 bound_epilog += vf - 1;
2420 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2421 bound_epilog += 1;
2422 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2423 poly_uint64 bound_scalar = bound_epilog;
2425 if (!prolog_peeling && !epilog_peeling)
2426 return NULL;
2428 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2429 estimated_vf = vect_vf_for_cost (loop_vinfo);
2430 if (estimated_vf == 2)
2431 estimated_vf = 3;
2432 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2433 .apply_scale (estimated_vf - 1, estimated_vf);
2435 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2436 struct loop *first_loop = loop;
2437 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2438 create_lcssa_for_virtual_phi (loop);
2439 update_ssa (TODO_update_ssa_only_virtuals);
2441 if (MAY_HAVE_DEBUG_BIND_STMTS)
2443 gcc_assert (!adjust_vec.exists ());
2444 adjust_vec.create (32);
2446 initialize_original_copy_tables ();
2448 /* Record the anchor bb at which the guard should be placed if the scalar
2449 loop might be preferred. */
2450 basic_block anchor = loop_preheader_edge (loop)->src;
2452 /* Generate the number of iterations for the prolog loop. We do this here
2453 so that we can also get the upper bound on the number of iterations. */
2454 tree niters_prolog;
2455 int bound_prolog = 0;
2456 if (prolog_peeling)
2457 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2458 &bound_prolog);
2459 else
2460 niters_prolog = build_int_cst (type, 0);
2462 /* Prolog loop may be skipped. */
2463 bool skip_prolog = (prolog_peeling != 0);
2464 /* Skip to epilog if scalar loop may be preferred. It's only needed
2465 when we peel for epilog loop and when it hasn't been checked with
2466 loop versioning. */
2467 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2468 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2469 bound_prolog + bound_epilog)
2470 : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2471 /* Epilog loop must be executed if the number of iterations for epilog
2472 loop is known at compile time, otherwise we need to add a check at
2473 the end of vector loop and skip to the end of epilog loop. */
2474 bool skip_epilog = (prolog_peeling < 0
2475 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2476 || !vf.is_constant ());
2477 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2478 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2479 skip_epilog = false;
2481 if (skip_vector)
2483 split_edge (loop_preheader_edge (loop));
2485 /* Due to the order in which we peel prolog and epilog, we first
2486 propagate probability to the whole loop. The purpose is to
2487 avoid adjusting probabilities of both prolog and vector loops
2488 separately. Note in this case, the probability of epilog loop
2489 needs to be scaled back later. */
2490 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2491 if (prob_vector.initialized_p ())
2493 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2494 scale_loop_profile (loop, prob_vector, 0);
2498 source_location loop_loc = find_loop_location (loop);
2499 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2500 if (prolog_peeling)
2502 e = loop_preheader_edge (loop);
2503 if (!slpeel_can_duplicate_loop_p (loop, e))
2505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2506 "loop can't be duplicated to preheader edge.\n");
2507 gcc_unreachable ();
2509 /* Peel prolog and put it on preheader edge of loop. */
2510 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2511 if (!prolog)
2513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2514 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2515 gcc_unreachable ();
2517 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2518 first_loop = prolog;
2519 reset_original_copy_tables ();
2521 /* Update the number of iterations for prolog loop. */
2522 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2523 vect_set_loop_condition (prolog, NULL, niters_prolog,
2524 step_prolog, NULL_TREE, false);
2526 /* Skip the prolog loop. */
2527 if (skip_prolog)
2529 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2530 niters_prolog, build_int_cst (type, 0));
2531 guard_bb = loop_preheader_edge (prolog)->src;
2532 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2533 guard_to = split_edge (loop_preheader_edge (loop));
2534 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2535 guard_to, guard_bb,
2536 prob_prolog.invert (),
2537 irred_flag);
2538 e = EDGE_PRED (guard_to, 0);
2539 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2540 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2542 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2543 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2545 /* Update init address of DRs. */
2546 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2547 /* Update niters for vector loop. */
2548 LOOP_VINFO_NITERS (loop_vinfo)
2549 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2550 LOOP_VINFO_NITERSM1 (loop_vinfo)
2551 = fold_build2 (MINUS_EXPR, type,
2552 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2553 bool new_var_p = false;
2554 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2555 /* It's guaranteed that vector loop bound before vectorization is at
2556 least VF, so set range information for newly generated var. */
2557 if (new_var_p)
2558 set_range_info (niters, VR_RANGE,
2559 wi::to_wide (build_int_cst (type, vf)),
2560 wi::to_wide (TYPE_MAX_VALUE (type)));
2562 /* Prolog iterates at most bound_prolog times, latch iterates at
2563 most bound_prolog - 1 times. */
2564 record_niter_bound (prolog, bound_prolog - 1, false, true);
2565 delete_update_ssa ();
2566 adjust_vec_debug_stmts ();
2567 scev_reset ();
2570 if (epilog_peeling)
2572 e = single_exit (loop);
2573 if (!slpeel_can_duplicate_loop_p (loop, e))
2575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2576 "loop can't be duplicated to exit edge.\n");
2577 gcc_unreachable ();
2579 /* Peel epilog and put it on exit edge of loop. */
2580 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2581 if (!epilog)
2583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2584 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2585 gcc_unreachable ();
2587 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2589 /* Scalar version loop may be preferred. In this case, add guard
2590 and skip to epilog. Note this only happens when the number of
2591 iterations of loop is unknown at compile time, otherwise this
2592 won't be vectorized. */
2593 if (skip_vector)
2595 /* Additional epilogue iteration is peeled if gap exists. */
2596 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2597 bound_prolog, bound_epilog,
2598 th, &bound_scalar,
2599 check_profitability);
2600 /* Build guard against NITERSM1 since NITERS may overflow. */
2601 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2602 guard_bb = anchor;
2603 guard_to = split_edge (loop_preheader_edge (epilog));
2604 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2605 guard_to, guard_bb,
2606 prob_vector.invert (),
2607 irred_flag);
2608 e = EDGE_PRED (guard_to, 0);
2609 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2610 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2612 /* Simply propagate profile info from guard_bb to guard_to which is
2613 a merge point of control flow. */
2614 guard_to->count = guard_bb->count;
2616 /* Scale probability of epilog loop back.
2617 FIXME: We should avoid scaling down and back up. Profile may
2618 get lost if we scale down to 0. */
2619 basic_block *bbs = get_loop_body (epilog);
2620 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2621 bbs[i]->count = bbs[i]->count.apply_scale
2622 (bbs[i]->count,
2623 bbs[i]->count.apply_probability
2624 (prob_vector));
2625 free (bbs);
2628 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2629 tree niters_vector_mult_vf;
2630 /* If loop is peeled for non-zero constant times, now niters refers to
2631 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2632 overflows. */
2633 niters_no_overflow |= (prolog_peeling > 0);
2634 vect_gen_vector_loop_niters (loop_vinfo, niters,
2635 niters_vector, step_vector,
2636 niters_no_overflow);
2637 if (!integer_onep (*step_vector))
2639 /* On exit from the loop we will have an easy way of calcalating
2640 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2641 until then. */
2642 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2643 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2644 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2646 else
2647 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2648 &niters_vector_mult_vf);
2649 /* Update IVs of original loop as if they were advanced by
2650 niters_vector_mult_vf steps. */
2651 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2652 edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2653 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2654 update_e);
2656 if (skip_epilog)
2658 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2659 niters, niters_vector_mult_vf);
2660 guard_bb = single_exit (loop)->dest;
2661 guard_to = split_edge (single_exit (epilog));
2662 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2663 skip_vector ? anchor : guard_bb,
2664 prob_epilog.invert (),
2665 irred_flag);
2666 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2667 single_exit (epilog));
2668 /* Only need to handle basic block before epilog loop if it's not
2669 the guard_bb, which is the case when skip_vector is true. */
2670 if (guard_bb != bb_before_epilog)
2672 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2674 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2676 scale_loop_profile (epilog, prob_epilog, 0);
2678 else
2679 slpeel_update_phi_nodes_for_lcssa (epilog);
2681 unsigned HOST_WIDE_INT bound;
2682 if (bound_scalar.is_constant (&bound))
2684 gcc_assert (bound != 0);
2685 /* -1 to convert loop iterations to latch iterations. */
2686 record_niter_bound (epilog, bound - 1, false, true);
2689 delete_update_ssa ();
2690 adjust_vec_debug_stmts ();
2691 scev_reset ();
2693 adjust_vec.release ();
2694 free_original_copy_tables ();
2696 return epilog;
2699 /* Function vect_create_cond_for_niters_checks.
2701 Create a conditional expression that represents the run-time checks for
2702 loop's niter. The loop is guaranteed to to terminate if the run-time
2703 checks hold.
2705 Input:
2706 COND_EXPR - input conditional expression. New conditions will be chained
2707 with logical AND operation. If it is NULL, then the function
2708 is used to return the number of alias checks.
2709 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2710 to be checked.
2712 Output:
2713 COND_EXPR - conditional expression.
2715 The returned COND_EXPR is the conditional expression to be used in the
2716 if statement that controls which version of the loop gets executed at
2717 runtime. */
2719 static void
2720 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2722 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2724 if (*cond_expr)
2725 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2726 *cond_expr, part_cond_expr);
2727 else
2728 *cond_expr = part_cond_expr;
2731 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2732 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2734 static void
2735 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2737 if (*cond_expr)
2738 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2739 *cond_expr, part_cond_expr);
2740 else
2741 *cond_expr = part_cond_expr;
2744 /* Function vect_create_cond_for_align_checks.
2746 Create a conditional expression that represents the alignment checks for
2747 all of data references (array element references) whose alignment must be
2748 checked at runtime.
2750 Input:
2751 COND_EXPR - input conditional expression. New conditions will be chained
2752 with logical AND operation.
2753 LOOP_VINFO - two fields of the loop information are used.
2754 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2755 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2757 Output:
2758 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2759 expression.
2760 The returned value is the conditional expression to be used in the if
2761 statement that controls which version of the loop gets executed at runtime.
2763 The algorithm makes two assumptions:
2764 1) The number of bytes "n" in a vector is a power of 2.
2765 2) An address "a" is aligned if a%n is zero and that this
2766 test can be done as a&(n-1) == 0. For example, for 16
2767 byte vectors the test is a&0xf == 0. */
2769 static void
2770 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2771 tree *cond_expr,
2772 gimple_seq *cond_expr_stmt_list)
2774 vec<gimple *> may_misalign_stmts
2775 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2776 gimple *ref_stmt;
2777 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2778 tree mask_cst;
2779 unsigned int i;
2780 tree int_ptrsize_type;
2781 char tmp_name[20];
2782 tree or_tmp_name = NULL_TREE;
2783 tree and_tmp_name;
2784 gimple *and_stmt;
2785 tree ptrsize_zero;
2786 tree part_cond_expr;
2788 /* Check that mask is one less than a power of 2, i.e., mask is
2789 all zeros followed by all ones. */
2790 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2792 int_ptrsize_type = signed_type_for (ptr_type_node);
2794 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2795 of the first vector of the i'th data reference. */
2797 FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
2799 gimple_seq new_stmt_list = NULL;
2800 tree addr_base;
2801 tree addr_tmp_name;
2802 tree new_or_tmp_name;
2803 gimple *addr_stmt, *or_stmt;
2804 stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2805 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2806 bool negative = tree_int_cst_compare
2807 (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2808 tree offset = negative
2809 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2811 /* create: addr_tmp = (int)(address_of_first_vector) */
2812 addr_base =
2813 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
2814 offset);
2815 if (new_stmt_list != NULL)
2816 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2818 sprintf (tmp_name, "addr2int%d", i);
2819 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2820 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2821 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2823 /* The addresses are OR together. */
2825 if (or_tmp_name != NULL_TREE)
2827 /* create: or_tmp = or_tmp | addr_tmp */
2828 sprintf (tmp_name, "orptrs%d", i);
2829 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2830 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2831 or_tmp_name, addr_tmp_name);
2832 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2833 or_tmp_name = new_or_tmp_name;
2835 else
2836 or_tmp_name = addr_tmp_name;
2838 } /* end for i */
2840 mask_cst = build_int_cst (int_ptrsize_type, mask);
2842 /* create: and_tmp = or_tmp & mask */
2843 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2845 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2846 or_tmp_name, mask_cst);
2847 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2849 /* Make and_tmp the left operand of the conditional test against zero.
2850 if and_tmp has a nonzero bit then some address is unaligned. */
2851 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2852 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2853 and_tmp_name, ptrsize_zero);
2854 chain_cond_expr (cond_expr, part_cond_expr);
2857 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2858 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2859 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2860 and this new condition are true. Treat a null *COND_EXPR as "true". */
2862 static void
2863 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2865 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2866 unsigned int i;
2867 vec_object_pair *pair;
2868 FOR_EACH_VEC_ELT (pairs, i, pair)
2870 tree addr1 = build_fold_addr_expr (pair->first);
2871 tree addr2 = build_fold_addr_expr (pair->second);
2872 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2873 addr1, addr2);
2874 chain_cond_expr (cond_expr, part_cond_expr);
2878 /* Create an expression that is true when all lower-bound conditions for
2879 the vectorized loop are met. Chain this condition with *COND_EXPR. */
2881 static void
2882 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2884 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2885 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2887 tree expr = lower_bounds[i].expr;
2888 tree type = unsigned_type_for (TREE_TYPE (expr));
2889 expr = fold_convert (type, expr);
2890 poly_uint64 bound = lower_bounds[i].min_value;
2891 if (!lower_bounds[i].unsigned_p)
2893 expr = fold_build2 (PLUS_EXPR, type, expr,
2894 build_int_cstu (type, bound - 1));
2895 bound += bound - 1;
2897 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2898 build_int_cstu (type, bound));
2899 chain_cond_expr (cond_expr, part_cond_expr);
2903 /* Function vect_create_cond_for_alias_checks.
2905 Create a conditional expression that represents the run-time checks for
2906 overlapping of address ranges represented by a list of data references
2907 relations passed as input.
2909 Input:
2910 COND_EXPR - input conditional expression. New conditions will be chained
2911 with logical AND operation. If it is NULL, then the function
2912 is used to return the number of alias checks.
2913 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2914 to be checked.
2916 Output:
2917 COND_EXPR - conditional expression.
2919 The returned COND_EXPR is the conditional expression to be used in the if
2920 statement that controls which version of the loop gets executed at runtime.
2923 void
2924 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2926 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2927 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2929 if (comp_alias_ddrs.is_empty ())
2930 return;
2932 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2933 &comp_alias_ddrs, cond_expr);
2934 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_NOTE, vect_location,
2936 "created %u versioning for alias checks.\n",
2937 comp_alias_ddrs.length ());
2941 /* Function vect_loop_versioning.
2943 If the loop has data references that may or may not be aligned or/and
2944 has data reference relations whose independence was not proven then
2945 two versions of the loop need to be generated, one which is vectorized
2946 and one which isn't. A test is then generated to control which of the
2947 loops is executed. The test checks for the alignment of all of the
2948 data references that may or may not be aligned. An additional
2949 sequence of runtime tests is generated for each pairs of DDRs whose
2950 independence was not proven. The vectorized version of loop is
2951 executed only if both alias and alignment tests are passed.
2953 The test generated to check which version of loop is executed
2954 is modified to also check for profitability as indicated by the
2955 cost model threshold TH.
2957 The versioning precondition(s) are placed in *COND_EXPR and
2958 *COND_EXPR_STMT_LIST. */
2960 void
2961 vect_loop_versioning (loop_vec_info loop_vinfo,
2962 unsigned int th, bool check_profitability,
2963 poly_uint64 versioning_threshold)
2965 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2966 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2967 basic_block condition_bb;
2968 gphi_iterator gsi;
2969 gimple_stmt_iterator cond_exp_gsi;
2970 basic_block merge_bb;
2971 basic_block new_exit_bb;
2972 edge new_exit_e, e;
2973 gphi *orig_phi, *new_phi;
2974 tree cond_expr = NULL_TREE;
2975 gimple_seq cond_expr_stmt_list = NULL;
2976 tree arg;
2977 profile_probability prob = profile_probability::likely ();
2978 gimple_seq gimplify_stmt_list = NULL;
2979 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2980 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2981 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2982 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2984 if (check_profitability)
2985 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2986 build_int_cst (TREE_TYPE (scalar_loop_iters),
2987 th - 1));
2988 if (maybe_ne (versioning_threshold, 0U))
2990 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2991 build_int_cst (TREE_TYPE (scalar_loop_iters),
2992 versioning_threshold - 1));
2993 if (cond_expr)
2994 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2995 expr, cond_expr);
2996 else
2997 cond_expr = expr;
3000 if (version_niter)
3001 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3003 if (cond_expr)
3004 cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3005 is_gimple_condexpr, NULL_TREE);
3007 if (version_align)
3008 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3009 &cond_expr_stmt_list);
3011 if (version_alias)
3013 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3014 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3015 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3018 cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list,
3019 is_gimple_condexpr, NULL_TREE);
3020 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3022 initialize_original_copy_tables ();
3023 if (scalar_loop)
3025 edge scalar_e;
3026 basic_block preheader, scalar_preheader;
3028 /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3029 scale LOOP's frequencies instead. */
3030 nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3031 prob, prob.invert (), prob, prob.invert (), true);
3032 scale_loop_frequencies (loop, prob);
3033 /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3034 while we need to move it above LOOP's preheader. */
3035 e = loop_preheader_edge (loop);
3036 scalar_e = loop_preheader_edge (scalar_loop);
3037 gcc_assert (empty_block_p (e->src)
3038 && single_pred_p (e->src));
3039 gcc_assert (empty_block_p (scalar_e->src)
3040 && single_pred_p (scalar_e->src));
3041 gcc_assert (single_pred_p (condition_bb));
3042 preheader = e->src;
3043 scalar_preheader = scalar_e->src;
3044 scalar_e = find_edge (condition_bb, scalar_preheader);
3045 e = single_pred_edge (preheader);
3046 redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3047 scalar_preheader);
3048 redirect_edge_and_branch_force (scalar_e, preheader);
3049 redirect_edge_and_branch_force (e, condition_bb);
3050 set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3051 single_pred (condition_bb));
3052 set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3053 single_pred (scalar_preheader));
3054 set_immediate_dominator (CDI_DOMINATORS, preheader,
3055 condition_bb);
3057 else
3058 nloop = loop_version (loop, cond_expr, &condition_bb,
3059 prob, prob.invert (), prob, prob.invert (), true);
3061 if (version_niter)
3063 /* The versioned loop could be infinite, we need to clear existing
3064 niter information which is copied from the original loop. */
3065 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3066 vect_free_loop_info_assumptions (nloop);
3067 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3068 loop_constraint_set (loop, LOOP_C_INFINITE);
3071 if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
3072 && dump_enabled_p ())
3074 if (version_alias)
3075 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3076 "loop versioned for vectorization because of "
3077 "possible aliasing\n");
3078 if (version_align)
3079 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3080 "loop versioned for vectorization to enhance "
3081 "alignment\n");
3084 free_original_copy_tables ();
3086 /* Loop versioning violates an assumption we try to maintain during
3087 vectorization - that the loop exit block has a single predecessor.
3088 After versioning, the exit block of both loop versions is the same
3089 basic block (i.e. it has two predecessors). Just in order to simplify
3090 following transformations in the vectorizer, we fix this situation
3091 here by adding a new (empty) block on the exit-edge of the loop,
3092 with the proper loop-exit phis to maintain loop-closed-form.
3093 If loop versioning wasn't done from loop, but scalar_loop instead,
3094 merge_bb will have already just a single successor. */
3096 merge_bb = single_exit (loop)->dest;
3097 if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3099 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3100 new_exit_bb = split_edge (single_exit (loop));
3101 new_exit_e = single_exit (loop);
3102 e = EDGE_SUCC (new_exit_bb, 0);
3104 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3106 tree new_res;
3107 orig_phi = gsi.phi ();
3108 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3109 new_phi = create_phi_node (new_res, new_exit_bb);
3110 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3111 add_phi_arg (new_phi, arg, new_exit_e,
3112 gimple_phi_arg_location_from_edge (orig_phi, e));
3113 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3117 /* End loop-exit-fixes after versioning. */
3119 if (cond_expr_stmt_list)
3121 cond_exp_gsi = gsi_last_bb (condition_bb);
3122 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3123 GSI_SAME_STMT);
3125 update_ssa (TODO_update_ssa);